From 7cbd3aa7137fe3b9d165cb409036848af05b0a38 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:44:11 +0000 Subject: [PATCH 01/40] feat(supervisor): add ComputeWorkloadManager for compute gateway Add a third WorkloadManager implementation that creates sandboxes via the compute gateway HTTP API (POST /api/sandboxes). Uses native fetch with no new dependencies. Enabled by setting COMPUTE_GATEWAY_URL, which takes priority over Kubernetes and Docker providers. --- apps/supervisor/src/env.ts | 4 + apps/supervisor/src/index.ts | 13 +- .../supervisor/src/workloadManager/compute.ts | 116 ++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 apps/supervisor/src/workloadManager/compute.ts diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index faf34bcd025..77f34d04867 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -77,6 +77,10 @@ const Env = z.object({ */ DOCKER_RUNNER_NETWORKS: z.string().default("host"), + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), KUBERNETES_NAMESPACE: z.string().default("default"), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 0e274b30390..bd16f54dd2b 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -14,6 +14,7 @@ import { } from "./resourceMonitor.js"; import { KubernetesWorkloadManager } from "./workloadManager/kubernetes.js"; import { DockerWorkloadManager } from "./workloadManager/docker.js"; +import { ComputeWorkloadManager } from "./workloadManager/compute.js"; import { HttpServer, CheckpointClient, @@ -77,9 +78,15 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = this.isKubernetes - ? new KubernetesWorkloadManager(workloadManagerOptions) - : new DockerWorkloadManager(workloadManagerOptions); + this.workloadManager = env.COMPUTE_GATEWAY_URL + ? new ComputeWorkloadManager({ + ...workloadManagerOptions, + gatewayUrl: env.COMPUTE_GATEWAY_URL, + gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + }) + : this.isKubernetes + ? new KubernetesWorkloadManager(workloadManagerOptions) + : new DockerWorkloadManager(workloadManagerOptions); if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts new file mode 100644 index 00000000000..a984ca7794a --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -0,0 +1,116 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { env } from "../env.js"; +import { getRunnerId } from "../util.js"; +import { tryCatch } from "@trigger.dev/core"; + +type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { + gatewayUrl: string; + gatewayAuthToken?: string; +}; + +export class ComputeWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + + constructor(private opts: ComputeWorkloadManagerOptions) { + if (!opts.workloadApiDomain) { + this.logger.warn( + "⚠️ workloadApiDomain is unset — VMs need an explicit host IP to reach the supervisor" + ); + } + } + + async create(opts: WorkloadManagerCreateOptions) { + this.logger.log("create()", { opts }); + + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + const envVars: Record = { + OTEL_EXPORTER_OTLP_ENDPOINT: env.OTEL_EXPORTER_OTLP_ENDPOINT, + TRIGGER_DEQUEUED_AT_MS: String(opts.dequeuedAt.getTime()), + TRIGGER_POD_SCHEDULED_AT_MS: String(Date.now()), + TRIGGER_ENV_ID: opts.envId, + TRIGGER_DEPLOYMENT_ID: opts.deploymentFriendlyId, + TRIGGER_DEPLOYMENT_VERSION: opts.deploymentVersion, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, + TRIGGER_RUNNER_ID: runnerId, + TRIGGER_MACHINE_CPU: String(opts.machine.cpu), + TRIGGER_MACHINE_MEMORY: String(opts.machine.memory), + PRETTY_LOGS: String(env.RUNNER_PRETTY_LOGS), + }; + + if (this.opts.warmStartUrl) { + envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; + } + + if (this.opts.metadataUrl) { + envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = String(this.opts.heartbeatIntervalSeconds); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(this.opts.snapshotPollIntervalSeconds); + } + + if (this.opts.additionalEnvVars) { + Object.assign(envVars, this.opts.additionalEnvVars); + } + + const headers: Record = { + "Content-Type": "application/json", + }; + + if (this.opts.gatewayAuthToken) { + headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; + } + + const url = `${this.opts.gatewayUrl}/api/sandboxes`; + + const [fetchError, response] = await tryCatch( + fetch(url, { + method: "POST", + headers, + body: JSON.stringify({ + image: opts.image, + env: envVars, + }), + }) + ); + + if (fetchError) { + this.logger.error("Failed to create sandbox", { error: fetchError, url }); + return; + } + + if (!response.ok) { + const [bodyError, body] = await tryCatch(response.text()); + this.logger.error("Gateway returned error", { + status: response.status, + body: bodyError ? undefined : body, + url, + }); + return; + } + + const [parseError, data] = await tryCatch(response.json()); + + if (parseError) { + this.logger.error("Failed to parse gateway response", { error: parseError }); + return; + } + + this.logger.debug("create succeeded", { sandboxId: data.id, runnerId }); + } +} From 3175a10c73947cda95d7dfc5d37a28357194f5a2 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 11:49:13 +0000 Subject: [PATCH 02/40] fix(supervisor): strip image digest in ComputeWorkloadManager --- apps/supervisor/src/workloadManager/compute.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index a984ca7794a..ad01d7b6225 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -17,10 +17,10 @@ export class ComputeWorkloadManager implements WorkloadManager { private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); constructor(private opts: ComputeWorkloadManagerOptions) { - if (!opts.workloadApiDomain) { - this.logger.warn( - "⚠️ workloadApiDomain is unset — VMs need an explicit host IP to reach the supervisor" - ); + if (opts.workloadApiDomain) { + this.logger.warn("⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); } } @@ -76,6 +76,9 @@ export class ComputeWorkloadManager implements WorkloadManager { headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; } + // Strip image digest — resolve by tag, not digest + const imageRef = opts.image.split("@")[0]!; + const url = `${this.opts.gatewayUrl}/api/sandboxes`; const [fetchError, response] = await tryCatch( @@ -83,7 +86,7 @@ export class ComputeWorkloadManager implements WorkloadManager { method: "POST", headers, body: JSON.stringify({ - image: opts.image, + image: imageRef, env: envVars, }), }) From 56ef39f813ccff71bee51ff30957849751a84a21 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:57:56 +0000 Subject: [PATCH 03/40] fix: add fetch timeout and wide event logging to ComputeWorkloadManager The fetch() call had no timeout, causing infinite hangs when the gateway accepted requests but never returned responses. Adds AbortSignal.timeout (30s) and consolidates all logging into a single structured event per create() call with timing, status, and error context. --- .../supervisor/src/workloadManager/compute.ts | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index ad01d7b6225..a35cd951d6e 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -25,8 +25,6 @@ export class ComputeWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("create()", { opts }); - const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); const envVars: Record = { @@ -81,10 +79,20 @@ export class ComputeWorkloadManager implements WorkloadManager { const url = `${this.opts.gatewayUrl}/api/sandboxes`; + const event: Record = { + runId: opts.runFriendlyId, + runnerId, + image: imageRef, + url, + }; + + const startMs = performance.now(); + const [fetchError, response] = await tryCatch( fetch(url, { method: "POST", headers, + signal: AbortSignal.timeout(30_000), body: JSON.stringify({ image: imageRef, env: envVars, @@ -92,28 +100,41 @@ export class ComputeWorkloadManager implements WorkloadManager { }) ); + event.durationMs = Math.round(performance.now() - startMs); + if (fetchError) { - this.logger.error("Failed to create sandbox", { error: fetchError, url }); + event.ok = false; + event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); + event.errorType = + fetchError instanceof DOMException && fetchError.name === "TimeoutError" + ? "timeout" + : "fetch"; + this.logger.error("create sandbox", event); return; } + event.status = response.status; + if (!response.ok) { const [bodyError, body] = await tryCatch(response.text()); - this.logger.error("Gateway returned error", { - status: response.status, - body: bodyError ? undefined : body, - url, - }); + event.ok = false; + event.responseBody = bodyError ? undefined : body; + this.logger.error("create sandbox", event); return; } const [parseError, data] = await tryCatch(response.json()); if (parseError) { - this.logger.error("Failed to parse gateway response", { error: parseError }); + event.ok = false; + event.error = parseError instanceof Error ? parseError.message : String(parseError); + event.errorType = "parse"; + this.logger.error("create sandbox", event); return; } - this.logger.debug("create succeeded", { sandboxId: data.id, runnerId }); + event.ok = true; + event.sandboxId = data.id; + this.logger.log("create sandbox", event); } } From 1bccd1eb8eeddf2e825c2f941aa58f3a45401b6d Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 22:00:22 +0000 Subject: [PATCH 04/40] feat: make gateway fetch timeout configurable --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 1 + apps/supervisor/src/workloadManager/compute.ts | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 77f34d04867..a8750221a87 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -80,6 +80,7 @@ const Env = z.object({ // Compute settings COMPUTE_GATEWAY_URL: z.string().url().optional(), COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index bd16f54dd2b..bdf3332406a 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -83,6 +83,7 @@ class ManagedSupervisor { ...workloadManagerOptions, gatewayUrl: env.COMPUTE_GATEWAY_URL, gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, }) : this.isKubernetes ? new KubernetesWorkloadManager(workloadManagerOptions) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index a35cd951d6e..26fbc99caf6 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -11,6 +11,7 @@ import { tryCatch } from "@trigger.dev/core"; type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { gatewayUrl: string; gatewayAuthToken?: string; + gatewayTimeoutMs: number; }; export class ComputeWorkloadManager implements WorkloadManager { @@ -92,7 +93,7 @@ export class ComputeWorkloadManager implements WorkloadManager { fetch(url, { method: "POST", headers, - signal: AbortSignal.timeout(30_000), + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), body: JSON.stringify({ image: imageRef, env: envVars, From 74817d75211bd371a1567767b1f28095c680e756 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 12 Feb 2026 23:54:34 +0000 Subject: [PATCH 05/40] refactor(supervisor): improve ComputeWorkloadManager wide event logging Emit a single canonical log line in a finally block instead of scattered log calls at each early return. Adds business context (envId, envType, orgId, projectId, deploymentVersion, machine) and instanceName to the event. Always emits at info level with ok=true/false for queryability. --- .../supervisor/src/workloadManager/compute.ts | 102 ++++++++++-------- 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 26fbc99caf6..85a43112f0d 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -80,62 +80,70 @@ export class ComputeWorkloadManager implements WorkloadManager { const url = `${this.opts.gatewayUrl}/api/sandboxes`; + // Wide event: single canonical log line emitted in finally const event: Record = { + // High-cardinality identifiers runId: opts.runFriendlyId, runnerId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + // Environment + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + // Request image: imageRef, url, }; const startMs = performance.now(); - const [fetchError, response] = await tryCatch( - fetch(url, { - method: "POST", - headers, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - image: imageRef, - env: envVars, - }), - }) - ); - - event.durationMs = Math.round(performance.now() - startMs); - - if (fetchError) { - event.ok = false; - event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); - event.errorType = - fetchError instanceof DOMException && fetchError.name === "TimeoutError" - ? "timeout" - : "fetch"; - this.logger.error("create sandbox", event); - return; + try { + const [fetchError, response] = await tryCatch( + fetch(url, { + method: "POST", + headers, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + image: imageRef, + env: envVars, + }), + }) + ); + + if (fetchError) { + event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); + event.errorType = + fetchError instanceof DOMException && fetchError.name === "TimeoutError" + ? "timeout" + : "fetch"; + return; + } + + event.status = response.status; + + if (!response.ok) { + const [bodyError, body] = await tryCatch(response.text()); + event.responseBody = bodyError ? undefined : body; + return; + } + + const [parseError, data] = await tryCatch(response.json()); + + if (parseError) { + event.error = parseError instanceof Error ? parseError.message : String(parseError); + event.errorType = "parse"; + return; + } + + event.sandboxId = data.id; + event.ok = true; + } finally { + event.durationMs = Math.round(performance.now() - startMs); + event.ok ??= false; + this.logger.info("create sandbox", event); } - - event.status = response.status; - - if (!response.ok) { - const [bodyError, body] = await tryCatch(response.text()); - event.ok = false; - event.responseBody = bodyError ? undefined : body; - this.logger.error("create sandbox", event); - return; - } - - const [parseError, data] = await tryCatch(response.json()); - - if (parseError) { - event.ok = false; - event.error = parseError instanceof Error ? parseError.message : String(parseError); - event.errorType = "parse"; - this.logger.error("create sandbox", event); - return; - } - - event.ok = true; - event.sandboxId = data.id; - this.logger.log("create sandbox", event); } } From a538735ac3c873b9cf0db1037b5d3fe6b6db0f53 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 13 Feb 2026 00:08:54 +0000 Subject: [PATCH 06/40] feat(supervisor): send metadata in compute sandbox creation requests Pass business context (runId, envId, orgId, projectId, machine, etc.) as metadata on CreateSandboxRequest instead of relying on env vars. This enables wide event logging in the compute stack without parsing env or leaking secrets. --- apps/supervisor/src/workloadManager/compute.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 85a43112f0d..2cdd84a5ef4 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -109,6 +109,15 @@ export class ComputeWorkloadManager implements WorkloadManager { body: JSON.stringify({ image: imageRef, env: envVars, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, }), }) ); From ac3dadfc5b9aedf53647a8bab002de1246a650dd Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 13 Feb 2026 00:19:52 +0000 Subject: [PATCH 07/40] feat(supervisor): send machine cpu/memory in compute sandbox requests Passes machine preset cpu and memory as top-level fields on the CreateSandboxRequest so the compute stack can use them for admission control and resource allocation. --- apps/supervisor/src/workloadManager/compute.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 2cdd84a5ef4..355f8ae1760 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -109,6 +109,8 @@ export class ComputeWorkloadManager implements WorkloadManager { body: JSON.stringify({ image: imageRef, env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, metadata: { runId: opts.runFriendlyId, envId: opts.envId, From 5a7b8ce550d6ca2f512ab6fbac455635f33427aa Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Wed, 18 Feb 2026 16:26:28 +0000 Subject: [PATCH 08/40] feat(supervisor): add dequeue and warm start timing to wide event Thread timing context from queue consumer through to the compute workload manager's wide event: - dequeueResponseMs: platform dequeue HTTP round-trip - pollingIntervalMs: which polling interval was active (idle vs active) - warmStartCheckMs: warm start check duration All fields are optional to avoid breaking existing consumers. --- apps/supervisor/src/index.ts | 7 ++++++- apps/supervisor/src/workloadManager/compute.ts | 4 ++++ apps/supervisor/src/workloadManager/types.ts | 4 ++++ .../src/v3/runEngineWorker/supervisor/consumerPool.ts | 4 ++-- .../core/src/v3/runEngineWorker/supervisor/events.ts | 2 ++ .../v3/runEngineWorker/supervisor/queueConsumer.ts | 11 ++++++++--- .../core/src/v3/runEngineWorker/supervisor/session.ts | 4 +++- 7 files changed, 29 insertions(+), 7 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index bdf3332406a..ede628f82a4 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -195,7 +195,7 @@ class ManagedSupervisor { this.workloadServer.notifyRun({ run }); }); - this.workerSession.on("runQueueMessage", async ({ time, message }) => { + this.workerSession.on("runQueueMessage", async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { this.logger.log(`Received message with timestamp ${time.toLocaleString()}`, message); if (message.completedWaitpoints.length > 0) { @@ -244,7 +244,9 @@ class ManagedSupervisor { this.logger.log("Scheduling run", { runId: message.run.id }); + const warmStartStart = performance.now(); const didWarmStart = await this.tryWarmStart(message); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); if (didWarmStart) { this.logger.log("Warm start successful", { runId: message.run.id }); @@ -260,6 +262,9 @@ class ManagedSupervisor { await this.workloadManager.create({ dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, envId: message.environment.id, envType: message.environment.type, image: message.image, diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 355f8ae1760..3363236ff49 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -93,6 +93,10 @@ export class ComputeWorkloadManager implements WorkloadManager { machine: opts.machine.name, // Environment instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + // Supervisor timing + dequeueResponseMs: opts.dequeueResponseMs, + pollingIntervalMs: opts.pollingIntervalMs, + warmStartCheckMs: opts.warmStartCheckMs, // Request image: imageRef, url, diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index 90b61957795..82c7ea7b4c0 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -24,6 +24,10 @@ export interface WorkloadManagerCreateOptions { nextAttemptNumber?: number; dequeuedAt: Date; placementTags?: PlacementTag[]; + // Timing context (populated by supervisor handler, included in wide event) + dequeueResponseMs?: number; + pollingIntervalMs?: number; + warmStartCheckMs?: number; // identifiers envId: string; envType: EnvironmentType; diff --git a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts index 2dd3d1b898b..d72cef75c7c 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.ts @@ -351,12 +351,12 @@ export class RunQueueConsumerPool { const consumer = this.consumerFactory({ ...this.consumerOptions, - onDequeue: async (messages) => { + onDequeue: async (messages, timing) => { // Always update queue length, default to 0 for empty dequeues or missing value this.updateQueueLength(messages[0]?.workerQueueLength ?? 0); // Forward to the original handler - await this.consumerOptions.onDequeue(messages); + await this.consumerOptions.onDequeue(messages, timing); }, }); diff --git a/packages/core/src/v3/runEngineWorker/supervisor/events.ts b/packages/core/src/v3/runEngineWorker/supervisor/events.ts index a51c504a3e6..df4a93686a9 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/events.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/events.ts @@ -6,6 +6,8 @@ export type WorkerEvents = { { time: Date; message: DequeuedMessage; + dequeueResponseMs?: number; + pollingIntervalMs?: number; }, ]; requestRunAttemptStart: [ diff --git a/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts b/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts index 4379eb54f37..76faee40809 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/queueConsumer.ts @@ -15,7 +15,7 @@ export type RunQueueConsumerOptions = { preDequeue?: PreDequeueFn; preSkip?: PreSkipFn; maxRunCount?: number; - onDequeue: (messages: WorkerApiDequeueResponseBody) => Promise; + onDequeue: (messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }) => Promise; }; export class RunQueueConsumer implements QueueConsumer { @@ -23,13 +23,14 @@ export class RunQueueConsumer implements QueueConsumer { private readonly preDequeue?: PreDequeueFn; private readonly preSkip?: PreSkipFn; private readonly maxRunCount?: number; - private readonly onDequeue: (messages: WorkerApiDequeueResponseBody) => Promise; + private readonly onDequeue: (messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }) => Promise; private readonly logger = new SimpleStructuredLogger("queue-consumer"); private intervalMs: number; private idleIntervalMs: number; private isEnabled: boolean; + private lastScheduledIntervalMs: number; constructor(opts: RunQueueConsumerOptions) { this.isEnabled = false; @@ -38,6 +39,7 @@ export class RunQueueConsumer implements QueueConsumer { this.preDequeue = opts.preDequeue; this.preSkip = opts.preSkip; this.maxRunCount = opts.maxRunCount; + this.lastScheduledIntervalMs = opts.idleIntervalMs; this.onDequeue = opts.onDequeue; this.client = opts.client; } @@ -111,16 +113,18 @@ export class RunQueueConsumer implements QueueConsumer { let nextIntervalMs = this.idleIntervalMs; try { + const dequeueStart = performance.now(); const response = await this.client.dequeue({ maxResources: preDequeueResult?.maxResources, maxRunCount: this.maxRunCount, }); + const dequeueResponseMs = Math.round(performance.now() - dequeueStart); if (!response.success) { this.logger.error("Failed to dequeue", { error: response.error }); } else { try { - await this.onDequeue(response.data); + await this.onDequeue(response.data, { dequeueResponseMs, pollingIntervalMs: this.lastScheduledIntervalMs }); if (response.data.length > 0) { nextIntervalMs = this.intervalMs; @@ -141,6 +145,7 @@ export class RunQueueConsumer implements QueueConsumer { this.logger.verbose("scheduled dequeue with idle interval", { delayMs }); } + this.lastScheduledIntervalMs = delayMs; setTimeout(this.dequeue.bind(this), delayMs); } } diff --git a/packages/core/src/v3/runEngineWorker/supervisor/session.ts b/packages/core/src/v3/runEngineWorker/supervisor/session.ts index e5a783b8d41..b2d344fb3dc 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/session.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/session.ts @@ -80,13 +80,15 @@ export class SupervisorSession extends EventEmitter { }); } - private async onDequeue(messages: WorkerApiDequeueResponseBody): Promise { + private async onDequeue(messages: WorkerApiDequeueResponseBody, timing?: { dequeueResponseMs: number; pollingIntervalMs: number }): Promise { this.logger.verbose("Dequeued messages with contents", { count: messages.length, messages }); for (const message of messages) { this.emit("runQueueMessage", { time: new Date(), message, + dequeueResponseMs: timing?.dequeueResponseMs, + pollingIntervalMs: timing?.pollingIntervalMs, }); } } From 4d603adf675f4ef67db5d7b87c61996d8a389d60 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:46:45 +0000 Subject: [PATCH 09/40] feat(supervisor): add compute checkpoint/restore support - Fix instance creation URL from /api/sandboxes to /api/instances - Pass name: runnerId when creating compute instances - Add snapshot(), deleteInstance(), and restore() methods to ComputeWorkloadManager - Add /api/v1/compute/snapshot-complete callback endpoint to WorkloadServer - Handle suspend requests in compute mode via fire-and-forget snapshot with callback - Handle restore in compute mode by calling gateway restore API directly - Wire computeManager into WorkloadServer for compute mode suspend/restore --- apps/supervisor/src/index.ts | 41 ++++-- .../supervisor/src/workloadManager/compute.ts | 124 ++++++++++++++++- apps/supervisor/src/workloadServer/index.ts | 126 ++++++++++++++++-- 3 files changed, 269 insertions(+), 22 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index ede628f82a4..13c1ddb7f82 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -36,9 +36,11 @@ class ManagedSupervisor { private readonly metricsServer?: HttpServer; private readonly workloadServer: WorkloadServer; private readonly workloadManager: WorkloadManager; + private readonly computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("managed-supervisor"); private readonly resourceMonitor: ResourceMonitor; private readonly checkpointClient?: CheckpointClient; + private readonly isComputeMode: boolean; private readonly podCleaner?: PodCleaner; private readonly failedPodHandler?: FailedPodHandler; @@ -78,16 +80,22 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = env.COMPUTE_GATEWAY_URL - ? new ComputeWorkloadManager({ - ...workloadManagerOptions, - gatewayUrl: env.COMPUTE_GATEWAY_URL, - gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, - gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, - }) - : this.isKubernetes + this.isComputeMode = !!env.COMPUTE_GATEWAY_URL; + + if (env.COMPUTE_GATEWAY_URL) { + const computeManager = new ComputeWorkloadManager({ + ...workloadManagerOptions, + gatewayUrl: env.COMPUTE_GATEWAY_URL, + gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, + }); + this.computeManager = computeManager; + this.workloadManager = computeManager; + } else { + this.workloadManager = this.isKubernetes ? new KubernetesWorkloadManager(workloadManagerOptions) : new DockerWorkloadManager(workloadManagerOptions); + } if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { @@ -215,6 +223,22 @@ class ManagedSupervisor { if (checkpoint) { this.logger.log("Restoring run", { runId: message.run.id }); + if (this.isComputeMode && this.computeManager) { + try { + const didRestore = await this.computeManager.restore(checkpoint.location); + + if (didRestore) { + this.logger.log("Compute restore successful", { runId: message.run.id }); + } else { + this.logger.error("Compute restore failed", { runId: message.run.id }); + } + } catch (error) { + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + if (!this.checkpointClient) { this.logger.error("No checkpoint client", { runId: message.run.id }); return; @@ -309,6 +333,7 @@ class ManagedSupervisor { host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL, workerClient: this.workerSession.httpClient, checkpointClient: this.checkpointClient, + computeManager: this.computeManager, }); this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 3363236ff49..1e1a3462413 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -60,7 +60,9 @@ export class ComputeWorkloadManager implements WorkloadManager { } if (this.opts.snapshotPollIntervalSeconds) { - envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(this.opts.snapshotPollIntervalSeconds); + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String( + this.opts.snapshotPollIntervalSeconds + ); } if (this.opts.additionalEnvVars) { @@ -78,7 +80,7 @@ export class ComputeWorkloadManager implements WorkloadManager { // Strip image digest — resolve by tag, not digest const imageRef = opts.image.split("@")[0]!; - const url = `${this.opts.gatewayUrl}/api/sandboxes`; + const url = `${this.opts.gatewayUrl}/api/instances`; // Wide event: single canonical log line emitted in finally const event: Record = { @@ -111,6 +113,7 @@ export class ComputeWorkloadManager implements WorkloadManager { headers, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), body: JSON.stringify({ + name: runnerId, image: imageRef, env: envVars, cpu: opts.machine.cpu, @@ -153,12 +156,125 @@ export class ComputeWorkloadManager implements WorkloadManager { return; } - event.sandboxId = data.id; + event.instanceId = data.id; event.ok = true; } finally { event.durationMs = Math.round(performance.now() - startMs); event.ok ??= false; - this.logger.info("create sandbox", event); + this.logger.info("create instance", event); + } + } + + private get authHeaders(): Record { + const headers: Record = { + "Content-Type": "application/json", + }; + if (this.opts.gatewayAuthToken) { + headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; + } + return headers; + } + + async snapshot(opts: { + runnerId: string; + callbackUrl: string; + metadata: Record; + }): Promise { + const url = `${this.opts.gatewayUrl}/api/instances/${opts.runnerId}/snapshot`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "POST", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + callback: { + url: opts.callbackUrl, + metadata: opts.metadata, + }, + }), + }) + ); + + if (error) { + this.logger.error("snapshot request failed", { + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; } + + if (response.status !== 202) { + this.logger.error("snapshot request rejected", { + runnerId: opts.runnerId, + status: response.status, + }); + return false; + } + + this.logger.info("snapshot request accepted", { runnerId: opts.runnerId }); + return true; + } + + async deleteInstance(runnerId: string): Promise { + const url = `${this.opts.gatewayUrl}/api/instances/${runnerId}`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "DELETE", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + }) + ); + + if (error) { + this.logger.error("delete instance failed", { + runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + if (!response.ok) { + this.logger.error("delete instance rejected", { + runnerId, + status: response.status, + }); + return false; + } + + this.logger.info("delete instance success", { runnerId }); + return true; + } + + async restore(snapshotId: string): Promise { + const url = `${this.opts.gatewayUrl}/api/snapshots/${snapshotId}/restore`; + + const [error, response] = await tryCatch( + fetch(url, { + method: "POST", + headers: this.authHeaders, + signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + }) + ); + + if (error) { + this.logger.error("restore request failed", { + snapshotId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + if (!response.ok) { + this.logger.error("restore request rejected", { + snapshotId, + status: response.status, + }); + return false; + } + + this.logger.info("restore request success", { snapshotId }); + return true; } } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 35d53d36099..6598da76a34 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -24,6 +24,7 @@ import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOn import { type IncomingMessage } from "node:http"; import { register } from "../metrics.js"; import { env } from "../env.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -53,15 +54,25 @@ type WorkloadServerEvents = { ]; }; +const ComputeSnapshotCallbackBody = z.object({ + snapshot_id: z.string(), + instance_id: z.string(), + status: z.enum(["completed", "failed"]), + error: z.string().optional(), + metadata: z.record(z.string()).optional(), +}); + type WorkloadServerOptions = { port: number; host?: string; workerClient: SupervisorHttpClient; checkpointClient?: CheckpointClient; + computeManager?: ComputeWorkloadManager; }; export class WorkloadServer extends EventEmitter { private checkpointClient?: CheckpointClient; + private computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("workload-server"); @@ -93,6 +104,7 @@ export class WorkloadServer extends EventEmitter { this.workerClient = opts.workerClient; this.checkpointClient = opts.checkpointClient; + this.computeManager = opts.computeManager; this.httpServer = this.createHttpServer({ host, port }); this.websocketServer = this.createWebsocketServer(); @@ -231,11 +243,19 @@ export class WorkloadServer extends EventEmitter { handler: async ({ reply, params, req }) => { this.logger.debug("Suspend request", { params, headers: req.headers }); - if (!this.checkpointClient) { + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + headers: req.headers, + }); reply.json( { ok: false, - error: "Checkpoints disabled", + error: "Invalid headers", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -243,19 +263,35 @@ export class WorkloadServer extends EventEmitter { return; } - const runnerId = this.runnerIdFromRequest(req); - const deploymentVersion = this.deploymentVersionFromRequest(req); - const projectRef = this.projectRefFromRequest(req); + if (this.computeManager) { + // Compute mode: fire-and-forget snapshot with callback + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - if (!runnerId || !deploymentVersion || !projectRef) { - this.logger.error("Invalid headers for suspend request", { - ...params, - headers: req.headers, + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${ + env.TRIGGER_WORKLOAD_API_DOMAIN ?? "localhost" + }:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + const snapshotResult = await this.computeManager.snapshot({ + runnerId, + callbackUrl, + metadata: { + runId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + }, }); + + if (!snapshotResult) { + this.logger.error("Failed to request compute snapshot", { params, runnerId }); + } + + return; + } + + if (!this.checkpointClient) { reply.json( { ok: false, - error: "Invalid headers", + error: "Checkpoints disabled", } satisfies WorkloadSuspendRunResponseBody, false, 400 @@ -394,6 +430,76 @@ export class WorkloadServer extends EventEmitter { }); } + // Compute snapshot callback endpoint + httpServer.route("/api/v1/compute/snapshot-complete", "POST", { + bodySchema: ComputeSnapshotCallbackBody, + handler: async ({ reply, body }) => { + this.logger.info("Compute snapshot callback", { + snapshotId: body.snapshot_id, + instanceId: body.instance_id, + status: body.status, + error: body.error, + metadata: body.metadata, + }); + + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + if (!runId || !snapshotFriendlyId) { + this.logger.error("Compute snapshot callback missing metadata", { body }); + reply.empty(400); + return; + } + + if (body.status === "completed") { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: true, + checkpoint: { + type: "KUBERNETES", + location: body.snapshot_id, + }, + }, + }); + + if (result.success) { + this.logger.info("Suspend completion submitted, deleting instance", { + runId, + instanceId: body.instance_id, + }); + await this.computeManager?.deleteInstance(body.instance_id); + } else { + this.logger.error("Failed to submit suspend completion", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } else { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: false, + error: body.error ?? "Snapshot failed", + }, + }); + + if (!result.success) { + this.logger.error("Failed to submit suspend failure", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } + + reply.empty(200); + }, + }); + return httpServer; } From c1511f9db1981980f0e1dd34ecc25d916cbb4610 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 20 Feb 2026 21:46:18 +0000 Subject: [PATCH 10/40] fix(cli): fix --load flag on local/self-hosted builds --- .changeset/fix-local-build-load.md | 5 +++++ packages/cli-v3/src/deploy/buildImage.ts | 26 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 .changeset/fix-local-build-load.md diff --git a/.changeset/fix-local-build-load.md b/.changeset/fix-local-build-load.md new file mode 100644 index 00000000000..13f91da9d6a --- /dev/null +++ b/.changeset/fix-local-build-load.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix `--load` flag being silently ignored on local/self-hosted builds. diff --git a/packages/cli-v3/src/deploy/buildImage.ts b/packages/cli-v3/src/deploy/buildImage.ts index 2225d7db056..31a2b658545 100644 --- a/packages/cli-v3/src/deploy/buildImage.ts +++ b/packages/cli-v3/src/deploy/buildImage.ts @@ -205,6 +205,7 @@ async function remoteBuildImage(options: DepotBuildImageOptions): Promise Date: Sat, 21 Feb 2026 02:28:43 +0000 Subject: [PATCH 11/40] feat(supervisor): pass name, metadata, and resources in compute restore request Restore calls now send a request body with the runner name, env override metadata, cpu, and memory so the agent can inject them before the VM resumes. The runner fetches these overrides from TRIGGER_METADATA_URL at restore time. runnerId is derived per restore cycle as runner-{runIdShort}-{checkpointSuffix}, matching iceman's pattern. --- apps/supervisor/src/index.ts | 17 +++++++-- .../supervisor/src/workloadManager/compute.ts | 37 ++++++++++++++++--- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 13c1ddb7f82..5ba2bbf7e2c 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -225,12 +225,23 @@ class ManagedSupervisor { if (this.isComputeMode && this.computeManager) { try { - const didRestore = await this.computeManager.restore(checkpoint.location); + // Derive runnerId unique per restore cycle (matches iceman's pattern) + const runIdShort = message.run.friendlyId.replace("run_", ""); + const checkpointSuffix = checkpoint.id.slice(-8); + const runnerId = `runner-${runIdShort}-${checkpointSuffix}`; + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + }); if (didRestore) { - this.logger.log("Compute restore successful", { runId: message.run.id }); + this.logger.log("Compute restore successful", { runId: message.run.id, runnerId }); } else { - this.logger.error("Compute restore failed", { runId: message.run.id }); + this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); } } catch (error) { this.logger.error("Failed to restore run (compute)", { error }); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 1e1a3462413..1d9e905ce5d 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -247,20 +247,43 @@ export class ComputeWorkloadManager implements WorkloadManager { return true; } - async restore(snapshotId: string): Promise { - const url = `${this.opts.gatewayUrl}/api/snapshots/${snapshotId}/restore`; + async restore(opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; + machine: { cpu: number; memory: number }; + }): Promise { + const url = `${this.opts.gatewayUrl}/api/snapshots/${opts.snapshotId}/restore`; + + const metadata: Record = { + TRIGGER_RUNNER_ID: opts.runnerId, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, + }; const [error, response] = await tryCatch( fetch(url, { method: "POST", headers: this.authHeaders, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), + body: JSON.stringify({ + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_mb: opts.machine.memory * 1024, + }), }) ); if (error) { this.logger.error("restore request failed", { - snapshotId, + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, error: error instanceof Error ? error.message : String(error), }); return false; @@ -268,13 +291,17 @@ export class ComputeWorkloadManager implements WorkloadManager { if (!response.ok) { this.logger.error("restore request rejected", { - snapshotId, + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, status: response.status, }); return false; } - this.logger.info("restore request success", { snapshotId }); + this.logger.info("restore request success", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); return true; } } From 43327439445436824492ff32adf0e280db89223d Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Sat, 21 Feb 2026 11:28:26 +0000 Subject: [PATCH 12/40] feat(supervisor): add flag to enable compute snapshots Gates snapshot/restore behaviour independently of compute mode. When disabled, VMs won't receive the metadata URL and suspend/restore are no-ops. Defaults to off so compute mode can be used without snapshots. --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 2 +- apps/supervisor/src/workloadManager/compute.ts | 18 +++++++++++------- apps/supervisor/src/workloadServer/index.ts | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index a8750221a87..3cf69513818 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -81,6 +81,7 @@ const Env = z.object({ COMPUTE_GATEWAY_URL: z.string().url().optional(), COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 5ba2bbf7e2c..dd91591aba6 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -223,7 +223,7 @@ class ManagedSupervisor { if (checkpoint) { this.logger.log("Restoring run", { runId: message.run.id }); - if (this.isComputeMode && this.computeManager) { + if (this.isComputeMode && this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { try { // Derive runnerId unique per restore cycle (matches iceman's pattern) const runIdShort = message.run.friendlyId.replace("run_", ""); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 1d9e905ce5d..ea7be55d1a9 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -51,7 +51,7 @@ export class ComputeWorkloadManager implements WorkloadManager { envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; } - if (this.opts.metadataUrl) { + if (env.COMPUTE_SNAPSHOTS_ENABLED && this.opts.metadataUrl) { envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; } @@ -266,17 +266,21 @@ export class ComputeWorkloadManager implements WorkloadManager { TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, }; + const body = { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_mb: opts.machine.memory * 1024, + }; + + this.logger.debug("restore request body", { url, body }); + const [error, response] = await tryCatch( fetch(url, { method: "POST", headers: this.authHeaders, signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - name: opts.runnerId, - metadata, - cpu: opts.machine.cpu, - memory_mb: opts.machine.memory * 1024, - }), + body: JSON.stringify(body), }) ); diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 6598da76a34..0586c259a39 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -263,7 +263,7 @@ export class WorkloadServer extends EventEmitter { return; } - if (this.computeManager) { + if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { // Compute mode: fire-and-forget snapshot with callback reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); From 5089bba33577d68db15321b4e9ab20d1e1c184b5 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 23 Feb 2026 11:49:55 +0000 Subject: [PATCH 13/40] feat(supervisor): require metadata URL when compute snapshots enabled --- apps/supervisor/src/env.ts | 300 +++++++++++++++++++------------------ 1 file changed, 157 insertions(+), 143 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 3cf69513818..33478bab64b 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -3,148 +3,162 @@ import { env as stdEnv } from "std-env"; import { z } from "zod"; import { AdditionalEnvVars, BoolEnv } from "./envUtil.js"; -const Env = z.object({ - // This will come from `spec.nodeName` in k8s - TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), - TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), - - // Required settings - TRIGGER_API_URL: z.string().url(), - TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file - MANAGED_WORKER_SECRET: z.string(), - OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners - - // Workload API settings (coordinator mode) - the workload API is what the run controller connects to - TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), - TRIGGER_WORKLOAD_API_PROTOCOL: z - .string() - .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) - .default("http"), - TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default - TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), - TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on - TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller - - // Runner settings - RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) - RUNNER_PRETTY_LOGS: BoolEnv.default(false), - - // Dequeue settings (provider mode) - TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), - TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), - TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), - TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), - TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), - TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds - TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds - TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) - TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) - TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) - TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) - - // Optional services - TRIGGER_WARM_START_URL: z.string().optional(), - TRIGGER_CHECKPOINT_URL: z.string().optional(), - TRIGGER_METADATA_URL: z.string().optional(), - - // Used by the resource monitor - RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), - RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), - RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), - - // Docker settings - DOCKER_API_VERSION: z.string().optional(), - DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 - DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), - DOCKER_REGISTRY_USERNAME: z.string().optional(), - DOCKER_REGISTRY_PASSWORD: z.string().optional(), - DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 - DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), - DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), - /** - * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. - * Any other value is taken as a custom network's name to which all runners should connect to. - * - * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. - * - * **WARNING**: Specifying multiple networks will slightly increase startup times. - * - * @default "host" - */ - DOCKER_RUNNER_NETWORKS: z.string().default("host"), - - // Compute settings - COMPUTE_GATEWAY_URL: z.string().url().optional(), - COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), - COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), - COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), - - // Kubernetes settings - KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), - KUBERNETES_NAMESPACE: z.string().default("default"), - KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), - KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv - KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), - KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), - KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), - KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), - KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit - KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), - KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit - - // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO - KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO - KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB - KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods - KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= - - // Project affinity settings - pods from the same project prefer the same node - KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), - KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), - KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z.string().trim().min(1).default("kubernetes.io/hostname"), - - // Placement tags settings - PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), - PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), - - // Metrics - METRICS_ENABLED: BoolEnv.default(true), - METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), - METRICS_HOST: z.string().default("127.0.0.1"), - METRICS_PORT: z.coerce.number().int().default(9090), - - // Pod cleaner - POD_CLEANER_ENABLED: BoolEnv.default(true), - POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), - POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), - - // Failed pod handler - FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), - FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), - - // Debug - DEBUG: BoolEnv.default(false), - SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), -}); +const Env = z + .object({ + // This will come from `spec.nodeName` in k8s + TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), + TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + + // Required settings + TRIGGER_API_URL: z.string().url(), + TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file + MANAGED_WORKER_SECRET: z.string(), + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners + + // Workload API settings (coordinator mode) - the workload API is what the run controller connects to + TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), + TRIGGER_WORKLOAD_API_PROTOCOL: z + .string() + .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) + .default("http"), + TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default + TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), + TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on + TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller + + // Runner settings + RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_PRETTY_LOGS: BoolEnv.default(false), + + // Dequeue settings (provider mode) + TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), + TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), + TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), + TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), + TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds + TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds + TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) + TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) + TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) + TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + + // Optional services + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_CHECKPOINT_URL: z.string().optional(), + TRIGGER_METADATA_URL: z.string().optional(), + + // Used by the resource monitor + RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), + RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), + RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), + + // Docker settings + DOCKER_API_VERSION: z.string().optional(), + DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 + DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), + DOCKER_REGISTRY_USERNAME: z.string().optional(), + DOCKER_REGISTRY_PASSWORD: z.string().optional(), + DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 + DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), + DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), + /** + * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. + * Any other value is taken as a custom network's name to which all runners should connect to. + * + * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. + * + * **WARNING**: Specifying multiple networks will slightly increase startup times. + * + * @default "host" + */ + DOCKER_RUNNER_NETWORKS: z.string().default("host"), + + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + + // Kubernetes settings + KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), + KUBERNETES_NAMESPACE: z.string().default("default"), + KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), + KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv + KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), + KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), + KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), + KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), + KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit + KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), + KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit + + // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO + KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO + KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB + KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= + + // Project affinity settings - pods from the same project prefer the same node + KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), + KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z + .string() + .trim() + .min(1) + .default("kubernetes.io/hostname"), + + // Placement tags settings + PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), + PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), + + // Metrics + METRICS_ENABLED: BoolEnv.default(true), + METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), + METRICS_HOST: z.string().default("127.0.0.1"), + METRICS_PORT: z.coerce.number().int().default(9090), + + // Pod cleaner + POD_CLEANER_ENABLED: BoolEnv.default(true), + POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), + POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), + + // Failed pod handler + FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), + FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), + + // Debug + DEBUG: BoolEnv.default(false), + SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + }) + .superRefine((data, ctx) => { + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_METADATA_URL"], + }); + } + }); export const env = Env.parse(stdEnv); From 7ed92216627dc09268c582f46e33d4b65456bff4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 23 Feb 2026 13:05:22 +0000 Subject: [PATCH 14/40] fix(supervisor): require workload API domain when compute snapshots enabled Remove the silent `localhost` fallback for the snapshot callback URL, which would be unreachable from external compute gateways. Add env validation and a runtime guard matching the existing metadata URL pattern. --- apps/supervisor/src/env.ts | 7 +++++++ apps/supervisor/src/workloadServer/index.ts | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 33478bab64b..da7ea5b91f1 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -159,6 +159,13 @@ const Env = z path: ["TRIGGER_METADATA_URL"], }); } + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_WORKLOAD_API_DOMAIN) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_WORKLOAD_API_DOMAIN is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_WORKLOAD_API_DOMAIN"], + }); + } }); export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 0586c259a39..07801682bc0 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -264,12 +264,18 @@ export class WorkloadServer extends EventEmitter { } if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { + if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { + this.logger.error( + "TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create snapshot callback URL" + ); + reply.json({ error: "Snapshot callbacks not configured" }, false, 500); + return; + } + // Compute mode: fire-and-forget snapshot with callback reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${ - env.TRIGGER_WORKLOAD_API_DOMAIN ?? "localhost" - }:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; const snapshotResult = await this.computeManager.snapshot({ runnerId, From e9b5fd384663c83772292350411f0fcea947aea5 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 2 Mar 2026 19:35:15 +0000 Subject: [PATCH 15/40] fix(supervisor): don't destroy compute instance after snapshot --- apps/supervisor/src/workloadServer/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 07801682bc0..d857b0da677 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -471,11 +471,11 @@ export class WorkloadServer extends EventEmitter { }); if (result.success) { - this.logger.info("Suspend completion submitted, deleting instance", { + this.logger.info("Suspend completion submitted", { runId, instanceId: body.instance_id, + snapshotId: body.snapshot_id, }); - await this.computeManager?.deleteInstance(body.instance_id); } else { this.logger.error("Failed to submit suspend completion", { runId, From 63424fa6c77b3083950cd7c6c850ff73dad7af3e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:36:54 +0000 Subject: [PATCH 16/40] feat(supervisor): add snapshot delay for compute path via timer wheel delay compute snapshot requests to avoid wasted work on short-lived waitpoints (e.g. triggerAndWait resolving in <5s). configurable via COMPUTE_SNAPSHOT_DELAY_MS (default 5s). --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 1 + .../src/services/timerWheel.test.ts | 254 ++++++++++++++++++ apps/supervisor/src/services/timerWheel.ts | 160 +++++++++++ apps/supervisor/src/workloadServer/index.ts | 79 +++++- 5 files changed, 482 insertions(+), 13 deletions(-) create mode 100644 apps/supervisor/src/services/timerWheel.test.ts create mode 100644 apps/supervisor/src/services/timerWheel.ts diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index da7ea5b91f1..73d703da2fe 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -83,6 +83,7 @@ const Env = z COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index dd91591aba6..71fd4b83796 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -429,6 +429,7 @@ class ManagedSupervisor { async stop() { this.logger.log("Shutting down"); + await this.workloadServer.stop(); await this.workerSession.stop(); // Optional services diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts new file mode 100644 index 00000000000..3f6bb9aa19b --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.test.ts @@ -0,0 +1,254 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { TimerWheel } from "./timerWheel.js"; + +describe("TimerWheel", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("dispatches item after delay", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "snapshot-data"); + + // Not yet + vi.advanceTimersByTime(2900); + expect(dispatched).toEqual([]); + + // After delay + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("cancels item before it fires", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(1000); + expect(wheel.cancel("run-1")).toBe(true); + + vi.advanceTimersByTime(5000); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("cancel returns false for unknown key", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + expect(wheel.cancel("nonexistent")).toBe(false); + }); + + it("deduplicates: resubmitting same key replaces the entry", () => { + const dispatched: { key: string; data: string }[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push({ key: item.key, data: item.data }), + }); + + wheel.start(); + wheel.submit("run-1", "old-data"); + + vi.advanceTimersByTime(1000); + wheel.submit("run-1", "new-data"); + + // Original would have fired at t=3000, but was replaced + // New one fires at t=1000+3000=4000 + vi.advanceTimersByTime(2100); + expect(dispatched).toEqual([]); + + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual([{ key: "run-1", data: "new-data" }]); + + wheel.stop(); + }); + + it("handles many concurrent items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + for (let i = 0; i < 1000; i++) { + wheel.submit(`run-${i}`, `data-${i}`); + } + expect(wheel.size).toBe(1000); + + vi.advanceTimersByTime(3100); + expect(dispatched.length).toBe(1000); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("handles items submitted at different times", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-2", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-3", "data"); + + // t=2000: nothing yet + expect(dispatched).toEqual([]); + + // t=3100: run-1 fires + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-1"]); + + // t=4100: run-2 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2"]); + + // t=5100: run-3 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2", "run-3"]); + + wheel.stop(); + }); + + it("setDelay changes delay for new items only", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); // 3s delay + + vi.advanceTimersByTime(500); + wheel.setDelay(1000); + wheel.submit("run-2", "data"); // 1s delay + + // t=1500: run-2 should have fired (submitted at t=500 with 1s delay) + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-2"]); + + // t=3100: run-1 fires at its original 3s delay + vi.advanceTimersByTime(1500); + expect(dispatched).toEqual(["run-2", "run-1"]); + + wheel.stop(); + }); + + it("stop returns unprocessed items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data-1"); + wheel.submit("run-2", "data-2"); + wheel.submit("run-3", "data-3"); + + const remaining = wheel.stop(); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + expect(remaining.length).toBe(3); + expect(remaining.map((r) => r.key).sort()).toEqual(["run-1", "run-2", "run-3"]); + expect(remaining.find((r) => r.key === "run-1")?.data).toBe("data-1"); + }); + + it("after stop, new submissions are silently dropped", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.stop(); + + wheel.submit("run-late", "data"); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + }); + + it("tracks size correctly through submit/cancel/dispatch", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + + wheel.submit("a", "data"); + wheel.submit("b", "data"); + expect(wheel.size).toBe(2); + + wheel.cancel("a"); + expect(wheel.size).toBe(1); + + vi.advanceTimersByTime(3100); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("clamps delay to valid range", () => { + const dispatched: string[] = []; + + // Very small delay (should be at least 1 tick = 100ms) + const wheel = new TimerWheel({ + delayMs: 0, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("multiple cancel calls are safe", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + expect(wheel.cancel("run-1")).toBe(true); + expect(wheel.cancel("run-1")).toBe(false); + + wheel.stop(); + }); +}); diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts new file mode 100644 index 00000000000..4a95e216b25 --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.ts @@ -0,0 +1,160 @@ +/** + * TimerWheel implements a hashed timer wheel for efficiently managing large numbers + * of delayed operations with O(1) submit, cancel, and per-item dispatch. + * + * Used by the supervisor to delay snapshot requests so that short-lived waitpoints + * (e.g. triggerAndWait that resolves in <3s) skip the snapshot entirely. + * + * The wheel is a ring buffer of slots. A single setInterval advances a cursor. + * When the cursor reaches a slot, all items in that slot are dispatched. + * + * Fixed capacity: 600 slots at 100ms tick = 60s max delay. + */ + +const TICK_MS = 100; +const NUM_SLOTS = 600; // 60s max delay at 100ms tick + +export type TimerWheelItem = { + key: string; + data: T; +}; + +export type TimerWheelOptions = { + /** Called when an item's delay expires. */ + onExpire: (item: TimerWheelItem) => void; + /** Delay in milliseconds before items fire. Clamped to [100, 60000]. */ + delayMs: number; +}; + +type Entry = { + key: string; + data: T; + slotIndex: number; +}; + +export class TimerWheel { + private slots: Set[]; + private entries: Map>; + private cursor: number; + private intervalId: ReturnType | null; + private onExpire: (item: TimerWheelItem) => void; + private delaySlots: number; + + constructor(opts: TimerWheelOptions) { + this.slots = Array.from({ length: NUM_SLOTS }, () => new Set()); + this.entries = new Map(); + this.cursor = 0; + this.intervalId = null; + this.onExpire = opts.onExpire; + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.round(opts.delayMs / TICK_MS))); + } + + /** Start the timer wheel. Must be called before submitting items. */ + start(): void { + if (this.intervalId) return; + this.intervalId = setInterval(() => this.tick(), TICK_MS); + // Don't hold the process open just for the timer wheel + if (this.intervalId && typeof this.intervalId === "object" && "unref" in this.intervalId) { + this.intervalId.unref(); + } + } + + /** + * Stop the timer wheel and return all unprocessed items. + * The wheel keeps running normally during graceful shutdown - call stop() + * only when you're ready to tear down. Caller decides what to do with leftovers. + */ + stop(): TimerWheelItem[] { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + + const remaining: TimerWheelItem[] = []; + for (const [key, entry] of this.entries) { + remaining.push({ key, data: entry.data }); + } + + for (const slot of this.slots) { + slot.clear(); + } + this.entries.clear(); + + return remaining; + } + + /** + * Update the delay for future submissions. Already-queued items keep their original timing. + * Clamped to [TICK_MS, 60000ms]. + */ + setDelay(delayMs: number): void { + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.round(delayMs / TICK_MS))); + } + + /** + * Submit an item to be dispatched after the configured delay. + * If an item with the same key already exists, it is replaced (dedup). + * No-op if the wheel is stopped. + */ + submit(key: string, data: T): void { + if (!this.intervalId) return; + + // Dedup: remove existing entry for this key + this.cancel(key); + + const slotIndex = (this.cursor + this.delaySlots) % NUM_SLOTS; + const entry: Entry = { key, data, slotIndex }; + + this.entries.set(key, entry); + this.slot(slotIndex).add(key); + } + + /** + * Cancel a pending item. Returns true if the item was found and removed. + */ + cancel(key: string): boolean { + const entry = this.entries.get(key); + if (!entry) return false; + + this.slot(entry.slotIndex).delete(key); + this.entries.delete(key); + return true; + } + + /** Number of pending items in the wheel. */ + get size(): number { + return this.entries.size; + } + + /** Whether the wheel is running. */ + get running(): boolean { + return this.intervalId !== null; + } + + /** Get a slot by index. The array is fully initialized so this always returns a Set. */ + private slot(index: number): Set { + const s = this.slots[index]; + if (!s) throw new Error(`TimerWheel: invalid slot index ${index}`); + return s; + } + + /** Advance the cursor and dispatch all items in the current slot. */ + private tick(): void { + this.cursor = (this.cursor + 1) % NUM_SLOTS; + const slot = this.slot(this.cursor); + + if (slot.size === 0) return; + + // Collect items to dispatch (copy keys since we mutate during iteration) + const keys = [...slot]; + slot.clear(); + + for (const key of keys) { + const entry = this.entries.get(key); + if (!entry) continue; + + this.entries.delete(key); + this.onExpire({ key, data: entry.data }); + } + } +} diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index d857b0da677..fcb7297d340 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -25,6 +25,7 @@ import { type IncomingMessage } from "node:http"; import { register } from "../metrics.js"; import { env } from "../env.js"; import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import { TimerWheel } from "../services/timerWheel.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -62,6 +63,12 @@ const ComputeSnapshotCallbackBody = z.object({ metadata: z.record(z.string()).optional(), }); +type DelayedSnapshot = { + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; +}; + type WorkloadServerOptions = { port: number; host?: string; @@ -95,6 +102,7 @@ export class WorkloadServer extends EventEmitter { >(); private readonly workerClient: SupervisorHttpClient; + private readonly snapshotDelayWheel?: TimerWheel; constructor(opts: WorkloadServerOptions) { super(); @@ -106,6 +114,14 @@ export class WorkloadServer extends EventEmitter { this.checkpointClient = opts.checkpointClient; this.computeManager = opts.computeManager; + if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { + this.snapshotDelayWheel = new TimerWheel({ + delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, + onExpire: (item) => this.dispatchComputeSnapshot(item.data), + }); + this.snapshotDelayWheel.start(); + } + this.httpServer = this.createHttpServer({ host, port }); this.websocketServer = this.createWebsocketServer(); } @@ -263,7 +279,7 @@ export class WorkloadServer extends EventEmitter { return; } - if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { + if (this.snapshotDelayWheel && this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { this.logger.error( "TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create snapshot callback URL" @@ -272,23 +288,20 @@ export class WorkloadServer extends EventEmitter { return; } - // Compute mode: fire-and-forget snapshot with callback + // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. + // If the run continues before the delay expires, the snapshot is cancelled. reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); - const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; - - const snapshotResult = await this.computeManager.snapshot({ + this.snapshotDelayWheel.submit(params.runFriendlyId, { runnerId, - callbackUrl, - metadata: { - runId: params.runFriendlyId, - snapshotFriendlyId: params.snapshotFriendlyId, - }, + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, }); - if (!snapshotResult) { - this.logger.error("Failed to request compute snapshot", { params, runnerId }); - } + this.logger.debug("Snapshot delayed", { + runId: params.runFriendlyId, + delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, + }); return; } @@ -340,6 +353,11 @@ export class WorkloadServer extends EventEmitter { handler: async ({ req, reply, params }) => { this.logger.debug("Run continuation request", { params }); + // Cancel any pending delayed snapshot for this run + if (this.snapshotDelayWheel?.cancel(params.runFriendlyId)) { + this.logger.debug("Cancelled delayed snapshot", { runId: params.runFriendlyId }); + } + const continuationResult = await this.workerClient.continueRunExecution( params.runFriendlyId, params.snapshotFriendlyId, @@ -700,11 +718,46 @@ export class WorkloadServer extends EventEmitter { } } + /** + * Dispatch a compute snapshot request to the gateway. Called by the timer wheel + * when the delay expires, or immediately during drain. + */ + private async dispatchComputeSnapshot(snapshot: DelayedSnapshot): Promise { + if (!this.computeManager) return; + + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + const result = await this.computeManager.snapshot({ + runnerId: snapshot.runnerId, + callbackUrl, + metadata: { + runId: snapshot.runFriendlyId, + snapshotFriendlyId: snapshot.snapshotFriendlyId, + }, + }); + + if (!result) { + this.logger.error("Failed to request compute snapshot", { + runId: snapshot.runFriendlyId, + runnerId: snapshot.runnerId, + }); + } + } + async start() { await this.httpServer.start(); } async stop() { + const remaining = this.snapshotDelayWheel?.stop() ?? []; + if (remaining.length > 0) { + this.logger.info("Snapshot delay wheel stopped, dropped pending snapshots", { + count: remaining.length, + }); + this.logger.debug("Dropped snapshot details", { + runs: remaining.map((item) => item.key), + }); + } await this.httpServer.stop(); } } From 8b4c6bf284fde190c5ec95a8afb162688972405e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:41:13 +0000 Subject: [PATCH 17/40] feat: emit compute OTel spans (provision, restore, snapshot) in run traces Supervisor emits OTel spans for compute lifecycle events so they appear in the run's trace view with per-stage timing breakdowns. Spans: - compute.provision: emitted after gateway create returns, includes gateway/agent/fcrun timing and cache indicators from _timing response - compute.restore: emitted after gateway restore returns (supervisor-side timing only, gateway restore timing not yet surfaced) - compute.snapshot: emitted from snapshot callback handler using duration_ms from the agent, trace context from in-memory map (best-effort, does not survive restarts - TRI-7992) Implementation: - Hand-rolled OTLP JSON client (otlpTrace.ts) - builds ExportTraceServiceRequest payload and fire-and-forget POSTs to TRIGGER_API_URL/otel - Trace context (traceparent) from DequeuedMessage links spans to run trace - Resource attributes (ctx.environment.id, ctx.run.id, etc.) link to the correct run in the trace view - COMPUTE_TRACE_SPANS_ENABLED env var (default true) to disable in prod - Span start time offset by -1ms to ensure stable sort order before attempt Also adds .claude/rules/span-timeline-events.md documenting how the trace view timeline events system works (trigger.dev/ prefix, admin visibility, ClickHouse SPAN_EVENT storage, start_time filter constraint). --- .claude/rules/span-timeline-events.md | 72 +++++++++ apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 24 +++ apps/supervisor/src/otlpTrace.test.ts | 114 ++++++++++++++ apps/supervisor/src/otlpTrace.ts | 83 +++++++++++ .../supervisor/src/workloadManager/compute.ts | 139 ++++++++++++++++++ apps/supervisor/src/workloadManager/types.ts | 2 + apps/supervisor/src/workloadServer/index.ts | 62 ++++++++ 8 files changed, 497 insertions(+) create mode 100644 .claude/rules/span-timeline-events.md create mode 100644 apps/supervisor/src/otlpTrace.test.ts create mode 100644 apps/supervisor/src/otlpTrace.ts diff --git a/.claude/rules/span-timeline-events.md b/.claude/rules/span-timeline-events.md new file mode 100644 index 00000000000..f4e36717780 --- /dev/null +++ b/.claude/rules/span-timeline-events.md @@ -0,0 +1,72 @@ +# Span Timeline Events + +The trace view's right panel shows a timeline of events for the selected span. These are OTel span events rendered by `app/utils/timelineSpanEvents.ts` and the `SpanTimeline` component. + +## How They Work + +1. **Span events** in OTel are attached to a parent span. In ClickHouse, they're stored as separate rows with `kind: "SPAN_EVENT"` sharing the parent span's `span_id`. The `#mergeRecordsIntoSpanDetail` method reassembles them into the span's `events` array at query time. +2. The timeline only renders events whose `name` starts with `trigger.dev/` - all others are silently filtered out. +3. The **display name** comes from `properties.event` (not the span event name), mapped through `getFriendlyNameForEvent()`. +4. Events are shown on the **span they belong to** - events on one span don't appear in another span's timeline. + +## ClickHouse Storage Constraint + +When events are written to ClickHouse, `spanEventsToTaskEventV1Input()` filters out events whose `start_time` is not greater than the parent span's `startTime`. Events at or before the span start are silently dropped. This means span events must have timestamps strictly after the span's own `startTimeUnixNano`. + +## Timeline Rendering (SpanTimeline component) + +The `SpanTimeline` component in `app/components/run/RunTimeline.tsx` renders: + +1. **Events** (thin 1px line with hollow dots) - all events from `createTimelineSpanEventsFromSpanEvents()` +2. **"Started"** marker (thick cap) - at the span's `startTime` +3. **Duration bar** (thick 7px line) - from "Started" to "Finished" +4. **"Finished"** marker (thick cap) - at `startTime + duration` + +The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued → Pod scheduled → Launched → etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). + +## Trace View Sort Order + +Sibling spans (same parent) are sorted by `start_time ASC` from the ClickHouse query. The `createTreeFromFlatItems` function preserves this order. Event timestamps don't affect sort order - only the span's own `start_time`. + +## Event Structure + +```typescript +// OTel span event format +{ + name: "trigger.dev/run", // Must start with "trigger.dev/" to render + timeUnixNano: "1711200000000000000", + attributes: [ + { key: "event", value: { stringValue: "dequeue" } }, // The actual event type + { key: "duration", value: { intValue: 150 } }, // Optional: duration in ms + ] +} +``` + +## Admin-Only Events + +`getAdminOnlyForEvent()` controls visibility. Events default to **admin-only** (`true`). + +| Event | Admin-only | Friendly name | +|-------|-----------|---------------| +| `dequeue` | No | Dequeued | +| `fork` | No | Launched | +| `import` | No (if no fork event) | Importing task file | +| `create_attempt` | Yes | Attempt created | +| `lazy_payload` | Yes | Lazy attempt initialized | +| `pod_scheduled` | Yes | Pod scheduled | +| (default) | Yes | (raw event name) | + +## Adding New Timeline Events + +1. Add OTLP span event with `name: "trigger.dev/"` and `properties.event: ""` +2. Event timestamp must be strictly after the parent span's `startTimeUnixNano` (ClickHouse drops earlier events) +3. Add friendly name in `getFriendlyNameForEvent()` in `app/utils/timelineSpanEvents.ts` +4. Set admin visibility in `getAdminOnlyForEvent()` +5. Optionally add help text in `getHelpTextForEvent()` + +## Key Files + +- `app/utils/timelineSpanEvents.ts` - filtering, naming, admin logic +- `app/components/run/RunTimeline.tsx` - `SpanTimeline` component (thin line + thick bar rendering) +- `app/presenters/v3/SpanPresenter.server.ts` - loads span data including events +- `app/v3/eventRepository/clickhouseEventRepository.server.ts` - `spanEventsToTaskEventV1Input()` (storage filter), `#mergeRecordsIntoSpanDetail` (reassembly) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 73d703da2fe..74ae5d1b11f 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -83,6 +83,7 @@ const Env = z COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + COMPUTE_TRACE_SPANS_ENABLED: BoolEnv.default(true), COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), // Kubernetes settings diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 71fd4b83796..f1784cc5642 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -236,6 +236,11 @@ class ManagedSupervisor { runFriendlyId: message.run.friendlyId, snapshotFriendlyId: message.snapshot.friendlyId, machine: message.run.machine, + traceContext: message.run.traceContext, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + dequeuedAt: message.dequeuedAt, }); if (didRestore) { @@ -288,6 +293,24 @@ class ManagedSupervisor { return; } + if (env.COMPUTE_TRACE_SPANS_ENABLED) { + const traceparent = + message.run.traceContext && + "traceparent" in message.run.traceContext && + typeof message.run.traceContext.traceparent === "string" + ? message.run.traceContext.traceparent + : undefined; + + if (traceparent) { + this.workloadServer.registerRunTraceContext(message.run.friendlyId, { + traceparent, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + }); + } + } + try { if (!message.deployment.friendlyId) { // mostly a type guard, deployments always exists for deployed environments @@ -315,6 +338,7 @@ class ManagedSupervisor { snapshotId: message.snapshot.id, snapshotFriendlyId: message.snapshot.friendlyId, placementTags: message.placementTags, + traceContext: message.run.traceContext, }); // Disabled for now diff --git a/apps/supervisor/src/otlpTrace.test.ts b/apps/supervisor/src/otlpTrace.test.ts new file mode 100644 index 00000000000..765ed028216 --- /dev/null +++ b/apps/supervisor/src/otlpTrace.test.ts @@ -0,0 +1,114 @@ +import { describe, it, expect } from "vitest"; +import { buildOtlpTracePayload } from "./otlpTrace.js"; + +describe("buildOtlpTracePayload", () => { + it("builds valid OTLP JSON with timing attributes", () => { + const payload = buildOtlpTracePayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + parentSpanId: "1234567890abcdef", + spanName: "compute.provision", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: { + "ctx.environment.id": "env_123", + "ctx.organization.id": "org_456", + "ctx.project.id": "proj_789", + "ctx.run.id": "run_abc", + }, + spanAttributes: { + "compute.total_ms": 250, + "compute.gateway.schedule_ms": 1, + "compute.cache.image_cached": true, + }, + }); + + expect(payload.resourceSpans).toHaveLength(1); + + const resourceSpan = payload.resourceSpans[0]!; + + // $trigger=true so the webapp accepts it + const triggerAttr = resourceSpan.resource.attributes.find((a) => a.key === "$trigger"); + expect(triggerAttr).toEqual({ key: "$trigger", value: { boolValue: true } }); + + // Resource attributes + const envAttr = resourceSpan.resource.attributes.find( + (a) => a.key === "ctx.environment.id" + ); + expect(envAttr).toEqual({ + key: "ctx.environment.id", + value: { stringValue: "env_123" }, + }); + + // Span basics + const span = resourceSpan.scopeSpans[0]!.spans[0]!; + expect(span.name).toBe("compute.provision"); + expect(span.traceId).toBe("abcd1234abcd1234abcd1234abcd1234"); + expect(span.parentSpanId).toBe("1234567890abcdef"); + + // Integer attribute + const totalMs = span.attributes.find((a) => a.key === "compute.total_ms"); + expect(totalMs).toEqual({ key: "compute.total_ms", value: { intValue: 250 } }); + + // Boolean attribute + const cached = span.attributes.find((a) => a.key === "compute.cache.image_cached"); + expect(cached).toEqual({ key: "compute.cache.image_cached", value: { boolValue: true } }); + }); + + it("generates a valid 16-char hex span ID", () => { + const payload = buildOtlpTracePayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.spanId).toMatch(/^[0-9a-f]{16}$/); + }); + + it("converts timestamps to nanoseconds", () => { + const payload = buildOtlpTracePayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000000000"); + expect(span.endTimeUnixNano).toBe("1250000000"); + }); + + it("omits parentSpanId when not provided", () => { + const payload = buildOtlpTracePayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.parentSpanId).toBeUndefined(); + }); + + it("handles double values for non-integer numbers", () => { + const payload = buildOtlpTracePayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: { "compute.cpu": 0.25 }, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + const cpu = span.attributes.find((a) => a.key === "compute.cpu"); + expect(cpu).toEqual({ key: "compute.cpu", value: { doubleValue: 0.25 } }); + }); +}); diff --git a/apps/supervisor/src/otlpTrace.ts b/apps/supervisor/src/otlpTrace.ts new file mode 100644 index 00000000000..7a87c056f0d --- /dev/null +++ b/apps/supervisor/src/otlpTrace.ts @@ -0,0 +1,83 @@ +import { randomBytes } from "crypto"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +const logger = new SimpleStructuredLogger("otlp-trace"); + +export interface OtlpTraceOptions { + traceId: string; + parentSpanId?: string; + spanName: string; + startTimeMs: number; + endTimeMs: number; + resourceAttributes: Record; + spanAttributes: Record; +} + +/** Build an OTLP JSON ExportTraceServiceRequest payload */ +export function buildOtlpTracePayload(opts: OtlpTraceOptions) { + const spanId = randomBytes(8).toString("hex"); + + return { + resourceSpans: [ + { + resource: { + attributes: [ + { key: "$trigger", value: { boolValue: true } }, + ...toOtlpAttributes(opts.resourceAttributes), + ], + }, + scopeSpans: [ + { + scope: { name: "supervisor.compute" }, + spans: [ + { + traceId: opts.traceId, + spanId, + parentSpanId: opts.parentSpanId, + name: opts.spanName, + kind: 3, // SPAN_KIND_CLIENT + startTimeUnixNano: String(opts.startTimeMs * 1_000_000), + endTimeUnixNano: String(opts.endTimeMs * 1_000_000), + attributes: toOtlpAttributes(opts.spanAttributes), + status: { code: 1 }, // STATUS_CODE_OK + }, + ], + }, + ], + }, + ], + }; +} + +/** Fire-and-forget: send an OTLP trace payload to the collector */ +export function sendOtlpTrace( + endpoint: string, + payload: ReturnType +) { + fetch(`${endpoint}/v1/traces`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(5_000), + }).catch((err) => { + logger.warn("failed to send compute provision span", { + error: err instanceof Error ? err.message : String(err), + }); + }); +} + +function toOtlpAttributes( + attrs: Record +): Array<{ key: string; value: Record }> { + return Object.entries(attrs).map(([key, value]) => ({ + key, + value: toOtlpValue(value), + })); +} + +function toOtlpValue(value: string | number | boolean): Record { + if (typeof value === "string") return { stringValue: value }; + if (typeof value === "boolean") return { boolValue: value }; + if (Number.isInteger(value)) return { intValue: value }; + return { doubleValue: value }; +} diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index ea7be55d1a9..f0a126f909a 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -1,4 +1,6 @@ import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import { flattenAttributes } from "@trigger.dev/core/v3/utils/flattenAttributes"; import { type WorkloadManager, type WorkloadManagerCreateOptions, @@ -6,6 +8,7 @@ import { } from "./types.js"; import { env } from "../env.js"; import { getRunnerId } from "../util.js"; +import { buildOtlpTracePayload, sendOtlpTrace } from "../otlpTrace.js"; import { tryCatch } from "@trigger.dev/core"; type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { @@ -158,6 +161,13 @@ export class ComputeWorkloadManager implements WorkloadManager { event.instanceId = data.id; event.ok = true; + + // Parse timing data from compute response (optional - requires gateway timing flag) + if (data._timing) { + event.timing = data._timing; + } + + this.#emitProvisionSpan(opts, startMs, data._timing); } finally { event.durationMs = Math.round(performance.now() - startMs); event.ok ??= false; @@ -247,12 +257,76 @@ export class ComputeWorkloadManager implements WorkloadManager { return true; } + #emitProvisionSpan( + opts: WorkloadManagerCreateOptions, + startMs: number, + timing?: unknown + ) { + if (!env.COMPUTE_TRACE_SPANS_ENABLED) return; + const traceparent = + opts.traceContext && + "traceparent" in opts.traceContext && + typeof opts.traceContext.traceparent === "string" + ? opts.traceContext.traceparent + : undefined; + + const parsed = parseTraceparent(traceparent); + if (!parsed) return; + + const endMs = performance.now(); + const now = Date.now(); + const provisionStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Span starts at dequeue time so events (dequeue) render in the thin-line section + // before "Started". The actual provision call time is in provisionStartEpochMs. + // Subtract 1ms so compute span always sorts before the attempt span (same dequeue time) + const startEpochMs = opts.dequeuedAt.getTime() - 1; + + const spanAttributes: Record = { + "compute.type": "create", + "compute.provision_start_ms": provisionStartEpochMs, + ...(timing ? (flattenAttributes(timing, "compute") as Record) : {}), + }; + + if (opts.dequeueResponseMs !== undefined) { + spanAttributes["supervisor.dequeue_response_ms"] = opts.dequeueResponseMs; + } + if (opts.warmStartCheckMs !== undefined) { + spanAttributes["supervisor.warm_start_check_ms"] = opts.warmStartCheckMs; + } + + const payload = buildOtlpTracePayload({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.provision", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes, + }); + + // Use the platform API URL, not the runner OTLP endpoint (which may be a VM gateway IP) + sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + } + async restore(opts: { snapshotId: string; runnerId: string; runFriendlyId: string; snapshotFriendlyId: string; machine: { cpu: number; memory: number }; + // Trace context for OTel span emission + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; }): Promise { const url = `${this.opts.gatewayUrl}/api/snapshots/${opts.snapshotId}/restore`; @@ -275,6 +349,8 @@ export class ComputeWorkloadManager implements WorkloadManager { this.logger.debug("restore request body", { url, body }); + const startMs = performance.now(); + const [error, response] = await tryCatch( fetch(url, { method: "POST", @@ -284,11 +360,14 @@ export class ComputeWorkloadManager implements WorkloadManager { }) ); + const durationMs = Math.round(performance.now() - startMs); + if (error) { this.logger.error("restore request failed", { snapshotId: opts.snapshotId, runnerId: opts.runnerId, error: error instanceof Error ? error.message : String(error), + durationMs, }); return false; } @@ -298,6 +377,7 @@ export class ComputeWorkloadManager implements WorkloadManager { snapshotId: opts.snapshotId, runnerId: opts.runnerId, status: response.status, + durationMs, }); return false; } @@ -305,7 +385,66 @@ export class ComputeWorkloadManager implements WorkloadManager { this.logger.info("restore request success", { snapshotId: opts.snapshotId, runnerId: opts.runnerId, + durationMs, }); + + this.#emitRestoreSpan(opts, startMs); + return true; } + + #emitRestoreSpan( + opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }, + startMs: number + ) { + if (!env.COMPUTE_TRACE_SPANS_ENABLED) return; + + const traceparent = + opts.traceContext && + "traceparent" in opts.traceContext && + typeof opts.traceContext.traceparent === "string" + ? opts.traceContext.traceparent + : undefined; + + const parsed = parseTraceparent(traceparent); + if (!parsed || !opts.envId || !opts.orgId || !opts.projectId) return; + + const endMs = performance.now(); + const now = Date.now(); + const restoreStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Subtract 1ms so restore span always sorts before the attempt span + const startEpochMs = (opts.dequeuedAt?.getTime() ?? restoreStartEpochMs) - 1; + + const payload = buildOtlpTracePayload({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.restore", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes: { + "compute.type": "restore", + "compute.snapshot_id": opts.snapshotId, + }, + }); + + sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + } } + diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index 82c7ea7b4c0..89ddc88aa31 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -39,4 +39,6 @@ export interface WorkloadManagerCreateOptions { runFriendlyId: string; snapshotId: string; snapshotFriendlyId: string; + // Trace context for OTel span emission (W3C format: { traceparent: "00-...", tracestate?: "..." }) + traceContext?: Record; } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index fcb7297d340..02c320b8f1d 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -26,6 +26,8 @@ import { register } from "../metrics.js"; import { env } from "../env.js"; import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; import { TimerWheel } from "../services/timerWheel.js"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import { buildOtlpTracePayload, sendOtlpTrace } from "../otlpTrace.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -61,6 +63,7 @@ const ComputeSnapshotCallbackBody = z.object({ status: z.enum(["completed", "failed"]), error: z.string().optional(), metadata: z.record(z.string()).optional(), + duration_ms: z.number().optional(), }); type DelayedSnapshot = { @@ -69,6 +72,13 @@ type DelayedSnapshot = { snapshotFriendlyId: string; }; +type RunTraceContext = { + traceparent: string; + envId: string; + orgId: string; + projectId: string; +}; + type WorkloadServerOptions = { port: number; host?: string; @@ -102,6 +112,7 @@ export class WorkloadServer extends EventEmitter { >(); private readonly workerClient: SupervisorHttpClient; + private readonly runTraceContexts = new Map(); private readonly snapshotDelayWheel?: TimerWheel; constructor(opts: WorkloadServerOptions) { @@ -464,6 +475,7 @@ export class WorkloadServer extends EventEmitter { status: body.status, error: body.error, metadata: body.metadata, + durationMs: body.duration_ms, }); const runId = body.metadata?.runId; @@ -475,6 +487,9 @@ export class WorkloadServer extends EventEmitter { return; } + // Emit snapshot span (best-effort - requires trace context from dequeue on this instance) + this.#emitSnapshotSpan(runId, body.duration_ms, body.snapshot_id); + if (body.status === "completed") { const result = await this.workerClient.submitSuspendCompletion({ runId, @@ -677,6 +692,7 @@ export class WorkloadServer extends EventEmitter { try { runDisconnected(message.run.friendlyId); + this.runTraceContexts.delete(message.run.friendlyId); } catch (error) { log.error("run:stop error", { error }); } @@ -744,6 +760,52 @@ export class WorkloadServer extends EventEmitter { } } + #emitSnapshotSpan(runFriendlyId: string, durationMs?: number, snapshotId?: string) { + if (!env.COMPUTE_TRACE_SPANS_ENABLED) return; + + const ctx = this.runTraceContexts.get(runFriendlyId); + if (!ctx) return; + + const parsed = parseTraceparent(ctx.traceparent); + if (!parsed) return; + + const endEpochMs = Date.now(); + const startEpochMs = durationMs ? endEpochMs - durationMs : endEpochMs; + + const spanAttributes: Record = { + "compute.type": "snapshot", + }; + + if (durationMs !== undefined) { + spanAttributes["compute.total_ms"] = durationMs; + } + + if (snapshotId) { + spanAttributes["compute.snapshot_id"] = snapshotId; + } + + const payload = buildOtlpTracePayload({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.snapshot", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": ctx.envId, + "ctx.organization.id": ctx.orgId, + "ctx.project.id": ctx.projectId, + "ctx.run.id": runFriendlyId, + }, + spanAttributes, + }); + + sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + } + + registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + this.runTraceContexts.set(runFriendlyId, ctx); + } + async start() { await this.httpServer.start(); } From f70be68220c8b3bf79eb12358148af20aba6cb5f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:26:55 +0000 Subject: [PATCH 18/40] refactor(supervisor): demote per-run logs to debug/verbose for quieter prod output --- apps/supervisor/src/index.ts | 14 +++++++------- apps/supervisor/src/services/failedPodHandler.ts | 12 ++++++------ apps/supervisor/src/services/podCleaner.ts | 2 +- apps/supervisor/src/workloadManager/docker.ts | 2 +- apps/supervisor/src/workloadManager/kubernetes.ts | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index f1784cc5642..09c9bd87700 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -198,13 +198,13 @@ class ManagedSupervisor { } this.workerSession.on("runNotification", async ({ time, run }) => { - this.logger.log("runNotification", { time, run }); + this.logger.verbose("runNotification", { time, run }); this.workloadServer.notifyRun({ run }); }); this.workerSession.on("runQueueMessage", async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { - this.logger.log(`Received message with timestamp ${time.toLocaleString()}`, message); + this.logger.verbose(`Received message with timestamp ${time.toLocaleString()}`, message); if (message.completedWaitpoints.length > 0) { this.logger.debug("Run has completed waitpoints", { @@ -221,7 +221,7 @@ class ManagedSupervisor { const { checkpoint, ...rest } = message; if (checkpoint) { - this.logger.log("Restoring run", { runId: message.run.id }); + this.logger.debug("Restoring run", { runId: message.run.id }); if (this.isComputeMode && this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { try { @@ -244,7 +244,7 @@ class ManagedSupervisor { }); if (didRestore) { - this.logger.log("Compute restore successful", { runId: message.run.id, runnerId }); + this.logger.debug("Compute restore successful", { runId: message.run.id, runnerId }); } else { this.logger.error("Compute restore failed", { runId: message.run.id, runnerId }); } @@ -271,7 +271,7 @@ class ManagedSupervisor { }); if (didRestore) { - this.logger.log("Restore successful", { runId: message.run.id }); + this.logger.debug("Restore successful", { runId: message.run.id }); } else { this.logger.error("Restore failed", { runId: message.run.id }); } @@ -282,14 +282,14 @@ class ManagedSupervisor { return; } - this.logger.log("Scheduling run", { runId: message.run.id }); + this.logger.debug("Scheduling run", { runId: message.run.id }); const warmStartStart = performance.now(); const didWarmStart = await this.tryWarmStart(message); const warmStartCheckMs = Math.round(performance.now() - warmStartStart); if (didWarmStart) { - this.logger.log("Warm start successful", { runId: message.run.id }); + this.logger.debug("Warm start successful", { runId: message.run.id }); return; } diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index 07217243769..3d56c92b213 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -151,7 +151,7 @@ export class FailedPodHandler { } private async onPodCompleted(pod: V1Pod) { - this.logger.info("pod-completed", this.podSummary(pod)); + this.logger.debug("pod-completed", this.podSummary(pod)); this.informerEventsTotal.inc({ namespace: this.namespace, verb: "add" }); if (!pod.metadata?.name) { @@ -165,7 +165,7 @@ export class FailedPodHandler { } if (pod.metadata?.deletionTimestamp) { - this.logger.info("pod-completed: pod is being deleted", this.podSummary(pod)); + this.logger.verbose("pod-completed: pod is being deleted", this.podSummary(pod)); return; } @@ -188,7 +188,7 @@ export class FailedPodHandler { } private async onPodSucceeded(pod: V1Pod) { - this.logger.info("pod-succeeded", this.podSummary(pod)); + this.logger.debug("pod-succeeded", this.podSummary(pod)); this.processedPodsTotal.inc({ namespace: this.namespace, status: this.podStatus(pod), @@ -196,7 +196,7 @@ export class FailedPodHandler { } private async onPodFailed(pod: V1Pod) { - this.logger.info("pod-failed", this.podSummary(pod)); + this.logger.debug("pod-failed", this.podSummary(pod)); try { await this.processFailedPod(pod); @@ -208,7 +208,7 @@ export class FailedPodHandler { } private async processFailedPod(pod: V1Pod) { - this.logger.info("pod-failed: processing pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: processing pod", this.podSummary(pod)); const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller"); @@ -231,7 +231,7 @@ export class FailedPodHandler { } private async deletePod(pod: V1Pod) { - this.logger.info("pod-failed: deleting pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: deleting pod", this.podSummary(pod)); try { await this.k8s.core.deleteNamespacedPod({ name: pod.metadata!.name!, diff --git a/apps/supervisor/src/services/podCleaner.ts b/apps/supervisor/src/services/podCleaner.ts index 56eaaeb88af..3ac5da293df 100644 --- a/apps/supervisor/src/services/podCleaner.ts +++ b/apps/supervisor/src/services/podCleaner.ts @@ -90,7 +90,7 @@ export class PodCleaner { status: "succeeded", }); - this.logger.info("Deleted batch of pods", { continuationToken }); + this.logger.debug("Deleted batch of pods", { continuationToken }); } catch (err) { this.logger.error("Failed to delete batch of pods", { err: err instanceof Error ? err.message : String(err), diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts index d6651d325a2..66405df9ba5 100644 --- a/apps/supervisor/src/workloadManager/docker.ts +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -62,7 +62,7 @@ export class DockerWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("create()", { opts }); + this.logger.verbose("create()", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index 16c5eff9da1..891a94ba83a 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -100,7 +100,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("[KubernetesWorkloadManager] Creating container", { opts }); + this.logger.verbose("[KubernetesWorkloadManager] Creating container", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); From bc057054a38f1916ce43d231099cd22da2845aac Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:27:41 +0000 Subject: [PATCH 19/40] feat(supervisor): add COMPUTE_TRACE_OTLP_ENDPOINT override and demote remaining logs Add optional COMPUTE_TRACE_OTLP_ENDPOINT env var to override the OTLP endpoint for supervisor-emitted spans (defaults to TRIGGER_API_URL/otel). Useful for sending spans to an OTel collector instead of the webapp. Also demotes remaining per-run logs in compute workload manager and workload server to debug/verbose. --- apps/supervisor/src/env.ts | 7 +- apps/supervisor/src/otlpPayload.ts | 63 +++++++++++++++ apps/supervisor/src/otlpTrace.test.ts | 2 +- apps/supervisor/src/otlpTrace.ts | 76 ++----------------- .../supervisor/src/workloadManager/compute.ts | 17 +++-- apps/supervisor/src/workloadServer/index.ts | 25 +++--- 6 files changed, 99 insertions(+), 91 deletions(-) create mode 100644 apps/supervisor/src/otlpPayload.ts diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 74ae5d1b11f..063d2293098 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -84,6 +84,7 @@ const Env = z COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), COMPUTE_TRACE_SPANS_ENABLED: BoolEnv.default(true), + COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), // Kubernetes settings @@ -168,6 +169,10 @@ const Env = z path: ["TRIGGER_WORKLOAD_API_DOMAIN"], }); } - }); + }) + .transform((data) => ({ + ...data, + COMPUTE_TRACE_OTLP_ENDPOINT: data.COMPUTE_TRACE_OTLP_ENDPOINT ?? `${data.TRIGGER_API_URL}/otel`, + })); export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/otlpPayload.ts b/apps/supervisor/src/otlpPayload.ts new file mode 100644 index 00000000000..3e5b48b530f --- /dev/null +++ b/apps/supervisor/src/otlpPayload.ts @@ -0,0 +1,63 @@ +import { randomBytes } from "crypto"; + +export interface OtlpTraceOptions { + traceId: string; + parentSpanId?: string; + spanName: string; + startTimeMs: number; + endTimeMs: number; + resourceAttributes: Record; + spanAttributes: Record; +} + +/** Build an OTLP JSON ExportTraceServiceRequest payload */ +export function buildOtlpTracePayload(opts: OtlpTraceOptions) { + const spanId = randomBytes(8).toString("hex"); + + return { + resourceSpans: [ + { + resource: { + attributes: [ + { key: "$trigger", value: { boolValue: true } }, + ...toOtlpAttributes(opts.resourceAttributes), + ], + }, + scopeSpans: [ + { + scope: { name: "supervisor.compute" }, + spans: [ + { + traceId: opts.traceId, + spanId, + parentSpanId: opts.parentSpanId, + name: opts.spanName, + kind: 3, // SPAN_KIND_CLIENT + startTimeUnixNano: String(opts.startTimeMs * 1_000_000), + endTimeUnixNano: String(opts.endTimeMs * 1_000_000), + attributes: toOtlpAttributes(opts.spanAttributes), + status: { code: 1 }, // STATUS_CODE_OK + }, + ], + }, + ], + }, + ], + }; +} + +function toOtlpAttributes( + attrs: Record +): Array<{ key: string; value: Record }> { + return Object.entries(attrs).map(([key, value]) => ({ + key, + value: toOtlpValue(value), + })); +} + +function toOtlpValue(value: string | number | boolean): Record { + if (typeof value === "string") return { stringValue: value }; + if (typeof value === "boolean") return { boolValue: value }; + if (Number.isInteger(value)) return { intValue: value }; + return { doubleValue: value }; +} diff --git a/apps/supervisor/src/otlpTrace.test.ts b/apps/supervisor/src/otlpTrace.test.ts index 765ed028216..506a4d497d0 100644 --- a/apps/supervisor/src/otlpTrace.test.ts +++ b/apps/supervisor/src/otlpTrace.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { buildOtlpTracePayload } from "./otlpTrace.js"; +import { buildOtlpTracePayload } from "./otlpPayload.js"; describe("buildOtlpTracePayload", () => { it("builds valid OTLP JSON with timing attributes", () => { diff --git a/apps/supervisor/src/otlpTrace.ts b/apps/supervisor/src/otlpTrace.ts index 7a87c056f0d..9cef2cb0d1f 100644 --- a/apps/supervisor/src/otlpTrace.ts +++ b/apps/supervisor/src/otlpTrace.ts @@ -1,83 +1,19 @@ -import { randomBytes } from "crypto"; import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { env } from "./env.js"; +import type { buildOtlpTracePayload } from "./otlpPayload.js"; const logger = new SimpleStructuredLogger("otlp-trace"); -export interface OtlpTraceOptions { - traceId: string; - parentSpanId?: string; - spanName: string; - startTimeMs: number; - endTimeMs: number; - resourceAttributes: Record; - spanAttributes: Record; -} - -/** Build an OTLP JSON ExportTraceServiceRequest payload */ -export function buildOtlpTracePayload(opts: OtlpTraceOptions) { - const spanId = randomBytes(8).toString("hex"); - - return { - resourceSpans: [ - { - resource: { - attributes: [ - { key: "$trigger", value: { boolValue: true } }, - ...toOtlpAttributes(opts.resourceAttributes), - ], - }, - scopeSpans: [ - { - scope: { name: "supervisor.compute" }, - spans: [ - { - traceId: opts.traceId, - spanId, - parentSpanId: opts.parentSpanId, - name: opts.spanName, - kind: 3, // SPAN_KIND_CLIENT - startTimeUnixNano: String(opts.startTimeMs * 1_000_000), - endTimeUnixNano: String(opts.endTimeMs * 1_000_000), - attributes: toOtlpAttributes(opts.spanAttributes), - status: { code: 1 }, // STATUS_CODE_OK - }, - ], - }, - ], - }, - ], - }; -} - -/** Fire-and-forget: send an OTLP trace payload to the collector */ -export function sendOtlpTrace( - endpoint: string, - payload: ReturnType -) { - fetch(`${endpoint}/v1/traces`, { +/** Fire-and-forget: send an OTLP trace payload to the configured endpoint */ +export function sendOtlpTrace(payload: ReturnType) { + fetch(`${env.COMPUTE_TRACE_OTLP_ENDPOINT}/v1/traces`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(payload), signal: AbortSignal.timeout(5_000), }).catch((err) => { - logger.warn("failed to send compute provision span", { + logger.warn("failed to send compute trace span", { error: err instanceof Error ? err.message : String(err), }); }); } - -function toOtlpAttributes( - attrs: Record -): Array<{ key: string; value: Record }> { - return Object.entries(attrs).map(([key, value]) => ({ - key, - value: toOtlpValue(value), - })); -} - -function toOtlpValue(value: string | number | boolean): Record { - if (typeof value === "string") return { stringValue: value }; - if (typeof value === "boolean") return { boolValue: value }; - if (Number.isInteger(value)) return { intValue: value }; - return { doubleValue: value }; -} diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index f0a126f909a..892f3ecbc08 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -8,7 +8,8 @@ import { } from "./types.js"; import { env } from "../env.js"; import { getRunnerId } from "../util.js"; -import { buildOtlpTracePayload, sendOtlpTrace } from "../otlpTrace.js"; +import { buildOtlpTracePayload } from "../otlpPayload.js"; +import { sendOtlpTrace } from "../otlpTrace.js"; import { tryCatch } from "@trigger.dev/core"; type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { @@ -171,7 +172,7 @@ export class ComputeWorkloadManager implements WorkloadManager { } finally { event.durationMs = Math.round(performance.now() - startMs); event.ok ??= false; - this.logger.info("create instance", event); + this.logger.debug("create instance", event); } } @@ -222,7 +223,7 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - this.logger.info("snapshot request accepted", { runnerId: opts.runnerId }); + this.logger.debug("snapshot request accepted", { runnerId: opts.runnerId }); return true; } @@ -253,7 +254,7 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - this.logger.info("delete instance success", { runnerId }); + this.logger.debug("delete instance success", { runnerId }); return true; } @@ -312,7 +313,7 @@ export class ComputeWorkloadManager implements WorkloadManager { }); // Use the platform API URL, not the runner OTLP endpoint (which may be a VM gateway IP) - sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + sendOtlpTrace(payload); } async restore(opts: { @@ -347,7 +348,7 @@ export class ComputeWorkloadManager implements WorkloadManager { memory_mb: opts.machine.memory * 1024, }; - this.logger.debug("restore request body", { url, body }); + this.logger.verbose("restore request body", { url, body }); const startMs = performance.now(); @@ -382,7 +383,7 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - this.logger.info("restore request success", { + this.logger.debug("restore request success", { snapshotId: opts.snapshotId, runnerId: opts.runnerId, durationMs, @@ -444,7 +445,7 @@ export class ComputeWorkloadManager implements WorkloadManager { }, }); - sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + sendOtlpTrace(payload); } } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 02c320b8f1d..10e85b628b2 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -27,7 +27,8 @@ import { env } from "../env.js"; import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; import { TimerWheel } from "../services/timerWheel.js"; import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; -import { buildOtlpTracePayload, sendOtlpTrace } from "../otlpTrace.js"; +import { buildOtlpTracePayload } from "../otlpPayload.js"; +import { sendOtlpTrace } from "../otlpTrace.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -469,7 +470,7 @@ export class WorkloadServer extends EventEmitter { httpServer.route("/api/v1/compute/snapshot-complete", "POST", { bodySchema: ComputeSnapshotCallbackBody, handler: async ({ reply, body }) => { - this.logger.info("Compute snapshot callback", { + this.logger.debug("Compute snapshot callback", { snapshotId: body.snapshot_id, instanceId: body.instance_id, status: body.status, @@ -504,7 +505,7 @@ export class WorkloadServer extends EventEmitter { }); if (result.success) { - this.logger.info("Suspend completion submitted", { + this.logger.debug("Suspend completion submitted", { runId, instanceId: body.instance_id, snapshotId: body.snapshot_id, @@ -553,7 +554,7 @@ export class WorkloadServer extends EventEmitter { > = io.of("/workload"); websocketServer.on("disconnect", (socket) => { - this.logger.log("[WS] disconnect", socket.id); + this.logger.verbose("[WS] disconnect", socket.id); }); websocketServer.use(async (socket, next) => { const setSocketDataFromHeader = ( @@ -635,7 +636,7 @@ export class WorkloadServer extends EventEmitter { socket.data.runFriendlyId = undefined; }; - socketLogger.log("wsServer socket connected", { ...getSocketMetadata() }); + socketLogger.debug("wsServer socket connected", { ...getSocketMetadata() }); // FIXME: where does this get set? if (socket.data.runFriendlyId) { @@ -643,7 +644,7 @@ export class WorkloadServer extends EventEmitter { } socket.on("disconnecting", (reason, description) => { - socketLogger.log("Socket disconnecting", { ...getSocketMetadata(), reason, description }); + socketLogger.verbose("Socket disconnecting", { ...getSocketMetadata(), reason, description }); if (socket.data.runFriendlyId) { runDisconnected(socket.data.runFriendlyId); @@ -651,7 +652,7 @@ export class WorkloadServer extends EventEmitter { }); socket.on("disconnect", (reason, description) => { - socketLogger.log("Socket disconnected", { ...getSocketMetadata(), reason, description }); + socketLogger.debug("Socket disconnected", { ...getSocketMetadata(), reason, description }); }); socket.on("error", (error) => { @@ -672,7 +673,7 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:start"); + log.debug("Handling run:start"); try { runConnected(message.run.friendlyId); @@ -688,11 +689,13 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:stop"); + log.debug("Handling run:stop"); try { runDisconnected(message.run.friendlyId); - this.runTraceContexts.delete(message.run.friendlyId); + // Don't delete trace context here - run:stop fires after each snapshot/shutdown + // but the run may be restored on a new VM and snapshot again. Trace context is + // re-populated on dequeue, and entries are small (4 strings per run). } catch (error) { log.error("run:stop error", { error }); } @@ -799,7 +802,7 @@ export class WorkloadServer extends EventEmitter { spanAttributes, }); - sendOtlpTrace(`${env.TRIGGER_API_URL}/otel`, payload); + sendOtlpTrace(payload); } registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { From 7b37b0c50c8e21a4fe1c592cf0af1bb36454a9ae Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:59:12 +0000 Subject: [PATCH 20/40] feat(database): add WorkloadType enum and column to WorkerInstanceGroup --- .../migration.sql | 5 +++++ internal-packages/database/prisma/schema.prisma | 7 +++++++ 2 files changed, 12 insertions(+) create mode 100644 internal-packages/database/prisma/migrations/20260326150000_add_workload_type_to_worker_instance_group/migration.sql diff --git a/internal-packages/database/prisma/migrations/20260326150000_add_workload_type_to_worker_instance_group/migration.sql b/internal-packages/database/prisma/migrations/20260326150000_add_workload_type_to_worker_instance_group/migration.sql new file mode 100644 index 00000000000..4865ae070e3 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260326150000_add_workload_type_to_worker_instance_group/migration.sql @@ -0,0 +1,5 @@ +-- CreateEnum +CREATE TYPE "WorkloadType" AS ENUM ('CONTAINER', 'MICROVM'); + +-- AlterTable +ALTER TABLE "WorkerInstanceGroup" ADD COLUMN "workloadType" "WorkloadType" NOT NULL DEFAULT 'CONTAINER'; diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index 5ebc78508b9..42d05459369 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -1280,6 +1280,11 @@ enum WorkerInstanceGroupType { UNMANAGED } +enum WorkloadType { + CONTAINER + MICROVM +} + model WorkerInstanceGroup { id String @id @default(cuid()) type WorkerInstanceGroupType @@ -1314,6 +1319,8 @@ model WorkerInstanceGroup { location String? staticIPs String? + workloadType WorkloadType @default(CONTAINER) + createdAt DateTime @default(now()) updatedAt DateTime @updatedAt } From 441334ba1dbec2a2ac9c1108bb62b65d25f45ce0 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:12:45 +0000 Subject: [PATCH 21/40] feat(core): add shared compute gateway client and template creation types --- packages/core/package.json | 14 +++++- packages/core/src/v3/compute/gatewayClient.ts | 46 +++++++++++++++++++ packages/core/src/v3/compute/index.ts | 7 +++ packages/core/src/v3/compute/types.ts | 24 ++++++++++ 4 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 packages/core/src/v3/compute/gatewayClient.ts create mode 100644 packages/core/src/v3/compute/index.ts create mode 100644 packages/core/src/v3/compute/types.ts diff --git a/packages/core/package.json b/packages/core/package.json index 1ecf044bf6e..58094913914 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -52,7 +52,8 @@ "./v3/runEngineWorker": "./src/v3/runEngineWorker/index.ts", "./v3/machines": "./src/v3/machines/index.ts", "./v3/serverOnly": "./src/v3/serverOnly/index.ts", - "./v3/isomorphic": "./src/v3/isomorphic/index.ts" + "./v3/isomorphic": "./src/v3/isomorphic/index.ts", + "./v3/compute": "./src/v3/compute/index.ts" }, "sourceDialects": [ "@triggerdotdev/source" @@ -577,6 +578,17 @@ "types": "./dist/commonjs/v3/isomorphic/index.d.ts", "default": "./dist/commonjs/v3/isomorphic/index.js" } + }, + "./v3/compute": { + "import": { + "@triggerdotdev/source": "./src/v3/compute/index.ts", + "types": "./dist/esm/v3/compute/index.d.ts", + "default": "./dist/esm/v3/compute/index.js" + }, + "require": { + "types": "./dist/commonjs/v3/compute/index.d.ts", + "default": "./dist/commonjs/v3/compute/index.js" + } } }, "type": "module", diff --git a/packages/core/src/v3/compute/gatewayClient.ts b/packages/core/src/v3/compute/gatewayClient.ts new file mode 100644 index 00000000000..025a4ec0fbf --- /dev/null +++ b/packages/core/src/v3/compute/gatewayClient.ts @@ -0,0 +1,46 @@ +import type { TemplateCreateRequest } from "./types.js"; + +export type ComputeGatewayClientOptions = { + gatewayUrl: string; + authToken?: string; + timeoutMs: number; +}; + +export class ComputeGatewayClient { + constructor(private opts: ComputeGatewayClientOptions) {} + + async createTemplate( + req: TemplateCreateRequest, + options?: { signal?: AbortSignal } + ): Promise<{ accepted: boolean; templateId?: string }> { + const url = `${this.opts.gatewayUrl}/api/templates`; + + const headers: Record = { + "Content-Type": "application/json", + }; + if (this.opts.authToken) { + headers["Authorization"] = `Bearer ${this.opts.authToken}`; + } + + const signal = options?.signal ?? AbortSignal.timeout(this.opts.timeoutMs); + + const response = await fetch(url, { + method: "POST", + headers, + body: JSON.stringify(req), + signal, + }); + + if (!response.ok) { + const errorBody = await response.text().catch(() => "unknown error"); + throw new Error(`Gateway template creation failed (${response.status}): ${errorBody}`); + } + + if (response.status === 202) { + return { accepted: true }; + } + + const result = await response.json(); + return { accepted: false, templateId: result.template_id }; + } +} diff --git a/packages/core/src/v3/compute/index.ts b/packages/core/src/v3/compute/index.ts new file mode 100644 index 00000000000..20bb36a7bf6 --- /dev/null +++ b/packages/core/src/v3/compute/index.ts @@ -0,0 +1,7 @@ +export { ComputeGatewayClient } from "./gatewayClient.js"; +export type { ComputeGatewayClientOptions } from "./gatewayClient.js"; +export { + TemplateCreateRequestSchema, + TemplateCallbackPayloadSchema, +} from "./types.js"; +export type { TemplateCreateRequest, TemplateCallbackPayload } from "./types.js"; diff --git a/packages/core/src/v3/compute/types.ts b/packages/core/src/v3/compute/types.ts new file mode 100644 index 00000000000..b45bddfdaef --- /dev/null +++ b/packages/core/src/v3/compute/types.ts @@ -0,0 +1,24 @@ +import { z } from "zod"; + +export const TemplateCreateRequestSchema = z.object({ + image: z.string(), + cpu: z.number(), + memory_mb: z.number(), + callback: z + .object({ + url: z.string(), + metadata: z.record(z.string()).optional(), + }) + .optional(), +}); +export type TemplateCreateRequest = z.infer; + +export const TemplateCallbackPayloadSchema = z.object({ + template_id: z.string().optional(), + image: z.string(), + status: z.enum(["completed", "failed"]), + error: z.string().optional(), + metadata: z.record(z.string()).optional(), + duration_ms: z.number().optional(), +}); +export type TemplateCallbackPayload = z.infer; From d0149e98e4ec5af4c30fcb27691b37ddb205474a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:33:36 +0000 Subject: [PATCH 22/40] feat(webapp): add compute template creation service --- apps/webapp/app/env.server.ts | 5 ++ .../computeTemplateCreation.server.ts | 69 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 apps/webapp/app/v3/services/computeTemplateCreation.server.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 2e6da79fdf1..318e128fbe6 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -333,6 +333,11 @@ const EnvironmentSchema = z .optional() .transform((v) => v ?? process.env.DEPLOY_REGISTRY_ECR_ASSUME_ROLE_EXTERNAL_ID), + // Compute gateway (template creation during deploy finalize) + COMPUTE_GATEWAY_URL: z.string().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_TEMPLATE_SHADOW_ROLLOUT_PCT: z.string().optional(), + DEPLOY_IMAGE_PLATFORM: z.string().default("linux/amd64"), DEPLOY_TIMEOUT_MS: z.coerce .number() diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts new file mode 100644 index 00000000000..3a283af6c0b --- /dev/null +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -0,0 +1,69 @@ +import { ComputeGatewayClient } from "@trigger.dev/core/v3/compute"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import type { PrismaClientOrTransaction } from "~/db.server"; + +type TemplateCreationMode = "required" | "shadow" | "skip"; + +export class ComputeTemplateCreationService { + private client: ComputeGatewayClient | undefined; + + constructor() { + if (env.COMPUTE_GATEWAY_URL) { + this.client = new ComputeGatewayClient({ + gatewayUrl: env.COMPUTE_GATEWAY_URL, + authToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + timeoutMs: 5 * 60 * 1000, // 5 minutes + }); + } + } + + async resolveMode( + projectId: string, + prisma: PrismaClientOrTransaction + ): Promise { + const project = await prisma.project.findFirst({ + where: { id: projectId }, + select: { + defaultWorkerGroup: { + select: { workloadType: true }, + }, + }, + }); + + if (project?.defaultWorkerGroup?.workloadType === "MICROVM") { + return "required"; + } + + // TODO: check private beta feature flag for org + + const rolloutPct = Number(env.COMPUTE_TEMPLATE_SHADOW_ROLLOUT_PCT ?? "0"); + if (rolloutPct > 0 && Math.random() * 100 < rolloutPct) { + return "shadow"; + } + + return "skip"; + } + + async createTemplate(imageReference: string): Promise<{ success: boolean; error?: string }> { + if (!this.client) { + return { success: false, error: "Compute gateway not configured" }; + } + + try { + await this.client.createTemplate({ + image: imageReference, + cpu: 0.5, + memory_mb: 512, + }); + return { success: true }; + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + logger.error("Failed to create compute template", { + imageReference, + error: message, + }); + return { success: false, error: message }; + } + } +} From 50cf67237264d2d98ebbee9ba83fcfdfba8eb597 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:33:44 +0000 Subject: [PATCH 23/40] feat(webapp): integrate template creation into deploy finalize flow --- .../services/finalizeDeploymentV2.server.ts | 103 +++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index 2ad2b7b8258..f35ab9e5aa1 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -11,10 +11,12 @@ import { mkdtemp, writeFile } from "node:fs/promises"; import { env } from "~/env.server"; import { depot as execDepot } from "@depot/cli"; import { FinalizeDeploymentService } from "./finalizeDeployment.server"; +import { FailDeploymentService } from "./failDeployment.server"; import { remoteBuildsEnabled } from "../remoteImageBuilder.server"; import { getEcrAuthToken, isEcrRegistry } from "../getDeploymentImageRef.server"; import { tryCatch } from "@trigger.dev/core"; import { getRegistryConfig, type RegistryConfig } from "../registryConfig.server"; +import { ComputeTemplateCreationService } from "./computeTemplateCreation.server"; export class FinalizeDeploymentV2Service extends BaseService { public async call( @@ -77,7 +79,33 @@ export class FinalizeDeploymentV2Service extends BaseService { logger.debug("Skipping push to registry during deployment finalization", { deployment, }); - return await finalizeService.call(authenticatedEnv, id, body); + + let templateMode: "required" | "shadow" | "skip" = "skip"; + if (deployment.imageReference) { + templateMode = await this.#handleTemplateCreation({ + templateService: new ComputeTemplateCreationService(), + projectId: deployment.worker.project.id, + imageReference: deployment.imageReference, + deploymentFriendlyId: id, + authenticatedEnv, + writer, + }); + } + + const result = await finalizeService.call(authenticatedEnv, id, body); + + if (templateMode === "shadow" && deployment.imageReference) { + const shadowService = new ComputeTemplateCreationService(); + shadowService.createTemplate(deployment.imageReference).catch((error) => { + logger.error("Shadow compute template creation failed", { + id, + imageReference: deployment.imageReference, + error: error instanceof Error ? error.message : String(error), + }); + }); + } + + return result; } const externalBuildData = deployment.externalBuildData @@ -143,10 +171,83 @@ export class FinalizeDeploymentV2Service extends BaseService { pushedImage: pushResult.image, }); + const templateMode = await this.#handleTemplateCreation({ + templateService: new ComputeTemplateCreationService(), + projectId: deployment.worker.project.id, + imageReference: deployment.imageReference, + deploymentFriendlyId: id, + authenticatedEnv, + writer, + }); + const finalizedDeployment = await finalizeService.call(authenticatedEnv, id, body); + // Shadow mode: fire-and-forget template creation after deploy is finalized + if (templateMode === "shadow") { + const shadowService = new ComputeTemplateCreationService(); + shadowService.createTemplate(deployment.imageReference).catch((error) => { + logger.error("Shadow compute template creation failed", { + id, + imageReference: deployment.imageReference, + error: error instanceof Error ? error.message : String(error), + }); + }); + } + return finalizedDeployment; } + + async #handleTemplateCreation(options: { + templateService: ComputeTemplateCreationService; + projectId: string; + imageReference: string; + deploymentFriendlyId: string; + authenticatedEnv: AuthenticatedEnvironment; + writer?: WritableStreamDefaultWriter; + }): Promise<"required" | "shadow" | "skip"> { + const { templateService, projectId, imageReference, deploymentFriendlyId, authenticatedEnv, writer } = options; + + const mode = await templateService.resolveMode(projectId, this._prisma); + + if (mode !== "required") { + return mode; + } + + if (writer) { + await writer.write( + `event: log\ndata: ${JSON.stringify({ message: "Building compute template..." })}\n\n` + ); + } + + const templateResult = await templateService.createTemplate(imageReference); + + if (!templateResult.success) { + logger.error("Compute template creation failed", { + id: deploymentFriendlyId, + imageReference, + error: templateResult.error, + }); + + const failService = new FailDeploymentService(); + await failService.call(authenticatedEnv, deploymentFriendlyId, { + error: { + name: "TemplateCreationFailed", + message: `Failed to create compute template: ${templateResult.error}`, + }, + }); + + throw new ServiceValidationError( + `Compute template creation failed: ${templateResult.error}` + ); + } + + logger.debug("Compute template created", { + id: deploymentFriendlyId, + imageReference, + }); + + return mode; + } } type ExecutePushToRegistryOptions = { From 899a7fb13f0d382645b57a2fec455f3b3c26cb2b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:47:41 +0000 Subject: [PATCH 24/40] fix: deduplicate shadow template creation and reuse service instance --- .../services/finalizeDeploymentV2.server.ts | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index f35ab9e5aa1..9d4200c891b 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -74,6 +74,7 @@ export class FinalizeDeploymentV2Service extends BaseService { } const finalizeService = new FinalizeDeploymentService(); + const templateService = new ComputeTemplateCreationService(); if (body.skipPushToRegistry) { logger.debug("Skipping push to registry during deployment finalization", { @@ -83,7 +84,7 @@ export class FinalizeDeploymentV2Service extends BaseService { let templateMode: "required" | "shadow" | "skip" = "skip"; if (deployment.imageReference) { templateMode = await this.#handleTemplateCreation({ - templateService: new ComputeTemplateCreationService(), + templateService, projectId: deployment.worker.project.id, imageReference: deployment.imageReference, deploymentFriendlyId: id, @@ -95,14 +96,7 @@ export class FinalizeDeploymentV2Service extends BaseService { const result = await finalizeService.call(authenticatedEnv, id, body); if (templateMode === "shadow" && deployment.imageReference) { - const shadowService = new ComputeTemplateCreationService(); - shadowService.createTemplate(deployment.imageReference).catch((error) => { - logger.error("Shadow compute template creation failed", { - id, - imageReference: deployment.imageReference, - error: error instanceof Error ? error.message : String(error), - }); - }); + this.#fireShadowTemplateCreation(templateService, deployment.imageReference, id); } return result; @@ -172,7 +166,7 @@ export class FinalizeDeploymentV2Service extends BaseService { }); const templateMode = await this.#handleTemplateCreation({ - templateService: new ComputeTemplateCreationService(), + templateService, projectId: deployment.worker.project.id, imageReference: deployment.imageReference, deploymentFriendlyId: id, @@ -184,14 +178,7 @@ export class FinalizeDeploymentV2Service extends BaseService { // Shadow mode: fire-and-forget template creation after deploy is finalized if (templateMode === "shadow") { - const shadowService = new ComputeTemplateCreationService(); - shadowService.createTemplate(deployment.imageReference).catch((error) => { - logger.error("Shadow compute template creation failed", { - id, - imageReference: deployment.imageReference, - error: error instanceof Error ? error.message : String(error), - }); - }); + this.#fireShadowTemplateCreation(templateService, deployment.imageReference, id); } return finalizedDeployment; @@ -248,6 +235,20 @@ export class FinalizeDeploymentV2Service extends BaseService { return mode; } + + #fireShadowTemplateCreation( + templateService: ComputeTemplateCreationService, + imageReference: string, + deploymentFriendlyId: string + ) { + templateService.createTemplate(imageReference).catch((error) => { + logger.error("Shadow compute template creation failed", { + id: deploymentFriendlyId, + imageReference, + error: error instanceof Error ? error.message : String(error), + }); + }); + } } type ExecutePushToRegistryOptions = { From 3cc7874b3e3787e6b67cafcc822007d683694a1f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 17:43:23 +0000 Subject: [PATCH 25/40] fix(webapp): add template creation to V1 finalize path (non-remote builds) --- .../v3/services/finalizeDeployment.server.ts | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/apps/webapp/app/v3/services/finalizeDeployment.server.ts b/apps/webapp/app/v3/services/finalizeDeployment.server.ts index 6cbfc323e7a..55a5907f8c1 100644 --- a/apps/webapp/app/v3/services/finalizeDeployment.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeployment.server.ts @@ -12,6 +12,7 @@ import { TimeoutDeploymentService } from "./timeoutDeployment.server"; import { DeploymentService } from "./deployment.server"; import { engine } from "../runEngine.server"; import { tryCatch } from "@trigger.dev/core"; +import { ComputeTemplateCreationService } from "./computeTemplateCreation.server"; export class FinalizeDeploymentService extends BaseService { public async call( @@ -65,6 +66,47 @@ export class FinalizeDeploymentService extends BaseService { const imageDigest = validatedImageDigest(body.imageDigest); + // Compute template creation (before setting DEPLOYED) + const templateService = new ComputeTemplateCreationService(); + const templateMode = await templateService.resolveMode( + authenticatedEnv.projectId, + this._prisma + ); + + if (templateMode === "required" && deployment.imageReference) { + logger.info("Creating compute template (required mode)", { + id, + imageReference: deployment.imageReference, + }); + + const templateResult = await templateService.createTemplate(deployment.imageReference); + + if (!templateResult.success) { + logger.error("Compute template creation failed", { + id, + imageReference: deployment.imageReference, + error: templateResult.error, + }); + + const failService = new FailDeploymentService(); + await failService.call(authenticatedEnv, deployment.friendlyId, { + error: { + name: "TemplateCreationFailed", + message: `Failed to create compute template: ${templateResult.error}`, + }, + }); + + throw new ServiceValidationError( + `Compute template creation failed: ${templateResult.error}` + ); + } + + logger.info("Compute template created", { + id, + imageReference: deployment.imageReference, + }); + } + // Link the deployment with the background worker const finalizedDeployment = await this._prisma.workerDeployment.update({ where: { @@ -147,6 +189,17 @@ export class FinalizeDeploymentService extends BaseService { await PerformDeploymentAlertsService.enqueue(deployment.id); + // Shadow mode: fire-and-forget template creation after deploy is finalized + if (templateMode === "shadow" && deployment.imageReference) { + templateService.createTemplate(deployment.imageReference).catch((error) => { + logger.error("Shadow compute template creation failed", { + id, + imageReference: deployment.imageReference, + error: error instanceof Error ? error.message : String(error), + }); + }); + } + return finalizedDeployment; } } From 8f08403faf2701a2076e85f517b98df412a4186c Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:43:03 +0000 Subject: [PATCH 26/40] refactor: move compute gateway client from @trigger.dev/core to @internal/compute --- .../computeTemplateCreation.server.ts | 2 +- apps/webapp/package.json | 1 + internal-packages/compute/package.json | 14 +++++++ .../compute/src}/gatewayClient.ts | 0 .../compute/src}/index.ts | 0 .../compute/src}/types.ts | 0 internal-packages/compute/tsconfig.json | 18 ++++++++ packages/core/package.json | 14 +------ pnpm-lock.yaml | 42 +++++++++++++------ 9 files changed, 64 insertions(+), 27 deletions(-) create mode 100644 internal-packages/compute/package.json rename {packages/core/src/v3/compute => internal-packages/compute/src}/gatewayClient.ts (100%) rename {packages/core/src/v3/compute => internal-packages/compute/src}/index.ts (100%) rename {packages/core/src/v3/compute => internal-packages/compute/src}/types.ts (100%) create mode 100644 internal-packages/compute/tsconfig.json diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 3a283af6c0b..f01e27fc956 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -1,4 +1,4 @@ -import { ComputeGatewayClient } from "@trigger.dev/core/v3/compute"; +import { ComputeGatewayClient } from "@internal/compute"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; diff --git a/apps/webapp/package.json b/apps/webapp/package.json index a34b0cf7f90..a94889e2580 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -57,6 +57,7 @@ "@heroicons/react": "^2.0.12", "@jsonhero/schema-infer": "^0.1.5", "@internal/cache": "workspace:*", + "@internal/compute": "workspace:*", "@internal/llm-pricing": "workspace:*", "@internal/redis": "workspace:*", "@internal/run-engine": "workspace:*", diff --git a/internal-packages/compute/package.json b/internal-packages/compute/package.json new file mode 100644 index 00000000000..4671565936c --- /dev/null +++ b/internal-packages/compute/package.json @@ -0,0 +1,14 @@ +{ + "name": "@internal/compute", + "private": true, + "version": "0.0.1", + "main": "./src/index.ts", + "types": "./src/index.ts", + "type": "module", + "dependencies": { + "zod": "3.23.8" + }, + "scripts": { + "typecheck": "tsc --noEmit" + } +} diff --git a/packages/core/src/v3/compute/gatewayClient.ts b/internal-packages/compute/src/gatewayClient.ts similarity index 100% rename from packages/core/src/v3/compute/gatewayClient.ts rename to internal-packages/compute/src/gatewayClient.ts diff --git a/packages/core/src/v3/compute/index.ts b/internal-packages/compute/src/index.ts similarity index 100% rename from packages/core/src/v3/compute/index.ts rename to internal-packages/compute/src/index.ts diff --git a/packages/core/src/v3/compute/types.ts b/internal-packages/compute/src/types.ts similarity index 100% rename from packages/core/src/v3/compute/types.ts rename to internal-packages/compute/src/types.ts diff --git a/internal-packages/compute/tsconfig.json b/internal-packages/compute/tsconfig.json new file mode 100644 index 00000000000..ec9998c5e00 --- /dev/null +++ b/internal-packages/compute/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2019", + "lib": ["ES2019", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "module": "Node16", + "moduleResolution": "Node16", + "moduleDetection": "force", + "verbatimModuleSyntax": false, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "preserveWatchOutput": true, + "skipLibCheck": true, + "noEmit": true, + "strict": true + }, + "exclude": ["node_modules"] +} diff --git a/packages/core/package.json b/packages/core/package.json index 58094913914..1ecf044bf6e 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -52,8 +52,7 @@ "./v3/runEngineWorker": "./src/v3/runEngineWorker/index.ts", "./v3/machines": "./src/v3/machines/index.ts", "./v3/serverOnly": "./src/v3/serverOnly/index.ts", - "./v3/isomorphic": "./src/v3/isomorphic/index.ts", - "./v3/compute": "./src/v3/compute/index.ts" + "./v3/isomorphic": "./src/v3/isomorphic/index.ts" }, "sourceDialects": [ "@triggerdotdev/source" @@ -578,17 +577,6 @@ "types": "./dist/commonjs/v3/isomorphic/index.d.ts", "default": "./dist/commonjs/v3/isomorphic/index.js" } - }, - "./v3/compute": { - "import": { - "@triggerdotdev/source": "./src/v3/compute/index.ts", - "types": "./dist/esm/v3/compute/index.d.ts", - "default": "./dist/esm/v3/compute/index.js" - }, - "require": { - "types": "./dist/commonjs/v3/compute/index.d.ts", - "default": "./dist/commonjs/v3/compute/index.js" - } } }, "type": "module", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bc27428575a..93096ee823e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -305,6 +305,9 @@ importers: '@internal/cache': specifier: workspace:* version: link:../../internal-packages/cache + '@internal/compute': + specifier: workspace:* + version: link:../../internal-packages/compute '@internal/llm-pricing': specifier: workspace:* version: link:../../internal-packages/llm-pricing @@ -1075,6 +1078,12 @@ importers: specifier: 6.0.1 version: 6.0.1 + internal-packages/compute: + dependencies: + zod: + specifier: 3.23.8 + version: 3.23.8 + internal-packages/database: dependencies: '@prisma/client': @@ -1113,7 +1122,7 @@ importers: version: 18.3.1 react-email: specifier: ^2.1.1 - version: 2.1.2(@opentelemetry/api@1.9.0)(@swc/helpers@0.5.15)(eslint@8.31.0) + version: 2.1.2(@opentelemetry/api@1.9.0)(@swc/helpers@0.5.15)(bufferutil@4.0.9)(eslint@8.31.0) resend: specifier: ^3.2.0 version: 3.2.0 @@ -11205,7 +11214,7 @@ packages: '@vercel/postgres@0.10.0': resolution: {integrity: sha512-fSD23DxGND40IzSkXjcFcxr53t3Tiym59Is0jSYIFpG4/0f0KO9SGtcp1sXiebvPaGe7N/tU05cH4yt2S6/IPg==} engines: {node: '>=18.14'} - deprecated: '@vercel/postgres is deprecated. You can either choose an alternate storage solution from the Vercel Marketplace if you want to set up a new database. Or you can follow this guide to migrate your existing Vercel Postgres db: https://neon.com/docs/guides/vercel-postgres-transition-guide' + deprecated: '@vercel/postgres is deprecated. If you are setting up a new database, you can choose an alternate storage solution from the Vercel Marketplace. If you had an existing Vercel Postgres database, it should have been migrated to Neon as a native Vercel integration. You can find more details and the guide to migrate to Neon''s SDKs here: https://neon.com/docs/guides/vercel-postgres-transition-guide' '@vercel/sdk@1.19.1': resolution: {integrity: sha512-K4rmtUT6t1vX06tiY44ot8A7W1FKN7g/tMkE7yZghCgNQ8b30SzljBd4ni8RNp2pJzM/HrZmphRDeIArO7oZuw==} @@ -11872,6 +11881,7 @@ packages: basic-ftp@5.0.3: resolution: {integrity: sha512-QHX8HLlncOLpy54mh+k/sWIFd0ThmRqwe9ZjELybGZK+tZ8rUb9VO0saKJUROTbE+KhzDUT7xziGpGrW8Kmd+g==} engines: {node: '>=10.0.0'} + deprecated: Security vulnerability fixed in 5.2.0, please upgrade bcrypt-pbkdf@1.0.2: resolution: {integrity: sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==} @@ -14311,25 +14321,29 @@ packages: glob@10.3.10: resolution: {integrity: sha512-fa46+tv1Ak0UPK1TOy/pZrIybNNt4HCv7SDzwyfiOZkvZLEbjsZkJBPtDHVshZjbecAoAGSC20MjLDG/qr679g==} engines: {node: '>=16 || 14 >=14.17'} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@10.3.4: resolution: {integrity: sha512-6LFElP3A+i/Q8XQKEvZjkEWEOTgAIALR9AO2rwT8bgPhDd1anmqDJDZ6lLddI4ehxxxR1S5RIqKe1uapMQfYaQ==} engines: {node: '>=16 || 14 >=14.17'} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@10.4.5: resolution: {integrity: sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@11.0.0: resolution: {integrity: sha512-9UiX/Bl6J2yaBbxKoEBRm4Cipxgok8kQYcOPEhScPwebu2I0HoQOuYdIO6S3hLuWoZgpDpwQZMzTFxgpkyT76g==} engines: {node: 20 || >=22} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} - deprecated: Glob versions prior to v9 are no longer supported + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me glob@9.3.5: resolution: {integrity: sha512-e1LleDykUz2Iu+MTYdkSsuWX8lvAjAcs0Xef0lNIu0S2wOAzuTxCJtcd9S3cijlwYF18EsU3rzb8jPVobxDh9Q==} @@ -17347,6 +17361,7 @@ packages: prebuild-install@7.1.3: resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==} engines: {node: '>=10'} + deprecated: No longer maintained. Please contact the author of the relevant native addon; alternatives are available. hasBin: true preferred-pm@3.0.3: @@ -19044,21 +19059,22 @@ packages: tar@6.1.13: resolution: {integrity: sha512-jdIBIN6LTIe2jqzay/2vtYLlBHa3JF42ot3h1dW8Q0PaAG4v8rm0cvpVePtau5C6OKXGGcgO9q2AMNSWxiLqKw==} engines: {node: '>=10'} - deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me + deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me tar@6.2.1: resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} engines: {node: '>=10'} - deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me + deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me tar@7.4.3: resolution: {integrity: sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==} engines: {node: '>=18'} - deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me + deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me tar@7.5.6: resolution: {integrity: sha512-xqUeu2JAIJpXyvskvU3uvQW8PAmHrtXp2KDuMJwQqW8Sqq0CaZBAQ+dKS3RBXVhU4wC5NjAdKrmh84241gO9cA==} engines: {node: '>=18'} + deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me tdigest@0.1.2: resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==} @@ -23234,7 +23250,7 @@ snapshots: '@epic-web/test-server@0.1.0(bufferutil@4.0.9)': dependencies: '@hono/node-server': 1.12.2(hono@4.5.11) - '@hono/node-ws': 1.0.4(@hono/node-server@1.12.2(hono@4.11.8))(bufferutil@4.0.9) + '@hono/node-ws': 1.0.4(@hono/node-server@1.12.2(hono@4.5.11))(bufferutil@4.0.9) '@open-draft/deferred-promise': 2.2.0 '@types/ws': 8.5.12 hono: 4.5.11 @@ -23989,7 +24005,7 @@ snapshots: dependencies: hono: 4.11.8 - '@hono/node-ws@1.0.4(@hono/node-server@1.12.2(hono@4.11.8))(bufferutil@4.0.9)': + '@hono/node-ws@1.0.4(@hono/node-server@1.12.2(hono@4.5.11))(bufferutil@4.0.9)': dependencies: '@hono/node-server': 1.12.2(hono@4.5.11) ws: 8.18.3(bufferutil@4.0.9) @@ -39409,7 +39425,7 @@ snapshots: react: 18.2.0 react-dom: 18.2.0(react@18.2.0) - react-email@2.1.2(@opentelemetry/api@1.9.0)(@swc/helpers@0.5.15)(eslint@8.31.0): + react-email@2.1.2(@opentelemetry/api@1.9.0)(@swc/helpers@0.5.15)(bufferutil@4.0.9)(eslint@8.31.0): dependencies: '@babel/parser': 7.24.1 '@radix-ui/colors': 1.0.1 @@ -39446,8 +39462,8 @@ snapshots: react: 18.3.1 react-dom: 18.2.0(react@18.3.1) shelljs: 0.8.5 - socket.io: 4.7.3 - socket.io-client: 4.7.3 + socket.io: 4.7.3(bufferutil@4.0.9) + socket.io-client: 4.7.3(bufferutil@4.0.9) sonner: 1.3.1(react-dom@18.2.0(react@18.3.1))(react@18.3.1) source-map-js: 1.0.2 stacktrace-parser: 0.1.10 @@ -40674,7 +40690,7 @@ snapshots: - supports-color - utf-8-validate - socket.io-client@4.7.3: + socket.io-client@4.7.3(bufferutil@4.0.9): dependencies: '@socket.io/component-emitter': 3.1.0 debug: 4.3.7(supports-color@10.0.0) @@ -40703,7 +40719,7 @@ snapshots: transitivePeerDependencies: - supports-color - socket.io@4.7.3: + socket.io@4.7.3(bufferutil@4.0.9): dependencies: accepts: 1.3.8 base64id: 2.0.0 From d0445eff685d316306bbbfb04e2afc7b71daa69c Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:07:28 +0000 Subject: [PATCH 27/40] fix: simplify gateway client return type, remove unused json parsing --- internal-packages/compute/src/gatewayClient.ts | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/internal-packages/compute/src/gatewayClient.ts b/internal-packages/compute/src/gatewayClient.ts index 025a4ec0fbf..48b3f9dc03c 100644 --- a/internal-packages/compute/src/gatewayClient.ts +++ b/internal-packages/compute/src/gatewayClient.ts @@ -12,7 +12,7 @@ export class ComputeGatewayClient { async createTemplate( req: TemplateCreateRequest, options?: { signal?: AbortSignal } - ): Promise<{ accepted: boolean; templateId?: string }> { + ): Promise<{ accepted: boolean }> { const url = `${this.opts.gatewayUrl}/api/templates`; const headers: Record = { @@ -36,11 +36,6 @@ export class ComputeGatewayClient { throw new Error(`Gateway template creation failed (${response.status}): ${errorBody}`); } - if (response.status === 202) { - return { accepted: true }; - } - - const result = await response.json(); - return { accepted: false, templateId: result.template_id }; + return { accepted: response.status === 202 }; } } From 1834da4ef32903e69ef1b70e4ec794958c3b324e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:26:06 +0000 Subject: [PATCH 28/40] fix(supervisor): address review feedback on compute workload manager --- apps/supervisor/src/services/timerWheel.ts | 4 ++-- apps/supervisor/src/workloadServer/index.ts | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts index 4a95e216b25..9584423824d 100644 --- a/apps/supervisor/src/services/timerWheel.ts +++ b/apps/supervisor/src/services/timerWheel.ts @@ -46,7 +46,7 @@ export class TimerWheel { this.cursor = 0; this.intervalId = null; this.onExpire = opts.onExpire; - this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.round(opts.delayMs / TICK_MS))); + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(opts.delayMs / TICK_MS))); } /** Start the timer wheel. Must be called before submitting items. */ @@ -88,7 +88,7 @@ export class TimerWheel { * Clamped to [TICK_MS, 60000ms]. */ setDelay(delayMs: number): void { - this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.round(delayMs / TICK_MS))); + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(delayMs / TICK_MS))); } /** diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 10e85b628b2..9d2e25b3cb4 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -129,7 +129,15 @@ export class WorkloadServer extends EventEmitter { if (this.computeManager && env.COMPUTE_SNAPSHOTS_ENABLED) { this.snapshotDelayWheel = new TimerWheel({ delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, - onExpire: (item) => this.dispatchComputeSnapshot(item.data), + onExpire: (item) => { + this.dispatchComputeSnapshot(item.data).catch((error) => { + this.logger.error("Compute snapshot dispatch failed", { + runId: item.data.runFriendlyId, + runnerId: item.data.runnerId, + error, + }); + }); + }, }); this.snapshotDelayWheel.start(); } @@ -296,7 +304,14 @@ export class WorkloadServer extends EventEmitter { this.logger.error( "TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create snapshot callback URL" ); - reply.json({ error: "Snapshot callbacks not configured" }, false, 500); + reply.json( + { + ok: false, + error: "Snapshot callbacks not configured", + } satisfies WorkloadSuspendRunResponseBody, + false, + 500 + ); return; } From 76e5715bfbae767723ccf20dfead38efcb5978a4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:30:54 +0000 Subject: [PATCH 29/40] refactor: convert span-timeline-events from always-loaded rule to on-demand skill --- .../span-timeline-events/SKILL.md} | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) rename .claude/{rules/span-timeline-events.md => skills/span-timeline-events/SKILL.md} (86%) diff --git a/.claude/rules/span-timeline-events.md b/.claude/skills/span-timeline-events/SKILL.md similarity index 86% rename from .claude/rules/span-timeline-events.md rename to .claude/skills/span-timeline-events/SKILL.md index f4e36717780..122f49912d7 100644 --- a/.claude/rules/span-timeline-events.md +++ b/.claude/skills/span-timeline-events/SKILL.md @@ -1,3 +1,9 @@ +--- +name: span-timeline-events +description: Use when adding, modifying, or debugging OTel span timeline events in the trace view. Covers event structure, ClickHouse storage constraints, rendering in SpanTimeline component, admin visibility, and the step-by-step process for adding new events. +allowed-tools: Read, Write, Edit, Glob, Grep, Bash +--- + # Span Timeline Events The trace view's right panel shows a timeline of events for the selected span. These are OTel span events rendered by `app/utils/timelineSpanEvents.ts` and the `SpanTimeline` component. @@ -22,7 +28,7 @@ The `SpanTimeline` component in `app/components/run/RunTimeline.tsx` renders: 3. **Duration bar** (thick 7px line) - from "Started" to "Finished" 4. **"Finished"** marker (thick cap) - at `startTime + duration` -The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued → Pod scheduled → Launched → etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). +The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued -> Pod scheduled -> Launched -> etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). ## Trace View Sort Order From 6e32bb74824cae399280e175f7b04ac3e77e4bb3 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:44:49 +0000 Subject: [PATCH 30/40] feat(webapp): add hasComputeAccess feature flag for private beta org opt-in --- apps/webapp/app/v3/featureFlags.server.ts | 2 ++ .../v3/services/computeTemplateCreation.server.ts | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/featureFlags.server.ts b/apps/webapp/app/v3/featureFlags.server.ts index f32f34c64b8..8a2e879395d 100644 --- a/apps/webapp/app/v3/featureFlags.server.ts +++ b/apps/webapp/app/v3/featureFlags.server.ts @@ -9,6 +9,7 @@ export const FEATURE_FLAG = { hasLogsPageAccess: "hasLogsPageAccess", hasAiAccess: "hasAiAccess", hasAiModelsAccess: "hasAiModelsAccess", + hasComputeAccess: "hasComputeAccess", } as const; const FeatureFlagCatalog = { @@ -19,6 +20,7 @@ const FeatureFlagCatalog = { [FEATURE_FLAG.hasLogsPageAccess]: z.coerce.boolean(), [FEATURE_FLAG.hasAiAccess]: z.coerce.boolean(), [FEATURE_FLAG.hasAiModelsAccess]: z.coerce.boolean(), + [FEATURE_FLAG.hasComputeAccess]: z.coerce.boolean(), }; type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index f01e27fc956..81a8df984b0 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -2,6 +2,7 @@ import { ComputeGatewayClient } from "@internal/compute"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; +import { FEATURE_FLAG, makeFlag } from "~/v3/featureFlags.server"; type TemplateCreationMode = "required" | "shadow" | "skip"; @@ -28,6 +29,9 @@ export class ComputeTemplateCreationService { defaultWorkerGroup: { select: { workloadType: true }, }, + organization: { + select: { featureFlags: true }, + }, }, }); @@ -35,7 +39,16 @@ export class ComputeTemplateCreationService { return "required"; } - // TODO: check private beta feature flag for org + const flag = makeFlag(prisma); + const hasComputeAccess = await flag({ + key: FEATURE_FLAG.hasComputeAccess, + defaultValue: false, + overrides: (project?.organization?.featureFlags as Record) ?? {}, + }); + + if (hasComputeAccess) { + return "required"; + } const rolloutPct = Number(env.COMPUTE_TEMPLATE_SHADOW_ROLLOUT_PCT ?? "0"); if (rolloutPct > 0 && Math.random() * 100 < rolloutPct) { From 64b08b9591911fa013eda77ec366f95ed929e4f6 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:00:53 +0000 Subject: [PATCH 31/40] fix: skip resolveMode early when compute gateway is not configured --- apps/webapp/app/v3/services/computeTemplateCreation.server.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 81a8df984b0..072d01c819b 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -23,6 +23,10 @@ export class ComputeTemplateCreationService { projectId: string, prisma: PrismaClientOrTransaction ): Promise { + if (!this.client) { + return "skip"; + } + const project = await prisma.project.findFirst({ where: { id: projectId }, select: { From 6809f07706554217f1b5fc1e4d1646b460363d0b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:08:16 +0000 Subject: [PATCH 32/40] fix: shadow mode uses fire-and-forget HTTP to avoid holding connections --- .../computeTemplateCreation.server.ts | 16 ++++++++ .../v3/services/finalizeDeployment.server.ts | 10 ++--- .../services/finalizeDeploymentV2.server.ts | 10 ++--- .../compute/src/gatewayClient.ts | 39 +++++++++++++------ 4 files changed, 52 insertions(+), 23 deletions(-) diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 072d01c819b..ab0bda38e4a 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -83,4 +83,20 @@ export class ComputeTemplateCreationService { return { success: false, error: message }; } } + + /** + * Fire-and-forget template creation. No HTTP connection held open, + * no response awaited. Used for shadow rollout mode. + */ + createTemplateBackground(imageReference: string): void { + if (!this.client) { + return; + } + + this.client.createTemplateBackground({ + image: imageReference, + cpu: 0.5, + memory_mb: 512, + }); + } } diff --git a/apps/webapp/app/v3/services/finalizeDeployment.server.ts b/apps/webapp/app/v3/services/finalizeDeployment.server.ts index 55a5907f8c1..6a754841693 100644 --- a/apps/webapp/app/v3/services/finalizeDeployment.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeployment.server.ts @@ -191,13 +191,11 @@ export class FinalizeDeploymentService extends BaseService { // Shadow mode: fire-and-forget template creation after deploy is finalized if (templateMode === "shadow" && deployment.imageReference) { - templateService.createTemplate(deployment.imageReference).catch((error) => { - logger.error("Shadow compute template creation failed", { - id, - imageReference: deployment.imageReference, - error: error instanceof Error ? error.message : String(error), - }); + logger.debug("Shadow compute template creation (background)", { + id, + imageReference: deployment.imageReference, }); + templateService.createTemplateBackground(deployment.imageReference); } return finalizedDeployment; diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index 9d4200c891b..bb2ab98c234 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -241,13 +241,11 @@ export class FinalizeDeploymentV2Service extends BaseService { imageReference: string, deploymentFriendlyId: string ) { - templateService.createTemplate(imageReference).catch((error) => { - logger.error("Shadow compute template creation failed", { - id: deploymentFriendlyId, - imageReference, - error: error instanceof Error ? error.message : String(error), - }); + logger.debug("Shadow compute template creation (background)", { + id: deploymentFriendlyId, + imageReference, }); + templateService.createTemplateBackground(imageReference); } } diff --git a/internal-packages/compute/src/gatewayClient.ts b/internal-packages/compute/src/gatewayClient.ts index 48b3f9dc03c..17310d44e40 100644 --- a/internal-packages/compute/src/gatewayClient.ts +++ b/internal-packages/compute/src/gatewayClient.ts @@ -13,6 +13,32 @@ export class ComputeGatewayClient { req: TemplateCreateRequest, options?: { signal?: AbortSignal } ): Promise<{ accepted: boolean }> { + const response = await this.#fetch(req, options?.signal); + + if (!response.ok) { + const errorBody = await response.text().catch(() => "unknown error"); + throw new Error(`Gateway template creation failed (${response.status}): ${errorBody}`); + } + + return { accepted: response.status === 202 }; + } + + /** + * Fire-and-forget template creation. Sends the request but does not + * await the response, so no HTTP connection is held open. + */ + createTemplateBackground(req: TemplateCreateRequest): void { + this.#fetch(req).then( + (response) => { + if (!response.ok) { + response.text().catch(() => {}); + } + }, + () => {} // swallow network errors + ); + } + + #fetch(req: TemplateCreateRequest, signal?: AbortSignal): Promise { const url = `${this.opts.gatewayUrl}/api/templates`; const headers: Record = { @@ -22,20 +48,11 @@ export class ComputeGatewayClient { headers["Authorization"] = `Bearer ${this.opts.authToken}`; } - const signal = options?.signal ?? AbortSignal.timeout(this.opts.timeoutMs); - - const response = await fetch(url, { + return fetch(url, { method: "POST", headers, body: JSON.stringify(req), - signal, + signal: signal ?? AbortSignal.timeout(this.opts.timeoutMs), }); - - if (!response.ok) { - const errorBody = await response.text().catch(() => "unknown error"); - throw new Error(`Gateway template creation failed (${response.status}): ${errorBody}`); - } - - return { accepted: response.status === 202 }; } } From 5d24dd18d2aa89dfbef108917b00937054e42f75 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 00:32:05 +0000 Subject: [PATCH 33/40] feat(webapp): use background flag for shadow mode template creation --- .../computeTemplateCreation.server.ts | 22 +++++-------------- .../v3/services/finalizeDeployment.server.ts | 10 +++++---- .../services/finalizeDeploymentV2.server.ts | 10 +++++---- internal-packages/compute/src/types.ts | 1 + 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index ab0bda38e4a..da6b26a5398 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -62,7 +62,10 @@ export class ComputeTemplateCreationService { return "skip"; } - async createTemplate(imageReference: string): Promise<{ success: boolean; error?: string }> { + async createTemplate( + imageReference: string, + options?: { background?: boolean } + ): Promise<{ success: boolean; error?: string }> { if (!this.client) { return { success: false, error: "Compute gateway not configured" }; } @@ -72,6 +75,7 @@ export class ComputeTemplateCreationService { image: imageReference, cpu: 0.5, memory_mb: 512, + background: options?.background, }); return { success: true }; } catch (error) { @@ -83,20 +87,4 @@ export class ComputeTemplateCreationService { return { success: false, error: message }; } } - - /** - * Fire-and-forget template creation. No HTTP connection held open, - * no response awaited. Used for shadow rollout mode. - */ - createTemplateBackground(imageReference: string): void { - if (!this.client) { - return; - } - - this.client.createTemplateBackground({ - image: imageReference, - cpu: 0.5, - memory_mb: 512, - }); - } } diff --git a/apps/webapp/app/v3/services/finalizeDeployment.server.ts b/apps/webapp/app/v3/services/finalizeDeployment.server.ts index 6a754841693..0beb283cdf0 100644 --- a/apps/webapp/app/v3/services/finalizeDeployment.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeployment.server.ts @@ -191,11 +191,13 @@ export class FinalizeDeploymentService extends BaseService { // Shadow mode: fire-and-forget template creation after deploy is finalized if (templateMode === "shadow" && deployment.imageReference) { - logger.debug("Shadow compute template creation (background)", { - id, - imageReference: deployment.imageReference, + templateService.createTemplate(deployment.imageReference, { background: true }).catch((error) => { + logger.error("Shadow compute template creation failed", { + id, + imageReference: deployment.imageReference, + error: error instanceof Error ? error.message : String(error), + }); }); - templateService.createTemplateBackground(deployment.imageReference); } return finalizedDeployment; diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index bb2ab98c234..c1e3260ab2b 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -241,11 +241,13 @@ export class FinalizeDeploymentV2Service extends BaseService { imageReference: string, deploymentFriendlyId: string ) { - logger.debug("Shadow compute template creation (background)", { - id: deploymentFriendlyId, - imageReference, + templateService.createTemplate(imageReference, { background: true }).catch((error) => { + logger.error("Shadow compute template creation failed", { + id: deploymentFriendlyId, + imageReference, + error: error instanceof Error ? error.message : String(error), + }); }); - templateService.createTemplateBackground(imageReference); } } diff --git a/internal-packages/compute/src/types.ts b/internal-packages/compute/src/types.ts index b45bddfdaef..a84bb829e46 100644 --- a/internal-packages/compute/src/types.ts +++ b/internal-packages/compute/src/types.ts @@ -4,6 +4,7 @@ export const TemplateCreateRequestSchema = z.object({ image: z.string(), cpu: z.number(), memory_mb: z.number(), + background: z.boolean().optional(), callback: z .object({ url: z.string(), From 4cbbadb83631b01029d39d57a8157b56837b7154 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 01:06:29 +0000 Subject: [PATCH 34/40] refactor: consolidate template creation logic into ComputeTemplateCreationService.handleDeployTemplate --- .../computeTemplateCreation.server.ts | 77 ++++++++++ .../v3/services/finalizeDeployment.server.ts | 53 ------- .../services/finalizeDeploymentV2.server.ts | 133 ++++-------------- 3 files changed, 102 insertions(+), 161 deletions(-) diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index da6b26a5398..a18ed11e705 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -3,6 +3,9 @@ import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; import { FEATURE_FLAG, makeFlag } from "~/v3/featureFlags.server"; +import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { ServiceValidationError } from "./baseService.server"; +import { FailDeploymentService } from "./failDeployment.server"; type TemplateCreationMode = "required" | "shadow" | "skip"; @@ -19,6 +22,80 @@ export class ComputeTemplateCreationService { } } + /** + * Handle template creation for a deployment. Call this before setting DEPLOYED. + * + * - Required mode: creates template synchronously, fails deployment on error + * - Shadow mode: fires background template creation (returns immediately) + * - Skip: no-op + * + * Throws ServiceValidationError if required mode fails (caller should stop finalize). + */ + async handleDeployTemplate(options: { + projectId: string; + imageReference: string; + deploymentFriendlyId: string; + authenticatedEnv: AuthenticatedEnvironment; + prisma: PrismaClientOrTransaction; + writer?: WritableStreamDefaultWriter; + }): Promise { + const mode = await this.resolveMode(options.projectId, options.prisma); + + if (mode === "skip") { + return; + } + + if (mode === "shadow") { + this.createTemplate(options.imageReference, { background: true }).catch((error) => { + logger.error("Shadow compute template creation failed", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + error: error instanceof Error ? error.message : String(error), + }); + }); + return; + } + + // Required mode + if (options.writer) { + await options.writer.write( + `event: log\ndata: ${JSON.stringify({ message: "Building compute template..." })}\n\n` + ); + } + + logger.info("Creating compute template (required mode)", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + }); + + const result = await this.createTemplate(options.imageReference); + + if (!result.success) { + logger.error("Compute template creation failed", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + error: result.error, + }); + + const failService = new FailDeploymentService(); + await failService.call(options.authenticatedEnv, options.deploymentFriendlyId, { + error: { + name: "TemplateCreationFailed", + message: `Failed to create compute template: ${result.error}`, + }, + }); + + throw new ServiceValidationError( + `Compute template creation failed: ${result.error}` + ); + } + + logger.info("Compute template created", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + }); + } + async resolveMode( projectId: string, prisma: PrismaClientOrTransaction diff --git a/apps/webapp/app/v3/services/finalizeDeployment.server.ts b/apps/webapp/app/v3/services/finalizeDeployment.server.ts index 0beb283cdf0..6cbfc323e7a 100644 --- a/apps/webapp/app/v3/services/finalizeDeployment.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeployment.server.ts @@ -12,7 +12,6 @@ import { TimeoutDeploymentService } from "./timeoutDeployment.server"; import { DeploymentService } from "./deployment.server"; import { engine } from "../runEngine.server"; import { tryCatch } from "@trigger.dev/core"; -import { ComputeTemplateCreationService } from "./computeTemplateCreation.server"; export class FinalizeDeploymentService extends BaseService { public async call( @@ -66,47 +65,6 @@ export class FinalizeDeploymentService extends BaseService { const imageDigest = validatedImageDigest(body.imageDigest); - // Compute template creation (before setting DEPLOYED) - const templateService = new ComputeTemplateCreationService(); - const templateMode = await templateService.resolveMode( - authenticatedEnv.projectId, - this._prisma - ); - - if (templateMode === "required" && deployment.imageReference) { - logger.info("Creating compute template (required mode)", { - id, - imageReference: deployment.imageReference, - }); - - const templateResult = await templateService.createTemplate(deployment.imageReference); - - if (!templateResult.success) { - logger.error("Compute template creation failed", { - id, - imageReference: deployment.imageReference, - error: templateResult.error, - }); - - const failService = new FailDeploymentService(); - await failService.call(authenticatedEnv, deployment.friendlyId, { - error: { - name: "TemplateCreationFailed", - message: `Failed to create compute template: ${templateResult.error}`, - }, - }); - - throw new ServiceValidationError( - `Compute template creation failed: ${templateResult.error}` - ); - } - - logger.info("Compute template created", { - id, - imageReference: deployment.imageReference, - }); - } - // Link the deployment with the background worker const finalizedDeployment = await this._prisma.workerDeployment.update({ where: { @@ -189,17 +147,6 @@ export class FinalizeDeploymentService extends BaseService { await PerformDeploymentAlertsService.enqueue(deployment.id); - // Shadow mode: fire-and-forget template creation after deploy is finalized - if (templateMode === "shadow" && deployment.imageReference) { - templateService.createTemplate(deployment.imageReference, { background: true }).catch((error) => { - logger.error("Shadow compute template creation failed", { - id, - imageReference: deployment.imageReference, - error: error instanceof Error ? error.message : String(error), - }); - }); - } - return finalizedDeployment; } } diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index c1e3260ab2b..7ca8a379a3d 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -11,7 +11,6 @@ import { mkdtemp, writeFile } from "node:fs/promises"; import { env } from "~/env.server"; import { depot as execDepot } from "@depot/cli"; import { FinalizeDeploymentService } from "./finalizeDeployment.server"; -import { FailDeploymentService } from "./failDeployment.server"; import { remoteBuildsEnabled } from "../remoteImageBuilder.server"; import { getEcrAuthToken, isEcrRegistry } from "../getDeploymentImageRef.server"; import { tryCatch } from "@trigger.dev/core"; @@ -25,12 +24,6 @@ export class FinalizeDeploymentV2Service extends BaseService { body: FinalizeDeploymentRequestBody, writer?: WritableStreamDefaultWriter ) { - // If remote builds are not enabled, lets just use the v1 finalize deployment service - if (!remoteBuildsEnabled()) { - const finalizeService = new FinalizeDeploymentService(); - return finalizeService.call(authenticatedEnv, id, body); - } - const deployment = await this._prisma.workerDeployment.findFirst({ where: { friendlyId: id, @@ -64,7 +57,6 @@ export class FinalizeDeploymentV2Service extends BaseService { if (deployment.status === "DEPLOYED") { logger.debug("Worker deployment is already deployed", { id }); - return deployment; } @@ -74,32 +66,17 @@ export class FinalizeDeploymentV2Service extends BaseService { } const finalizeService = new FinalizeDeploymentService(); - const templateService = new ComputeTemplateCreationService(); - - if (body.skipPushToRegistry) { - logger.debug("Skipping push to registry during deployment finalization", { - deployment, - }); - let templateMode: "required" | "shadow" | "skip" = "skip"; - if (deployment.imageReference) { - templateMode = await this.#handleTemplateCreation({ - templateService, - projectId: deployment.worker.project.id, - imageReference: deployment.imageReference, - deploymentFriendlyId: id, - authenticatedEnv, - writer, + // If remote builds are not enabled, skip image push and go straight to template + finalize + if (!remoteBuildsEnabled() || body.skipPushToRegistry) { + if (body.skipPushToRegistry) { + logger.debug("Skipping push to registry during deployment finalization", { + deployment, }); } - const result = await finalizeService.call(authenticatedEnv, id, body); - - if (templateMode === "shadow" && deployment.imageReference) { - this.#fireShadowTemplateCreation(templateService, deployment.imageReference, id); - } - - return result; + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); + return finalizeService.call(authenticatedEnv, id, body); } const externalBuildData = deployment.externalBuildData @@ -165,88 +142,28 @@ export class FinalizeDeploymentV2Service extends BaseService { pushedImage: pushResult.image, }); - const templateMode = await this.#handleTemplateCreation({ - templateService, - projectId: deployment.worker.project.id, - imageReference: deployment.imageReference, - deploymentFriendlyId: id, - authenticatedEnv, - writer, - }); - - const finalizedDeployment = await finalizeService.call(authenticatedEnv, id, body); - - // Shadow mode: fire-and-forget template creation after deploy is finalized - if (templateMode === "shadow") { - this.#fireShadowTemplateCreation(templateService, deployment.imageReference, id); - } - - return finalizedDeployment; + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); + return finalizeService.call(authenticatedEnv, id, body); } - async #handleTemplateCreation(options: { - templateService: ComputeTemplateCreationService; - projectId: string; - imageReference: string; - deploymentFriendlyId: string; - authenticatedEnv: AuthenticatedEnvironment; - writer?: WritableStreamDefaultWriter; - }): Promise<"required" | "shadow" | "skip"> { - const { templateService, projectId, imageReference, deploymentFriendlyId, authenticatedEnv, writer } = options; - - const mode = await templateService.resolveMode(projectId, this._prisma); - - if (mode !== "required") { - return mode; - } - - if (writer) { - await writer.write( - `event: log\ndata: ${JSON.stringify({ message: "Building compute template..." })}\n\n` - ); - } - - const templateResult = await templateService.createTemplate(imageReference); - - if (!templateResult.success) { - logger.error("Compute template creation failed", { - id: deploymentFriendlyId, - imageReference, - error: templateResult.error, - }); - - const failService = new FailDeploymentService(); - await failService.call(authenticatedEnv, deploymentFriendlyId, { - error: { - name: "TemplateCreationFailed", - message: `Failed to create compute template: ${templateResult.error}`, - }, - }); - - throw new ServiceValidationError( - `Compute template creation failed: ${templateResult.error}` - ); + async #createTemplateIfNeeded( + deployment: { imageReference: string | null; worker: { project: { id: string } } | null }, + deploymentFriendlyId: string, + authenticatedEnv: AuthenticatedEnvironment, + writer?: WritableStreamDefaultWriter + ): Promise { + if (!deployment.imageReference || !deployment.worker) { + return; } - logger.debug("Compute template created", { - id: deploymentFriendlyId, - imageReference, - }); - - return mode; - } - - #fireShadowTemplateCreation( - templateService: ComputeTemplateCreationService, - imageReference: string, - deploymentFriendlyId: string - ) { - templateService.createTemplate(imageReference, { background: true }).catch((error) => { - logger.error("Shadow compute template creation failed", { - id: deploymentFriendlyId, - imageReference, - error: error instanceof Error ? error.message : String(error), - }); + const templateService = new ComputeTemplateCreationService(); + await templateService.handleDeployTemplate({ + projectId: deployment.worker.project.id, + imageReference: deployment.imageReference, + deploymentFriendlyId, + authenticatedEnv, + prisma: this._prisma, + writer, }); } } From eef782f882cbc51baf9729b2d8b8bac1606ef67f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 10:38:21 +0000 Subject: [PATCH 35/40] fix: use .then() instead of .catch() for shadow mode error logging --- .../computeTemplateCreation.server.ts | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index a18ed11e705..4562c4ae16f 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -46,13 +46,23 @@ export class ComputeTemplateCreationService { } if (mode === "shadow") { - this.createTemplate(options.imageReference, { background: true }).catch((error) => { - logger.error("Shadow compute template creation failed", { - id: options.deploymentFriendlyId, - imageReference: options.imageReference, - error: error instanceof Error ? error.message : String(error), + this.createTemplate(options.imageReference, { background: true }) + .then((result) => { + if (!result.success) { + logger.error("Shadow template creation failed", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + error: result.error, + }); + } + }) + .catch((error) => { + logger.error("Shadow template creation threw unexpectedly", { + id: options.deploymentFriendlyId, + imageReference: options.imageReference, + error: error instanceof Error ? error.message : String(error), + }); }); - }); return; } From 2219d11f4f902ab9679681cdc25c726bb31acb5b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:16:55 +0000 Subject: [PATCH 36/40] fix: update consumerPool test assertion for optional timing parameter --- .../core/src/v3/runEngineWorker/supervisor/consumerPool.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.test.ts b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.test.ts index 5f515b95b79..6093790b012 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.test.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/consumerPool.test.ts @@ -498,7 +498,7 @@ describe("RunQueueConsumerPool", () => { await testConsumers[0].onDequeue(messages); } - expect(mockOnDequeue).toHaveBeenCalledWith(messages); + expect(mockOnDequeue).toHaveBeenCalledWith(messages, undefined); advanceTimeAndProcessMetrics(1100); const metrics = pool.getMetrics(); From b7fa4207e564a34f3edc7d08eb86ca0650f0bb5a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:17:10 +0000 Subject: [PATCH 37/40] refactor: consolidate compute gateway clients into shared @internal/compute package --- apps/supervisor/package.json | 1 + .../supervisor/src/workloadManager/compute.ts | 174 +++++------------- .../computeTemplateCreation.server.ts | 8 +- internal-packages/compute/src/client.ts | 156 ++++++++++++++++ .../compute/src/gatewayClient.ts | 58 ------ internal-packages/compute/src/index.ts | 13 +- internal-packages/compute/src/types.ts | 34 ++++ pnpm-lock.yaml | 3 + 8 files changed, 253 insertions(+), 194 deletions(-) create mode 100644 internal-packages/compute/src/client.ts delete mode 100644 internal-packages/compute/src/gatewayClient.ts diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index e9609bf1541..092d9dcf604 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -15,6 +15,7 @@ "dependencies": { "@aws-sdk/client-ecr": "^3.839.0", "@kubernetes/client-node": "^1.0.0", + "@internal/compute": "workspace:*", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", "prom-client": "^15.1.0", diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 892f3ecbc08..8903681f67f 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -6,6 +6,7 @@ import { type WorkloadManagerCreateOptions, type WorkloadManagerOptions, } from "./types.js"; +import { ComputeClient } from "@internal/compute"; import { env } from "../env.js"; import { getRunnerId } from "../util.js"; import { buildOtlpTracePayload } from "../otlpPayload.js"; @@ -20,6 +21,7 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { export class ComputeWorkloadManager implements WorkloadManager { private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + private readonly compute: ComputeClient; constructor(private opts: ComputeWorkloadManagerOptions) { if (opts.workloadApiDomain) { @@ -27,6 +29,12 @@ export class ComputeWorkloadManager implements WorkloadManager { domain: opts.workloadApiDomain, }); } + + this.compute = new ComputeClient({ + gatewayUrl: opts.gatewayUrl, + authToken: opts.gatewayAuthToken, + timeoutMs: opts.gatewayTimeoutMs, + }); } async create(opts: WorkloadManagerCreateOptions) { @@ -73,19 +81,9 @@ export class ComputeWorkloadManager implements WorkloadManager { Object.assign(envVars, this.opts.additionalEnvVars); } - const headers: Record = { - "Content-Type": "application/json", - }; - - if (this.opts.gatewayAuthToken) { - headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; - } - - // Strip image digest — resolve by tag, not digest + // Strip image digest - resolve by tag, not digest const imageRef = opts.image.split("@")[0]!; - const url = `${this.opts.gatewayUrl}/api/instances`; - // Wide event: single canonical log line emitted in finally const event: Record = { // High-cardinality identifiers @@ -105,58 +103,34 @@ export class ComputeWorkloadManager implements WorkloadManager { warmStartCheckMs: opts.warmStartCheckMs, // Request image: imageRef, - url, }; const startMs = performance.now(); try { - const [fetchError, response] = await tryCatch( - fetch(url, { - method: "POST", - headers, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - name: runnerId, - image: imageRef, - env: envVars, - cpu: opts.machine.cpu, - memory_gb: opts.machine.memory, - metadata: { - runId: opts.runFriendlyId, - envId: opts.envId, - envType: opts.envType, - orgId: opts.orgId, - projectId: opts.projectId, - deploymentVersion: opts.deploymentVersion, - machine: opts.machine.name, - }, - }), + const [error, data] = await tryCatch( + this.compute.instances.create({ + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, }) ); - if (fetchError) { - event.error = fetchError instanceof Error ? fetchError.message : String(fetchError); + if (error) { + event.error = error instanceof Error ? error.message : String(error); event.errorType = - fetchError instanceof DOMException && fetchError.name === "TimeoutError" - ? "timeout" - : "fetch"; - return; - } - - event.status = response.status; - - if (!response.ok) { - const [bodyError, body] = await tryCatch(response.text()); - event.responseBody = bodyError ? undefined : body; - return; - } - - const [parseError, data] = await tryCatch(response.json()); - - if (parseError) { - event.error = parseError instanceof Error ? parseError.message : String(parseError); - event.errorType = "parse"; + error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; return; } @@ -176,34 +150,17 @@ export class ComputeWorkloadManager implements WorkloadManager { } } - private get authHeaders(): Record { - const headers: Record = { - "Content-Type": "application/json", - }; - if (this.opts.gatewayAuthToken) { - headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`; - } - return headers; - } - async snapshot(opts: { runnerId: string; callbackUrl: string; metadata: Record; }): Promise { - const url = `${this.opts.gatewayUrl}/api/instances/${opts.runnerId}/snapshot`; - - const [error, response] = await tryCatch( - fetch(url, { - method: "POST", - headers: this.authHeaders, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify({ - callback: { - url: opts.callbackUrl, - metadata: opts.metadata, - }, - }), + const [error] = await tryCatch( + this.compute.instances.snapshot(opts.runnerId, { + callback: { + url: opts.callbackUrl, + metadata: opts.metadata, + }, }) ); @@ -215,28 +172,12 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - if (response.status !== 202) { - this.logger.error("snapshot request rejected", { - runnerId: opts.runnerId, - status: response.status, - }); - return false; - } - this.logger.debug("snapshot request accepted", { runnerId: opts.runnerId }); return true; } async deleteInstance(runnerId: string): Promise { - const url = `${this.opts.gatewayUrl}/api/instances/${runnerId}`; - - const [error, response] = await tryCatch( - fetch(url, { - method: "DELETE", - headers: this.authHeaders, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - }) - ); + const [error] = await tryCatch(this.compute.instances.delete(runnerId)); if (error) { this.logger.error("delete instance failed", { @@ -246,14 +187,6 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - if (!response.ok) { - this.logger.error("delete instance rejected", { - runnerId, - status: response.status, - }); - return false; - } - this.logger.debug("delete instance success", { runnerId }); return true; } @@ -329,8 +262,6 @@ export class ComputeWorkloadManager implements WorkloadManager { projectId?: string; dequeuedAt?: Date; }): Promise { - const url = `${this.opts.gatewayUrl}/api/snapshots/${opts.snapshotId}/restore`; - const metadata: Record = { TRIGGER_RUNNER_ID: opts.runnerId, TRIGGER_RUN_ID: opts.runFriendlyId, @@ -341,23 +272,19 @@ export class ComputeWorkloadManager implements WorkloadManager { TRIGGER_WORKER_INSTANCE_NAME: env.TRIGGER_WORKER_INSTANCE_NAME, }; - const body = { - name: opts.runnerId, - metadata, - cpu: opts.machine.cpu, - memory_mb: opts.machine.memory * 1024, - }; - - this.logger.verbose("restore request body", { url, body }); + this.logger.verbose("restore request body", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); const startMs = performance.now(); - const [error, response] = await tryCatch( - fetch(url, { - method: "POST", - headers: this.authHeaders, - signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs), - body: JSON.stringify(body), + const [error] = await tryCatch( + this.compute.snapshots.restore(opts.snapshotId, { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_mb: opts.machine.memory * 1024, }) ); @@ -373,16 +300,6 @@ export class ComputeWorkloadManager implements WorkloadManager { return false; } - if (!response.ok) { - this.logger.error("restore request rejected", { - snapshotId: opts.snapshotId, - runnerId: opts.runnerId, - status: response.status, - durationMs, - }); - return false; - } - this.logger.debug("restore request success", { snapshotId: opts.snapshotId, runnerId: opts.runnerId, @@ -448,4 +365,3 @@ export class ComputeWorkloadManager implements WorkloadManager { sendOtlpTrace(payload); } } - diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 4562c4ae16f..65d704c5f5c 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -1,4 +1,4 @@ -import { ComputeGatewayClient } from "@internal/compute"; +import { ComputeClient } from "@internal/compute"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; @@ -10,11 +10,11 @@ import { FailDeploymentService } from "./failDeployment.server"; type TemplateCreationMode = "required" | "shadow" | "skip"; export class ComputeTemplateCreationService { - private client: ComputeGatewayClient | undefined; + private client: ComputeClient | undefined; constructor() { if (env.COMPUTE_GATEWAY_URL) { - this.client = new ComputeGatewayClient({ + this.client = new ComputeClient({ gatewayUrl: env.COMPUTE_GATEWAY_URL, authToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, timeoutMs: 5 * 60 * 1000, // 5 minutes @@ -158,7 +158,7 @@ export class ComputeTemplateCreationService { } try { - await this.client.createTemplate({ + await this.client.templates.create({ image: imageReference, cpu: 0.5, memory_mb: 512, diff --git a/internal-packages/compute/src/client.ts b/internal-packages/compute/src/client.ts new file mode 100644 index 00000000000..f4293af7197 --- /dev/null +++ b/internal-packages/compute/src/client.ts @@ -0,0 +1,156 @@ +import type { + TemplateCreateRequest, + InstanceCreateRequest, + InstanceCreateResponse, + InstanceSnapshotRequest, + SnapshotRestoreRequest, +} from "./types.js"; + +export type ComputeClientOptions = { + gatewayUrl: string; + authToken?: string; + timeoutMs: number; +}; + +export class ComputeClient { + readonly templates: TemplatesNamespace; + readonly instances: InstancesNamespace; + readonly snapshots: SnapshotsNamespace; + + constructor(private opts: ComputeClientOptions) { + const http = new HttpTransport(opts); + this.templates = new TemplatesNamespace(http); + this.instances = new InstancesNamespace(http); + this.snapshots = new SnapshotsNamespace(http); + } +} + +// ── HTTP transport (shared plumbing) ───────────────────────────────────────── + +type RequestOptions = { + signal?: AbortSignal; +}; + +class HttpTransport { + constructor(private opts: ComputeClientOptions) {} + + private get headers(): Record { + const h: Record = { "Content-Type": "application/json" }; + if (this.opts.authToken) { + h["Authorization"] = `Bearer ${this.opts.authToken}`; + } + return h; + } + + private signal(options?: RequestOptions): AbortSignal { + return options?.signal ?? AbortSignal.timeout(this.opts.timeoutMs); + } + + async post(path: string, body: unknown, options?: RequestOptions): Promise { + const url = `${this.opts.gatewayUrl}${path}`; + + const response = await fetch(url, { + method: "POST", + headers: this.headers, + body: JSON.stringify(body), + signal: this.signal(options), + }); + + if (!response.ok) { + const errorBody = await response.text().catch(() => "unknown error"); + throw new ComputeClientError(response.status, errorBody, url); + } + + // 202 Accepted or 204 No Content - no body to parse + if (response.status === 202 || response.status === 204) { + return undefined as T; + } + + return (await response.json()) as T; + } + + async delete(path: string, options?: RequestOptions): Promise { + const url = `${this.opts.gatewayUrl}${path}`; + + const response = await fetch(url, { + method: "DELETE", + headers: this.headers, + signal: this.signal(options), + }); + + if (!response.ok) { + const errorBody = await response.text().catch(() => "unknown error"); + throw new ComputeClientError(response.status, errorBody, url); + } + } +} + +// ── Error ──────────────────────────────────────────────────────────────────── + +export class ComputeClientError extends Error { + constructor( + public readonly status: number, + public readonly body: string, + public readonly url: string + ) { + super(`Compute gateway request failed (${status}): ${body}`); + this.name = "ComputeClientError"; + } +} + +// ── Namespaces ─────────────────────────────────────────────────────────────── + +class TemplatesNamespace { + constructor(private http: HttpTransport) {} + + async create( + req: TemplateCreateRequest, + options?: RequestOptions + ): Promise<{ accepted: boolean }> { + try { + await this.http.post("/api/templates", req, options); + // If we get here without error, the request was accepted (202) or succeeded + return { accepted: true }; + } catch (error) { + if (error instanceof ComputeClientError && error.status === 202) { + return { accepted: true }; + } + throw error; + } + } +} + +class InstancesNamespace { + constructor(private http: HttpTransport) {} + + async create( + req: InstanceCreateRequest, + options?: RequestOptions + ): Promise { + return this.http.post("/api/instances", req, options); + } + + async delete(runnerId: string, options?: RequestOptions): Promise { + return this.http.delete(`/api/instances/${runnerId}`, options); + } + + async snapshot( + runnerId: string, + req: InstanceSnapshotRequest, + options?: RequestOptions + ): Promise { + await this.http.post(`/api/instances/${runnerId}/snapshot`, req, options); + } +} + +class SnapshotsNamespace { + constructor(private http: HttpTransport) {} + + async restore( + snapshotId: string, + req: SnapshotRestoreRequest, + options?: RequestOptions + ): Promise { + await this.http.post(`/api/snapshots/${snapshotId}/restore`, req, options); + } +} diff --git a/internal-packages/compute/src/gatewayClient.ts b/internal-packages/compute/src/gatewayClient.ts deleted file mode 100644 index 17310d44e40..00000000000 --- a/internal-packages/compute/src/gatewayClient.ts +++ /dev/null @@ -1,58 +0,0 @@ -import type { TemplateCreateRequest } from "./types.js"; - -export type ComputeGatewayClientOptions = { - gatewayUrl: string; - authToken?: string; - timeoutMs: number; -}; - -export class ComputeGatewayClient { - constructor(private opts: ComputeGatewayClientOptions) {} - - async createTemplate( - req: TemplateCreateRequest, - options?: { signal?: AbortSignal } - ): Promise<{ accepted: boolean }> { - const response = await this.#fetch(req, options?.signal); - - if (!response.ok) { - const errorBody = await response.text().catch(() => "unknown error"); - throw new Error(`Gateway template creation failed (${response.status}): ${errorBody}`); - } - - return { accepted: response.status === 202 }; - } - - /** - * Fire-and-forget template creation. Sends the request but does not - * await the response, so no HTTP connection is held open. - */ - createTemplateBackground(req: TemplateCreateRequest): void { - this.#fetch(req).then( - (response) => { - if (!response.ok) { - response.text().catch(() => {}); - } - }, - () => {} // swallow network errors - ); - } - - #fetch(req: TemplateCreateRequest, signal?: AbortSignal): Promise { - const url = `${this.opts.gatewayUrl}/api/templates`; - - const headers: Record = { - "Content-Type": "application/json", - }; - if (this.opts.authToken) { - headers["Authorization"] = `Bearer ${this.opts.authToken}`; - } - - return fetch(url, { - method: "POST", - headers, - body: JSON.stringify(req), - signal: signal ?? AbortSignal.timeout(this.opts.timeoutMs), - }); - } -} diff --git a/internal-packages/compute/src/index.ts b/internal-packages/compute/src/index.ts index 20bb36a7bf6..d47b95af277 100644 --- a/internal-packages/compute/src/index.ts +++ b/internal-packages/compute/src/index.ts @@ -1,7 +1,14 @@ -export { ComputeGatewayClient } from "./gatewayClient.js"; -export type { ComputeGatewayClientOptions } from "./gatewayClient.js"; +export { ComputeClient, ComputeClientError } from "./client.js"; +export type { ComputeClientOptions } from "./client.js"; export { TemplateCreateRequestSchema, TemplateCallbackPayloadSchema, } from "./types.js"; -export type { TemplateCreateRequest, TemplateCallbackPayload } from "./types.js"; +export type { + TemplateCreateRequest, + TemplateCallbackPayload, + InstanceCreateRequest, + InstanceCreateResponse, + InstanceSnapshotRequest, + SnapshotRestoreRequest, +} from "./types.js"; diff --git a/internal-packages/compute/src/types.ts b/internal-packages/compute/src/types.ts index a84bb829e46..0cec64b15c1 100644 --- a/internal-packages/compute/src/types.ts +++ b/internal-packages/compute/src/types.ts @@ -1,5 +1,7 @@ import { z } from "zod"; +// ── Templates ──────────────────────────────────────────────────────────────── + export const TemplateCreateRequestSchema = z.object({ image: z.string(), cpu: z.number(), @@ -23,3 +25,35 @@ export const TemplateCallbackPayloadSchema = z.object({ duration_ms: z.number().optional(), }); export type TemplateCallbackPayload = z.infer; + +// ── Instances ──────────────────────────────────────────────────────────────── + +export type InstanceCreateRequest = { + name: string; + image: string; + env: Record; + cpu: number; + memory_gb: number; + metadata?: Record; +}; + +export type InstanceCreateResponse = { + id: string; + _timing?: unknown; +}; + +export type InstanceSnapshotRequest = { + callback: { + url: string; + metadata: Record; + }; +}; + +// ── Snapshots ──────────────────────────────────────────────────────────────── + +export type SnapshotRestoreRequest = { + name: string; + metadata: Record; + cpu: number; + memory_mb: number; +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 427ef2d9489..1f2b01e5065 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -190,6 +190,9 @@ importers: '@aws-sdk/client-ecr': specifier: ^3.839.0 version: 3.839.0 + '@internal/compute': + specifier: workspace:* + version: link:../../internal-packages/compute '@kubernetes/client-node': specifier: ^1.0.0 version: 1.0.0(patch_hash=ba1a06f46256cdb8d6faf7167246692c0de2e7cd846a9dc0f13be0137e1c3745)(bufferutil@4.0.9)(encoding@0.1.13) From d8e478a415cab027ef37b94c3a58fd9086aa45b1 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:48:18 +0000 Subject: [PATCH 38/40] fix: add type-safe post return, strip image digests consistently --- .../supervisor/src/workloadManager/compute.ts | 4 ++-- .../computeTemplateCreation.server.ts | 4 ++-- internal-packages/compute/src/client.ts | 23 ++++++++----------- internal-packages/compute/src/imageRef.ts | 11 +++++++++ internal-packages/compute/src/index.ts | 1 + 5 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 internal-packages/compute/src/imageRef.ts diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 8903681f67f..44695a3766f 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -6,7 +6,7 @@ import { type WorkloadManagerCreateOptions, type WorkloadManagerOptions, } from "./types.js"; -import { ComputeClient } from "@internal/compute"; +import { ComputeClient, stripImageDigest } from "@internal/compute"; import { env } from "../env.js"; import { getRunnerId } from "../util.js"; import { buildOtlpTracePayload } from "../otlpPayload.js"; @@ -82,7 +82,7 @@ export class ComputeWorkloadManager implements WorkloadManager { } // Strip image digest - resolve by tag, not digest - const imageRef = opts.image.split("@")[0]!; + const imageRef = stripImageDigest(opts.image); // Wide event: single canonical log line emitted in finally const event: Record = { diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 65d704c5f5c..159ef43fb00 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -1,4 +1,4 @@ -import { ComputeClient } from "@internal/compute"; +import { ComputeClient, stripImageDigest } from "@internal/compute"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; @@ -159,7 +159,7 @@ export class ComputeTemplateCreationService { try { await this.client.templates.create({ - image: imageReference, + image: stripImageDigest(imageReference), cpu: 0.5, memory_mb: 512, background: options?.background, diff --git a/internal-packages/compute/src/client.ts b/internal-packages/compute/src/client.ts index f4293af7197..4f627bd2830 100644 --- a/internal-packages/compute/src/client.ts +++ b/internal-packages/compute/src/client.ts @@ -46,7 +46,7 @@ class HttpTransport { return options?.signal ?? AbortSignal.timeout(this.opts.timeoutMs); } - async post(path: string, body: unknown, options?: RequestOptions): Promise { + async post(path: string, body: unknown, options?: RequestOptions): Promise { const url = `${this.opts.gatewayUrl}${path}`; const response = await fetch(url, { @@ -63,7 +63,7 @@ class HttpTransport { // 202 Accepted or 204 No Content - no body to parse if (response.status === 202 || response.status === 204) { - return undefined as T; + return undefined; } return (await response.json()) as T; @@ -106,17 +106,8 @@ class TemplatesNamespace { async create( req: TemplateCreateRequest, options?: RequestOptions - ): Promise<{ accepted: boolean }> { - try { - await this.http.post("/api/templates", req, options); - // If we get here without error, the request was accepted (202) or succeeded - return { accepted: true }; - } catch (error) { - if (error instanceof ComputeClientError && error.status === 202) { - return { accepted: true }; - } - throw error; - } + ): Promise { + await this.http.post("/api/templates", req, options); } } @@ -127,7 +118,11 @@ class InstancesNamespace { req: InstanceCreateRequest, options?: RequestOptions ): Promise { - return this.http.post("/api/instances", req, options); + const result = await this.http.post("/api/instances", req, options); + if (!result) { + throw new Error("Compute gateway returned no instance body"); + } + return result; } async delete(runnerId: string, options?: RequestOptions): Promise { diff --git a/internal-packages/compute/src/imageRef.ts b/internal-packages/compute/src/imageRef.ts new file mode 100644 index 00000000000..813f2a6a663 --- /dev/null +++ b/internal-packages/compute/src/imageRef.ts @@ -0,0 +1,11 @@ +/** + * Strip the digest suffix from a container image reference. + * Tags are immutable, so we resolve by tag rather than pinning to a digest. + * + * "ghcr.io/org/image:tag@sha256:abc..." -> "ghcr.io/org/image:tag" + * "ghcr.io/org/image@sha256:abc..." -> "ghcr.io/org/image" + * "ghcr.io/org/image:tag" -> "ghcr.io/org/image:tag" (unchanged) + */ +export function stripImageDigest(imageRef: string): string { + return imageRef.split("@")[0] ?? imageRef; +} diff --git a/internal-packages/compute/src/index.ts b/internal-packages/compute/src/index.ts index d47b95af277..f47bc0a65f4 100644 --- a/internal-packages/compute/src/index.ts +++ b/internal-packages/compute/src/index.ts @@ -1,5 +1,6 @@ export { ComputeClient, ComputeClientError } from "./client.js"; export type { ComputeClientOptions } from "./client.js"; +export { stripImageDigest } from "./imageRef.js"; export { TemplateCreateRequestSchema, TemplateCallbackPayloadSchema, From 641d6a31c027d2f0dde30482146ca93e579969fe Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:30:40 +0000 Subject: [PATCH 39/40] refactor: convert remaining compute types to zod schemas --- internal-packages/compute/src/index.ts | 4 ++ internal-packages/compute/src/types.ts | 56 ++++++++++++++------------ 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/internal-packages/compute/src/index.ts b/internal-packages/compute/src/index.ts index f47bc0a65f4..a8f3e8edb5c 100644 --- a/internal-packages/compute/src/index.ts +++ b/internal-packages/compute/src/index.ts @@ -4,6 +4,10 @@ export { stripImageDigest } from "./imageRef.js"; export { TemplateCreateRequestSchema, TemplateCallbackPayloadSchema, + InstanceCreateRequestSchema, + InstanceCreateResponseSchema, + InstanceSnapshotRequestSchema, + SnapshotRestoreRequestSchema, } from "./types.js"; export type { TemplateCreateRequest, diff --git a/internal-packages/compute/src/types.ts b/internal-packages/compute/src/types.ts index 0cec64b15c1..6f97ad9847e 100644 --- a/internal-packages/compute/src/types.ts +++ b/internal-packages/compute/src/types.ts @@ -28,32 +28,36 @@ export type TemplateCallbackPayload = z.infer; - cpu: number; - memory_gb: number; - metadata?: Record; -}; - -export type InstanceCreateResponse = { - id: string; - _timing?: unknown; -}; - -export type InstanceSnapshotRequest = { - callback: { - url: string; - metadata: Record; - }; -}; +export const InstanceCreateRequestSchema = z.object({ + name: z.string(), + image: z.string(), + env: z.record(z.string()), + cpu: z.number(), + memory_gb: z.number(), + metadata: z.record(z.unknown()).optional(), +}); +export type InstanceCreateRequest = z.infer; + +export const InstanceCreateResponseSchema = z.object({ + id: z.string(), + _timing: z.unknown().optional(), +}); +export type InstanceCreateResponse = z.infer; + +export const InstanceSnapshotRequestSchema = z.object({ + callback: z.object({ + url: z.string(), + metadata: z.record(z.string()), + }), +}); +export type InstanceSnapshotRequest = z.infer; // ── Snapshots ──────────────────────────────────────────────────────────────── -export type SnapshotRestoreRequest = { - name: string; - metadata: Record; - cpu: number; - memory_mb: number; -}; +export const SnapshotRestoreRequestSchema = z.object({ + name: z.string(), + metadata: z.record(z.string()), + cpu: z.number(), + memory_mb: z.number(), +}); +export type SnapshotRestoreRequest = z.infer; From c1021f213b8129272752c81f3db77c3b18a6cb4c Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:40:45 +0000 Subject: [PATCH 40/40] fix: bound trace context map, gate on compute mode, use machine preset for templates --- apps/supervisor/src/index.ts | 2 +- apps/supervisor/src/workloadServer/index.ts | 18 ++++++++++++++++++ .../services/computeTemplateCreation.server.ts | 8 ++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 45c410d2e34..f148d181c57 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -293,7 +293,7 @@ class ManagedSupervisor { return; } - if (env.COMPUTE_TRACE_SPANS_ENABLED) { + if (this.isComputeMode && env.COMPUTE_TRACE_SPANS_ENABLED) { const traceparent = message.run.traceContext && "traceparent" in message.run.traceContext && diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 9d2e25b3cb4..0ee28ec77ed 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -113,6 +113,11 @@ export class WorkloadServer extends EventEmitter { >(); private readonly workerClient: SupervisorHttpClient; + // Bounded map for trace contexts used by compute snapshot spans. + // Entries are added on dequeue and consumed on snapshot callback, which may arrive + // hours later after a checkpoint/restore cycle. Using a capped map avoids unbounded + // growth while keeping recent contexts available. Oldest entries are evicted first. + private static readonly MAX_TRACE_CONTEXTS = 10_000; private readonly runTraceContexts = new Map(); private readonly snapshotDelayWheel?: TimerWheel; @@ -821,6 +826,14 @@ export class WorkloadServer extends EventEmitter { } registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + // Evict oldest entries if we've hit the cap + if (this.runTraceContexts.size >= WorkloadServer.MAX_TRACE_CONTEXTS) { + const firstKey = this.runTraceContexts.keys().next().value; + if (firstKey) { + this.runTraceContexts.delete(firstKey); + } + } + this.runTraceContexts.set(runFriendlyId, ctx); } @@ -829,6 +842,11 @@ export class WorkloadServer extends EventEmitter { } async stop() { + // Intentionally drop pending snapshots rather than dispatching them. The supervisor + // is shutting down, so our callback URL will be dead by the time the gateway responds. + // Runners detect the supervisor is gone and reconnect to a new instance, which + // re-triggers the snapshot workflow. Snapshots are an optimization, not a correctness + // requirement - runs continue fine without them. const remaining = this.snapshotDelayWheel?.stop() ?? []; if (remaining.length > 0) { this.logger.info("Snapshot delay wheel stopped, dropped pending snapshots", { diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts index 159ef43fb00..873f2f089e3 100644 --- a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts +++ b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts @@ -1,4 +1,5 @@ import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { machinePresetFromName } from "~/v3/machinePresets.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import type { PrismaClientOrTransaction } from "~/db.server"; @@ -158,10 +159,13 @@ export class ComputeTemplateCreationService { } try { + // Templates are resource-agnostic - these values don't affect template content. + const machine = machinePresetFromName("small-1x"); + await this.client.templates.create({ image: stripImageDigest(imageReference), - cpu: 0.5, - memory_mb: 512, + cpu: machine.cpu, + memory_mb: machine.memory * 1024, background: options?.background, }); return { success: true };