From bee80b007200833eef4f87780a770092e95d7330 Mon Sep 17 00:00:00 2001 From: "exe.dev user" Date: Thu, 23 Apr 2026 15:12:02 +0000 Subject: [PATCH 1/3] fix(container): clear orphan heartbeat before spawn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a container exits, its .heartbeat file is left behind with the mtime of its last SDK activity. When the same session spawns a new container, the host sweep's ceiling check reads that stale mtime and kills the freshly-spawned container within seconds — before the new instance has had time to touch the file itself. The sweep already has a carve-out for "no heartbeat file" (treated as a fresh spawn, given grace), so simply removing the orphan at spawn time restores the intended semantics. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/container-runner.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/container-runner.ts b/src/container-runner.ts index 71e2064..8815b11 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -36,7 +36,7 @@ import { type ProviderContainerContribution, type VolumeMount, } from './providers/provider-container-registry.js'; -import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js'; +import { heartbeatPath, markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js'; import type { AgentGroup, Session } from './types.js'; const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY }); @@ -131,6 +131,12 @@ async function spawnContainer(session: Session): Promise { log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName }); + // Clear any orphan heartbeat from a previous container instance — the + // sweep's ceiling check treats a missing file as "fresh spawn, give grace" + // (host-sweep.ts line 87). Without this, the stale mtime can trigger an + // immediate kill before the new container touches the file itself. + fs.rmSync(heartbeatPath(agentGroup.id, session.id), { force: true }); + const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] }); activeContainers.set(session.id, { process: container, containerName }); From 209061f54f6a8804ad6fd50f4ddf7d5a140b408e Mon Sep 17 00:00:00 2001 From: "exe.dev user" Date: Thu, 23 Apr 2026 15:12:16 +0000 Subject: [PATCH 2/3] fix(sweep): wake before reset + idempotent retry for orphan claims MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a container exits with an unresolved processing_ack claim, the sweep's crashed-container cleanup would reset the matching inbound message with tries++ and a future process_after. dueCount then dropped to 0, so the wake step never fired — and the next sweep tick found the same orphan claim, bumped tries again, and pushed process_after further out. The message reached MAX_TRIES and was marked failed without any container ever being spawned. Two changes: 1. Reorder sweep so the wake step runs before crashed-container cleanup. A fresh container clears orphan 'processing' rows on its own startup (container/agent-runner/src/db/connection.ts), so once we get it running the claim resolves itself. 2. Make resetStuckProcessingRows idempotent: if a message already has process_after set to a future time, skip the retry bump. The wake path will pick it up when the backoff elapses. Requires returning process_after from getMessageForRetry. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/db/session-db.ts | 8 ++++---- src/host-sweep.ts | 34 ++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/db/session-db.ts b/src/db/session-db.ts index aea255d..48e9297 100644 --- a/src/db/session-db.ts +++ b/src/db/session-db.ts @@ -139,10 +139,10 @@ export function getMessageForRetry( db: Database.Database, messageId: string, status: string, -): { id: string; tries: number } | undefined { - return db.prepare('SELECT id, tries FROM messages_in WHERE id = ? AND status = ?').get(messageId, status) as - | { id: string; tries: number } - | undefined; +): { id: string; tries: number; processAfter: string | null } | undefined { + return db + .prepare('SELECT id, tries, process_after as processAfter FROM messages_in WHERE id = ? AND status = ?') + .get(messageId, status) as { id: string; tries: number; processAfter: string | null } | undefined; } export function syncProcessingAcks(inDb: Database.Database, outDb: Database.Database): void { diff --git a/src/host-sweep.ts b/src/host-sweep.ts index 1a2901c..4dc2fb7 100644 --- a/src/host-sweep.ts +++ b/src/host-sweep.ts @@ -159,23 +159,31 @@ async function sweepSession(session: Session): Promise { syncProcessingAcks(inDb, outDb); } - const alive = isContainerRunning(session.id); - - // 2. Crashed-container cleanup: processing rows left behind get retried. - if (!alive && outDb) { - resetStuckProcessingRows(inDb, outDb, session, 'container not running'); + // 2. Wake a container if work is due and nothing is running. Ordered + // before the crashed-container cleanup so a fresh container gets a chance + // to clean its own orphan processing_ack rows on startup (see + // container/agent-runner/src/db/connection.ts). Otherwise the reset path + // would keep bumping process_after into the future, dueCount would stay 0, + // and the wake would never fire. + const dueCount = countDueMessages(inDb); + if (dueCount > 0 && !isContainerRunning(session.id)) { + log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); + await wakeContainer(session); } + const alive = isContainerRunning(session.id); + // 3. Running-container SLA: absolute ceiling + per-claim stuck rules. if (alive && outDb) { enforceRunningContainerSla(inDb, outDb, session, agentGroup.id); } - // 4. Wake a container if new work is due and nothing is running. - const dueCount = countDueMessages(inDb); - if (dueCount > 0 && !isContainerRunning(session.id)) { - log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); - await wakeContainer(session); + // 4. Crashed-container cleanup: processing rows left behind get retried. + // Only fires when wake in step 2 didn't pick up the work (no due messages, + // or wake failed). resetStuckProcessingRows itself is idempotent — it + // skips messages already scheduled for a future retry. + if (!alive && outDb) { + resetStuckProcessingRows(inDb, outDb, session, 'container not running'); } // 5. Recurrence fanout for completed recurring tasks. @@ -246,10 +254,16 @@ function resetStuckProcessingRows( reason: string, ): void { const claims = getProcessingClaims(outDb); + const now = Date.now(); for (const { message_id } of claims) { const msg = getMessageForRetry(inDb, message_id, 'pending'); if (!msg) continue; + // Already rescheduled for a future retry — don't bump tries again. The + // wake path (sweep step 2) will fire when process_after elapses and a + // fresh container will clean the orphan claim on startup. + if (msg.processAfter && Date.parse(msg.processAfter) > now) continue; + if (msg.tries >= MAX_TRIES) { markMessageFailed(inDb, msg.id); log.warn('Message marked as failed after max retries', { From 237876c2c6f7012fcbd6d8505b8b8e5dea33b2d3 Mon Sep 17 00:00:00 2001 From: "exe.dev user" Date: Thu, 23 Apr 2026 15:12:56 +0000 Subject: [PATCH 3/3] chore(format): wrap session-manager import in container-runner Pre-commit prettier reformatted this in the working tree but didn't re-stage. Keeping it in a separate commit to avoid amending a prior commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/container-runner.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/container-runner.ts b/src/container-runner.ts index 8815b11..fca88c4 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -36,7 +36,13 @@ import { type ProviderContainerContribution, type VolumeMount, } from './providers/provider-container-registry.js'; -import { heartbeatPath, markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js'; +import { + heartbeatPath, + markContainerRunning, + markContainerStopped, + sessionDir, + writeSessionRouting, +} from './session-manager.js'; import type { AgentGroup, Session } from './types.js'; const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY });