Merge pull request #1941 from qwibitai/fix/container-restart-recovery

fix: container restart recovery — stale heartbeat + orphan claim loop
2026-04-23 19:01:36 +03:00
parent 438dedad77 500353c182
commit a67b4abd79
3 changed files with 41 additions and 15 deletions
--- a/src/container-runner.ts
+++ b/src/container-runner.ts
@@ -36,7 +36,13 @@ import {
  type ProviderContainerContribution,
  type VolumeMount,
 } from './providers/provider-container-registry.js';
-import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js';
+import {
+  heartbeatPath,
+  markContainerRunning,
+  markContainerStopped,
+  sessionDir,
+  writeSessionRouting,
+} from './session-manager.js';
 import type { AgentGroup, Session } from './types.js';

 const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY });
@@ -131,6 +137,12 @@ async function spawnContainer(session: Session): Promise<void> {

  log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName });

+  // Clear any orphan heartbeat from a previous container instance — the
+  // sweep's ceiling check treats a missing file as "fresh spawn, give grace"
+  // (host-sweep.ts line 87). Without this, the stale mtime can trigger an
+  // immediate kill before the new container touches the file itself.
+  fs.rmSync(heartbeatPath(agentGroup.id, session.id), { force: true });
+
  const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] });

  activeContainers.set(session.id, { process: container, containerName });
--- a/src/db/session-db.ts
+++ b/src/db/session-db.ts
@@ -139,10 +139,10 @@ export function getMessageForRetry(
  db: Database.Database,
  messageId: string,
  status: string,
-): { id: string; tries: number } | undefined {
-  return db.prepare('SELECT id, tries FROM messages_in WHERE id = ? AND status = ?').get(messageId, status) as
-    | { id: string; tries: number }
-    | undefined;
+): { id: string; tries: number; processAfter: string | null } | undefined {
+  return db
+    .prepare('SELECT id, tries, process_after as processAfter FROM messages_in WHERE id = ? AND status = ?')
+    .get(messageId, status) as { id: string; tries: number; processAfter: string | null } | undefined;
 }

 export function syncProcessingAcks(inDb: Database.Database, outDb: Database.Database): void {
--- a/src/host-sweep.ts
+++ b/src/host-sweep.ts
@@ -159,23 +159,31 @@ async function sweepSession(session: Session): Promise<void> {
      syncProcessingAcks(inDb, outDb);
    }

-    const alive = isContainerRunning(session.id);
-
-    // 2. Crashed-container cleanup: processing rows left behind get retried.
-    if (!alive && outDb) {
-      resetStuckProcessingRows(inDb, outDb, session, 'container not running');
+    // 2. Wake a container if work is due and nothing is running. Ordered
+    // before the crashed-container cleanup so a fresh container gets a chance
+    // to clean its own orphan processing_ack rows on startup (see
+    // container/agent-runner/src/db/connection.ts). Otherwise the reset path
+    // would keep bumping process_after into the future, dueCount would stay 0,
+    // and the wake would never fire.
+    const dueCount = countDueMessages(inDb);
+    if (dueCount > 0 && !isContainerRunning(session.id)) {
+      log.info('Waking container for due messages', { sessionId: session.id, count: dueCount });
+      await wakeContainer(session);
    }

+    const alive = isContainerRunning(session.id);
+
    // 3. Running-container SLA: absolute ceiling + per-claim stuck rules.
    if (alive && outDb) {
      enforceRunningContainerSla(inDb, outDb, session, agentGroup.id);
    }

-    // 4. Wake a container if new work is due and nothing is running.
-    const dueCount = countDueMessages(inDb);
-    if (dueCount > 0 && !isContainerRunning(session.id)) {
-      log.info('Waking container for due messages', { sessionId: session.id, count: dueCount });
-      await wakeContainer(session);
+    // 4. Crashed-container cleanup: processing rows left behind get retried.
+    // Only fires when wake in step 2 didn't pick up the work (no due messages,
+    // or wake failed). resetStuckProcessingRows itself is idempotent — it
+    // skips messages already scheduled for a future retry.
+    if (!alive && outDb) {
+      resetStuckProcessingRows(inDb, outDb, session, 'container not running');
    }

    // 5. Recurrence fanout for completed recurring tasks.
@@ -246,10 +254,16 @@ function resetStuckProcessingRows(
  reason: string,
 ): void {
  const claims = getProcessingClaims(outDb);
+  const now = Date.now();
  for (const { message_id } of claims) {
    const msg = getMessageForRetry(inDb, message_id, 'pending');
    if (!msg) continue;

+    // Already rescheduled for a future retry — don't bump tries again. The
+    // wake path (sweep step 2) will fire when process_after elapses and a
+    // fresh container will clean the orphan claim on startup.
+    if (msg.processAfter && Date.parse(msg.processAfter) > now) continue;
+
    if (msg.tries >= MAX_TRIES) {
      markMessageFailed(inDb, msg.id);
      log.warn('Message marked as failed after max retries', {