feat(lifecycle): stuck detection + heartbeat lifecycle + SDK tool blocklist
Replaces the two overlapping old mechanisms (30-min setTimeout kill in
container-runner, 10-min heartbeat STALE_THRESHOLD reset in host-sweep)
with message-scoped stuck detection anchored to the processing_ack claim
age + an absolute 30-min ceiling that extends for long-declared Bash
tools.
Old model problems:
- IDLE_TIMEOUT setTimeout fired on plain wall-clock time; slow-but-alive
agents got killed at 30min regardless of activity
- 10-min STALE_THRESHOLD in the sweep was unreliable — the heartbeat is
only touched on SDK events, so legitimate silent tool work (sleep 30,
long WebFetch, npm install) looked identical to a hung container
- Two overlapping sources of truth for "when to let go of a container"
New model:
- Host sweep is the single source of truth.
- Container exposes a new `container_state` single-row table in outbound.db
(schema added; container writes, host reads). PreToolUse hook writes
current_tool + tool_declared_timeout_ms (read from Bash's tool_input);
PostToolUse / PostToolUseFailure clear it.
- Sweep decides with a pure helper `decideStuckAction`:
* absolute ceiling — kill if heartbeat age > max(30min, bash_timeout)
* per-claim stuck — kill if any processing_ack row has claim_age >
max(60s, bash_timeout) AND heartbeat hasn't been touched since claim
* otherwise ok
Kill paths reset leftover processing rows with exponential backoff,
reusing the existing retry machinery.
Tool blocklist expanded:
- AskUserQuestion (SDK placeholder; we have mcp__nanoclaw__ask_user_question)
- EnterPlanMode, ExitPlanMode, EnterWorktree, ExitWorktree (Claude Code UI
affordances; would hang in headless containers)
PreToolUse hook is also defense-in-depth: if a disallowed tool name slips
through, it returns `{ decision: 'block' }` so the agent sees a clear
error instead of appearing stuck.
Removed:
- container-runner.ts: IDLE_TIMEOUT setTimeout, resetIdle callback on
activeContainers entry, resetContainerIdleTimer export.
- delivery.ts: the resetContainerIdleTimer call on successful delivery.
- poll-loop.ts: IDLE_END_MS + its setInterval. Keeping the query open is
cheaper than close+reopen (no cold prompt cache). Liveness is now a
host-side concern.
- host-sweep.ts: 10-min STALE_THRESHOLD_MS + getStuckProcessingIds in the
stale-detection path (still exported for kill reset).
Tests:
- src/host-sweep.test.ts — 9 tests for decideStuckAction covering: fresh
heartbeat, absolute ceiling, absent heartbeat, Bash-timeout extension
(both ceiling and per-claim), claim age below tolerance, heartbeat
touched after claim, unparseable timestamps.
Ref: docs/v1-vs-v2/ACTION-ITEMS.md items 9, 6a, 10.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,7 +9,7 @@ import path from 'path';
|
||||
|
||||
import { OneCLI } from '@onecli-sh/sdk';
|
||||
|
||||
import { CONTAINER_IMAGE, DATA_DIR, GROUPS_DIR, IDLE_TIMEOUT, ONECLI_URL, TIMEZONE } from './config.js';
|
||||
import { CONTAINER_IMAGE, DATA_DIR, GROUPS_DIR, ONECLI_URL, TIMEZONE } from './config.js';
|
||||
import { readContainerConfig, writeContainerConfig } from './container-config.js';
|
||||
import { CONTAINER_RUNTIME_BIN, hostGatewayArgs, readonlyMountArgs, stopContainer } from './container-runtime.js';
|
||||
import { getAgentGroup } from './db/agent-groups.js';
|
||||
@@ -26,12 +26,7 @@ import {
|
||||
type ProviderContainerContribution,
|
||||
type VolumeMount,
|
||||
} from './providers/provider-container-registry.js';
|
||||
import {
|
||||
markContainerRunning,
|
||||
markContainerStopped,
|
||||
sessionDir,
|
||||
writeSessionRouting,
|
||||
} from './session-manager.js';
|
||||
import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js';
|
||||
import type { AgentGroup, Session } from './types.js';
|
||||
|
||||
const onecli = new OneCLI({ url: ONECLI_URL });
|
||||
@@ -125,22 +120,12 @@ async function spawnContainer(session: Session): Promise<void> {
|
||||
// stdout is unused in v2 (all IO is via session DB)
|
||||
container.stdout?.on('data', () => {});
|
||||
|
||||
// Idle timeout: kill container after IDLE_TIMEOUT of no activity
|
||||
let idleTimer = setTimeout(() => killContainer(session.id, 'idle timeout'), IDLE_TIMEOUT);
|
||||
|
||||
const resetIdle = () => {
|
||||
clearTimeout(idleTimer);
|
||||
idleTimer = setTimeout(() => killContainer(session.id, 'idle timeout'), IDLE_TIMEOUT);
|
||||
};
|
||||
|
||||
// Reset idle timer when the host detects new messages_out (called by delivery.ts)
|
||||
const entry = activeContainers.get(session.id);
|
||||
if (entry) {
|
||||
(entry as { resetIdle?: () => void }).resetIdle = resetIdle;
|
||||
}
|
||||
// No host-side idle timeout. Stale/stuck detection is driven by the host
|
||||
// sweep reading heartbeat mtime + processing_ack claim age + container_state
|
||||
// (see src/host-sweep.ts). This avoids killing long-running legitimate work
|
||||
// on a wall-clock timer.
|
||||
|
||||
container.on('close', (code) => {
|
||||
clearTimeout(idleTimer);
|
||||
activeContainers.delete(session.id);
|
||||
markContainerStopped(session.id);
|
||||
stopTypingRefresh(session.id);
|
||||
@@ -148,7 +133,6 @@ async function spawnContainer(session: Session): Promise<void> {
|
||||
});
|
||||
|
||||
container.on('error', (err) => {
|
||||
clearTimeout(idleTimer);
|
||||
activeContainers.delete(session.id);
|
||||
markContainerStopped(session.id);
|
||||
stopTypingRefresh(session.id);
|
||||
@@ -156,12 +140,6 @@ async function spawnContainer(session: Session): Promise<void> {
|
||||
});
|
||||
}
|
||||
|
||||
/** Reset the idle timer for a session's container (called when messages_out are delivered). */
|
||||
export function resetContainerIdleTimer(sessionId: string): void {
|
||||
const entry = activeContainers.get(sessionId) as { resetIdle?: () => void } | undefined;
|
||||
entry?.resetIdle?.();
|
||||
}
|
||||
|
||||
/** Kill a container for a session. */
|
||||
export function killContainer(sessionId: string, reason: string): void {
|
||||
const entry = activeContainers.get(sessionId);
|
||||
|
||||
Reference in New Issue
Block a user