feat(lifecycle): stuck detection + heartbeat lifecycle + SDK tool blocklist

Replaces the two overlapping old mechanisms (30-min setTimeout kill in
container-runner, 10-min heartbeat STALE_THRESHOLD reset in host-sweep)
with message-scoped stuck detection anchored to the processing_ack claim
age + an absolute 30-min ceiling that extends for long-declared Bash
tools.

Old model problems:
- IDLE_TIMEOUT setTimeout fired on plain wall-clock time; slow-but-alive
  agents got killed at 30min regardless of activity
- 10-min STALE_THRESHOLD in the sweep was unreliable — the heartbeat is
  only touched on SDK events, so legitimate silent tool work (sleep 30,
  long WebFetch, npm install) looked identical to a hung container
- Two overlapping sources of truth for "when to let go of a container"

New model:
- Host sweep is the single source of truth.
- Container exposes a new `container_state` single-row table in outbound.db
  (schema added; container writes, host reads). PreToolUse hook writes
  current_tool + tool_declared_timeout_ms (read from Bash's tool_input);
  PostToolUse / PostToolUseFailure clear it.
- Sweep decides with a pure helper `decideStuckAction`:
    * absolute ceiling — kill if heartbeat age > max(30min, bash_timeout)
    * per-claim stuck  — kill if any processing_ack row has claim_age >
      max(60s, bash_timeout) AND heartbeat hasn't been touched since claim
    * otherwise ok
  Kill paths reset leftover processing rows with exponential backoff,
  reusing the existing retry machinery.

Tool blocklist expanded:
- AskUserQuestion (SDK placeholder; we have mcp__nanoclaw__ask_user_question)
- EnterPlanMode, ExitPlanMode, EnterWorktree, ExitWorktree (Claude Code UI
  affordances; would hang in headless containers)
PreToolUse hook is also defense-in-depth: if a disallowed tool name slips
through, it returns `{ decision: 'block' }` so the agent sees a clear
error instead of appearing stuck.

Removed:
- container-runner.ts: IDLE_TIMEOUT setTimeout, resetIdle callback on
  activeContainers entry, resetContainerIdleTimer export.
- delivery.ts: the resetContainerIdleTimer call on successful delivery.
- poll-loop.ts: IDLE_END_MS + its setInterval. Keeping the query open is
  cheaper than close+reopen (no cold prompt cache). Liveness is now a
  host-side concern.
- host-sweep.ts: 10-min STALE_THRESHOLD_MS + getStuckProcessingIds in the
  stale-detection path (still exported for kill reset).

Tests:
- src/host-sweep.test.ts — 9 tests for decideStuckAction covering: fresh
  heartbeat, absolute ceiling, absent heartbeat, Bash-timeout extension
  (both ceiling and per-claim), claim age below tolerance, heartbeat
  touched after claim, unparseable timestamps.

Ref: docs/v1-vs-v2/ACTION-ITEMS.md items 9, 6a, 10.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-04-20 01:16:57 +03:00
parent dcfa12ea06
commit 6a815190c0
12 changed files with 459 additions and 86 deletions

View File

@@ -64,10 +64,58 @@ export function getOutboundDb(): Database {
if (!cols.has('updated_at')) {
_outbound.exec(`ALTER TABLE session_state ADD COLUMN updated_at TEXT NOT NULL DEFAULT ''`);
}
// container_state: tracks the current tool in flight (if any) so the host
// sweep can widen its stuck tolerance when Bash is running with a user-
// declared long timeout. Forward-compat for older outbound.db files.
_outbound.exec(`
CREATE TABLE IF NOT EXISTS container_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
current_tool TEXT,
tool_declared_timeout_ms INTEGER,
tool_started_at TEXT,
updated_at TEXT NOT NULL
);
`);
}
return _outbound;
}
/**
* Record that a tool is starting. `declaredTimeoutMs` is the tool's own
* timeout hint when one is available (Bash exposes it in the tool_use input);
* omit for tools with no declared timeout.
*/
export function setContainerToolInFlight(tool: string, declaredTimeoutMs: number | null): void {
const now = new Date().toISOString();
getOutboundDb()
.prepare(
`INSERT INTO container_state (id, current_tool, tool_declared_timeout_ms, tool_started_at, updated_at)
VALUES (1, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
current_tool = excluded.current_tool,
tool_declared_timeout_ms = excluded.tool_declared_timeout_ms,
tool_started_at = excluded.tool_started_at,
updated_at = excluded.updated_at`,
)
.run(tool, declaredTimeoutMs, now, now);
}
/** Clear the in-flight tool — called on PostToolUse / PostToolUseFailure. */
export function clearContainerToolInFlight(): void {
const now = new Date().toISOString();
getOutboundDb()
.prepare(
`INSERT INTO container_state (id, current_tool, tool_declared_timeout_ms, tool_started_at, updated_at)
VALUES (1, NULL, NULL, NULL, ?)
ON CONFLICT(id) DO UPDATE SET
current_tool = NULL,
tool_declared_timeout_ms = NULL,
tool_started_at = NULL,
updated_at = excluded.updated_at`,
)
.run(now);
}
/**
* Touch the heartbeat file — replaces the old touchProcessing() DB writes.
* The host checks this file's mtime for stale container detection.
@@ -157,6 +205,13 @@ export function initTestSessionDb(): { inbound: Database; outbound: Database } {
value TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE TABLE container_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
current_tool TEXT,
tool_declared_timeout_ms INTEGER,
tool_started_at TEXT,
updated_at TEXT NOT NULL
);
`);
return { inbound: _inbound, outbound: _outbound };

View File

@@ -8,7 +8,6 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
const POLL_INTERVAL_MS = 1000;
const ACTIVE_POLL_INTERVAL_MS = 500;
const IDLE_END_MS = 20_000; // End stream after 20s with no SDK events
function log(msg: string): void {
console.error(`[poll-loop] ${msg}`);
@@ -267,9 +266,13 @@ interface QueryResult {
async function processQuery(query: AgentQuery, routing: RoutingContext): Promise<QueryResult> {
let queryContinuation: string | undefined;
let done = false;
let lastEventTime = Date.now();
// Concurrent polling: push follow-ups, checkpoint WAL, detect idle
// Concurrent polling: push follow-ups into the active query as they arrive.
// We do NOT force-end the stream on silence — keeping the query open is
// strictly cheaper than close+reopen (no cold prompt cache, no reconnect).
// Stream liveness is decided host-side via the heartbeat file + processing
// claim age (see src/host-sweep.ts); if something is truly stuck, the host
// will kill the container and messages get reset to pending.
const pollHandle = setInterval(() => {
if (done) return;
@@ -296,19 +299,11 @@ async function processQuery(query: AgentQuery, routing: RoutingContext): Promise
query.push(prompt);
markCompleted(newIds);
lastEventTime = Date.now(); // new input counts as activity
}
// End stream when agent is idle: no SDK events and no pending messages
if (Date.now() - lastEventTime > IDLE_END_MS) {
log(`No SDK events for ${IDLE_END_MS / 1000}s, ending query`);
query.end();
}
}, ACTIVE_POLL_INTERVAL_MS);
try {
for await (const event of query.events) {
lastEventTime = Date.now();
handleEvent(event, routing);
touchHeartbeat();

View File

@@ -3,6 +3,7 @@ import path from 'path';
import { query as sdkQuery, type HookCallback, type PreCompactHookInput } from '@anthropic-ai/claude-agent-sdk';
import { clearContainerToolInFlight, setContainerToolInFlight } from '../db/connection.js';
import { registerProvider } from './provider-registry.js';
import type { AgentProvider, AgentQuery, McpServerConfig, ProviderEvent, ProviderOptions, QueryInput } from './types.js';
@@ -10,10 +11,28 @@ function log(msg: string): void {
console.error(`[claude-provider] ${msg}`);
}
// Deferred SDK builtins that would sidestep nanoclaw's own scheduling.
// Scheduling goes through mcp__nanoclaw__schedule_task so that tasks are
// durable across sessions/restarts and gated by our pre-task script hook.
const SDK_DISALLOWED_TOOLS = ['CronCreate', 'CronDelete', 'CronList', 'ScheduleWakeup'];
// Deferred SDK builtins that either sidestep nanoclaw's own scheduling or
// don't fit our async message-passing model (they're designed for Claude
// Code's interactive UI and would hang here).
//
// - CronCreate / CronDelete / CronList / ScheduleWakeup: we have durable
// scheduling via mcp__nanoclaw__schedule_task.
// - AskUserQuestion: SDK returns a placeholder instead of blocking on a
// real answer — we have mcp__nanoclaw__ask_user_question that persists
// the question and blocks on the real reply.
// - EnterPlanMode / ExitPlanMode / EnterWorktree / ExitWorktree: Claude
// Code UI affordances; in a headless container they'd appear stuck.
const SDK_DISALLOWED_TOOLS = [
'CronCreate',
'CronDelete',
'CronList',
'ScheduleWakeup',
'AskUserQuestion',
'EnterPlanMode',
'ExitPlanMode',
'EnterWorktree',
'ExitWorktree',
];
// Tool allowlist for NanoClaw agent containers
const TOOL_ALLOWLIST = [
@@ -122,6 +141,43 @@ function formatTranscriptMarkdown(messages: ParsedMessage[], title?: string | nu
return lines.join('\n');
}
/**
* PreToolUse hook: record the current tool + its declared timeout so the host
* sweep can widen its stuck tolerance while Bash is running a long-declared
* script. Defense-in-depth: if SDK_DISALLOWED_TOOLS slips through somehow,
* block the call here instead of letting the agent hang.
*/
const preToolUseHook: HookCallback = async (input) => {
const i = input as { tool_name?: string; tool_input?: Record<string, unknown> };
const toolName = i.tool_name ?? '';
if (SDK_DISALLOWED_TOOLS.includes(toolName)) {
return {
decision: 'block',
stopReason: `Tool '${toolName}' is not available in this environment — use the nanoclaw equivalent.`,
} as unknown as ReturnType<HookCallback>;
}
// Bash exposes its timeout via the tool_input.timeout field (ms). Any other
// tool: no declared timeout.
const declaredTimeoutMs =
toolName === 'Bash' && typeof i.tool_input?.timeout === 'number' ? (i.tool_input.timeout as number) : null;
try {
setContainerToolInFlight(toolName, declaredTimeoutMs);
} catch (err) {
log(`PreToolUse: failed to record container_state: ${err instanceof Error ? err.message : String(err)}`);
}
return { continue: true };
};
/** Clear in-flight tool on PostToolUse / PostToolUseFailure. */
const postToolUseHook: HookCallback = async () => {
try {
clearContainerToolInFlight();
} catch (err) {
log(`PostToolUse: failed to clear container_state: ${err instanceof Error ? err.message : String(err)}`);
}
return { continue: true };
};
function createPreCompactHook(assistantName?: string): HookCallback {
return async (input) => {
const preCompact = input as PreCompactHookInput;
@@ -224,6 +280,9 @@ export class ClaudeProvider implements AgentProvider {
settingSources: ['project', 'user'],
mcpServers: this.mcpServers,
hooks: {
PreToolUse: [{ hooks: [preToolUseHook] }],
PostToolUse: [{ hooks: [postToolUseHook] }],
PostToolUseFailure: [{ hooks: [postToolUseHook] }],
PreCompact: [{ hooks: [createPreCompactHook(this.assistantName)] }],
},
},