Replaces the two overlapping old mechanisms (30-min setTimeout kill in
container-runner, 10-min heartbeat STALE_THRESHOLD reset in host-sweep)
with message-scoped stuck detection anchored to the processing_ack claim
age + an absolute 30-min ceiling that extends for long-declared Bash
tools.
Old model problems:
- IDLE_TIMEOUT setTimeout fired on plain wall-clock time; slow-but-alive
agents got killed at 30min regardless of activity
- 10-min STALE_THRESHOLD in the sweep was unreliable — the heartbeat is
only touched on SDK events, so legitimate silent tool work (sleep 30,
long WebFetch, npm install) looked identical to a hung container
- Two overlapping sources of truth for "when to let go of a container"
New model:
- Host sweep is the single source of truth.
- Container exposes a new `container_state` single-row table in outbound.db
(schema added; container writes, host reads). PreToolUse hook writes
current_tool + tool_declared_timeout_ms (read from Bash's tool_input);
PostToolUse / PostToolUseFailure clear it.
- Sweep decides with a pure helper `decideStuckAction`:
* absolute ceiling — kill if heartbeat age > max(30min, bash_timeout)
* per-claim stuck — kill if any processing_ack row has claim_age >
max(60s, bash_timeout) AND heartbeat hasn't been touched since claim
* otherwise ok
Kill paths reset leftover processing rows with exponential backoff,
reusing the existing retry machinery.
Tool blocklist expanded:
- AskUserQuestion (SDK placeholder; we have mcp__nanoclaw__ask_user_question)
- EnterPlanMode, ExitPlanMode, EnterWorktree, ExitWorktree (Claude Code UI
affordances; would hang in headless containers)
PreToolUse hook is also defense-in-depth: if a disallowed tool name slips
through, it returns `{ decision: 'block' }` so the agent sees a clear
error instead of appearing stuck.
Removed:
- container-runner.ts: IDLE_TIMEOUT setTimeout, resetIdle callback on
activeContainers entry, resetContainerIdleTimer export.
- delivery.ts: the resetContainerIdleTimer call on successful delivery.
- poll-loop.ts: IDLE_END_MS + its setInterval. Keeping the query open is
cheaper than close+reopen (no cold prompt cache). Liveness is now a
host-side concern.
- host-sweep.ts: 10-min STALE_THRESHOLD_MS + getStuckProcessingIds in the
stale-detection path (still exported for kill reset).
Tests:
- src/host-sweep.test.ts — 9 tests for decideStuckAction covering: fresh
heartbeat, absolute ceiling, absent heartbeat, Bash-timeout extension
(both ceiling and per-claim), claim age below tolerance, heartbeat
touched after claim, unparseable timestamps.
Ref: docs/v1-vs-v2/ACTION-ITEMS.md items 9, 6a, 10.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
129 lines
4.1 KiB
TypeScript
129 lines
4.1 KiB
TypeScript
/**
|
|
* Unit tests for the stuck-container decision logic introduced by
|
|
* ACTION-ITEMS item 9. Lives on the pure helper `decideStuckAction` so we
|
|
* don't have to mock the filesystem or the container runner.
|
|
*/
|
|
import { describe, expect, it } from 'vitest';
|
|
|
|
import { ABSOLUTE_CEILING_MS, CLAIM_STUCK_MS, decideStuckAction } from './host-sweep.js';
|
|
|
|
const BASE = Date.parse('2026-04-20T12:00:00.000Z');
|
|
|
|
function claim(id: string, offsetMs: number) {
|
|
return { message_id: id, status_changed: new Date(BASE - offsetMs).toISOString() };
|
|
}
|
|
|
|
describe('decideStuckAction', () => {
|
|
it('returns ok when heartbeat is fresh and no claims', () => {
|
|
expect(
|
|
decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: BASE - 5_000,
|
|
containerState: null,
|
|
claims: [],
|
|
}),
|
|
).toEqual({ action: 'ok' });
|
|
});
|
|
|
|
it('returns kill-ceiling when heartbeat older than 30 min', () => {
|
|
const heartbeatMtimeMs = BASE - ABSOLUTE_CEILING_MS - 1_000;
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs,
|
|
containerState: null,
|
|
claims: [],
|
|
});
|
|
expect(res.action).toBe('kill-ceiling');
|
|
if (res.action !== 'kill-ceiling') return;
|
|
expect(res.ceilingMs).toBe(ABSOLUTE_CEILING_MS);
|
|
expect(res.heartbeatAgeMs).toBeGreaterThan(ABSOLUTE_CEILING_MS);
|
|
});
|
|
|
|
it('treats an absent heartbeat file as infinitely stale', () => {
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: 0,
|
|
containerState: null,
|
|
claims: [],
|
|
});
|
|
expect(res.action).toBe('kill-ceiling');
|
|
});
|
|
|
|
it('extends the ceiling when Bash has a declared timeout longer than 30 min', () => {
|
|
const twoHrMs = 2 * 60 * 60 * 1000;
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
// 45 min — over the default ceiling, but under the Bash timeout
|
|
heartbeatMtimeMs: BASE - 45 * 60 * 1000,
|
|
containerState: {
|
|
current_tool: 'Bash',
|
|
tool_declared_timeout_ms: twoHrMs,
|
|
tool_started_at: new Date(BASE - 45 * 60 * 1000).toISOString(),
|
|
},
|
|
claims: [],
|
|
});
|
|
expect(res.action).toBe('ok');
|
|
});
|
|
|
|
it('returns kill-claim when a claim is past 60s and heartbeat has not moved', () => {
|
|
const claimedAgeMs = CLAIM_STUCK_MS + 10_000;
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: BASE - claimedAgeMs - 5_000, // older than the claim
|
|
containerState: null,
|
|
claims: [claim('msg-1', claimedAgeMs)],
|
|
});
|
|
expect(res.action).toBe('kill-claim');
|
|
if (res.action !== 'kill-claim') return;
|
|
expect(res.messageId).toBe('msg-1');
|
|
expect(res.toleranceMs).toBe(CLAIM_STUCK_MS);
|
|
});
|
|
|
|
it('does not kill when heartbeat has been touched since the claim', () => {
|
|
const claimedAgeMs = CLAIM_STUCK_MS + 10_000;
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: BASE - 2_000, // fresh, updated after the claim
|
|
containerState: null,
|
|
claims: [claim('msg-1', claimedAgeMs)],
|
|
});
|
|
expect(res.action).toBe('ok');
|
|
});
|
|
|
|
it('does not kill when claim age is below tolerance', () => {
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: BASE - CLAIM_STUCK_MS - 10_000, // old, but claim is recent
|
|
containerState: null,
|
|
claims: [claim('msg-1', 5_000)],
|
|
});
|
|
expect(res.action).toBe('ok');
|
|
});
|
|
|
|
it('widens per-claim tolerance for a running Bash with long timeout', () => {
|
|
const tenMinMs = 10 * 60 * 1000;
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
// 5 min since claim, over the 60s default but under the declared Bash timeout
|
|
heartbeatMtimeMs: BASE - (5 * 60 * 1000) - 5_000,
|
|
containerState: {
|
|
current_tool: 'Bash',
|
|
tool_declared_timeout_ms: tenMinMs,
|
|
tool_started_at: new Date(BASE - 5 * 60 * 1000).toISOString(),
|
|
},
|
|
claims: [claim('msg-1', 5 * 60 * 1000)],
|
|
});
|
|
expect(res.action).toBe('ok');
|
|
});
|
|
|
|
it('ignores claims with unparseable timestamps', () => {
|
|
const res = decideStuckAction({
|
|
now: BASE,
|
|
heartbeatMtimeMs: BASE - 5_000,
|
|
containerState: null,
|
|
claims: [{ message_id: 'x', status_changed: 'not-a-date' }],
|
|
});
|
|
expect(res.action).toBe('ok');
|
|
});
|
|
});
|