Files
nanoclaw/src/host-sweep.test.ts
gavrielc 0105de0257 fix(host-sweep): skip ceiling check when heartbeat file is absent
decideStuckAction treated a missing heartbeat file as heartbeatAge =
Infinity, which always exceeded the 30-minute ceiling. Result: every
freshly-spawned container got killed within seconds of spawn on the
first sweep pass because it hadn't produced an SDK event yet (heartbeat
is only touched on SDK events inside processQuery, not on boot).

Skip the ceiling branch when heartbeatMtimeMs === 0. Containers that
genuinely never wrote a heartbeat because they died are caught by the
separate "container process not running" cleanup path. Containers that
boot, claim a message, but hang at the gate are caught by the
claim-stuck check below — which correctly fires regardless of heartbeat
presence once claimAge exceeds tolerance.

Updates the "absent heartbeat → kill-ceiling" test (which was encoding
the bug) and adds a companion that the claim-stuck path still fires for
absent-heartbeat containers with aged claims.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:15:52 +03:00

147 lines
5.0 KiB
TypeScript

/**
* Unit tests for the stuck-container decision logic introduced by
* ACTION-ITEMS item 9. Lives on the pure helper `decideStuckAction` so we
* don't have to mock the filesystem or the container runner.
*/
import { describe, expect, it } from 'vitest';
import { ABSOLUTE_CEILING_MS, CLAIM_STUCK_MS, decideStuckAction } from './host-sweep.js';
const BASE = Date.parse('2026-04-20T12:00:00.000Z');
function claim(id: string, offsetMs: number) {
return { message_id: id, status_changed: new Date(BASE - offsetMs).toISOString() };
}
describe('decideStuckAction', () => {
it('returns ok when heartbeat is fresh and no claims', () => {
expect(
decideStuckAction({
now: BASE,
heartbeatMtimeMs: BASE - 5_000,
containerState: null,
claims: [],
}),
).toEqual({ action: 'ok' });
});
it('returns kill-ceiling when heartbeat older than 30 min', () => {
const heartbeatMtimeMs = BASE - ABSOLUTE_CEILING_MS - 1_000;
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs,
containerState: null,
claims: [],
});
expect(res.action).toBe('kill-ceiling');
if (res.action !== 'kill-ceiling') return;
expect(res.ceilingMs).toBe(ABSOLUTE_CEILING_MS);
expect(res.heartbeatAgeMs).toBeGreaterThan(ABSOLUTE_CEILING_MS);
});
it('skips the ceiling check when no heartbeat file exists (fresh container not yet ticked)', () => {
// A freshly-spawned container hasn't produced any SDK events yet, so no
// heartbeat. Prior behavior treated this as infinitely stale and killed
// every container within seconds of spawn. With no claims either, we
// should conclude everything is fine.
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: 0,
containerState: null,
claims: [],
});
expect(res.action).toBe('ok');
});
it('kills on claim-stuck when heartbeat is absent AND a claim has aged past tolerance', () => {
// Hanging fresh container: spawned, picked up a message (claim recorded
// in processing_ack), but never wrote a heartbeat. Falls through the
// skipped ceiling check into claim-stuck — which correctly fires.
const claimedAgeMs = CLAIM_STUCK_MS + 5_000;
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: 0,
containerState: null,
claims: [claim('msg-1', claimedAgeMs)],
});
expect(res.action).toBe('kill-claim');
});
it('extends the ceiling when Bash has a declared timeout longer than 30 min', () => {
const twoHrMs = 2 * 60 * 60 * 1000;
const res = decideStuckAction({
now: BASE,
// 45 min — over the default ceiling, but under the Bash timeout
heartbeatMtimeMs: BASE - 45 * 60 * 1000,
containerState: {
current_tool: 'Bash',
tool_declared_timeout_ms: twoHrMs,
tool_started_at: new Date(BASE - 45 * 60 * 1000).toISOString(),
},
claims: [],
});
expect(res.action).toBe('ok');
});
it('returns kill-claim when a claim is past 60s and heartbeat has not moved', () => {
const claimedAgeMs = CLAIM_STUCK_MS + 10_000;
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: BASE - claimedAgeMs - 5_000, // older than the claim
containerState: null,
claims: [claim('msg-1', claimedAgeMs)],
});
expect(res.action).toBe('kill-claim');
if (res.action !== 'kill-claim') return;
expect(res.messageId).toBe('msg-1');
expect(res.toleranceMs).toBe(CLAIM_STUCK_MS);
});
it('does not kill when heartbeat has been touched since the claim', () => {
const claimedAgeMs = CLAIM_STUCK_MS + 10_000;
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: BASE - 2_000, // fresh, updated after the claim
containerState: null,
claims: [claim('msg-1', claimedAgeMs)],
});
expect(res.action).toBe('ok');
});
it('does not kill when claim age is below tolerance', () => {
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: BASE - CLAIM_STUCK_MS - 10_000, // old, but claim is recent
containerState: null,
claims: [claim('msg-1', 5_000)],
});
expect(res.action).toBe('ok');
});
it('widens per-claim tolerance for a running Bash with long timeout', () => {
const tenMinMs = 10 * 60 * 1000;
const res = decideStuckAction({
now: BASE,
// 5 min since claim, over the 60s default but under the declared Bash timeout
heartbeatMtimeMs: BASE - 5 * 60 * 1000 - 5_000,
containerState: {
current_tool: 'Bash',
tool_declared_timeout_ms: tenMinMs,
tool_started_at: new Date(BASE - 5 * 60 * 1000).toISOString(),
},
claims: [claim('msg-1', 5 * 60 * 1000)],
});
expect(res.action).toBe('ok');
});
it('ignores claims with unparseable timestamps', () => {
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: BASE - 5_000,
containerState: null,
claims: [{ message_id: 'x', status_changed: 'not-a-date' }],
});
expect(res.action).toBe('ok');
});
});