Replaces the two overlapping old mechanisms (30-min setTimeout kill in
container-runner, 10-min heartbeat STALE_THRESHOLD reset in host-sweep)
with message-scoped stuck detection anchored to the processing_ack claim
age + an absolute 30-min ceiling that extends for long-declared Bash
tools.
Old model problems:
- IDLE_TIMEOUT setTimeout fired on plain wall-clock time; slow-but-alive
agents got killed at 30min regardless of activity
- 10-min STALE_THRESHOLD in the sweep was unreliable — the heartbeat is
only touched on SDK events, so legitimate silent tool work (sleep 30,
long WebFetch, npm install) looked identical to a hung container
- Two overlapping sources of truth for "when to let go of a container"
New model:
- Host sweep is the single source of truth.
- Container exposes a new `container_state` single-row table in outbound.db
(schema added; container writes, host reads). PreToolUse hook writes
current_tool + tool_declared_timeout_ms (read from Bash's tool_input);
PostToolUse / PostToolUseFailure clear it.
- Sweep decides with a pure helper `decideStuckAction`:
* absolute ceiling — kill if heartbeat age > max(30min, bash_timeout)
* per-claim stuck — kill if any processing_ack row has claim_age >
max(60s, bash_timeout) AND heartbeat hasn't been touched since claim
* otherwise ok
Kill paths reset leftover processing rows with exponential backoff,
reusing the existing retry machinery.
Tool blocklist expanded:
- AskUserQuestion (SDK placeholder; we have mcp__nanoclaw__ask_user_question)
- EnterPlanMode, ExitPlanMode, EnterWorktree, ExitWorktree (Claude Code UI
affordances; would hang in headless containers)
PreToolUse hook is also defense-in-depth: if a disallowed tool name slips
through, it returns `{ decision: 'block' }` so the agent sees a clear
error instead of appearing stuck.
Removed:
- container-runner.ts: IDLE_TIMEOUT setTimeout, resetIdle callback on
activeContainers entry, resetContainerIdleTimer export.
- delivery.ts: the resetContainerIdleTimer call on successful delivery.
- poll-loop.ts: IDLE_END_MS + its setInterval. Keeping the query open is
cheaper than close+reopen (no cold prompt cache). Liveness is now a
host-side concern.
- host-sweep.ts: 10-min STALE_THRESHOLD_MS + getStuckProcessingIds in the
stale-detection path (still exported for kill reset).
Tests:
- src/host-sweep.test.ts — 9 tests for decideStuckAction covering: fresh
heartbeat, absolute ceiling, absent heartbeat, Bash-timeout extension
(both ceiling and per-claim), claim age below tolerance, heartbeat
touched after claim, unparseable timestamps.
Ref: docs/v1-vs-v2/ACTION-ITEMS.md items 9, 6a, 10.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
149 lines
5.1 KiB
TypeScript
149 lines
5.1 KiB
TypeScript
/**
|
|
* Delivery race tests.
|
|
*
|
|
* The active poll (1s, running sessions) and the sweep poll (60s, all
|
|
* active sessions) both call deliverSessionMessages. A running session
|
|
* sits in both result sets, so the two timer chains can race on the same
|
|
* outbound row — read-undelivered → call channel API → markDelivered. The
|
|
* INSERT OR IGNORE in markDelivered makes the DB write idempotent, but
|
|
* the channel API has already fired twice → user sees the message twice.
|
|
*/
|
|
import Database from 'better-sqlite3';
|
|
import fs from 'fs';
|
|
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
|
|
vi.mock('./container-runner.js', () => ({
|
|
wakeContainer: vi.fn().mockResolvedValue(undefined),
|
|
isContainerRunning: vi.fn().mockReturnValue(false),
|
|
killContainer: vi.fn(),
|
|
buildAgentGroupImage: vi.fn().mockResolvedValue(undefined),
|
|
}));
|
|
|
|
vi.mock('./config.js', async () => {
|
|
const actual = await vi.importActual<typeof import('./config.js')>('./config.js');
|
|
return { ...actual, DATA_DIR: '/tmp/nanoclaw-test-delivery' };
|
|
});
|
|
|
|
const TEST_DIR = '/tmp/nanoclaw-test-delivery';
|
|
|
|
import { initTestDb, closeDb, runMigrations, createAgentGroup, createMessagingGroup } from './db/index.js';
|
|
import { resolveSession, outboundDbPath } from './session-manager.js';
|
|
import { deliverSessionMessages, setDeliveryAdapter } from './delivery.js';
|
|
|
|
function now(): string {
|
|
return new Date().toISOString();
|
|
}
|
|
|
|
function seedAgentAndChannel(): void {
|
|
createAgentGroup({
|
|
id: 'ag-1',
|
|
name: 'Test Agent',
|
|
folder: 'test-agent',
|
|
agent_provider: null,
|
|
created_at: now(),
|
|
});
|
|
createMessagingGroup({
|
|
id: 'mg-1',
|
|
channel_type: 'telegram',
|
|
platform_id: 'telegram:123',
|
|
name: 'Test Chat',
|
|
is_group: 0,
|
|
unknown_sender_policy: 'public',
|
|
created_at: now(),
|
|
});
|
|
}
|
|
|
|
function insertOutbound(agentGroupId: string, sessionId: string, msgId: string): void {
|
|
const db = new Database(outboundDbPath(agentGroupId, sessionId));
|
|
db.prepare(
|
|
`INSERT INTO messages_out (id, timestamp, kind, platform_id, channel_type, content)
|
|
VALUES (?, datetime('now'), 'chat', 'telegram:123', 'telegram', ?)`,
|
|
).run(msgId, JSON.stringify({ text: 'hello' }));
|
|
db.close();
|
|
}
|
|
|
|
beforeEach(() => {
|
|
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
|
|
fs.mkdirSync(TEST_DIR, { recursive: true });
|
|
const db = initTestDb();
|
|
runMigrations(db);
|
|
});
|
|
|
|
afterEach(() => {
|
|
closeDb();
|
|
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
|
|
});
|
|
|
|
describe('deliverSessionMessages — concurrent invocations', () => {
|
|
it('delivers a message exactly once when active and sweep polls overlap', async () => {
|
|
seedAgentAndChannel();
|
|
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
|
|
insertOutbound('ag-1', session.id, 'out-1');
|
|
|
|
const calls: string[] = [];
|
|
setDeliveryAdapter({
|
|
async deliver(_channelType, _platformId, _threadId, _kind, content) {
|
|
calls.push(content);
|
|
// Hold long enough that the second concurrent caller can race the
|
|
// read-undelivered → markDelivered window.
|
|
await new Promise((r) => setTimeout(r, 100));
|
|
return 'plat-msg-1';
|
|
},
|
|
});
|
|
|
|
// Two concurrent calls — simulating active (1s) and sweep (60s) polls
|
|
// hitting the same running session at the same moment.
|
|
await Promise.all([deliverSessionMessages(session), deliverSessionMessages(session)]);
|
|
|
|
expect(calls).toHaveLength(1);
|
|
});
|
|
|
|
it('still delivers on a subsequent call after the first finishes', async () => {
|
|
seedAgentAndChannel();
|
|
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
|
|
insertOutbound('ag-1', session.id, 'out-first');
|
|
|
|
const calls: string[] = [];
|
|
setDeliveryAdapter({
|
|
async deliver(_channelType, _platformId, _threadId, _kind, content) {
|
|
calls.push(content);
|
|
return 'plat-msg-id';
|
|
},
|
|
});
|
|
|
|
await deliverSessionMessages(session);
|
|
expect(calls).toHaveLength(1);
|
|
|
|
// Insert a second outbound message and deliver again — the lock from
|
|
// the first call must have been released.
|
|
insertOutbound('ag-1', session.id, 'out-second');
|
|
await deliverSessionMessages(session);
|
|
expect(calls).toHaveLength(2);
|
|
});
|
|
|
|
it('does not re-deliver when retried after a successful send (cleanup-after-send safety)', async () => {
|
|
// If something post-send throws (e.g. outbox cleanup), the message has
|
|
// still landed on the user's screen — the catch path must not trigger
|
|
// a re-send. We simulate by having the adapter succeed on the first
|
|
// call and recording how many times it's invoked across two attempts.
|
|
seedAgentAndChannel();
|
|
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
|
|
insertOutbound('ag-1', session.id, 'out-once');
|
|
|
|
let callCount = 0;
|
|
setDeliveryAdapter({
|
|
async deliver() {
|
|
callCount++;
|
|
return 'plat-msg-id';
|
|
},
|
|
});
|
|
|
|
await deliverSessionMessages(session);
|
|
// Re-invoke — should be idempotent because the message is now in the
|
|
// delivered table; the channel adapter must not be called again.
|
|
await deliverSessionMessages(session);
|
|
|
|
expect(callCount).toBe(1);
|
|
});
|
|
});
|