diff --git a/CLAUDE.md b/CLAUDE.md index 7115c4c..6565e8f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart systemctl --user start|stop|restart nanoclaw ``` -Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here). +## Troubleshooting + +Check these first when something goes wrong: + +| What | Where | +|------|-------| +| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain | +| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) | +| Session DBs | `data/v2-sessions///` — `inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) | + +Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect. ## Supply Chain Security (pnpm) diff --git a/src/circuit-breaker.test.ts b/src/circuit-breaker.test.ts new file mode 100644 index 0000000..d8c996c --- /dev/null +++ b/src/circuit-breaker.test.ts @@ -0,0 +1,197 @@ +/** + * Unit tests for the startup circuit breaker. + * + * Covers state transitions, the documented backoff schedule, and the + * fresh-install case where DATA_DIR doesn't exist yet (the breaker runs + * before initDb, so it has to create the dir itself). + */ +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; + +// vi.mock factories are hoisted above imports, so they can't close over local +// consts. vi.hoisted is hoisted alongside the mock and runs before any +// `import` — so it can only use globals (no path/os modules). Use require() +// inside the callback to compute the test dir. +const { TEST_DIR } = vi.hoisted(() => { + const nodePath = require('path') as typeof import('path'); + const nodeOs = require('os') as typeof import('os'); + return { TEST_DIR: nodePath.join(nodeOs.tmpdir(), 'nanoclaw-cb-test') }; +}); +const CB_PATH = path.join(TEST_DIR, 'circuit-breaker.json'); + +vi.mock('./config.js', async () => { + const actual = await vi.importActual('./config.js'); + return { ...actual, DATA_DIR: TEST_DIR }; +}); + +vi.mock('./log.js', () => ({ + log: { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + fatal: vi.fn(), + }, +})); + +import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js'; + +function readState(): { attempt: number; timestamp: string } { + return JSON.parse(fs.readFileSync(CB_PATH, 'utf-8')); +} + +function seedState(attempt: number, timestamp = new Date().toISOString()): void { + fs.writeFileSync(CB_PATH, JSON.stringify({ attempt, timestamp })); +} + +beforeEach(() => { + if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true }); + fs.mkdirSync(TEST_DIR, { recursive: true }); +}); + +afterEach(() => { + vi.useRealTimers(); + if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true }); +}); + +describe('resetCircuitBreaker', () => { + it('deletes the state file', () => { + seedState(3); + expect(fs.existsSync(CB_PATH)).toBe(true); + resetCircuitBreaker(); + expect(fs.existsSync(CB_PATH)).toBe(false); + }); + + it('is a no-op when the file does not exist', () => { + expect(fs.existsSync(CB_PATH)).toBe(false); + expect(() => resetCircuitBreaker()).not.toThrow(); + }); +}); + +describe('enforceStartupBackoff — state transitions', () => { + it('first run writes attempt=1 and does not delay', async () => { + vi.useFakeTimers(); + const start = Date.now(); + await enforceStartupBackoff(); + // No timers should have been queued — clean first start is 0s. + expect(Date.now() - start).toBe(0); + expect(readState().attempt).toBe(1); + }); + + it('within reset window, attempt is incremented', async () => { + seedState(1); + vi.useFakeTimers(); + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + expect(readState().attempt).toBe(2); + }); + + it('outside reset window (>1h), attempt resets to 1', async () => { + const longAgo = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString(); + seedState(5, longAgo); + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); + + it('exactly at the reset window boundary still counts as "within"', async () => { + // RESET_WINDOW_MS = 60min. Use 59min59s to stay inside even if the test + // takes a few ms to execute. + const justInside = new Date(Date.now() - (60 * 60 * 1000 - 1000)).toISOString(); + seedState(2, justInside); + vi.useFakeTimers(); + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + expect(readState().attempt).toBe(3); + }); + + it('treats a malformed state file as no prior state', async () => { + fs.writeFileSync(CB_PATH, '{ this is not json'); + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); + + it('resetCircuitBreaker after a startup actually clears the counter for the next startup', async () => { + // Simulate: crash, restart (attempt=2), graceful shutdown, restart again. + seedState(1); + vi.useFakeTimers(); + const p1 = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await p1; + expect(readState().attempt).toBe(2); + + resetCircuitBreaker(); + expect(fs.existsSync(CB_PATH)).toBe(false); + + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); +}); + +describe('enforceStartupBackoff — backoff schedule', () => { + /** + * Documented schedule: + * + * clean start → 1 crash → 2 crash → 3 crash → 4 crash → 5 crash → 6+ crash + * 0s → 0s → 10s → 30s → 2min → 5min → 15min cap + * + * Each row is [priorAttempt seeded in the file, expected delay this run + * produces in seconds]. priorAttempt=null = no file = very first start. + * + * To assert the *requested* delay (not just observed elapsed real time), + * we spy on global.setTimeout and look at the longest call. runAllTimersAsync + * lets the function complete so we can move on. + */ + const cases: Array<{ label: string; priorAttempt: number | null; expectedDelaySec: number }> = [ + { label: 'clean first start (no file)', priorAttempt: null, expectedDelaySec: 0 }, + { label: 'first crash (attempt=2)', priorAttempt: 1, expectedDelaySec: 0 }, + { label: 'second crash (attempt=3)', priorAttempt: 2, expectedDelaySec: 10 }, + { label: 'third crash (attempt=4)', priorAttempt: 3, expectedDelaySec: 30 }, + { label: 'fourth crash (attempt=5)', priorAttempt: 4, expectedDelaySec: 120 }, + { label: 'fifth crash (attempt=6)', priorAttempt: 5, expectedDelaySec: 300 }, + { label: 'sixth crash (attempt=7) — cap', priorAttempt: 6, expectedDelaySec: 900 }, + { label: 'far past cap (attempt=20)', priorAttempt: 19, expectedDelaySec: 900 }, + ]; + + for (const { label, priorAttempt, expectedDelaySec } of cases) { + it(`${label}: delays ${expectedDelaySec}s`, async () => { + if (priorAttempt !== null) seedState(priorAttempt); + + vi.useFakeTimers(); + const setTimeoutSpy = vi.spyOn(global, 'setTimeout'); + + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + + // enforceStartupBackoff only calls setTimeout when delaySec > 0. Pick + // the longest delay it requested (vitest may queue small internal + // timers we don't care about). + const requestedDelays = setTimeoutSpy.mock.calls.map((c) => c[1] ?? 0); + const maxDelayMs = requestedDelays.length ? Math.max(...requestedDelays) : 0; + + expect(maxDelayMs).toBe(expectedDelaySec * 1000); + }); + } +}); + +describe('enforceStartupBackoff — fresh install (DATA_DIR missing)', () => { + /** + * The breaker runs before initDb (which is what creates DATA_DIR). On a + * fresh checkout the dir doesn't exist yet, so write() must create it + * before writing the state file — otherwise the host crashes on its very + * first start. + */ + it('creates DATA_DIR on demand and does not throw', async () => { + fs.rmSync(TEST_DIR, { recursive: true }); + expect(fs.existsSync(TEST_DIR)).toBe(false); + + await expect(enforceStartupBackoff()).resolves.toBeUndefined(); + expect(fs.existsSync(TEST_DIR)).toBe(true); + expect(fs.existsSync(CB_PATH)).toBe(true); + expect(readState().attempt).toBe(1); + }); +}); diff --git a/src/circuit-breaker.ts b/src/circuit-breaker.ts new file mode 100644 index 0000000..20211f0 --- /dev/null +++ b/src/circuit-breaker.ts @@ -0,0 +1,84 @@ +import fs from 'fs'; +import path from 'path'; + +import { DATA_DIR } from './config.js'; +import { log } from './log.js'; + +const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json'); +const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour +// Index = number of consecutive crashes (0 = clean start, attempt 1). +// 6+ crashes capped at 15min. +const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; + +interface CircuitBreakerState { + attempt: number; + timestamp: string; +} + +function read(): CircuitBreakerState | null { + try { + const raw = fs.readFileSync(CB_PATH, 'utf-8'); + return JSON.parse(raw) as CircuitBreakerState; + } catch { + return null; + } +} + +function write(state: CircuitBreakerState): void { + // The breaker runs before initDb (which is what creates DATA_DIR), so on a + // fresh checkout the dir may not exist yet. + fs.mkdirSync(DATA_DIR, { recursive: true }); + fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n'); +} + +function getDelay(attempt: number): number { + const idx = Math.min(attempt - 1, BACKOFF_SCHEDULE_S.length - 1); + return BACKOFF_SCHEDULE_S[idx]; +} + +export function resetCircuitBreaker(): void { + try { + fs.unlinkSync(CB_PATH); + log.info('Circuit breaker reset on clean shutdown'); + } catch {} +} + +export async function enforceStartupBackoff(): Promise { + const now = new Date(); + const prev = read(); + + let attempt: number; + if (!prev) { + attempt = 1; + } else { + const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime(); + if (elapsedMs < RESET_WINDOW_MS) { + attempt = prev.attempt + 1; + log.warn('Previous startup was not a clean shutdown', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + elapsedSec: Math.round(elapsedMs / 1000), + }); + } else { + attempt = 1; + log.info('Circuit breaker reset — last startup was over 1h ago', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + }); + } + } + + write({ attempt, timestamp: now.toISOString() }); + + const delaySec = getDelay(attempt); + if (delaySec > 0) { + const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString(); + log.warn('Circuit breaker: delaying startup due to repeated crashes', { + attempt, + delaySec, + resumeAt, + }); + await new Promise((resolve) => setTimeout(resolve, delaySec * 1000)); + log.info('Circuit breaker: backoff complete, resuming startup', { attempt }); + } +} diff --git a/src/index.ts b/src/index.ts index ea9fba6..9ded3d6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -7,6 +7,7 @@ import path from 'path'; import { DATA_DIR } from './config.js'; +import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js'; import { migrateGroupsToClaudeLocal } from './claude-md-compose.js'; import { initDb } from './db/connection.js'; import { runMigrations } from './db/migrations/index.js'; @@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from async function main(): Promise { log.info('NanoClaw starting'); + // 0. Circuit breaker — backoff on rapid restarts + await enforceStartupBackoff(); + // 1. Init central DB const dbPath = path.join(DATA_DIR, 'v2.db'); const db = initDb(dbPath); @@ -174,8 +178,15 @@ async function shutdown(signal: string): Promise { } stopDeliveryPolls(); stopHostSweep(); - await teardownChannelAdapters(); - process.exit(0); + try { + await teardownChannelAdapters(); + } finally { + // Always reset on graceful shutdown — even if teardown threw, we got here + // via SIGTERM/SIGINT, not a crash, so the next start shouldn't be counted + // as one. + resetCircuitBreaker(); + process.exit(0); + } } process.on('SIGTERM', () => shutdown('SIGTERM'));