From 2bf296b04a3802b25edd08e39692d042a3d7868f Mon Sep 17 00:00:00 2001 From: Daniel Milliner Date: Tue, 28 Apr 2026 14:01:32 +0000 Subject: [PATCH 1/2] add startup circuit breaker and troubleshooting docs Backs off on rapid restarts to avoid exhausting Discord gateway identify limits and triggering Cloudflare IP bans. Resets on clean shutdown so only crashes accumulate the counter. Also adds a troubleshooting section to CLAUDE.md with the most useful diagnostic locations. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 12 ++++++- src/circuit-breaker.ts | 79 ++++++++++++++++++++++++++++++++++++++++++ src/index.ts | 5 +++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 src/circuit-breaker.ts diff --git a/CLAUDE.md b/CLAUDE.md index 7115c4c..6565e8f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart systemctl --user start|stop|restart nanoclaw ``` -Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here). +## Troubleshooting + +Check these first when something goes wrong: + +| What | Where | +|------|-------| +| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain | +| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) | +| Session DBs | `data/v2-sessions///` — `inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) | + +Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect. ## Supply Chain Security (pnpm) diff --git a/src/circuit-breaker.ts b/src/circuit-breaker.ts new file mode 100644 index 0000000..4288eb4 --- /dev/null +++ b/src/circuit-breaker.ts @@ -0,0 +1,79 @@ +import fs from 'fs'; +import path from 'path'; + +import { DATA_DIR } from './config.js'; +import { log } from './log.js'; + +const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json'); +const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour +const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min + +interface CircuitBreakerState { + attempt: number; + timestamp: string; +} + +function read(): CircuitBreakerState | null { + try { + const raw = fs.readFileSync(CB_PATH, 'utf-8'); + return JSON.parse(raw) as CircuitBreakerState; + } catch { + return null; + } +} + +function write(state: CircuitBreakerState): void { + fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n'); +} + +function getDelay(attempt: number): number { + const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1); + return BACKOFF_SCHEDULE_S[idx]; +} + +export function resetCircuitBreaker(): void { + try { + fs.unlinkSync(CB_PATH); + log.info('Circuit breaker reset on clean shutdown'); + } catch {} +} + +export async function enforceStartupBackoff(): Promise { + const now = new Date(); + const prev = read(); + + let attempt: number; + if (!prev) { + attempt = 1; + } else { + const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime(); + if (elapsedMs < RESET_WINDOW_MS) { + attempt = prev.attempt + 1; + log.warn('Previous startup was not a clean shutdown', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + elapsedSec: Math.round(elapsedMs / 1000), + }); + } else { + attempt = 1; + log.info('Circuit breaker reset — last startup was over 1h ago', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + }); + } + } + + write({ attempt, timestamp: now.toISOString() }); + + const delaySec = getDelay(attempt); + if (delaySec > 0) { + const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString(); + log.warn('Circuit breaker: delaying startup due to repeated crashes', { + attempt, + delaySec, + resumeAt, + }); + await new Promise((resolve) => setTimeout(resolve, delaySec * 1000)); + log.info('Circuit breaker: backoff complete, resuming startup', { attempt }); + } +} diff --git a/src/index.ts b/src/index.ts index ea9fba6..6235525 100644 --- a/src/index.ts +++ b/src/index.ts @@ -7,6 +7,7 @@ import path from 'path'; import { DATA_DIR } from './config.js'; +import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js'; import { migrateGroupsToClaudeLocal } from './claude-md-compose.js'; import { initDb } from './db/connection.js'; import { runMigrations } from './db/migrations/index.js'; @@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from async function main(): Promise { log.info('NanoClaw starting'); + // 0. Circuit breaker — backoff on rapid restarts + await enforceStartupBackoff(); + // 1. Init central DB const dbPath = path.join(DATA_DIR, 'v2.db'); const db = initDb(dbPath); @@ -175,6 +179,7 @@ async function shutdown(signal: string): Promise { stopDeliveryPolls(); stopHostSweep(); await teardownChannelAdapters(); + resetCircuitBreaker(); process.exit(0); } From 336e01d2a1f1014d87da4f1d00a0f67e1e811cad Mon Sep 17 00:00:00 2001 From: gavrielc Date: Tue, 28 Apr 2026 22:51:11 +0300 Subject: [PATCH 2/2] fix circuit-breaker off-by-one, ENOENT, and reset-on-throw + tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - getDelay indexed by attempt (1-based) into a 0-indexed array, so the leading 0 was unreachable and every "after a crash" delay was shifted up one slot. Use attempt - 1 so the documented schedule (0s → 0s → 10s → 30s → 2min → 5min → 15min cap) actually holds. - enforceStartupBackoff runs before initDb (which creates DATA_DIR), so on a fresh checkout fs.writeFileSync hit ENOENT. write() now mkdirSync's DATA_DIR first. - shutdown() didn't run resetCircuitBreaker if teardownChannelAdapters threw, so a graceful exit with a teardown error would be counted as a crash on the next start. Wrap teardown in try/finally. - Adds src/circuit-breaker.test.ts: state transitions, full schedule (parameterized), reset-window expiry, malformed file, and the fresh-install path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/circuit-breaker.test.ts | 197 ++++++++++++++++++++++++++++++++++++ src/circuit-breaker.ts | 9 +- src/index.ts | 12 ++- 3 files changed, 213 insertions(+), 5 deletions(-) create mode 100644 src/circuit-breaker.test.ts diff --git a/src/circuit-breaker.test.ts b/src/circuit-breaker.test.ts new file mode 100644 index 0000000..d8c996c --- /dev/null +++ b/src/circuit-breaker.test.ts @@ -0,0 +1,197 @@ +/** + * Unit tests for the startup circuit breaker. + * + * Covers state transitions, the documented backoff schedule, and the + * fresh-install case where DATA_DIR doesn't exist yet (the breaker runs + * before initDb, so it has to create the dir itself). + */ +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; + +// vi.mock factories are hoisted above imports, so they can't close over local +// consts. vi.hoisted is hoisted alongside the mock and runs before any +// `import` — so it can only use globals (no path/os modules). Use require() +// inside the callback to compute the test dir. +const { TEST_DIR } = vi.hoisted(() => { + const nodePath = require('path') as typeof import('path'); + const nodeOs = require('os') as typeof import('os'); + return { TEST_DIR: nodePath.join(nodeOs.tmpdir(), 'nanoclaw-cb-test') }; +}); +const CB_PATH = path.join(TEST_DIR, 'circuit-breaker.json'); + +vi.mock('./config.js', async () => { + const actual = await vi.importActual('./config.js'); + return { ...actual, DATA_DIR: TEST_DIR }; +}); + +vi.mock('./log.js', () => ({ + log: { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + fatal: vi.fn(), + }, +})); + +import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js'; + +function readState(): { attempt: number; timestamp: string } { + return JSON.parse(fs.readFileSync(CB_PATH, 'utf-8')); +} + +function seedState(attempt: number, timestamp = new Date().toISOString()): void { + fs.writeFileSync(CB_PATH, JSON.stringify({ attempt, timestamp })); +} + +beforeEach(() => { + if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true }); + fs.mkdirSync(TEST_DIR, { recursive: true }); +}); + +afterEach(() => { + vi.useRealTimers(); + if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true }); +}); + +describe('resetCircuitBreaker', () => { + it('deletes the state file', () => { + seedState(3); + expect(fs.existsSync(CB_PATH)).toBe(true); + resetCircuitBreaker(); + expect(fs.existsSync(CB_PATH)).toBe(false); + }); + + it('is a no-op when the file does not exist', () => { + expect(fs.existsSync(CB_PATH)).toBe(false); + expect(() => resetCircuitBreaker()).not.toThrow(); + }); +}); + +describe('enforceStartupBackoff — state transitions', () => { + it('first run writes attempt=1 and does not delay', async () => { + vi.useFakeTimers(); + const start = Date.now(); + await enforceStartupBackoff(); + // No timers should have been queued — clean first start is 0s. + expect(Date.now() - start).toBe(0); + expect(readState().attempt).toBe(1); + }); + + it('within reset window, attempt is incremented', async () => { + seedState(1); + vi.useFakeTimers(); + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + expect(readState().attempt).toBe(2); + }); + + it('outside reset window (>1h), attempt resets to 1', async () => { + const longAgo = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString(); + seedState(5, longAgo); + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); + + it('exactly at the reset window boundary still counts as "within"', async () => { + // RESET_WINDOW_MS = 60min. Use 59min59s to stay inside even if the test + // takes a few ms to execute. + const justInside = new Date(Date.now() - (60 * 60 * 1000 - 1000)).toISOString(); + seedState(2, justInside); + vi.useFakeTimers(); + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + expect(readState().attempt).toBe(3); + }); + + it('treats a malformed state file as no prior state', async () => { + fs.writeFileSync(CB_PATH, '{ this is not json'); + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); + + it('resetCircuitBreaker after a startup actually clears the counter for the next startup', async () => { + // Simulate: crash, restart (attempt=2), graceful shutdown, restart again. + seedState(1); + vi.useFakeTimers(); + const p1 = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await p1; + expect(readState().attempt).toBe(2); + + resetCircuitBreaker(); + expect(fs.existsSync(CB_PATH)).toBe(false); + + await enforceStartupBackoff(); + expect(readState().attempt).toBe(1); + }); +}); + +describe('enforceStartupBackoff — backoff schedule', () => { + /** + * Documented schedule: + * + * clean start → 1 crash → 2 crash → 3 crash → 4 crash → 5 crash → 6+ crash + * 0s → 0s → 10s → 30s → 2min → 5min → 15min cap + * + * Each row is [priorAttempt seeded in the file, expected delay this run + * produces in seconds]. priorAttempt=null = no file = very first start. + * + * To assert the *requested* delay (not just observed elapsed real time), + * we spy on global.setTimeout and look at the longest call. runAllTimersAsync + * lets the function complete so we can move on. + */ + const cases: Array<{ label: string; priorAttempt: number | null; expectedDelaySec: number }> = [ + { label: 'clean first start (no file)', priorAttempt: null, expectedDelaySec: 0 }, + { label: 'first crash (attempt=2)', priorAttempt: 1, expectedDelaySec: 0 }, + { label: 'second crash (attempt=3)', priorAttempt: 2, expectedDelaySec: 10 }, + { label: 'third crash (attempt=4)', priorAttempt: 3, expectedDelaySec: 30 }, + { label: 'fourth crash (attempt=5)', priorAttempt: 4, expectedDelaySec: 120 }, + { label: 'fifth crash (attempt=6)', priorAttempt: 5, expectedDelaySec: 300 }, + { label: 'sixth crash (attempt=7) — cap', priorAttempt: 6, expectedDelaySec: 900 }, + { label: 'far past cap (attempt=20)', priorAttempt: 19, expectedDelaySec: 900 }, + ]; + + for (const { label, priorAttempt, expectedDelaySec } of cases) { + it(`${label}: delays ${expectedDelaySec}s`, async () => { + if (priorAttempt !== null) seedState(priorAttempt); + + vi.useFakeTimers(); + const setTimeoutSpy = vi.spyOn(global, 'setTimeout'); + + const promise = enforceStartupBackoff(); + await vi.runAllTimersAsync(); + await promise; + + // enforceStartupBackoff only calls setTimeout when delaySec > 0. Pick + // the longest delay it requested (vitest may queue small internal + // timers we don't care about). + const requestedDelays = setTimeoutSpy.mock.calls.map((c) => c[1] ?? 0); + const maxDelayMs = requestedDelays.length ? Math.max(...requestedDelays) : 0; + + expect(maxDelayMs).toBe(expectedDelaySec * 1000); + }); + } +}); + +describe('enforceStartupBackoff — fresh install (DATA_DIR missing)', () => { + /** + * The breaker runs before initDb (which is what creates DATA_DIR). On a + * fresh checkout the dir doesn't exist yet, so write() must create it + * before writing the state file — otherwise the host crashes on its very + * first start. + */ + it('creates DATA_DIR on demand and does not throw', async () => { + fs.rmSync(TEST_DIR, { recursive: true }); + expect(fs.existsSync(TEST_DIR)).toBe(false); + + await expect(enforceStartupBackoff()).resolves.toBeUndefined(); + expect(fs.existsSync(TEST_DIR)).toBe(true); + expect(fs.existsSync(CB_PATH)).toBe(true); + expect(readState().attempt).toBe(1); + }); +}); diff --git a/src/circuit-breaker.ts b/src/circuit-breaker.ts index 4288eb4..20211f0 100644 --- a/src/circuit-breaker.ts +++ b/src/circuit-breaker.ts @@ -6,7 +6,9 @@ import { log } from './log.js'; const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json'); const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour -const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min +// Index = number of consecutive crashes (0 = clean start, attempt 1). +// 6+ crashes capped at 15min. +const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; interface CircuitBreakerState { attempt: number; @@ -23,11 +25,14 @@ function read(): CircuitBreakerState | null { } function write(state: CircuitBreakerState): void { + // The breaker runs before initDb (which is what creates DATA_DIR), so on a + // fresh checkout the dir may not exist yet. + fs.mkdirSync(DATA_DIR, { recursive: true }); fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n'); } function getDelay(attempt: number): number { - const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1); + const idx = Math.min(attempt - 1, BACKOFF_SCHEDULE_S.length - 1); return BACKOFF_SCHEDULE_S[idx]; } diff --git a/src/index.ts b/src/index.ts index 6235525..9ded3d6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -178,9 +178,15 @@ async function shutdown(signal: string): Promise { } stopDeliveryPolls(); stopHostSweep(); - await teardownChannelAdapters(); - resetCircuitBreaker(); - process.exit(0); + try { + await teardownChannelAdapters(); + } finally { + // Always reset on graceful shutdown — even if teardown threw, we got here + // via SIGTERM/SIGINT, not a crash, so the next start shouldn't be counted + // as one. + resetCircuitBreaker(); + process.exit(0); + } } process.on('SIGTERM', () => shutdown('SIGTERM'));