Merge pull request #2080 from Koshkoshinsk/circuit-breaker

Add startup circuit breaker for crash loop protection
This commit is contained in:
gavrielc
2026-04-28 22:53:06 +03:00
committed by GitHub
4 changed files with 305 additions and 3 deletions

View File

@@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart
systemctl --user start|stop|restart nanoclaw
```
Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here).
## Troubleshooting
Check these first when something goes wrong:
| What | Where |
|------|-------|
| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain |
| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) |
| Session DBs | `data/v2-sessions/<agent-group>/<session>/``inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) |
Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect.
## Supply Chain Security (pnpm)

197
src/circuit-breaker.test.ts Normal file
View File

@@ -0,0 +1,197 @@
/**
* Unit tests for the startup circuit breaker.
*
* Covers state transitions, the documented backoff schedule, and the
* fresh-install case where DATA_DIR doesn't exist yet (the breaker runs
* before initDb, so it has to create the dir itself).
*/
import fs from 'fs';
import os from 'os';
import path from 'path';
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
// vi.mock factories are hoisted above imports, so they can't close over local
// consts. vi.hoisted is hoisted alongside the mock and runs before any
// `import` — so it can only use globals (no path/os modules). Use require()
// inside the callback to compute the test dir.
const { TEST_DIR } = vi.hoisted(() => {
const nodePath = require('path') as typeof import('path');
const nodeOs = require('os') as typeof import('os');
return { TEST_DIR: nodePath.join(nodeOs.tmpdir(), 'nanoclaw-cb-test') };
});
const CB_PATH = path.join(TEST_DIR, 'circuit-breaker.json');
vi.mock('./config.js', async () => {
const actual = await vi.importActual<typeof import('./config.js')>('./config.js');
return { ...actual, DATA_DIR: TEST_DIR };
});
vi.mock('./log.js', () => ({
log: {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
fatal: vi.fn(),
},
}));
import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js';
function readState(): { attempt: number; timestamp: string } {
return JSON.parse(fs.readFileSync(CB_PATH, 'utf-8'));
}
function seedState(attempt: number, timestamp = new Date().toISOString()): void {
fs.writeFileSync(CB_PATH, JSON.stringify({ attempt, timestamp }));
}
beforeEach(() => {
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
fs.mkdirSync(TEST_DIR, { recursive: true });
});
afterEach(() => {
vi.useRealTimers();
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
});
describe('resetCircuitBreaker', () => {
it('deletes the state file', () => {
seedState(3);
expect(fs.existsSync(CB_PATH)).toBe(true);
resetCircuitBreaker();
expect(fs.existsSync(CB_PATH)).toBe(false);
});
it('is a no-op when the file does not exist', () => {
expect(fs.existsSync(CB_PATH)).toBe(false);
expect(() => resetCircuitBreaker()).not.toThrow();
});
});
describe('enforceStartupBackoff — state transitions', () => {
it('first run writes attempt=1 and does not delay', async () => {
vi.useFakeTimers();
const start = Date.now();
await enforceStartupBackoff();
// No timers should have been queued — clean first start is 0s.
expect(Date.now() - start).toBe(0);
expect(readState().attempt).toBe(1);
});
it('within reset window, attempt is incremented', async () => {
seedState(1);
vi.useFakeTimers();
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
expect(readState().attempt).toBe(2);
});
it('outside reset window (>1h), attempt resets to 1', async () => {
const longAgo = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
seedState(5, longAgo);
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
it('exactly at the reset window boundary still counts as "within"', async () => {
// RESET_WINDOW_MS = 60min. Use 59min59s to stay inside even if the test
// takes a few ms to execute.
const justInside = new Date(Date.now() - (60 * 60 * 1000 - 1000)).toISOString();
seedState(2, justInside);
vi.useFakeTimers();
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
expect(readState().attempt).toBe(3);
});
it('treats a malformed state file as no prior state', async () => {
fs.writeFileSync(CB_PATH, '{ this is not json');
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
it('resetCircuitBreaker after a startup actually clears the counter for the next startup', async () => {
// Simulate: crash, restart (attempt=2), graceful shutdown, restart again.
seedState(1);
vi.useFakeTimers();
const p1 = enforceStartupBackoff();
await vi.runAllTimersAsync();
await p1;
expect(readState().attempt).toBe(2);
resetCircuitBreaker();
expect(fs.existsSync(CB_PATH)).toBe(false);
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
});
describe('enforceStartupBackoff — backoff schedule', () => {
/**
* Documented schedule:
*
* clean start → 1 crash → 2 crash → 3 crash → 4 crash → 5 crash → 6+ crash
* 0s → 0s → 10s → 30s → 2min → 5min → 15min cap
*
* Each row is [priorAttempt seeded in the file, expected delay this run
* produces in seconds]. priorAttempt=null = no file = very first start.
*
* To assert the *requested* delay (not just observed elapsed real time),
* we spy on global.setTimeout and look at the longest call. runAllTimersAsync
* lets the function complete so we can move on.
*/
const cases: Array<{ label: string; priorAttempt: number | null; expectedDelaySec: number }> = [
{ label: 'clean first start (no file)', priorAttempt: null, expectedDelaySec: 0 },
{ label: 'first crash (attempt=2)', priorAttempt: 1, expectedDelaySec: 0 },
{ label: 'second crash (attempt=3)', priorAttempt: 2, expectedDelaySec: 10 },
{ label: 'third crash (attempt=4)', priorAttempt: 3, expectedDelaySec: 30 },
{ label: 'fourth crash (attempt=5)', priorAttempt: 4, expectedDelaySec: 120 },
{ label: 'fifth crash (attempt=6)', priorAttempt: 5, expectedDelaySec: 300 },
{ label: 'sixth crash (attempt=7) — cap', priorAttempt: 6, expectedDelaySec: 900 },
{ label: 'far past cap (attempt=20)', priorAttempt: 19, expectedDelaySec: 900 },
];
for (const { label, priorAttempt, expectedDelaySec } of cases) {
it(`${label}: delays ${expectedDelaySec}s`, async () => {
if (priorAttempt !== null) seedState(priorAttempt);
vi.useFakeTimers();
const setTimeoutSpy = vi.spyOn(global, 'setTimeout');
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
// enforceStartupBackoff only calls setTimeout when delaySec > 0. Pick
// the longest delay it requested (vitest may queue small internal
// timers we don't care about).
const requestedDelays = setTimeoutSpy.mock.calls.map((c) => c[1] ?? 0);
const maxDelayMs = requestedDelays.length ? Math.max(...requestedDelays) : 0;
expect(maxDelayMs).toBe(expectedDelaySec * 1000);
});
}
});
describe('enforceStartupBackoff — fresh install (DATA_DIR missing)', () => {
/**
* The breaker runs before initDb (which is what creates DATA_DIR). On a
* fresh checkout the dir doesn't exist yet, so write() must create it
* before writing the state file — otherwise the host crashes on its very
* first start.
*/
it('creates DATA_DIR on demand and does not throw', async () => {
fs.rmSync(TEST_DIR, { recursive: true });
expect(fs.existsSync(TEST_DIR)).toBe(false);
await expect(enforceStartupBackoff()).resolves.toBeUndefined();
expect(fs.existsSync(TEST_DIR)).toBe(true);
expect(fs.existsSync(CB_PATH)).toBe(true);
expect(readState().attempt).toBe(1);
});
});

84
src/circuit-breaker.ts Normal file
View File

@@ -0,0 +1,84 @@
import fs from 'fs';
import path from 'path';
import { DATA_DIR } from './config.js';
import { log } from './log.js';
const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json');
const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour
// Index = number of consecutive crashes (0 = clean start, attempt 1).
// 6+ crashes capped at 15min.
const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900];
interface CircuitBreakerState {
attempt: number;
timestamp: string;
}
function read(): CircuitBreakerState | null {
try {
const raw = fs.readFileSync(CB_PATH, 'utf-8');
return JSON.parse(raw) as CircuitBreakerState;
} catch {
return null;
}
}
function write(state: CircuitBreakerState): void {
// The breaker runs before initDb (which is what creates DATA_DIR), so on a
// fresh checkout the dir may not exist yet.
fs.mkdirSync(DATA_DIR, { recursive: true });
fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n');
}
function getDelay(attempt: number): number {
const idx = Math.min(attempt - 1, BACKOFF_SCHEDULE_S.length - 1);
return BACKOFF_SCHEDULE_S[idx];
}
export function resetCircuitBreaker(): void {
try {
fs.unlinkSync(CB_PATH);
log.info('Circuit breaker reset on clean shutdown');
} catch {}
}
export async function enforceStartupBackoff(): Promise<void> {
const now = new Date();
const prev = read();
let attempt: number;
if (!prev) {
attempt = 1;
} else {
const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime();
if (elapsedMs < RESET_WINDOW_MS) {
attempt = prev.attempt + 1;
log.warn('Previous startup was not a clean shutdown', {
previousAttempt: prev.attempt,
previousTimestamp: prev.timestamp,
elapsedSec: Math.round(elapsedMs / 1000),
});
} else {
attempt = 1;
log.info('Circuit breaker reset — last startup was over 1h ago', {
previousAttempt: prev.attempt,
previousTimestamp: prev.timestamp,
});
}
}
write({ attempt, timestamp: now.toISOString() });
const delaySec = getDelay(attempt);
if (delaySec > 0) {
const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString();
log.warn('Circuit breaker: delaying startup due to repeated crashes', {
attempt,
delaySec,
resumeAt,
});
await new Promise((resolve) => setTimeout(resolve, delaySec * 1000));
log.info('Circuit breaker: backoff complete, resuming startup', { attempt });
}
}

View File

@@ -7,6 +7,7 @@
import path from 'path';
import { DATA_DIR } from './config.js';
import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js';
import { migrateGroupsToClaudeLocal } from './claude-md-compose.js';
import { initDb } from './db/connection.js';
import { runMigrations } from './db/migrations/index.js';
@@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from
async function main(): Promise<void> {
log.info('NanoClaw starting');
// 0. Circuit breaker — backoff on rapid restarts
await enforceStartupBackoff();
// 1. Init central DB
const dbPath = path.join(DATA_DIR, 'v2.db');
const db = initDb(dbPath);
@@ -174,8 +178,15 @@ async function shutdown(signal: string): Promise<void> {
}
stopDeliveryPolls();
stopHostSweep();
await teardownChannelAdapters();
process.exit(0);
try {
await teardownChannelAdapters();
} finally {
// Always reset on graceful shutdown — even if teardown threw, we got here
// via SIGTERM/SIGINT, not a crash, so the next start shouldn't be counted
// as one.
resetCircuitBreaker();
process.exit(0);
}
}
process.on('SIGTERM', () => shutdown('SIGTERM'));