fix circuit-breaker off-by-one, ENOENT, and reset-on-throw + tests

- getDelay indexed by attempt (1-based) into a 0-indexed array, so the
  leading 0 was unreachable and every "after a crash" delay was shifted
  up one slot. Use attempt - 1 so the documented schedule (0s → 0s →
  10s → 30s → 2min → 5min → 15min cap) actually holds.
- enforceStartupBackoff runs before initDb (which creates DATA_DIR), so
  on a fresh checkout fs.writeFileSync hit ENOENT. write() now
  mkdirSync's DATA_DIR first.
- shutdown() didn't run resetCircuitBreaker if teardownChannelAdapters
  threw, so a graceful exit with a teardown error would be counted as a
  crash on the next start. Wrap teardown in try/finally.
- Adds src/circuit-breaker.test.ts: state transitions, full schedule
  (parameterized), reset-window expiry, malformed file, and the
  fresh-install path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-04-28 22:51:11 +03:00
parent 2bf296b04a
commit 336e01d2a1
3 changed files with 213 additions and 5 deletions

197
src/circuit-breaker.test.ts Normal file
View File

@@ -0,0 +1,197 @@
/**
* Unit tests for the startup circuit breaker.
*
* Covers state transitions, the documented backoff schedule, and the
* fresh-install case where DATA_DIR doesn't exist yet (the breaker runs
* before initDb, so it has to create the dir itself).
*/
import fs from 'fs';
import os from 'os';
import path from 'path';
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
// vi.mock factories are hoisted above imports, so they can't close over local
// consts. vi.hoisted is hoisted alongside the mock and runs before any
// `import` — so it can only use globals (no path/os modules). Use require()
// inside the callback to compute the test dir.
const { TEST_DIR } = vi.hoisted(() => {
const nodePath = require('path') as typeof import('path');
const nodeOs = require('os') as typeof import('os');
return { TEST_DIR: nodePath.join(nodeOs.tmpdir(), 'nanoclaw-cb-test') };
});
const CB_PATH = path.join(TEST_DIR, 'circuit-breaker.json');
vi.mock('./config.js', async () => {
const actual = await vi.importActual<typeof import('./config.js')>('./config.js');
return { ...actual, DATA_DIR: TEST_DIR };
});
vi.mock('./log.js', () => ({
log: {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
fatal: vi.fn(),
},
}));
import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js';
function readState(): { attempt: number; timestamp: string } {
return JSON.parse(fs.readFileSync(CB_PATH, 'utf-8'));
}
function seedState(attempt: number, timestamp = new Date().toISOString()): void {
fs.writeFileSync(CB_PATH, JSON.stringify({ attempt, timestamp }));
}
beforeEach(() => {
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
fs.mkdirSync(TEST_DIR, { recursive: true });
});
afterEach(() => {
vi.useRealTimers();
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
});
describe('resetCircuitBreaker', () => {
it('deletes the state file', () => {
seedState(3);
expect(fs.existsSync(CB_PATH)).toBe(true);
resetCircuitBreaker();
expect(fs.existsSync(CB_PATH)).toBe(false);
});
it('is a no-op when the file does not exist', () => {
expect(fs.existsSync(CB_PATH)).toBe(false);
expect(() => resetCircuitBreaker()).not.toThrow();
});
});
describe('enforceStartupBackoff — state transitions', () => {
it('first run writes attempt=1 and does not delay', async () => {
vi.useFakeTimers();
const start = Date.now();
await enforceStartupBackoff();
// No timers should have been queued — clean first start is 0s.
expect(Date.now() - start).toBe(0);
expect(readState().attempt).toBe(1);
});
it('within reset window, attempt is incremented', async () => {
seedState(1);
vi.useFakeTimers();
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
expect(readState().attempt).toBe(2);
});
it('outside reset window (>1h), attempt resets to 1', async () => {
const longAgo = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
seedState(5, longAgo);
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
it('exactly at the reset window boundary still counts as "within"', async () => {
// RESET_WINDOW_MS = 60min. Use 59min59s to stay inside even if the test
// takes a few ms to execute.
const justInside = new Date(Date.now() - (60 * 60 * 1000 - 1000)).toISOString();
seedState(2, justInside);
vi.useFakeTimers();
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
expect(readState().attempt).toBe(3);
});
it('treats a malformed state file as no prior state', async () => {
fs.writeFileSync(CB_PATH, '{ this is not json');
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
it('resetCircuitBreaker after a startup actually clears the counter for the next startup', async () => {
// Simulate: crash, restart (attempt=2), graceful shutdown, restart again.
seedState(1);
vi.useFakeTimers();
const p1 = enforceStartupBackoff();
await vi.runAllTimersAsync();
await p1;
expect(readState().attempt).toBe(2);
resetCircuitBreaker();
expect(fs.existsSync(CB_PATH)).toBe(false);
await enforceStartupBackoff();
expect(readState().attempt).toBe(1);
});
});
describe('enforceStartupBackoff — backoff schedule', () => {
/**
* Documented schedule:
*
* clean start → 1 crash → 2 crash → 3 crash → 4 crash → 5 crash → 6+ crash
* 0s → 0s → 10s → 30s → 2min → 5min → 15min cap
*
* Each row is [priorAttempt seeded in the file, expected delay this run
* produces in seconds]. priorAttempt=null = no file = very first start.
*
* To assert the *requested* delay (not just observed elapsed real time),
* we spy on global.setTimeout and look at the longest call. runAllTimersAsync
* lets the function complete so we can move on.
*/
const cases: Array<{ label: string; priorAttempt: number | null; expectedDelaySec: number }> = [
{ label: 'clean first start (no file)', priorAttempt: null, expectedDelaySec: 0 },
{ label: 'first crash (attempt=2)', priorAttempt: 1, expectedDelaySec: 0 },
{ label: 'second crash (attempt=3)', priorAttempt: 2, expectedDelaySec: 10 },
{ label: 'third crash (attempt=4)', priorAttempt: 3, expectedDelaySec: 30 },
{ label: 'fourth crash (attempt=5)', priorAttempt: 4, expectedDelaySec: 120 },
{ label: 'fifth crash (attempt=6)', priorAttempt: 5, expectedDelaySec: 300 },
{ label: 'sixth crash (attempt=7) — cap', priorAttempt: 6, expectedDelaySec: 900 },
{ label: 'far past cap (attempt=20)', priorAttempt: 19, expectedDelaySec: 900 },
];
for (const { label, priorAttempt, expectedDelaySec } of cases) {
it(`${label}: delays ${expectedDelaySec}s`, async () => {
if (priorAttempt !== null) seedState(priorAttempt);
vi.useFakeTimers();
const setTimeoutSpy = vi.spyOn(global, 'setTimeout');
const promise = enforceStartupBackoff();
await vi.runAllTimersAsync();
await promise;
// enforceStartupBackoff only calls setTimeout when delaySec > 0. Pick
// the longest delay it requested (vitest may queue small internal
// timers we don't care about).
const requestedDelays = setTimeoutSpy.mock.calls.map((c) => c[1] ?? 0);
const maxDelayMs = requestedDelays.length ? Math.max(...requestedDelays) : 0;
expect(maxDelayMs).toBe(expectedDelaySec * 1000);
});
}
});
describe('enforceStartupBackoff — fresh install (DATA_DIR missing)', () => {
/**
* The breaker runs before initDb (which is what creates DATA_DIR). On a
* fresh checkout the dir doesn't exist yet, so write() must create it
* before writing the state file — otherwise the host crashes on its very
* first start.
*/
it('creates DATA_DIR on demand and does not throw', async () => {
fs.rmSync(TEST_DIR, { recursive: true });
expect(fs.existsSync(TEST_DIR)).toBe(false);
await expect(enforceStartupBackoff()).resolves.toBeUndefined();
expect(fs.existsSync(TEST_DIR)).toBe(true);
expect(fs.existsSync(CB_PATH)).toBe(true);
expect(readState().attempt).toBe(1);
});
});

View File

@@ -6,7 +6,9 @@ import { log } from './log.js';
const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json');
const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour
const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min
// Index = number of consecutive crashes (0 = clean start, attempt 1).
// 6+ crashes capped at 15min.
const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900];
interface CircuitBreakerState {
attempt: number;
@@ -23,11 +25,14 @@ function read(): CircuitBreakerState | null {
}
function write(state: CircuitBreakerState): void {
// The breaker runs before initDb (which is what creates DATA_DIR), so on a
// fresh checkout the dir may not exist yet.
fs.mkdirSync(DATA_DIR, { recursive: true });
fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n');
}
function getDelay(attempt: number): number {
const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1);
const idx = Math.min(attempt - 1, BACKOFF_SCHEDULE_S.length - 1);
return BACKOFF_SCHEDULE_S[idx];
}

View File

@@ -178,9 +178,15 @@ async function shutdown(signal: string): Promise<void> {
}
stopDeliveryPolls();
stopHostSweep();
await teardownChannelAdapters();
resetCircuitBreaker();
process.exit(0);
try {
await teardownChannelAdapters();
} finally {
// Always reset on graceful shutdown — even if teardown threw, we got here
// via SIGTERM/SIGINT, not a crash, so the next start shouldn't be counted
// as one.
resetCircuitBreaker();
process.exit(0);
}
}
process.on('SIGTERM', () => shutdown('SIGTERM'));