diff --git a/CLAUDE.md b/CLAUDE.md index 7115c4c..6565e8f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart systemctl --user start|stop|restart nanoclaw ``` -Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here). +## Troubleshooting + +Check these first when something goes wrong: + +| What | Where | +|------|-------| +| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain | +| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) | +| Session DBs | `data/v2-sessions///` — `inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) | + +Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect. ## Supply Chain Security (pnpm) diff --git a/src/circuit-breaker.ts b/src/circuit-breaker.ts new file mode 100644 index 0000000..4288eb4 --- /dev/null +++ b/src/circuit-breaker.ts @@ -0,0 +1,79 @@ +import fs from 'fs'; +import path from 'path'; + +import { DATA_DIR } from './config.js'; +import { log } from './log.js'; + +const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json'); +const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour +const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min + +interface CircuitBreakerState { + attempt: number; + timestamp: string; +} + +function read(): CircuitBreakerState | null { + try { + const raw = fs.readFileSync(CB_PATH, 'utf-8'); + return JSON.parse(raw) as CircuitBreakerState; + } catch { + return null; + } +} + +function write(state: CircuitBreakerState): void { + fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n'); +} + +function getDelay(attempt: number): number { + const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1); + return BACKOFF_SCHEDULE_S[idx]; +} + +export function resetCircuitBreaker(): void { + try { + fs.unlinkSync(CB_PATH); + log.info('Circuit breaker reset on clean shutdown'); + } catch {} +} + +export async function enforceStartupBackoff(): Promise { + const now = new Date(); + const prev = read(); + + let attempt: number; + if (!prev) { + attempt = 1; + } else { + const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime(); + if (elapsedMs < RESET_WINDOW_MS) { + attempt = prev.attempt + 1; + log.warn('Previous startup was not a clean shutdown', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + elapsedSec: Math.round(elapsedMs / 1000), + }); + } else { + attempt = 1; + log.info('Circuit breaker reset — last startup was over 1h ago', { + previousAttempt: prev.attempt, + previousTimestamp: prev.timestamp, + }); + } + } + + write({ attempt, timestamp: now.toISOString() }); + + const delaySec = getDelay(attempt); + if (delaySec > 0) { + const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString(); + log.warn('Circuit breaker: delaying startup due to repeated crashes', { + attempt, + delaySec, + resumeAt, + }); + await new Promise((resolve) => setTimeout(resolve, delaySec * 1000)); + log.info('Circuit breaker: backoff complete, resuming startup', { attempt }); + } +} diff --git a/src/index.ts b/src/index.ts index ea9fba6..6235525 100644 --- a/src/index.ts +++ b/src/index.ts @@ -7,6 +7,7 @@ import path from 'path'; import { DATA_DIR } from './config.js'; +import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js'; import { migrateGroupsToClaudeLocal } from './claude-md-compose.js'; import { initDb } from './db/connection.js'; import { runMigrations } from './db/migrations/index.js'; @@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from async function main(): Promise { log.info('NanoClaw starting'); + // 0. Circuit breaker — backoff on rapid restarts + await enforceStartupBackoff(); + // 1. Init central DB const dbPath = path.join(DATA_DIR, 'v2.db'); const db = initDb(dbPath); @@ -175,6 +179,7 @@ async function shutdown(signal: string): Promise { stopDeliveryPolls(); stopHostSweep(); await teardownChannelAdapters(); + resetCircuitBreaker(); process.exit(0); }