feat(setup): Claude-assisted error recovery with resume-at-step retry

When a setup step fails — whether hard via fail() or soft via the
"What's left" / "Skipping the first chat" notes — offer to ask Claude
to diagnose. On consent, spawn `claude -p --output-format stream-json`
with a scrolling 3-line action window ("Reading x", "Running y") so
the 1–4 minute investigations feel active rather than hung. No hard
timeout: debugging can take time, Ctrl-C is the escape hatch.

The prompt is minimal: one-paragraph framing, failed step name + msg +
hint, and a list of file references (not contents). Claude's Read/Grep
tools fetch what they need. A per-step map in claude-assist.ts gives
the most relevant files per step; the rest is README + auto.ts +
logs/setup.log + the per-step raw log.

Claude responds with REASON + COMMAND lines. We show the reason in a
clack note, prefill the command via setup/run-suggested.sh (bash 4+
readline, 3.x fallback to Enter-to-run), and eval on the user's
confirm.

When the user runs a fix, fail() now offers to retry the failing step
rather than aborting. setup/logs.ts tracks successfully-completed step
names in-memory; fail() threads those as NANOCLAW_SKIP on a spawnSync
retry, so the child picks up exactly where the parent left off — no
rebuilding containers or reinstalling OneCLI.

Other polish in this change:
- fitToWidth + dimWrap in lib/theme.ts to prevent long spinner labels
  from soft-wrapping (each terminal row stacks a stale copy otherwise).
- Shorter container step label ("Preparing your assistant's sandbox…")
  so it fits on narrow terminals.
- Wordmark anchored in the clack intro line on every run.
- All 25 existing fail() call sites updated to await fail(...) since
  fail is now async.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-04-22 12:42:32 +03:00
parent dfcbab5364
commit 4859d8fb2d
8 changed files with 589 additions and 37 deletions

View File

@@ -11,13 +11,15 @@
*
* See docs/setup-flow.md for the three-level output contract.
*/
import { spawn } from 'child_process';
import { spawn, spawnSync } from 'child_process';
import fs from 'fs';
import * as p from '@clack/prompts';
import k from 'kleur';
import * as setupLog from '../logs.js';
import { offerClaudeAssist } from './claude-assist.js';
import { fitToWidth } from './theme.js';
export type Fields = Record<string, string>;
export type Block = { type: string; fields: Fields };
@@ -261,23 +263,25 @@ async function runUnderSpinner<
): Promise<T> {
const s = p.spinner();
const start = Date.now();
s.start(labels.running);
s.start(fitToWidth(labels.running, ' (999s)'));
const tick = setInterval(() => {
const elapsed = Math.round((Date.now() - start) / 1000);
s.message(`${labels.running} ${k.dim(`(${elapsed}s)`)}`);
const suffix = ` (${elapsed}s)`;
s.message(`${fitToWidth(labels.running, suffix)}${k.dim(suffix)}`);
}, 1000);
const result = await work();
clearInterval(tick);
const elapsed = Math.round((Date.now() - start) / 1000);
const suffix = ` (${elapsed}s)`;
if (result.ok) {
const isSkipped = result.terminal?.fields.STATUS === 'skipped';
const msg = isSkipped && labels.skipped ? labels.skipped : labels.done;
s.stop(`${msg} ${k.dim(`(${elapsed}s)`)}`);
s.stop(`${fitToWidth(msg, suffix)}${k.dim(suffix)}`);
} else {
const failMsg = labels.failed ?? labels.running.replace(/…$/, ' failed');
s.stop(`${failMsg} ${k.dim(`(${elapsed}s)`)}`, 1);
s.stop(`${fitToWidth(failMsg, suffix)}${k.dim(suffix)}`, 1);
dumpTranscriptOnFailure(result.transcript);
}
return result;
@@ -301,12 +305,53 @@ export function dumpTranscriptOnFailure(transcript: string): void {
* Abort the setup run with a user-facing error, logging the abort to the
* progression log. Takes the step name explicitly so callers are clear
* about which step they're failing from — no hidden module state.
*
* Before aborting we offer Claude-assisted debugging. Callers must
* `await fail(...)` so the offer can actually run before we call
* process.exit. The return type is `Promise<never>`; control-flow
* narrowing still works after `await`.
*/
export function fail(stepName: string, msg: string, hint?: string): never {
export async function fail(
stepName: string,
msg: string,
hint?: string,
rawLogPath?: string,
): Promise<never> {
setupLog.abort(stepName, msg);
p.log.error(msg);
if (hint) p.log.message(k.dim(hint));
p.log.message(k.dim('Logs: logs/setup.log · Raw: logs/setup-steps/'));
const ranFix = await offerClaudeAssist({ stepName, msg, hint, rawLogPath });
// If the user just ran a Claude-suggested fix, offer to resume the flow
// at the step that failed instead of aborting. We re-exec via spawnSync
// and pass NANOCLAW_SKIP with every step that already completed so the
// child skips them and picks up where we left off.
if (ranFix) {
const retry = ensureAnswer(
await p.confirm({
message: `Fix applied. Retry the ${stepName} step?`,
initialValue: true,
}),
);
if (retry) {
const existingSkip = (process.env.NANOCLAW_SKIP ?? '')
.split(',')
.map((s) => s.trim())
.filter(Boolean);
const skipList = [
...new Set([...existingSkip, ...setupLog.completedStepNames()]),
].join(',');
p.log.step(`Retrying from ${stepName}`);
const result = spawnSync('pnpm', ['--silent', 'run', 'setup:auto'], {
stdio: 'inherit',
env: { ...process.env, NANOCLAW_SKIP: skipList },
});
process.exit(result.status ?? 0);
}
}
p.cancel('Setup aborted.');
process.exit(1);
}