diff --git a/.claude/skills/add-atomic-chat-tool/SKILL.md b/.claude/skills/add-atomic-chat-tool/SKILL.md new file mode 100644 index 0000000..6a6d858 --- /dev/null +++ b/.claude/skills/add-atomic-chat-tool/SKILL.md @@ -0,0 +1,243 @@ +--- +name: add-atomic-chat-tool +description: Add Atomic Chat MCP server so the container agent can call local models served by the Atomic Chat desktop app via its OpenAI-compatible API. +--- + +# Add Atomic Chat Integration + +This skill adds a stdio-based MCP server that exposes models running in the local [Atomic Chat](https://github.com/AtomicBot-ai/Atomic-Chat) desktop app as tools for the container agent. Claude remains the orchestrator but can offload work to local models served by Atomic Chat on `http://127.0.0.1:1337/v1` (OpenAI-compatible). + +Tools exposed: +- `atomic_chat_list_models` — list models currently available in Atomic Chat (`GET /v1/models`) +- `atomic_chat_generate` — send a prompt to a specified model and return the response (`POST /v1/chat/completions`) + +Model management (download, delete) is done through the **Atomic Chat desktop UI** — the app is a fork of Jan and manages its own model library. + +The skill ships the MCP server source in this folder and copies it into the agent-runner tree at install time, then wires it up with small edits to `index.ts`, `providers/claude.ts`, and `container-runner.ts`. No branch merge — all edits are additive and idempotent. + +## Phase 1: Pre-flight + +### Check if already applied + +Check if `container/agent-runner/src/atomic-chat-mcp-stdio.ts` exists. If it does, skip to Phase 3 (Configure). + +### Check prerequisites + +Verify Atomic Chat is installed and its local API server is running. On the host: + +```bash +curl -s http://127.0.0.1:1337/v1/models | head +``` + +If the request fails: + +1. Install Atomic Chat from the [latest release](https://github.com/AtomicBot-ai/Atomic-Chat/releases) (macOS only for now — `atomic-chat.dmg`). +2. Open the app. +3. Open **Settings → Local API Server** and make sure it's enabled on port `1337`. +4. Go to the **Hub** (or **Models**) tab and download at least one model (e.g. Llama 3.2 3B, Qwen 2.5 Coder 7B). +5. Load the model once by sending any message in Atomic Chat's UI to warm it up. + +## Phase 2: Apply Code Changes + +### Copy the MCP server source + +```bash +cp .claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts container/agent-runner/src/atomic-chat-mcp-stdio.ts +``` + +### Register the MCP server in the agent-runner + +Edit `container/agent-runner/src/index.ts`. Find the `mcpServers` object that currently looks like this: + +```ts + const mcpServers: Record }> = { + nanoclaw: { + command: 'bun', + args: ['run', mcpServerPath], + env: {}, + }, + }; +``` + +Add an `atomic_chat` entry alongside `nanoclaw`: + +```ts + const mcpServers: Record }> = { + nanoclaw: { + command: 'bun', + args: ['run', mcpServerPath], + env: {}, + }, + atomic_chat: { + command: 'bun', + args: ['run', path.join(__dirname, 'atomic-chat-mcp-stdio.ts')], + env: { + ...(process.env.ATOMIC_CHAT_HOST ? { ATOMIC_CHAT_HOST: process.env.ATOMIC_CHAT_HOST } : {}), + ...(process.env.ATOMIC_CHAT_API_KEY ? { ATOMIC_CHAT_API_KEY: process.env.ATOMIC_CHAT_API_KEY } : {}), + }, + }, + }; +``` + +### Add the tool glob to the allowlist + +Edit `container/agent-runner/src/providers/claude.ts`. Find `'mcp__nanoclaw__*',` in the `TOOL_ALLOWLIST` array and add `'mcp__atomic_chat__*',` on the following line: + +```ts + 'mcp__nanoclaw__*', + 'mcp__atomic_chat__*', +]; +``` + +### Forward host env vars into the container + +Edit `src/container-runner.ts` in `buildContainerArgs`. Find the `TZ` env line: + +```ts + args.push('-e', `TZ=${TIMEZONE}`); +``` + +Add ATOMIC_CHAT forwarding right after it: + +```ts + args.push('-e', `TZ=${TIMEZONE}`); + + // Atomic Chat MCP tool: forward host overrides if set (default is host.docker.internal:1337). + if (process.env.ATOMIC_CHAT_HOST) { + args.push('-e', `ATOMIC_CHAT_HOST=${process.env.ATOMIC_CHAT_HOST}`); + } + if (process.env.ATOMIC_CHAT_API_KEY) { + args.push('-e', `ATOMIC_CHAT_API_KEY=${process.env.ATOMIC_CHAT_API_KEY}`); + } +``` + +### Surface `[ATOMIC]` log lines at info level + +In the same file, find the stderr logger: + +```ts + container.stderr?.on('data', (data) => { + for (const line of data.toString().trim().split('\n')) { + if (line) log.debug(line, { container: agentGroup.folder }); + } + }); +``` + +Replace it with: + +```ts + container.stderr?.on('data', (data) => { + for (const line of data.toString().trim().split('\n')) { + if (!line) continue; + if (line.includes('[ATOMIC]')) { + log.info(line, { container: agentGroup.folder }); + } else { + log.debug(line, { container: agentGroup.folder }); + } + } + }); +``` + +### Add env-var stubs to `.env.example` + +Append to `.env.example`: + +```bash +# Atomic Chat MCP tool (.claude/skills/add-atomic-chat-tool) +# Override the host where Atomic Chat exposes its OpenAI-compatible API. +# Default: http://host.docker.internal:1337 (with fallback to localhost) +# ATOMIC_CHAT_HOST=http://host.docker.internal:1337 + +# Optional API key. Leave unset for a local Atomic Chat install — it does not require auth. +# ATOMIC_CHAT_API_KEY= +``` + +### Validate code changes + +```bash +pnpm run build +pnpm exec tsc -p container/agent-runner/tsconfig.json --noEmit +./container/build.sh +``` + +All three must be clean before proceeding. + +## Phase 3: Configure + +### Set Atomic Chat host (optional) + +By default, the MCP server connects to `http://host.docker.internal:1337` (Docker Desktop) with a fallback to `localhost`. To use a custom host, add to `.env`: + +```bash +ATOMIC_CHAT_HOST=http://your-atomic-chat-host:1337 +``` + +### Set API key (optional) + +Atomic Chat does **not require authentication** when running locally — leave this unset. Only set it if you've put Atomic Chat behind a reverse proxy that enforces auth: + +```bash +ATOMIC_CHAT_API_KEY=sk-... +``` + +### Restart the service + +```bash +launchctl kickstart -k gui/$(id -u)/com.nanoclaw # macOS +# Linux: systemctl --user restart nanoclaw +``` + +## Phase 4: Verify + +### Test inference + +Tell the user: + +> Send a message like: "use atomic chat to tell me the capital of France" +> +> The agent should use `atomic_chat_list_models` to find available models, then `atomic_chat_generate` to get a response. + +### Check logs if needed + +```bash +tail -f logs/nanoclaw.log | grep -i atomic +``` + +Look for: +- `[ATOMIC] Listing models...` — list request started +- `[ATOMIC] Found N models` — models discovered +- `[ATOMIC] >>> Generating with ` — generation started +- `[ATOMIC] <<< Done: | Xs | N tokens | M chars` — generation completed + +## Troubleshooting + +### Agent says "Atomic Chat is not installed" or tries to run a CLI + +The agent is looking for a CLI that doesn't exist instead of using the MCP tools. This means: +1. The MCP server wasn't copied — check `container/agent-runner/src/atomic-chat-mcp-stdio.ts` exists +2. The MCP server wasn't registered — check `container/agent-runner/src/index.ts` has the `atomic_chat` entry in `mcpServers` +3. The allowlist wasn't updated — check `container/agent-runner/src/providers/claude.ts` includes `mcp__atomic_chat__*` in `TOOL_ALLOWLIST` +4. The container wasn't rebuilt — run `./container/build.sh` + +### "Failed to connect to Atomic Chat" + +1. Verify the host API is reachable: `curl http://127.0.0.1:1337/v1/models` +2. Confirm the Local API Server is enabled in Atomic Chat's settings +3. Check Docker can reach the host: `docker run --rm curlimages/curl curl -s http://host.docker.internal:1337/v1/models` +4. If using a custom host, check `ATOMIC_CHAT_HOST` in `.env` + +### `model not found` / 404 on generate + +The model ID passed to `atomic_chat_generate` must exactly match one of the IDs returned by `atomic_chat_list_models`. Ask the agent to list models first, then pick one from that list. + +### Slow first response + +Atomic Chat lazy-loads models into memory on first use. The initial call may take longer while the model warms up. Subsequent calls against the same model are fast. + +### Agent doesn't use Atomic Chat tools + +The agent may not know about the tools. Try being explicit: "use the atomic_chat_generate tool with llama3.2-3b-instruct to answer: ..." + +### Context window or output size issues + +Atomic Chat respects each model's native context length. If you hit limits, pass `max_tokens` explicitly when calling `atomic_chat_generate`, or switch to a model with a larger context window in the Atomic Chat UI. diff --git a/.claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts b/.claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts new file mode 100644 index 0000000..0198644 --- /dev/null +++ b/.claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts @@ -0,0 +1,229 @@ +/** + * Atomic Chat MCP Server for NanoClaw + * Exposes local Atomic Chat models (OpenAI-compatible, /v1) as tools for the container agent. + * Uses host.docker.internal to reach the host's Atomic Chat desktop app from Docker. + */ + +import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; +import { z } from 'zod'; + +import fs from 'fs'; +import path from 'path'; + +const ATOMIC_CHAT_HOST = + process.env.ATOMIC_CHAT_HOST || 'http://host.docker.internal:1337'; +const ATOMIC_CHAT_API_KEY = process.env.ATOMIC_CHAT_API_KEY || ''; +const ATOMIC_CHAT_STATUS_FILE = '/workspace/ipc/atomic_chat_status.json'; + +function log(msg: string): void { + console.error(`[ATOMIC] ${msg}`); +} + +function writeStatus(status: string, detail?: string): void { + try { + const data = { status, detail, timestamp: new Date().toISOString() }; + const tmpPath = `${ATOMIC_CHAT_STATUS_FILE}.tmp`; + fs.mkdirSync(path.dirname(ATOMIC_CHAT_STATUS_FILE), { recursive: true }); + fs.writeFileSync(tmpPath, JSON.stringify(data)); + fs.renameSync(tmpPath, ATOMIC_CHAT_STATUS_FILE); + } catch { + /* best-effort */ + } +} + +async function atomicFetch( + apiPath: string, + options?: RequestInit, +): Promise { + const url = `${ATOMIC_CHAT_HOST}${apiPath}`; + const headers: Record = { + ...((options?.headers as Record) || {}), + }; + if (ATOMIC_CHAT_API_KEY) { + headers.Authorization = `Bearer ${ATOMIC_CHAT_API_KEY}`; + } + const finalOptions: RequestInit = { ...options, headers }; + try { + return await fetch(url, finalOptions); + } catch (err) { + // Fallback to localhost if host.docker.internal fails + if (ATOMIC_CHAT_HOST.includes('host.docker.internal')) { + const fallbackUrl = url.replace('host.docker.internal', 'localhost'); + return await fetch(fallbackUrl, finalOptions); + } + throw err; + } +} + +const server = new McpServer({ + name: 'atomic_chat', + version: '1.0.0', +}); + +server.tool( + 'atomic_chat_list_models', + 'List all models available in the local Atomic Chat desktop app. Use this to see which models are loaded before calling atomic_chat_generate.', + {}, + async () => { + log('Listing models...'); + writeStatus('listing', 'Listing available models'); + try { + const res = await atomicFetch('/v1/models'); + if (!res.ok) { + return { + content: [ + { + type: 'text' as const, + text: `Atomic Chat API error: ${res.status} ${res.statusText}`, + }, + ], + isError: true, + }; + } + + const data = (await res.json()) as { + data?: Array<{ id: string; owned_by?: string }>; + }; + const models = data.data || []; + + if (models.length === 0) { + return { + content: [ + { + type: 'text' as const, + text: 'No models available. Open Atomic Chat on the host and download a model from the Hub.', + }, + ], + }; + } + + const list = models + .map((m) => `- ${m.id}${m.owned_by ? ` (${m.owned_by})` : ''}`) + .join('\n'); + + log(`Found ${models.length} models`); + return { + content: [ + { type: 'text' as const, text: `Available models:\n${list}` }, + ], + }; + } catch (err) { + return { + content: [ + { + type: 'text' as const, + text: `Failed to connect to Atomic Chat at ${ATOMIC_CHAT_HOST}: ${err instanceof Error ? err.message : String(err)}`, + }, + ], + isError: true, + }; + } + }, +); + +server.tool( + 'atomic_chat_generate', + 'Send a prompt to a local Atomic Chat model and get a response. Good for cheaper/faster tasks like summarization, translation, or general queries. Use atomic_chat_list_models first to see available models.', + { + model: z + .string() + .describe( + 'The model ID as returned by atomic_chat_list_models (e.g. "llama3.2-3b-instruct")', + ), + prompt: z.string().describe('The prompt to send to the model'), + system: z + .string() + .optional() + .describe('Optional system prompt to set model behavior'), + temperature: z + .number() + .optional() + .describe('Sampling temperature (0.0–2.0). Defaults to model default.'), + max_tokens: z + .number() + .optional() + .describe('Maximum number of tokens to generate in the response.'), + }, + async (args) => { + log(`>>> Generating with ${args.model} (${args.prompt.length} chars)...`); + writeStatus('generating', `Generating with ${args.model}`); + try { + const messages: Array<{ role: string; content: string }> = []; + if (args.system) { + messages.push({ role: 'system', content: args.system }); + } + messages.push({ role: 'user', content: args.prompt }); + + const body: Record = { + model: args.model, + messages, + stream: false, + }; + if (args.temperature !== undefined) body.temperature = args.temperature; + if (args.max_tokens !== undefined) body.max_tokens = args.max_tokens; + + const startedAt = Date.now(); + const res = await atomicFetch('/v1/chat/completions', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }); + + if (!res.ok) { + const errorText = await res.text(); + return { + content: [ + { + type: 'text' as const, + text: `Atomic Chat error (${res.status}): ${errorText}`, + }, + ], + isError: true, + }; + } + + const data = (await res.json()) as { + choices?: Array<{ message?: { content?: string } }>; + usage?: { + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + }; + }; + + const response = data.choices?.[0]?.message?.content ?? ''; + const elapsedSec = ((Date.now() - startedAt) / 1000).toFixed(1); + const completionTokens = data.usage?.completion_tokens; + + const meta = `\n\n[${args.model} | ${elapsedSec}s${ + completionTokens !== undefined ? ` | ${completionTokens} tokens` : '' + }]`; + + log( + `<<< Done: ${args.model} | ${elapsedSec}s | ${ + completionTokens ?? '?' + } tokens | ${response.length} chars`, + ); + writeStatus( + 'done', + `${args.model} | ${elapsedSec}s | ${completionTokens ?? '?'} tokens`, + ); + + return { content: [{ type: 'text' as const, text: response + meta }] }; + } catch (err) { + return { + content: [ + { + type: 'text' as const, + text: `Failed to call Atomic Chat: ${err instanceof Error ? err.message : String(err)}`, + }, + ], + isError: true, + }; + } + }, +); + +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/.claude/skills/add-codex/SKILL.md b/.claude/skills/add-codex/SKILL.md new file mode 100644 index 0000000..17910b7 --- /dev/null +++ b/.claude/skills/add-codex/SKILL.md @@ -0,0 +1,161 @@ +--- +name: add-codex +description: Use Codex (CLI + AppServer) as the full agent provider — planning, tool orchestration, native compaction, MCP tools, session resume — in place of the Claude Agent SDK. ChatGPT subscription or OPENAI_API_KEY. Per-group via agent_provider. Distinct from using OpenAI as an MCP tool (where Claude remains the planner). +--- + +# Codex agent provider + +NanoClaw runs agents in a long-lived **poll loop** inside the container. The backend is selected with **`AGENT_PROVIDER`** (`claude` | `opencode` | `codex` | `mock`). + +Trunk ships with only the `claude` provider baked in. This skill copies the Codex provider files in from the `providers` branch, wires them into the host and container barrels, updates the Dockerfile to install the Codex CLI, and rebuilds the image. + +The Codex provider runs `codex app-server` as a child process and speaks JSON-RPC over stdio. That gives it native session resume, streaming events, MCP tool access, and `thread/compact/start` compaction — same feature bar as the Claude Agent SDK, without the Anthropic-only lock-in. + +## Install + +### Pre-flight + +If all of the following are already present, skip to **Configuration**: + +- `src/providers/codex.ts` +- `container/agent-runner/src/providers/codex.ts` +- `container/agent-runner/src/providers/codex-app-server.ts` +- `container/agent-runner/src/providers/codex.factory.test.ts` +- `import './codex.js';` line in `src/providers/index.ts` +- `import './codex.js';` line in `container/agent-runner/src/providers/index.ts` +- `ARG CODEX_VERSION` and `"@openai/codex@${CODEX_VERSION}"` in the pnpm global-install block in `container/Dockerfile` + +Missing pieces — continue below. All steps are idempotent; re-running is safe. + +### 1. Fetch the providers branch + +```bash +git fetch origin providers +``` + +### 2. Copy the Codex source files + +Wholesale copies (owned entirely by this skill — user edits to these files won't survive a re-run, as designed): + +```bash +git show origin/providers:src/providers/codex.ts > src/providers/codex.ts +git show origin/providers:container/agent-runner/src/providers/codex.ts > container/agent-runner/src/providers/codex.ts +git show origin/providers:container/agent-runner/src/providers/codex-app-server.ts > container/agent-runner/src/providers/codex-app-server.ts +git show origin/providers:container/agent-runner/src/providers/codex.factory.test.ts > container/agent-runner/src/providers/codex.factory.test.ts +``` + +### 3. Append the self-registration imports + +Each barrel gets one line — alphabetical placement keeps diffs small. + +`src/providers/index.ts`: + +```typescript +import './codex.js'; +``` + +`container/agent-runner/src/providers/index.ts`: + +```typescript +import './codex.js'; +``` + +### 4. Add the Codex CLI to the container Dockerfile + +Two edits to `container/Dockerfile`, both idempotent (skip if already present): + +**(a)** In the "Pin CLI versions" ARG block (around line 18), add after `ARG CLAUDE_CODE_VERSION=...`: + +```dockerfile +ARG CODEX_VERSION=0.121.0 +``` + +**(b)** Add a new standalone `RUN` block for the Codex CLI, after the existing per-CLI install blocks (around line 106, right after the `@anthropic-ai/claude-code` block). The Dockerfile splits each global CLI into its own layer for cache granularity — keep that pattern; do not collapse them into a single combined `pnpm install -g` call: + +```dockerfile +RUN --mount=type=cache,target=/root/.cache/pnpm \ + pnpm install -g "@openai/codex@${CODEX_VERSION}" +``` + +Note: **no agent-runner package dependency** — Codex is a CLI binary, not a library. Unlike OpenCode, there's nothing to add to `container/agent-runner/package.json`. + +### 5. Build + +```bash +pnpm run build # host +pnpm exec tsc -p container/agent-runner/tsconfig.json --noEmit # container typecheck +./container/build.sh # agent image +``` + +## Configuration + +Codex supports two primary auth paths and one experimental BYO-endpoint path. Pick the one that matches your setup. + +### Option A — ChatGPT subscription (recommended for individuals) + +On the host (not inside the container), run Codex's OAuth login: + +```bash +codex login +``` + +This writes `~/.codex/auth.json` with a subscription token. The host-side Codex provider ([src/providers/codex.ts](../../../src/providers/codex.ts)) copies `auth.json` into a per-session `~/.codex` directory mounted into the container — your host's own Codex CLI is never touched. + +No `.env` variables required for this mode. + +### Option B — API key (recommended for CI or API billing) + +```env +OPENAI_API_KEY=sk-... +CODEX_MODEL=gpt-5.4-mini +``` + +The host forwards both variables into the container. If both subscription (`auth.json`) and `OPENAI_API_KEY` are present, Codex prefers the subscription. + +### Option C — BYO OpenAI-compatible endpoint (experimental) + +Codex's built-in `openai` provider honors the `OPENAI_BASE_URL` env var directly. Point it at any OpenAI-compatible endpoint — Groq, Together, self-hosted vLLM, an OpenAI proxy, etc. + +```env +OPENAI_API_KEY=... +OPENAI_BASE_URL=https://api.groq.com/openai/v1 +CODEX_MODEL=llama-3.3-70b-versatile +``` + +Codex also ships first-class local-runner flags — `codex --oss --local-provider ollama` or `--local-provider lmstudio` — that auto-detect a local server. To use those inside NanoClaw, set `CODEX_MODEL` to a model your local runner serves and add the corresponding base URL; see the Codex CLI docs for the full `model_provider = oss` configuration. + +**Experimental caveat:** tool-calling quality depends on the model and endpoint. Not every OpenAI-compat provider implements the full function-calling spec, and smaller models (< 30B) often struggle with multi-step tool orchestration. Test before committing. + +### Per group / per session + +Schema: **`agent_groups.agent_provider`** and **`sessions.agent_provider`**. Set to `codex` for groups or sessions that should use Codex. The container receives `AGENT_PROVIDER` from the resolved value (session overrides group). + +`CODEX_MODEL` applies process-wide via `.env`; if you need different models for different groups, set them via `container_config.env` on the group. + +Extra MCP servers still come from **`NANOCLAW_MCP_SERVERS`** / `container_config.mcpServers` on the host. The runner merges them into the same `mcpServers` object passed to all providers. + +## Operational notes + +- **Spawn-per-query:** Codex's app-server is spawned fresh per query invocation, matching the OpenCode pattern. No long-lived daemon to keep healthy across sessions. +- **Per-session `~/.codex` isolation:** each group gets its own copy of the host's `auth.json`. The container can rewrite `config.toml` freely on every wake without touching the host's Codex config. +- **Native compaction:** kicks in automatically at 40K cumulative input tokens between turns, via `thread/compact/start`. If compaction fails, the provider logs and continues uncompacted — no fatal error. +- **Approvals:** auto-accepted inside the container (the container is the sandbox; same posture as Claude/OpenCode). +- **Mid-turn input:** Codex turns don't accept mid-turn messages. Follow-up `push()` calls queue and drain between turns, matching the OpenCode pattern. The poll-loop only pushes between turns anyway, so no messages are dropped. +- **Stale thread recovery:** `isSessionInvalid` matches on stale-thread-ID errors (`thread not found`, `unknown thread`, etc.) so a cold-started app-server can recover cleanly when it sees a stored continuation it no longer has. + +## Verify + +```bash +grep -q "./codex.js" container/agent-runner/src/providers/index.ts && echo "container barrel: OK" +grep -q "./codex.js" src/providers/index.ts && echo "host barrel: OK" +grep -q "@openai/codex@" container/Dockerfile && echo "Dockerfile install: OK" +cd container/agent-runner && bun test src/providers/codex.factory.test.ts && cd - +``` + +After the image rebuild, set `agent_provider = 'codex'` on a test group and send a message. Successful round-trip looks like: + +- `init` event with a stable thread ID as continuation +- One or more `activity` / `progress` events during the turn +- `result` event with the model's reply + +If the agent hangs or errors, check `~/.codex/auth.json` exists on the host (Option A) or that `OPENAI_API_KEY` is forwarding correctly (Option B) — `docker exec` into a running container and `env | grep -i openai` to confirm. diff --git a/package.json b/package.json index 09053c4..77920c4 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "nanoclaw", - "version": "2.0.5", + "version": "2.0.7", "description": "Personal Claude assistant. Lightweight, secure, customizable.", "type": "module", "packageManager": "pnpm@10.33.0", diff --git a/src/container-runner.ts b/src/container-runner.ts index 71e2064..fca88c4 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -36,7 +36,13 @@ import { type ProviderContainerContribution, type VolumeMount, } from './providers/provider-container-registry.js'; -import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js'; +import { + heartbeatPath, + markContainerRunning, + markContainerStopped, + sessionDir, + writeSessionRouting, +} from './session-manager.js'; import type { AgentGroup, Session } from './types.js'; const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY }); @@ -131,6 +137,12 @@ async function spawnContainer(session: Session): Promise { log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName }); + // Clear any orphan heartbeat from a previous container instance — the + // sweep's ceiling check treats a missing file as "fresh spawn, give grace" + // (host-sweep.ts line 87). Without this, the stale mtime can trigger an + // immediate kill before the new container touches the file itself. + fs.rmSync(heartbeatPath(agentGroup.id, session.id), { force: true }); + const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] }); activeContainers.set(session.id, { process: container, containerName }); diff --git a/src/db/session-db.ts b/src/db/session-db.ts index aea255d..48e9297 100644 --- a/src/db/session-db.ts +++ b/src/db/session-db.ts @@ -139,10 +139,10 @@ export function getMessageForRetry( db: Database.Database, messageId: string, status: string, -): { id: string; tries: number } | undefined { - return db.prepare('SELECT id, tries FROM messages_in WHERE id = ? AND status = ?').get(messageId, status) as - | { id: string; tries: number } - | undefined; +): { id: string; tries: number; processAfter: string | null } | undefined { + return db + .prepare('SELECT id, tries, process_after as processAfter FROM messages_in WHERE id = ? AND status = ?') + .get(messageId, status) as { id: string; tries: number; processAfter: string | null } | undefined; } export function syncProcessingAcks(inDb: Database.Database, outDb: Database.Database): void { diff --git a/src/host-sweep.ts b/src/host-sweep.ts index 1a2901c..4dc2fb7 100644 --- a/src/host-sweep.ts +++ b/src/host-sweep.ts @@ -159,23 +159,31 @@ async function sweepSession(session: Session): Promise { syncProcessingAcks(inDb, outDb); } - const alive = isContainerRunning(session.id); - - // 2. Crashed-container cleanup: processing rows left behind get retried. - if (!alive && outDb) { - resetStuckProcessingRows(inDb, outDb, session, 'container not running'); + // 2. Wake a container if work is due and nothing is running. Ordered + // before the crashed-container cleanup so a fresh container gets a chance + // to clean its own orphan processing_ack rows on startup (see + // container/agent-runner/src/db/connection.ts). Otherwise the reset path + // would keep bumping process_after into the future, dueCount would stay 0, + // and the wake would never fire. + const dueCount = countDueMessages(inDb); + if (dueCount > 0 && !isContainerRunning(session.id)) { + log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); + await wakeContainer(session); } + const alive = isContainerRunning(session.id); + // 3. Running-container SLA: absolute ceiling + per-claim stuck rules. if (alive && outDb) { enforceRunningContainerSla(inDb, outDb, session, agentGroup.id); } - // 4. Wake a container if new work is due and nothing is running. - const dueCount = countDueMessages(inDb); - if (dueCount > 0 && !isContainerRunning(session.id)) { - log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); - await wakeContainer(session); + // 4. Crashed-container cleanup: processing rows left behind get retried. + // Only fires when wake in step 2 didn't pick up the work (no due messages, + // or wake failed). resetStuckProcessingRows itself is idempotent — it + // skips messages already scheduled for a future retry. + if (!alive && outDb) { + resetStuckProcessingRows(inDb, outDb, session, 'container not running'); } // 5. Recurrence fanout for completed recurring tasks. @@ -246,10 +254,16 @@ function resetStuckProcessingRows( reason: string, ): void { const claims = getProcessingClaims(outDb); + const now = Date.now(); for (const { message_id } of claims) { const msg = getMessageForRetry(inDb, message_id, 'pending'); if (!msg) continue; + // Already rescheduled for a future retry — don't bump tries again. The + // wake path (sweep step 2) will fire when process_after elapses and a + // fresh container will clean the orphan claim on startup. + if (msg.processAfter && Date.parse(msg.processAfter) > now) continue; + if (msg.tries >= MAX_TRIES) { markMessageFailed(inDb, msg.id); log.warn('Message marked as failed after max retries', {