Merge branch 'main' into fix/setup-v2-registered-groups

2026-04-23 22:20:31 +03:00
parent 539af750d4 e5a7a33084
commit f3524a33bb
7 changed files with 675 additions and 16 deletions
--- a/.claude/skills/add-atomic-chat-tool/SKILL.md
+++ b/.claude/skills/add-atomic-chat-tool/SKILL.md
@@ -0,0 +1,243 @@
+---
+name: add-atomic-chat-tool
+description: Add Atomic Chat MCP server so the container agent can call local models served by the Atomic Chat desktop app via its OpenAI-compatible API.
+---
+
+# Add Atomic Chat Integration
+
+This skill adds a stdio-based MCP server that exposes models running in the local [Atomic Chat](https://github.com/AtomicBot-ai/Atomic-Chat) desktop app as tools for the container agent. Claude remains the orchestrator but can offload work to local models served by Atomic Chat on `http://127.0.0.1:1337/v1` (OpenAI-compatible).
+
+Tools exposed:
+- `atomic_chat_list_models` — list models currently available in Atomic Chat (`GET /v1/models`)
+- `atomic_chat_generate` — send a prompt to a specified model and return the response (`POST /v1/chat/completions`)
+
+Model management (download, delete) is done through the **Atomic Chat desktop UI** — the app is a fork of Jan and manages its own model library.
+
+The skill ships the MCP server source in this folder and copies it into the agent-runner tree at install time, then wires it up with small edits to `index.ts`, `providers/claude.ts`, and `container-runner.ts`. No branch merge — all edits are additive and idempotent.
+
+## Phase 1: Pre-flight
+
+### Check if already applied
+
+Check if `container/agent-runner/src/atomic-chat-mcp-stdio.ts` exists. If it does, skip to Phase 3 (Configure).
+
+### Check prerequisites
+
+Verify Atomic Chat is installed and its local API server is running. On the host:
+
+```bash
+curl -s http://127.0.0.1:1337/v1/models | head
+```
+
+If the request fails:
+
+1. Install Atomic Chat from the [latest release](https://github.com/AtomicBot-ai/Atomic-Chat/releases) (macOS only for now — `atomic-chat.dmg`).
+2. Open the app.
+3. Open **Settings → Local API Server** and make sure it's enabled on port `1337`.
+4. Go to the **Hub** (or **Models**) tab and download at least one model (e.g. Llama 3.2 3B, Qwen 2.5 Coder 7B).
+5. Load the model once by sending any message in Atomic Chat's UI to warm it up.
+
+## Phase 2: Apply Code Changes
+
+### Copy the MCP server source
+
+```bash
+cp .claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts container/agent-runner/src/atomic-chat-mcp-stdio.ts
+```
+
+### Register the MCP server in the agent-runner
+
+Edit `container/agent-runner/src/index.ts`. Find the `mcpServers` object that currently looks like this:
+
+```ts
+  const mcpServers: Record<string, { command: string; args: string[]; env: Record<string, string> }> = {
+    nanoclaw: {
+      command: 'bun',
+      args: ['run', mcpServerPath],
+      env: {},
+    },
+  };
+```
+
+Add an `atomic_chat` entry alongside `nanoclaw`:
+
+```ts
+  const mcpServers: Record<string, { command: string; args: string[]; env: Record<string, string> }> = {
+    nanoclaw: {
+      command: 'bun',
+      args: ['run', mcpServerPath],
+      env: {},
+    },
+    atomic_chat: {
+      command: 'bun',
+      args: ['run', path.join(__dirname, 'atomic-chat-mcp-stdio.ts')],
+      env: {
+        ...(process.env.ATOMIC_CHAT_HOST ? { ATOMIC_CHAT_HOST: process.env.ATOMIC_CHAT_HOST } : {}),
+        ...(process.env.ATOMIC_CHAT_API_KEY ? { ATOMIC_CHAT_API_KEY: process.env.ATOMIC_CHAT_API_KEY } : {}),
+      },
+    },
+  };
+```
+
+### Add the tool glob to the allowlist
+
+Edit `container/agent-runner/src/providers/claude.ts`. Find `'mcp__nanoclaw__*',` in the `TOOL_ALLOWLIST` array and add `'mcp__atomic_chat__*',` on the following line:
+
+```ts
+  'mcp__nanoclaw__*',
+  'mcp__atomic_chat__*',
+];
+```
+
+### Forward host env vars into the container
+
+Edit `src/container-runner.ts` in `buildContainerArgs`. Find the `TZ` env line:
+
+```ts
+  args.push('-e', `TZ=${TIMEZONE}`);
+```
+
+Add ATOMIC_CHAT forwarding right after it:
+
+```ts
+  args.push('-e', `TZ=${TIMEZONE}`);
+
+  // Atomic Chat MCP tool: forward host overrides if set (default is host.docker.internal:1337).
+  if (process.env.ATOMIC_CHAT_HOST) {
+    args.push('-e', `ATOMIC_CHAT_HOST=${process.env.ATOMIC_CHAT_HOST}`);
+  }
+  if (process.env.ATOMIC_CHAT_API_KEY) {
+    args.push('-e', `ATOMIC_CHAT_API_KEY=${process.env.ATOMIC_CHAT_API_KEY}`);
+  }
+```
+
+### Surface `[ATOMIC]` log lines at info level
+
+In the same file, find the stderr logger:
+
+```ts
+  container.stderr?.on('data', (data) => {
+    for (const line of data.toString().trim().split('\n')) {
+      if (line) log.debug(line, { container: agentGroup.folder });
+    }
+  });
+```
+
+Replace it with:
+
+```ts
+  container.stderr?.on('data', (data) => {
+    for (const line of data.toString().trim().split('\n')) {
+      if (!line) continue;
+      if (line.includes('[ATOMIC]')) {
+        log.info(line, { container: agentGroup.folder });
+      } else {
+        log.debug(line, { container: agentGroup.folder });
+      }
+    }
+  });
+```
+
+### Add env-var stubs to `.env.example`
+
+Append to `.env.example`:
+
+```bash
+# Atomic Chat MCP tool (.claude/skills/add-atomic-chat-tool)
+# Override the host where Atomic Chat exposes its OpenAI-compatible API.
+# Default: http://host.docker.internal:1337 (with fallback to localhost)
+# ATOMIC_CHAT_HOST=http://host.docker.internal:1337
+
+# Optional API key. Leave unset for a local Atomic Chat install — it does not require auth.
+# ATOMIC_CHAT_API_KEY=
+```
+
+### Validate code changes
+
+```bash
+pnpm run build
+pnpm exec tsc -p container/agent-runner/tsconfig.json --noEmit
+./container/build.sh
+```
+
+All three must be clean before proceeding.
+
+## Phase 3: Configure
+
+### Set Atomic Chat host (optional)
+
+By default, the MCP server connects to `http://host.docker.internal:1337` (Docker Desktop) with a fallback to `localhost`. To use a custom host, add to `.env`:
+
+```bash
+ATOMIC_CHAT_HOST=http://your-atomic-chat-host:1337
+```
+
+### Set API key (optional)
+
+Atomic Chat does **not require authentication** when running locally — leave this unset. Only set it if you've put Atomic Chat behind a reverse proxy that enforces auth:
+
+```bash
+ATOMIC_CHAT_API_KEY=sk-...
+```
+
+### Restart the service
+
+```bash
+launchctl kickstart -k gui/$(id -u)/com.nanoclaw  # macOS
+# Linux: systemctl --user restart nanoclaw
+```
+
+## Phase 4: Verify
+
+### Test inference
+
+Tell the user:
+
+> Send a message like: "use atomic chat to tell me the capital of France"
+>
+> The agent should use `atomic_chat_list_models` to find available models, then `atomic_chat_generate` to get a response.
+
+### Check logs if needed
+
+```bash
+tail -f logs/nanoclaw.log | grep -i atomic
+```
+
+Look for:
+- `[ATOMIC] Listing models...` — list request started
+- `[ATOMIC] Found N models` — models discovered
+- `[ATOMIC] >>> Generating with <model>` — generation started
+- `[ATOMIC] <<< Done: <model> | Xs | N tokens | M chars` — generation completed
+
+## Troubleshooting
+
+### Agent says "Atomic Chat is not installed" or tries to run a CLI
+
+The agent is looking for a CLI that doesn't exist instead of using the MCP tools. This means:
+1. The MCP server wasn't copied — check `container/agent-runner/src/atomic-chat-mcp-stdio.ts` exists
+2. The MCP server wasn't registered — check `container/agent-runner/src/index.ts` has the `atomic_chat` entry in `mcpServers`
+3. The allowlist wasn't updated — check `container/agent-runner/src/providers/claude.ts` includes `mcp__atomic_chat__*` in `TOOL_ALLOWLIST`
+4. The container wasn't rebuilt — run `./container/build.sh`
+
+### "Failed to connect to Atomic Chat"
+
+1. Verify the host API is reachable: `curl http://127.0.0.1:1337/v1/models`
+2. Confirm the Local API Server is enabled in Atomic Chat's settings
+3. Check Docker can reach the host: `docker run --rm curlimages/curl curl -s http://host.docker.internal:1337/v1/models`
+4. If using a custom host, check `ATOMIC_CHAT_HOST` in `.env`
+
+### `model not found` / 404 on generate
+
+The model ID passed to `atomic_chat_generate` must exactly match one of the IDs returned by `atomic_chat_list_models`. Ask the agent to list models first, then pick one from that list.
+
+### Slow first response
+
+Atomic Chat lazy-loads models into memory on first use. The initial call may take longer while the model warms up. Subsequent calls against the same model are fast.
+
+### Agent doesn't use Atomic Chat tools
+
+The agent may not know about the tools. Try being explicit: "use the atomic_chat_generate tool with llama3.2-3b-instruct to answer: ..."
+
+### Context window or output size issues
+
+Atomic Chat respects each model's native context length. If you hit limits, pass `max_tokens` explicitly when calling `atomic_chat_generate`, or switch to a model with a larger context window in the Atomic Chat UI.
--- a/.claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts
+++ b/.claude/skills/add-atomic-chat-tool/atomic-chat-mcp-stdio.ts
@@ -0,0 +1,229 @@
+/**
+ * Atomic Chat MCP Server for NanoClaw
+ * Exposes local Atomic Chat models (OpenAI-compatible, /v1) as tools for the container agent.
+ * Uses host.docker.internal to reach the host's Atomic Chat desktop app from Docker.
+ */
+
+import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import { z } from 'zod';
+
+import fs from 'fs';
+import path from 'path';
+
+const ATOMIC_CHAT_HOST =
+  process.env.ATOMIC_CHAT_HOST || 'http://host.docker.internal:1337';
+const ATOMIC_CHAT_API_KEY = process.env.ATOMIC_CHAT_API_KEY || '';
+const ATOMIC_CHAT_STATUS_FILE = '/workspace/ipc/atomic_chat_status.json';
+
+function log(msg: string): void {
+  console.error(`[ATOMIC] ${msg}`);
+}
+
+function writeStatus(status: string, detail?: string): void {
+  try {
+    const data = { status, detail, timestamp: new Date().toISOString() };
+    const tmpPath = `${ATOMIC_CHAT_STATUS_FILE}.tmp`;
+    fs.mkdirSync(path.dirname(ATOMIC_CHAT_STATUS_FILE), { recursive: true });
+    fs.writeFileSync(tmpPath, JSON.stringify(data));
+    fs.renameSync(tmpPath, ATOMIC_CHAT_STATUS_FILE);
+  } catch {
+    /* best-effort */
+  }
+}
+
+async function atomicFetch(
+  apiPath: string,
+  options?: RequestInit,
+): Promise<Response> {
+  const url = `${ATOMIC_CHAT_HOST}${apiPath}`;
+  const headers: Record<string, string> = {
+    ...((options?.headers as Record<string, string>) || {}),
+  };
+  if (ATOMIC_CHAT_API_KEY) {
+    headers.Authorization = `Bearer ${ATOMIC_CHAT_API_KEY}`;
+  }
+  const finalOptions: RequestInit = { ...options, headers };
+  try {
+    return await fetch(url, finalOptions);
+  } catch (err) {
+    // Fallback to localhost if host.docker.internal fails
+    if (ATOMIC_CHAT_HOST.includes('host.docker.internal')) {
+      const fallbackUrl = url.replace('host.docker.internal', 'localhost');
+      return await fetch(fallbackUrl, finalOptions);
+    }
+    throw err;
+  }
+}
+
+const server = new McpServer({
+  name: 'atomic_chat',
+  version: '1.0.0',
+});
+
+server.tool(
+  'atomic_chat_list_models',
+  'List all models available in the local Atomic Chat desktop app. Use this to see which models are loaded before calling atomic_chat_generate.',
+  {},
+  async () => {
+    log('Listing models...');
+    writeStatus('listing', 'Listing available models');
+    try {
+      const res = await atomicFetch('/v1/models');
+      if (!res.ok) {
+        return {
+          content: [
+            {
+              type: 'text' as const,
+              text: `Atomic Chat API error: ${res.status} ${res.statusText}`,
+            },
+          ],
+          isError: true,
+        };
+      }
+
+      const data = (await res.json()) as {
+        data?: Array<{ id: string; owned_by?: string }>;
+      };
+      const models = data.data || [];
+
+      if (models.length === 0) {
+        return {
+          content: [
+            {
+              type: 'text' as const,
+              text: 'No models available. Open Atomic Chat on the host and download a model from the Hub.',
+            },
+          ],
+        };
+      }
+
+      const list = models
+        .map((m) => `- ${m.id}${m.owned_by ? ` (${m.owned_by})` : ''}`)
+        .join('\n');
+
+      log(`Found ${models.length} models`);
+      return {
+        content: [
+          { type: 'text' as const, text: `Available models:\n${list}` },
+        ],
+      };
+    } catch (err) {
+      return {
+        content: [
+          {
+            type: 'text' as const,
+            text: `Failed to connect to Atomic Chat at ${ATOMIC_CHAT_HOST}: ${err instanceof Error ? err.message : String(err)}`,
+          },
+        ],
+        isError: true,
+      };
+    }
+  },
+);
+
+server.tool(
+  'atomic_chat_generate',
+  'Send a prompt to a local Atomic Chat model and get a response. Good for cheaper/faster tasks like summarization, translation, or general queries. Use atomic_chat_list_models first to see available models.',
+  {
+    model: z
+      .string()
+      .describe(
+        'The model ID as returned by atomic_chat_list_models (e.g. "llama3.2-3b-instruct")',
+      ),
+    prompt: z.string().describe('The prompt to send to the model'),
+    system: z
+      .string()
+      .optional()
+      .describe('Optional system prompt to set model behavior'),
+    temperature: z
+      .number()
+      .optional()
+      .describe('Sampling temperature (0.0–2.0). Defaults to model default.'),
+    max_tokens: z
+      .number()
+      .optional()
+      .describe('Maximum number of tokens to generate in the response.'),
+  },
+  async (args) => {
+    log(`>>> Generating with ${args.model} (${args.prompt.length} chars)...`);
+    writeStatus('generating', `Generating with ${args.model}`);
+    try {
+      const messages: Array<{ role: string; content: string }> = [];
+      if (args.system) {
+        messages.push({ role: 'system', content: args.system });
+      }
+      messages.push({ role: 'user', content: args.prompt });
+
+      const body: Record<string, unknown> = {
+        model: args.model,
+        messages,
+        stream: false,
+      };
+      if (args.temperature !== undefined) body.temperature = args.temperature;
+      if (args.max_tokens !== undefined) body.max_tokens = args.max_tokens;
+
+      const startedAt = Date.now();
+      const res = await atomicFetch('/v1/chat/completions', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify(body),
+      });
+
+      if (!res.ok) {
+        const errorText = await res.text();
+        return {
+          content: [
+            {
+              type: 'text' as const,
+              text: `Atomic Chat error (${res.status}): ${errorText}`,
+            },
+          ],
+          isError: true,
+        };
+      }
+
+      const data = (await res.json()) as {
+        choices?: Array<{ message?: { content?: string } }>;
+        usage?: {
+          prompt_tokens?: number;
+          completion_tokens?: number;
+          total_tokens?: number;
+        };
+      };
+
+      const response = data.choices?.[0]?.message?.content ?? '';
+      const elapsedSec = ((Date.now() - startedAt) / 1000).toFixed(1);
+      const completionTokens = data.usage?.completion_tokens;
+
+      const meta = `\n\n[${args.model} | ${elapsedSec}s${
+        completionTokens !== undefined ? ` | ${completionTokens} tokens` : ''
+      }]`;
+
+      log(
+        `<<< Done: ${args.model} | ${elapsedSec}s | ${
+          completionTokens ?? '?'
+        } tokens | ${response.length} chars`,
+      );
+      writeStatus(
+        'done',
+        `${args.model} | ${elapsedSec}s | ${completionTokens ?? '?'} tokens`,
+      );
+
+      return { content: [{ type: 'text' as const, text: response + meta }] };
+    } catch (err) {
+      return {
+        content: [
+          {
+            type: 'text' as const,
+            text: `Failed to call Atomic Chat: ${err instanceof Error ? err.message : String(err)}`,
+          },
+        ],
+        isError: true,
+      };
+    }
+  },
+);
+
+const transport = new StdioServerTransport();
+await server.connect(transport);
--- a/.claude/skills/add-codex/SKILL.md
+++ b/.claude/skills/add-codex/SKILL.md
@@ -0,0 +1,161 @@
+---
+name: add-codex
+description: Use Codex (CLI + AppServer) as the full agent provider — planning, tool orchestration, native compaction, MCP tools, session resume — in place of the Claude Agent SDK. ChatGPT subscription or OPENAI_API_KEY. Per-group via agent_provider. Distinct from using OpenAI as an MCP tool (where Claude remains the planner).
+---
+
+# Codex agent provider
+
+NanoClaw runs agents in a long-lived **poll loop** inside the container. The backend is selected with **`AGENT_PROVIDER`** (`claude` | `opencode` | `codex` | `mock`).
+
+Trunk ships with only the `claude` provider baked in. This skill copies the Codex provider files in from the `providers` branch, wires them into the host and container barrels, updates the Dockerfile to install the Codex CLI, and rebuilds the image.
+
+The Codex provider runs `codex app-server` as a child process and speaks JSON-RPC over stdio. That gives it native session resume, streaming events, MCP tool access, and `thread/compact/start` compaction — same feature bar as the Claude Agent SDK, without the Anthropic-only lock-in.
+
+## Install
+
+### Pre-flight
+
+If all of the following are already present, skip to **Configuration**:
+
+- `src/providers/codex.ts`
+- `container/agent-runner/src/providers/codex.ts`
+- `container/agent-runner/src/providers/codex-app-server.ts`
+- `container/agent-runner/src/providers/codex.factory.test.ts`
+- `import './codex.js';` line in `src/providers/index.ts`
+- `import './codex.js';` line in `container/agent-runner/src/providers/index.ts`
+- `ARG CODEX_VERSION` and `"@openai/codex@${CODEX_VERSION}"` in the pnpm global-install block in `container/Dockerfile`
+
+Missing pieces — continue below. All steps are idempotent; re-running is safe.
+
+### 1. Fetch the providers branch
+
+```bash
+git fetch origin providers
+```
+
+### 2. Copy the Codex source files
+
+Wholesale copies (owned entirely by this skill — user edits to these files won't survive a re-run, as designed):
+
+```bash
+git show origin/providers:src/providers/codex.ts                                      > src/providers/codex.ts
+git show origin/providers:container/agent-runner/src/providers/codex.ts               > container/agent-runner/src/providers/codex.ts
+git show origin/providers:container/agent-runner/src/providers/codex-app-server.ts    > container/agent-runner/src/providers/codex-app-server.ts
+git show origin/providers:container/agent-runner/src/providers/codex.factory.test.ts  > container/agent-runner/src/providers/codex.factory.test.ts
+```
+
+### 3. Append the self-registration imports
+
+Each barrel gets one line — alphabetical placement keeps diffs small.
+
+`src/providers/index.ts`:
+
+```typescript
+import './codex.js';
+```
+
+`container/agent-runner/src/providers/index.ts`:
+
+```typescript
+import './codex.js';
+```
+
+### 4. Add the Codex CLI to the container Dockerfile
+
+Two edits to `container/Dockerfile`, both idempotent (skip if already present):
+
+**(a)** In the "Pin CLI versions" ARG block (around line 18), add after `ARG CLAUDE_CODE_VERSION=...`:
+
+```dockerfile
+ARG CODEX_VERSION=0.121.0
+```
+
+**(b)** Add a new standalone `RUN` block for the Codex CLI, after the existing per-CLI install blocks (around line 106, right after the `@anthropic-ai/claude-code` block). The Dockerfile splits each global CLI into its own layer for cache granularity — keep that pattern; do not collapse them into a single combined `pnpm install -g` call:
+
+```dockerfile
+RUN --mount=type=cache,target=/root/.cache/pnpm \
+    pnpm install -g "@openai/codex@${CODEX_VERSION}"
+```
+
+Note: **no agent-runner package dependency** — Codex is a CLI binary, not a library. Unlike OpenCode, there's nothing to add to `container/agent-runner/package.json`.
+
+### 5. Build
+
+```bash
+pnpm run build                                         # host
+pnpm exec tsc -p container/agent-runner/tsconfig.json --noEmit   # container typecheck
+./container/build.sh                                   # agent image
+```
+
+## Configuration
+
+Codex supports two primary auth paths and one experimental BYO-endpoint path. Pick the one that matches your setup.
+
+### Option A — ChatGPT subscription (recommended for individuals)
+
+On the host (not inside the container), run Codex's OAuth login:
+
+```bash
+codex login
+```
+
+This writes `~/.codex/auth.json` with a subscription token. The host-side Codex provider ([src/providers/codex.ts](../../../src/providers/codex.ts)) copies `auth.json` into a per-session `~/.codex` directory mounted into the container — your host's own Codex CLI is never touched.
+
+No `.env` variables required for this mode.
+
+### Option B — API key (recommended for CI or API billing)
+
+```env
+OPENAI_API_KEY=sk-...
+CODEX_MODEL=gpt-5.4-mini
+```
+
+The host forwards both variables into the container. If both subscription (`auth.json`) and `OPENAI_API_KEY` are present, Codex prefers the subscription.
+
+### Option C — BYO OpenAI-compatible endpoint (experimental)
+
+Codex's built-in `openai` provider honors the `OPENAI_BASE_URL` env var directly. Point it at any OpenAI-compatible endpoint — Groq, Together, self-hosted vLLM, an OpenAI proxy, etc.
+
+```env
+OPENAI_API_KEY=...
+OPENAI_BASE_URL=https://api.groq.com/openai/v1
+CODEX_MODEL=llama-3.3-70b-versatile
+```
+
+Codex also ships first-class local-runner flags — `codex --oss --local-provider ollama` or `--local-provider lmstudio` — that auto-detect a local server. To use those inside NanoClaw, set `CODEX_MODEL` to a model your local runner serves and add the corresponding base URL; see the Codex CLI docs for the full `model_provider = oss` configuration.
+
+**Experimental caveat:** tool-calling quality depends on the model and endpoint. Not every OpenAI-compat provider implements the full function-calling spec, and smaller models (< 30B) often struggle with multi-step tool orchestration. Test before committing.
+
+### Per group / per session
+
+Schema: **`agent_groups.agent_provider`** and **`sessions.agent_provider`**. Set to `codex` for groups or sessions that should use Codex. The container receives `AGENT_PROVIDER` from the resolved value (session overrides group).
+
+`CODEX_MODEL` applies process-wide via `.env`; if you need different models for different groups, set them via `container_config.env` on the group.
+
+Extra MCP servers still come from **`NANOCLAW_MCP_SERVERS`** / `container_config.mcpServers` on the host. The runner merges them into the same `mcpServers` object passed to all providers.
+
+## Operational notes
+
+- **Spawn-per-query:** Codex's app-server is spawned fresh per query invocation, matching the OpenCode pattern. No long-lived daemon to keep healthy across sessions.
+- **Per-session `~/.codex` isolation:** each group gets its own copy of the host's `auth.json`. The container can rewrite `config.toml` freely on every wake without touching the host's Codex config.
+- **Native compaction:** kicks in automatically at 40K cumulative input tokens between turns, via `thread/compact/start`. If compaction fails, the provider logs and continues uncompacted — no fatal error.
+- **Approvals:** auto-accepted inside the container (the container is the sandbox; same posture as Claude/OpenCode).
+- **Mid-turn input:** Codex turns don't accept mid-turn messages. Follow-up `push()` calls queue and drain between turns, matching the OpenCode pattern. The poll-loop only pushes between turns anyway, so no messages are dropped.
+- **Stale thread recovery:** `isSessionInvalid` matches on stale-thread-ID errors (`thread not found`, `unknown thread`, etc.) so a cold-started app-server can recover cleanly when it sees a stored continuation it no longer has.
+
+## Verify
+
+```bash
+grep -q "./codex.js" container/agent-runner/src/providers/index.ts && echo "container barrel: OK"
+grep -q "./codex.js" src/providers/index.ts && echo "host barrel: OK"
+grep -q "@openai/codex@" container/Dockerfile && echo "Dockerfile install: OK"
+cd container/agent-runner && bun test src/providers/codex.factory.test.ts && cd -
+```
+
+After the image rebuild, set `agent_provider = 'codex'` on a test group and send a message. Successful round-trip looks like:
+
+- `init` event with a stable thread ID as continuation
+- One or more `activity` / `progress` events during the turn
+- `result` event with the model's reply
+
+If the agent hangs or errors, check `~/.codex/auth.json` exists on the host (Option A) or that `OPENAI_API_KEY` is forwarding correctly (Option B) — `docker exec` into a running container and `env | grep -i openai` to confirm.
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "nanoclaw",
-  "version": "2.0.5",
+  "version": "2.0.7",
  "description": "Personal Claude assistant. Lightweight, secure, customizable.",
  "type": "module",
  "packageManager": "pnpm@10.33.0",
--- a/src/container-runner.ts
+++ b/src/container-runner.ts
@@ -36,7 +36,13 @@ import {
  type ProviderContainerContribution,
  type VolumeMount,
 } from './providers/provider-container-registry.js';
-import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js';
+import {
+  heartbeatPath,
+  markContainerRunning,
+  markContainerStopped,
+  sessionDir,
+  writeSessionRouting,
+} from './session-manager.js';
 import type { AgentGroup, Session } from './types.js';

 const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY });
@@ -131,6 +137,12 @@ async function spawnContainer(session: Session): Promise<void> {

  log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName });

+  // Clear any orphan heartbeat from a previous container instance — the
+  // sweep's ceiling check treats a missing file as "fresh spawn, give grace"
+  // (host-sweep.ts line 87). Without this, the stale mtime can trigger an
+  // immediate kill before the new container touches the file itself.
+  fs.rmSync(heartbeatPath(agentGroup.id, session.id), { force: true });
+
  const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] });

  activeContainers.set(session.id, { process: container, containerName });
--- a/src/db/session-db.ts
+++ b/src/db/session-db.ts
@@ -139,10 +139,10 @@ export function getMessageForRetry(
  db: Database.Database,
  messageId: string,
  status: string,
-): { id: string; tries: number } | undefined {
-  return db.prepare('SELECT id, tries FROM messages_in WHERE id = ? AND status = ?').get(messageId, status) as
-    | { id: string; tries: number }
-    | undefined;
+): { id: string; tries: number; processAfter: string | null } | undefined {
+  return db
+    .prepare('SELECT id, tries, process_after as processAfter FROM messages_in WHERE id = ? AND status = ?')
+    .get(messageId, status) as { id: string; tries: number; processAfter: string | null } | undefined;
 }

 export function syncProcessingAcks(inDb: Database.Database, outDb: Database.Database): void {
--- a/src/host-sweep.ts
+++ b/src/host-sweep.ts
@@ -159,23 +159,31 @@ async function sweepSession(session: Session): Promise<void> {
      syncProcessingAcks(inDb, outDb);
    }

-    const alive = isContainerRunning(session.id);
-
-    // 2. Crashed-container cleanup: processing rows left behind get retried.
-    if (!alive && outDb) {
-      resetStuckProcessingRows(inDb, outDb, session, 'container not running');
+    // 2. Wake a container if work is due and nothing is running. Ordered
+    // before the crashed-container cleanup so a fresh container gets a chance
+    // to clean its own orphan processing_ack rows on startup (see
+    // container/agent-runner/src/db/connection.ts). Otherwise the reset path
+    // would keep bumping process_after into the future, dueCount would stay 0,
+    // and the wake would never fire.
+    const dueCount = countDueMessages(inDb);
+    if (dueCount > 0 && !isContainerRunning(session.id)) {
+      log.info('Waking container for due messages', { sessionId: session.id, count: dueCount });
+      await wakeContainer(session);
    }

+    const alive = isContainerRunning(session.id);
+
    // 3. Running-container SLA: absolute ceiling + per-claim stuck rules.
    if (alive && outDb) {
      enforceRunningContainerSla(inDb, outDb, session, agentGroup.id);
    }

-    // 4. Wake a container if new work is due and nothing is running.
-    const dueCount = countDueMessages(inDb);
-    if (dueCount > 0 && !isContainerRunning(session.id)) {
-      log.info('Waking container for due messages', { sessionId: session.id, count: dueCount });
-      await wakeContainer(session);
+    // 4. Crashed-container cleanup: processing rows left behind get retried.
+    // Only fires when wake in step 2 didn't pick up the work (no due messages,
+    // or wake failed). resetStuckProcessingRows itself is idempotent — it
+    // skips messages already scheduled for a future retry.
+    if (!alive && outDb) {
+      resetStuckProcessingRows(inDb, outDb, session, 'container not running');
    }

    // 5. Recurrence fanout for completed recurring tasks.
@@ -246,10 +254,16 @@ function resetStuckProcessingRows(
  reason: string,
 ): void {
  const claims = getProcessingClaims(outDb);
+  const now = Date.now();
  for (const { message_id } of claims) {
    const msg = getMessageForRetry(inDb, message_id, 'pending');
    if (!msg) continue;

+    // Already rescheduled for a future retry — don't bump tries again. The
+    // wake path (sweep step 2) will fire when process_after elapses and a
+    // fresh container will clean the orphan claim on startup.
+    if (msg.processAfter && Date.parse(msg.processAfter) > now) continue;
+
    if (msg.tries >= MAX_TRIES) {
      markMessageFailed(inDb, msg.id);
      log.warn('Message marked as failed after max retries', {