diff --git a/.claude/skills/add-codex/SKILL.md b/.claude/skills/add-codex/SKILL.md new file mode 100644 index 0000000..17910b7 --- /dev/null +++ b/.claude/skills/add-codex/SKILL.md @@ -0,0 +1,161 @@ +--- +name: add-codex +description: Use Codex (CLI + AppServer) as the full agent provider — planning, tool orchestration, native compaction, MCP tools, session resume — in place of the Claude Agent SDK. ChatGPT subscription or OPENAI_API_KEY. Per-group via agent_provider. Distinct from using OpenAI as an MCP tool (where Claude remains the planner). +--- + +# Codex agent provider + +NanoClaw runs agents in a long-lived **poll loop** inside the container. The backend is selected with **`AGENT_PROVIDER`** (`claude` | `opencode` | `codex` | `mock`). + +Trunk ships with only the `claude` provider baked in. This skill copies the Codex provider files in from the `providers` branch, wires them into the host and container barrels, updates the Dockerfile to install the Codex CLI, and rebuilds the image. + +The Codex provider runs `codex app-server` as a child process and speaks JSON-RPC over stdio. That gives it native session resume, streaming events, MCP tool access, and `thread/compact/start` compaction — same feature bar as the Claude Agent SDK, without the Anthropic-only lock-in. + +## Install + +### Pre-flight + +If all of the following are already present, skip to **Configuration**: + +- `src/providers/codex.ts` +- `container/agent-runner/src/providers/codex.ts` +- `container/agent-runner/src/providers/codex-app-server.ts` +- `container/agent-runner/src/providers/codex.factory.test.ts` +- `import './codex.js';` line in `src/providers/index.ts` +- `import './codex.js';` line in `container/agent-runner/src/providers/index.ts` +- `ARG CODEX_VERSION` and `"@openai/codex@${CODEX_VERSION}"` in the pnpm global-install block in `container/Dockerfile` + +Missing pieces — continue below. All steps are idempotent; re-running is safe. + +### 1. Fetch the providers branch + +```bash +git fetch origin providers +``` + +### 2. Copy the Codex source files + +Wholesale copies (owned entirely by this skill — user edits to these files won't survive a re-run, as designed): + +```bash +git show origin/providers:src/providers/codex.ts > src/providers/codex.ts +git show origin/providers:container/agent-runner/src/providers/codex.ts > container/agent-runner/src/providers/codex.ts +git show origin/providers:container/agent-runner/src/providers/codex-app-server.ts > container/agent-runner/src/providers/codex-app-server.ts +git show origin/providers:container/agent-runner/src/providers/codex.factory.test.ts > container/agent-runner/src/providers/codex.factory.test.ts +``` + +### 3. Append the self-registration imports + +Each barrel gets one line — alphabetical placement keeps diffs small. + +`src/providers/index.ts`: + +```typescript +import './codex.js'; +``` + +`container/agent-runner/src/providers/index.ts`: + +```typescript +import './codex.js'; +``` + +### 4. Add the Codex CLI to the container Dockerfile + +Two edits to `container/Dockerfile`, both idempotent (skip if already present): + +**(a)** In the "Pin CLI versions" ARG block (around line 18), add after `ARG CLAUDE_CODE_VERSION=...`: + +```dockerfile +ARG CODEX_VERSION=0.121.0 +``` + +**(b)** Add a new standalone `RUN` block for the Codex CLI, after the existing per-CLI install blocks (around line 106, right after the `@anthropic-ai/claude-code` block). The Dockerfile splits each global CLI into its own layer for cache granularity — keep that pattern; do not collapse them into a single combined `pnpm install -g` call: + +```dockerfile +RUN --mount=type=cache,target=/root/.cache/pnpm \ + pnpm install -g "@openai/codex@${CODEX_VERSION}" +``` + +Note: **no agent-runner package dependency** — Codex is a CLI binary, not a library. Unlike OpenCode, there's nothing to add to `container/agent-runner/package.json`. + +### 5. Build + +```bash +pnpm run build # host +pnpm exec tsc -p container/agent-runner/tsconfig.json --noEmit # container typecheck +./container/build.sh # agent image +``` + +## Configuration + +Codex supports two primary auth paths and one experimental BYO-endpoint path. Pick the one that matches your setup. + +### Option A — ChatGPT subscription (recommended for individuals) + +On the host (not inside the container), run Codex's OAuth login: + +```bash +codex login +``` + +This writes `~/.codex/auth.json` with a subscription token. The host-side Codex provider ([src/providers/codex.ts](../../../src/providers/codex.ts)) copies `auth.json` into a per-session `~/.codex` directory mounted into the container — your host's own Codex CLI is never touched. + +No `.env` variables required for this mode. + +### Option B — API key (recommended for CI or API billing) + +```env +OPENAI_API_KEY=sk-... +CODEX_MODEL=gpt-5.4-mini +``` + +The host forwards both variables into the container. If both subscription (`auth.json`) and `OPENAI_API_KEY` are present, Codex prefers the subscription. + +### Option C — BYO OpenAI-compatible endpoint (experimental) + +Codex's built-in `openai` provider honors the `OPENAI_BASE_URL` env var directly. Point it at any OpenAI-compatible endpoint — Groq, Together, self-hosted vLLM, an OpenAI proxy, etc. + +```env +OPENAI_API_KEY=... +OPENAI_BASE_URL=https://api.groq.com/openai/v1 +CODEX_MODEL=llama-3.3-70b-versatile +``` + +Codex also ships first-class local-runner flags — `codex --oss --local-provider ollama` or `--local-provider lmstudio` — that auto-detect a local server. To use those inside NanoClaw, set `CODEX_MODEL` to a model your local runner serves and add the corresponding base URL; see the Codex CLI docs for the full `model_provider = oss` configuration. + +**Experimental caveat:** tool-calling quality depends on the model and endpoint. Not every OpenAI-compat provider implements the full function-calling spec, and smaller models (< 30B) often struggle with multi-step tool orchestration. Test before committing. + +### Per group / per session + +Schema: **`agent_groups.agent_provider`** and **`sessions.agent_provider`**. Set to `codex` for groups or sessions that should use Codex. The container receives `AGENT_PROVIDER` from the resolved value (session overrides group). + +`CODEX_MODEL` applies process-wide via `.env`; if you need different models for different groups, set them via `container_config.env` on the group. + +Extra MCP servers still come from **`NANOCLAW_MCP_SERVERS`** / `container_config.mcpServers` on the host. The runner merges them into the same `mcpServers` object passed to all providers. + +## Operational notes + +- **Spawn-per-query:** Codex's app-server is spawned fresh per query invocation, matching the OpenCode pattern. No long-lived daemon to keep healthy across sessions. +- **Per-session `~/.codex` isolation:** each group gets its own copy of the host's `auth.json`. The container can rewrite `config.toml` freely on every wake without touching the host's Codex config. +- **Native compaction:** kicks in automatically at 40K cumulative input tokens between turns, via `thread/compact/start`. If compaction fails, the provider logs and continues uncompacted — no fatal error. +- **Approvals:** auto-accepted inside the container (the container is the sandbox; same posture as Claude/OpenCode). +- **Mid-turn input:** Codex turns don't accept mid-turn messages. Follow-up `push()` calls queue and drain between turns, matching the OpenCode pattern. The poll-loop only pushes between turns anyway, so no messages are dropped. +- **Stale thread recovery:** `isSessionInvalid` matches on stale-thread-ID errors (`thread not found`, `unknown thread`, etc.) so a cold-started app-server can recover cleanly when it sees a stored continuation it no longer has. + +## Verify + +```bash +grep -q "./codex.js" container/agent-runner/src/providers/index.ts && echo "container barrel: OK" +grep -q "./codex.js" src/providers/index.ts && echo "host barrel: OK" +grep -q "@openai/codex@" container/Dockerfile && echo "Dockerfile install: OK" +cd container/agent-runner && bun test src/providers/codex.factory.test.ts && cd - +``` + +After the image rebuild, set `agent_provider = 'codex'` on a test group and send a message. Successful round-trip looks like: + +- `init` event with a stable thread ID as continuation +- One or more `activity` / `progress` events during the turn +- `result` event with the model's reply + +If the agent hangs or errors, check `~/.codex/auth.json` exists on the host (Option A) or that `OPENAI_API_KEY` is forwarding correctly (Option B) — `docker exec` into a running container and `env | grep -i openai` to confirm. diff --git a/.claude/skills/add-signal/REMOVE.md b/.claude/skills/add-signal/REMOVE.md new file mode 100644 index 0000000..db37ade --- /dev/null +++ b/.claude/skills/add-signal/REMOVE.md @@ -0,0 +1,13 @@ +# Remove Signal + +1. Comment out `import './signal.js'` in `src/channels/index.ts` +2. Remove `SIGNAL_ACCOUNT` (and any other `SIGNAL_*` vars) from `.env` +3. Rebuild and restart + +If you also want to unlink the Signal account from `signal-cli`: + +```bash +signal-cli -a +1YOURNUMBER removeDevice --deviceId +``` + +(Find the device id with `signal-cli -a +1YOURNUMBER listDevices`.) diff --git a/.claude/skills/add-signal/SKILL.md b/.claude/skills/add-signal/SKILL.md new file mode 100644 index 0000000..e6d41aa --- /dev/null +++ b/.claude/skills/add-signal/SKILL.md @@ -0,0 +1,148 @@ +--- +name: add-signal +description: Add Signal channel integration via signal-cli TCP daemon. Native adapter — no Chat SDK bridge. +--- + +# Add Signal Channel + +Adds Signal messaging support via a native adapter that speaks JSON-RPC to a [signal-cli](https://github.com/AsamK/signal-cli) TCP daemon. No Chat SDK bridge, no npm deps — only Node.js builtins. + +## Prerequisites + +`signal-cli` installed and a Signal account linked: + +- macOS: `brew install signal-cli` +- Linux: download from [GitHub releases](https://github.com/AsamK/signal-cli/releases) +- Link your account: `signal-cli -a +1YOURNUMBER link` (follow the QR instructions) + +## Install + +NanoClaw doesn't ship channels in trunk. This skill copies the Signal adapter and its tests in from the `channels` branch. + +### Pre-flight (idempotent) + +Skip to **Credentials** if all of these are already in place: + +- `src/channels/signal.ts` and `src/channels/signal.test.ts` both exist +- `src/channels/index.ts` contains `import './signal.js';` + +Otherwise continue. Every step below is safe to re-run. + +### 1. Fetch the channels branch + +```bash +git fetch origin channels +``` + +### 2. Copy the adapter and tests + +```bash +git show origin/channels:src/channels/signal.ts > src/channels/signal.ts +git show origin/channels:src/channels/signal.test.ts > src/channels/signal.test.ts +``` + +### 3. Append the self-registration import + +Append to `src/channels/index.ts` (skip if the line is already present): + +```typescript +import './signal.js'; +``` + +### 4. Build + +```bash +pnpm run build +``` + +No npm packages to install — the adapter uses only Node.js builtins (`node:net`, `node:child_process`, `node:fs`). + +## Credentials + +Add to `.env`: + +```bash +SIGNAL_ACCOUNT=+1YOURNUMBER +``` + +### Optional settings + +```bash +# TCP daemon host and port (default: 127.0.0.1:7583) +SIGNAL_TCP_HOST=127.0.0.1 +SIGNAL_TCP_PORT=7583 + +# Path to the signal-cli binary (default: resolved on PATH) +SIGNAL_CLI_PATH=/usr/local/bin/signal-cli + +# Whether NanoClaw manages the daemon lifecycle (default: true). +# Set to false if you run signal-cli daemon externally. +SIGNAL_MANAGE_DAEMON=true + +# signal-cli data directory (default: ~/.local/share/signal-cli) +SIGNAL_DATA_DIR=~/.local/share/signal-cli +``` + +**Security note:** keep the TCP host on `127.0.0.1`. The daemon has no auth — binding it to a public interface would expose your full Signal account to the network. + +Sync to container: `mkdir -p data/env && cp .env data/env/env` + +### Restart + +```bash +# macOS +launchctl kickstart -k gui/$(id -u)/com.nanoclaw + +# Linux +systemctl --user restart nanoclaw +``` + +## Next Steps + +If you're in the middle of `/setup`, return to the setup flow now. + +Otherwise, run `/init-first-agent` to create an agent and wire it to your Signal DM, or `/manage-channels` to wire this channel to an existing agent group. Signal is direct-addressable — your phone number is the platform ID. + +## Channel Info + +- **type**: `signal` +- **terminology**: Signal has "chats" (1:1 DMs) and "groups." +- **how-to-find-id**: DMs use your phone number (e.g. `+15555550123`). Groups use `group:` — find group IDs via `signal-cli -a +1YOURNUMBER listGroups`. +- **supports-threads**: no +- **typical-use**: Personal assistant via Signal DMs or small group chats +- **default-isolation**: One agent per Signal account. Multiple chats with the same operator can share an agent group; groups with other people should typically be separate. + +### Features + +- Markdown formatting — `**bold**`, `*italic*` / `_italic_`, `` `code` ``, ` ```code fence``` `, `~~strike~~`, `||spoiler||` (converted to Signal's offset-based text styles) +- Quoted replies — `replyTo*` fields populated from Signal quotes +- Typing indicators — DMs only (Signal doesn't support group typing) +- Echo suppression — outbound messages are matched on `(platformId, text)` within a 10 s TTL to avoid syncMessage loops +- Note to Self — messages you send to your own account from another device route to the agent as inbound with `isFromMe: true` +- Voice attachments — detected but not transcribed by default; the agent receives `[Voice Message]` placeholder text. Run `/add-voice-transcription` for local transcription via parakeet-mlx + +Not supported yet: outbound file attachments (logged and dropped), edit/delete messages, reactions. + +## Troubleshooting + +### Daemon not reachable + +```bash +grep "Signal" logs/nanoclaw.log | tail +``` + +If you see `Signal daemon failed to start. Is signal-cli installed and your account linked?`: +- Confirm `signal-cli` is on PATH (or set `SIGNAL_CLI_PATH`) +- Confirm the account is linked: `signal-cli -a +1YOURNUMBER listIdentities` should succeed without prompting + +If you see `Signal daemon not reachable at 127.0.0.1:7583` and `SIGNAL_MANAGE_DAEMON=false`, start the daemon yourself: `signal-cli -a +1YOURNUMBER daemon --tcp 127.0.0.1:7583`. + +### Bot not responding + +1. Channel initialized: `grep "Signal channel connected" logs/nanoclaw.log | tail -1` +2. Channel wired: `sqlite3 data/v2.db "SELECT mg.platform_id, mg.name FROM messaging_groups mg JOIN messaging_group_agents mga ON mg.id = mga.messaging_group_id WHERE mg.channel_type='signal'"` +3. Service running: `launchctl print gui/$(id -u)/com.nanoclaw` (macOS) / `systemctl --user status nanoclaw` (Linux) + +### Lost connection mid-session + +If you see `Signal channel lost TCP connection to signal-cli daemon` in the logs, the daemon dropped us. There's no auto-reconnect yet — restart the service to re-establish. diff --git a/.claude/skills/add-signal/VERIFY.md b/.claude/skills/add-signal/VERIFY.md new file mode 100644 index 0000000..b1ae851 --- /dev/null +++ b/.claude/skills/add-signal/VERIFY.md @@ -0,0 +1,5 @@ +# Verify Signal + +Send a message to your own Signal number (Note to Self) from another device, or have someone send your linked number a DM. The bot should respond within a few seconds. + +If nothing happens, tail `logs/nanoclaw.log` for `Signal channel connected` and `Signal message received`. diff --git a/package.json b/package.json index 77920c4..098e01f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "nanoclaw", - "version": "2.0.7", + "version": "2.0.9", "description": "Personal Claude assistant. Lightweight, secure, customizable.", "type": "module", "packageManager": "pnpm@10.33.0", diff --git a/repo-tokens/badge.svg b/repo-tokens/badge.svg index 3fc904e..fd25267 100644 --- a/repo-tokens/badge.svg +++ b/repo-tokens/badge.svg @@ -1,5 +1,5 @@ - - 128k tokens, 64% of context window + + 129k tokens, 64% of context window @@ -15,8 +15,8 @@ tokens - - 128k + + 129k diff --git a/scripts/init-first-agent.ts b/scripts/init-first-agent.ts index dcb99b5..fc61b9c 100644 --- a/scripts/init-first-agent.ts +++ b/scripts/init-first-agent.ts @@ -137,13 +137,29 @@ function namespacedUserId(channel: string, raw: string): string { return raw.includes(':') ? raw : `${channel}:${raw}`; } +/** + * Determine whether a platform ID needs a channel-type prefix. + * + * Chat SDK adapters (Telegram, Discord, Slack, Teams, etc.) namespace their + * platform IDs with a channel prefix: "telegram:123456", "discord:guild:chan". + * The router stores `channel_type` and `platform_id` in separate columns, but + * Chat SDK adapters send the prefixed form as the platform_id, so this script + * must match that format. + * + * Native adapters (Signal, WhatsApp) use their own ID formats and send them + * as-is — no channel prefix. Signal sends raw phone numbers (+15551234567) + * for DMs and "group:" for group chats. WhatsApp sends JIDs containing + * '@' (@s.whatsapp.net, @g.us). Prefixing these would cause + * a mismatch between what the adapter sends and what the DB stores, breaking + * message routing. + */ function namespacedPlatformId(channel: string, raw: string): string { if (raw.startsWith(`${channel}:`)) return raw; - // Adapters using native JID format (WhatsApp: @s.whatsapp.net, - // @g.us) store platform_id without a channel prefix. The '@' is - // the discriminator — telegram/discord platform_ids don't contain it - // except after a channel prefix, which is already handled above. + // Native WhatsApp JIDs contain '@' — no prefix needed. if (raw.includes('@')) return raw; + // Native Signal IDs: phone numbers (+...) and group IDs (group:...). + if (raw.startsWith('+') || raw.startsWith('group:')) return raw; + // Chat SDK adapters — add the channel prefix. return `${channel}:${raw}`; } diff --git a/setup/environment.test.ts b/setup/environment.test.ts index deda62f..7765693 100644 --- a/setup/environment.test.ts +++ b/setup/environment.test.ts @@ -1,5 +1,7 @@ -import { describe, it, expect, beforeEach } from 'vitest'; +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import fs from 'fs'; +import os from 'os'; +import path from 'path'; import Database from 'better-sqlite3'; @@ -17,58 +19,63 @@ describe('environment detection', () => { }); }); -describe('registered groups DB query', () => { - let db: Database.Database; +describe('detectRegisteredGroups', () => { + let tempDir: string; beforeEach(() => { - db = new Database(':memory:'); - db.exec(`CREATE TABLE IF NOT EXISTS registered_groups ( - jid TEXT PRIMARY KEY, - name TEXT NOT NULL, - folder TEXT NOT NULL UNIQUE, - trigger_pattern TEXT NOT NULL, - added_at TEXT NOT NULL, - container_config TEXT, - requires_trigger INTEGER DEFAULT 1 - )`); + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'nanoclaw-env-test-')); + fs.mkdirSync(path.join(tempDir, 'data'), { recursive: true }); }); - it('returns 0 for empty table', () => { - const row = db - .prepare('SELECT COUNT(*) as count FROM registered_groups') - .get() as { count: number }; - expect(row.count).toBe(0); + afterEach(() => { + fs.rmSync(tempDir, { recursive: true, force: true }); }); - it('returns correct count after inserts', () => { - db.prepare( - `INSERT INTO registered_groups (jid, name, folder, trigger_pattern, added_at, requires_trigger) - VALUES (?, ?, ?, ?, ?, ?)`, - ).run( - '123@g.us', - 'Group 1', - 'group-1', - '@Andy', - '2024-01-01T00:00:00.000Z', - 1, - ); + it('returns false when no registration state exists', async () => { + const { detectRegisteredGroups } = await import('./environment.js'); + expect(detectRegisteredGroups(tempDir)).toBe(false); + }); - db.prepare( - `INSERT INTO registered_groups (jid, name, folder, trigger_pattern, added_at, requires_trigger) - VALUES (?, ?, ?, ?, ?, ?)`, - ).run( - '456@g.us', - 'Group 2', - 'group-2', - '@Andy', - '2024-01-01T00:00:00.000Z', - 1, - ); + it('detects pre-migration registered_groups.json', async () => { + const { detectRegisteredGroups } = await import('./environment.js'); + fs.writeFileSync(path.join(tempDir, 'data', 'registered_groups.json'), '[]'); + expect(detectRegisteredGroups(tempDir)).toBe(true); + }); - const row = db - .prepare('SELECT COUNT(*) as count FROM registered_groups') - .get() as { count: number }; - expect(row.count).toBe(2); + it('returns false for an empty v2 central DB', async () => { + const { detectRegisteredGroups } = await import('./environment.js'); + const db = new Database(path.join(tempDir, 'data', 'v2.db')); + db.exec(` + CREATE TABLE agent_groups (id TEXT PRIMARY KEY); + CREATE TABLE messaging_group_agents ( + id TEXT PRIMARY KEY, + messaging_group_id TEXT NOT NULL, + agent_group_id TEXT NOT NULL + ); + `); + db.close(); + + expect(detectRegisteredGroups(tempDir)).toBe(false); + }); + + it('detects wired agent groups in the v2 central DB', async () => { + const { detectRegisteredGroups } = await import('./environment.js'); + const db = new Database(path.join(tempDir, 'data', 'v2.db')); + db.exec(` + CREATE TABLE agent_groups (id TEXT PRIMARY KEY); + CREATE TABLE messaging_group_agents ( + id TEXT PRIMARY KEY, + messaging_group_id TEXT NOT NULL, + agent_group_id TEXT NOT NULL + ); + `); + db.prepare('INSERT INTO agent_groups (id) VALUES (?)').run('ag-1'); + db.prepare( + 'INSERT INTO messaging_group_agents (id, messaging_group_id, agent_group_id) VALUES (?, ?, ?)', + ).run('mga-1', 'mg-1', 'ag-1'); + db.close(); + + expect(detectRegisteredGroups(tempDir)).toBe(true); }); }); diff --git a/setup/environment.ts b/setup/environment.ts index 4a83665..6986396 100644 --- a/setup/environment.ts +++ b/setup/environment.ts @@ -7,11 +7,35 @@ import path from 'path'; import Database from 'better-sqlite3'; -import { STORE_DIR } from '../src/config.js'; import { log } from '../src/log.js'; import { commandExists, getPlatform, isHeadless, isWSL } from './platform.js'; import { emitStatus } from './status.js'; +export function detectRegisteredGroups(projectRoot: string): boolean { + if (fs.existsSync(path.join(projectRoot, 'data', 'registered_groups.json'))) { + return true; + } + + const dbPath = path.join(projectRoot, 'data', 'v2.db'); + if (!fs.existsSync(dbPath)) return false; + + let db: Database.Database | null = null; + try { + db = new Database(dbPath, { readonly: true }); + const row = db + .prepare( + `SELECT COUNT(DISTINCT ag.id) as count FROM agent_groups ag + JOIN messaging_group_agents mga ON mga.agent_group_id = ag.id`, + ) + .get() as { count: number }; + return row.count > 0; + } catch { + return false; + } finally { + db?.close(); + } +} + export async function run(_args: string[]): Promise { const projectRoot = process.cwd(); @@ -39,26 +63,7 @@ export async function run(_args: string[]): Promise { const authDir = path.join(projectRoot, 'store', 'auth'); const hasAuth = fs.existsSync(authDir) && fs.readdirSync(authDir).length > 0; - let hasRegisteredGroups = false; - // Check JSON file first (pre-migration) - if (fs.existsSync(path.join(projectRoot, 'data', 'registered_groups.json'))) { - hasRegisteredGroups = true; - } else { - // Check SQLite directly using better-sqlite3 (no sqlite3 CLI needed) - const dbPath = path.join(STORE_DIR, 'messages.db'); - if (fs.existsSync(dbPath)) { - try { - const db = new Database(dbPath, { readonly: true }); - const row = db - .prepare('SELECT COUNT(*) as count FROM registered_groups') - .get() as { count: number }; - if (row.count > 0) hasRegisteredGroups = true; - db.close(); - } catch { - // Table might not exist yet - } - } - } + const hasRegisteredGroups = detectRegisteredGroups(projectRoot); // Check for existing OpenClaw installation const homedir = (await import('os')).homedir(); diff --git a/src/channels/chat-sdk-bridge.ts b/src/channels/chat-sdk-bridge.ts index 6c9f802..c8cf3cc 100644 --- a/src/channels/chat-sdk-bridge.ts +++ b/src/channels/chat-sdk-bridge.ts @@ -81,6 +81,26 @@ export interface ChatSdkBridgeConfig { * chunk boundary will render as two independent blocks on the receiving * platform, which is the same behavior as manually re-opening a fence. */ +/** + * Decode the actual option value from a button callback. Buttons are encoded + * with an integer index (to keep under Telegram's 64-byte callback_data cap), + * and the real value is looked up via `getAskQuestionRender(questionId)`. + * Falls back to treating the tail as a literal value so old in-flight cards + * (encoded before this shortening landed) still resolve. + */ +function resolveSelectedOption( + render: { options: NormalizedOption[] } | undefined, + eventValue: string | undefined, + tail: string | undefined, +): string { + const candidate = eventValue ?? tail ?? ''; + if (render && /^\d+$/.test(candidate)) { + const idx = Number(candidate); + if (render.options[idx]) return render.options[idx].value; + } + return candidate; +} + export function splitForLimit(text: string, limit: number): string[] { if (text.length <= limit) return [text]; const chunks: string[] = []; @@ -241,11 +261,15 @@ export function createChatSdkBridge(config: ChatSdkBridgeConfig): ChannelAdapter const parts = event.actionId.split(':'); if (parts.length < 3) return; const questionId = parts[1]; - const selectedOption = event.value || ''; + const tail = parts.slice(2).join(':'); const userId = event.user?.userId || ''; // Resolve render metadata BEFORE dispatching onAction (which deletes the row). const render = getAskQuestionRender(questionId); + // New format: button id/value is an integer index into options (kept + // short to fit Telegram's 64-byte callback_data cap). Old format: + // the full value is embedded in actionId/value directly. + const selectedOption = resolveSelectedOption(render, event.value, tail); const title = render?.title ?? '❓ Question'; const matched = render?.options.find((o) => o.value === selectedOption); const selectedLabel = matched?.selectedLabel ?? selectedOption ?? '(clicked)'; @@ -349,8 +373,13 @@ export function createChatSdkBridge(config: ChatSdkBridgeConfig): ChannelAdapter children: [ CardText(question), Actions( - options.map((opt) => - Button({ id: `ncq:${questionId}:${opt.value}`, label: opt.label, value: opt.value }), + // Encode button id/value with the option index rather than the + // full value. Telegram caps callback_data at 64 bytes, and + // long values (e.g. ISO datetimes, URLs) push the JSON payload + // well past that. The onAction handlers resolve the index back + // to the real value via getAskQuestionRender(questionId). + options.map((opt, idx) => + Button({ id: `ncq:${questionId}:${idx}`, label: opt.label, value: String(idx) }), ), ), ], @@ -511,12 +540,12 @@ async function handleForwardedEvent( // Parse the selected option from custom_id let questionId: string | undefined; - let selectedOption: string | undefined; + let tail: string | undefined; if (customId?.startsWith('ncq:')) { const colonIdx = customId.indexOf(':', 4); // after "ncq:" if (colonIdx !== -1) { questionId = customId.slice(4, colonIdx); - selectedOption = customId.slice(colonIdx + 1); + tail = customId.slice(colonIdx + 1); } } @@ -525,6 +554,9 @@ async function handleForwardedEvent( ((interaction.message as Record)?.embeds as Array>) || []; const originalDescription = (originalEmbeds[0]?.description as string) || ''; const render = questionId ? getAskQuestionRender(questionId) : undefined; + // Discord custom_id mirrors the new index-based encoding (see Button + // construction). Decode back to the real option value for downstream. + const selectedOption = resolveSelectedOption(render, tail, tail); const cardTitle = render?.title ?? ((originalEmbeds[0]?.title as string) || '❓ Question'); const matchedOpt = render?.options.find((o) => o.value === selectedOption); const selectedLabel = matchedOpt?.selectedLabel ?? selectedOption ?? customId; diff --git a/src/container-runner.ts b/src/container-runner.ts index 71e2064..fca88c4 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -36,7 +36,13 @@ import { type ProviderContainerContribution, type VolumeMount, } from './providers/provider-container-registry.js'; -import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js'; +import { + heartbeatPath, + markContainerRunning, + markContainerStopped, + sessionDir, + writeSessionRouting, +} from './session-manager.js'; import type { AgentGroup, Session } from './types.js'; const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY }); @@ -131,6 +137,12 @@ async function spawnContainer(session: Session): Promise { log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName }); + // Clear any orphan heartbeat from a previous container instance — the + // sweep's ceiling check treats a missing file as "fresh spawn, give grace" + // (host-sweep.ts line 87). Without this, the stale mtime can trigger an + // immediate kill before the new container touches the file itself. + fs.rmSync(heartbeatPath(agentGroup.id, session.id), { force: true }); + const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] }); activeContainers.set(session.id, { process: container, containerName }); diff --git a/src/db/session-db.ts b/src/db/session-db.ts index aea255d..48e9297 100644 --- a/src/db/session-db.ts +++ b/src/db/session-db.ts @@ -139,10 +139,10 @@ export function getMessageForRetry( db: Database.Database, messageId: string, status: string, -): { id: string; tries: number } | undefined { - return db.prepare('SELECT id, tries FROM messages_in WHERE id = ? AND status = ?').get(messageId, status) as - | { id: string; tries: number } - | undefined; +): { id: string; tries: number; processAfter: string | null } | undefined { + return db + .prepare('SELECT id, tries, process_after as processAfter FROM messages_in WHERE id = ? AND status = ?') + .get(messageId, status) as { id: string; tries: number; processAfter: string | null } | undefined; } export function syncProcessingAcks(inDb: Database.Database, outDb: Database.Database): void { diff --git a/src/db/sessions.ts b/src/db/sessions.ts index 5c53ad5..504aa26 100644 --- a/src/db/sessions.ts +++ b/src/db/sessions.ts @@ -97,10 +97,16 @@ export function deleteSession(id: string): void { // ── Pending Questions ── -export function createPendingQuestion(pq: PendingQuestion): void { - getDb() +/** + * Insert a pending question row. Idempotent: when delivery fails and retries, + * the second attempt calls this with the same question_id — without `OR + * IGNORE` that would throw UNIQUE and prevent the retry from reaching the + * actual send step. Returns true if a new row was inserted. + */ +export function createPendingQuestion(pq: PendingQuestion): boolean { + const result = getDb() .prepare( - `INSERT INTO pending_questions (question_id, session_id, message_out_id, platform_id, channel_type, thread_id, title, options_json, created_at) + `INSERT OR IGNORE INTO pending_questions (question_id, session_id, message_out_id, platform_id, channel_type, thread_id, title, options_json, created_at) VALUES (@question_id, @session_id, @message_out_id, @platform_id, @channel_type, @thread_id, @title, @options_json, @created_at)`, ) .run({ @@ -114,6 +120,7 @@ export function createPendingQuestion(pq: PendingQuestion): void { options_json: JSON.stringify(pq.options), created_at: pq.created_at, }); + return result.changes > 0; } export function getPendingQuestion(questionId: string): PendingQuestion | undefined { @@ -131,16 +138,21 @@ export function deletePendingQuestion(questionId: string): void { // ── Pending Approvals ── +/** + * Insert a pending approval row. Idempotent for the same reason as + * createPendingQuestion: delivery retries with the same approval_id must not + * fail on UNIQUE before the send step gets a chance to succeed. + */ export function createPendingApproval( pa: Partial & Pick< PendingApproval, 'approval_id' | 'request_id' | 'action' | 'payload' | 'created_at' | 'title' | 'options_json' >, -): void { - getDb() +): boolean { + const result = getDb() .prepare( - `INSERT INTO pending_approvals + `INSERT OR IGNORE INTO pending_approvals (approval_id, session_id, request_id, action, payload, created_at, agent_group_id, channel_type, platform_id, platform_message_id, expires_at, status, title, options_json) @@ -159,6 +171,7 @@ export function createPendingApproval( status: 'pending', ...pa, }); + return result.changes > 0; } export function getPendingApproval(approvalId: string): PendingApproval | undefined { diff --git a/src/delivery.ts b/src/delivery.ts index 2e193d4..036153a 100644 --- a/src/delivery.ts +++ b/src/delivery.ts @@ -321,7 +321,7 @@ async function deliverMessage( questionId: content.questionId, }); } else { - createPendingQuestion({ + const inserted = createPendingQuestion({ question_id: content.questionId, session_id: session.id, message_out_id: msg.id, @@ -332,7 +332,9 @@ async function deliverMessage( options: normalizeOptions(rawOptions as never), created_at: new Date().toISOString(), }); - log.info('Pending question created', { questionId: content.questionId, sessionId: session.id }); + if (inserted) { + log.info('Pending question created', { questionId: content.questionId, sessionId: session.id }); + } } } diff --git a/src/host-sweep.ts b/src/host-sweep.ts index 1a2901c..4dc2fb7 100644 --- a/src/host-sweep.ts +++ b/src/host-sweep.ts @@ -159,23 +159,31 @@ async function sweepSession(session: Session): Promise { syncProcessingAcks(inDb, outDb); } - const alive = isContainerRunning(session.id); - - // 2. Crashed-container cleanup: processing rows left behind get retried. - if (!alive && outDb) { - resetStuckProcessingRows(inDb, outDb, session, 'container not running'); + // 2. Wake a container if work is due and nothing is running. Ordered + // before the crashed-container cleanup so a fresh container gets a chance + // to clean its own orphan processing_ack rows on startup (see + // container/agent-runner/src/db/connection.ts). Otherwise the reset path + // would keep bumping process_after into the future, dueCount would stay 0, + // and the wake would never fire. + const dueCount = countDueMessages(inDb); + if (dueCount > 0 && !isContainerRunning(session.id)) { + log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); + await wakeContainer(session); } + const alive = isContainerRunning(session.id); + // 3. Running-container SLA: absolute ceiling + per-claim stuck rules. if (alive && outDb) { enforceRunningContainerSla(inDb, outDb, session, agentGroup.id); } - // 4. Wake a container if new work is due and nothing is running. - const dueCount = countDueMessages(inDb); - if (dueCount > 0 && !isContainerRunning(session.id)) { - log.info('Waking container for due messages', { sessionId: session.id, count: dueCount }); - await wakeContainer(session); + // 4. Crashed-container cleanup: processing rows left behind get retried. + // Only fires when wake in step 2 didn't pick up the work (no due messages, + // or wake failed). resetStuckProcessingRows itself is idempotent — it + // skips messages already scheduled for a future retry. + if (!alive && outDb) { + resetStuckProcessingRows(inDb, outDb, session, 'container not running'); } // 5. Recurrence fanout for completed recurring tasks. @@ -246,10 +254,16 @@ function resetStuckProcessingRows( reason: string, ): void { const claims = getProcessingClaims(outDb); + const now = Date.now(); for (const { message_id } of claims) { const msg = getMessageForRetry(inDb, message_id, 'pending'); if (!msg) continue; + // Already rescheduled for a future retry — don't bump tries again. The + // wake path (sweep step 2) will fire when process_after elapses and a + // fresh container will clean the orphan claim on startup. + if (msg.processAfter && Date.parse(msg.processAfter) > now) continue; + if (msg.tries >= MAX_TRIES) { markMessageFailed(inDb, msg.id); log.warn('Message marked as failed after max retries', {