refactor: shared source — replace per-group agent-runner copies with single RO mount

Replace the per-group agent-runner-src copy model with a single shared read-only mount. Source and skills are now RO + shared; personality, config, working files, and Claude state stay RW + per-group. Key changes: - Mount container/agent-runner/src/ RO at /app/src (all groups share one copy) - Mount container/skills/ RO at /app/skills; per-group skill selection via symlinks in .claude-shared/skills/ based on container.json "skills" field - Mount container.json as nested RO bind on top of RW group dir - Move all NANOCLAW_* env vars to container.json (runner reads at startup) - New runner config.ts module replaces process.env reads - Move command gate (filtered/admin) from container to host router - Dockerfile: remove source COPY, split CLI installs (claude-code last), move agent-runner deps above CLIs for better layer caching - Add writeOutboundDirect for router denial responses - Design doc at docs/shared-src.md Not included (follow-up): DB migration to drop agent_provider columns, cleanup of orphaned agent-runner-src directories. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:05:19 +00:00
parent 596035be09
commit 8a12fa61ac
14 changed files with 715 additions and 249 deletions
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -3,8 +3,12 @@
 # Runs Claude Agent SDK in isolated Linux VM with browser automation.
 #
 # Runtime split:
-#   - agent-runner (our TypeScript code): Bun
+#   - agent-runner (our TypeScript code): Bun, mounted RO at /app/src by host
 #   - globally-installed Node CLIs (claude-code, agent-browser, vercel): pnpm + Node
+#
+# Source is never baked in — /app/src is provided by a shared read-only
+# bind mount at runtime (see src/container-runner.ts). Source-only changes
+# never require an image rebuild.

 FROM node:22-slim

@@ -66,36 +70,39 @@ RUN curl -fsSL https://bun.sh/install | bash -s "bun-v${BUN_VERSION}" && \
    install -m 0755 /root/.bun/bin/bun /usr/local/bin/bun && \
    rm -rf /root/.bun

-# ---- pnpm + global Node CLIs -------------------------------------------------
-ENV PNPM_HOME="/pnpm"
-ENV PATH="$PNPM_HOME:$PATH"
-RUN corepack enable
-
-# agent-browser has a postinstall build script — pnpm skips these by default.
-# Allowlist it via .npmrc so the install doesn't silently produce a broken
-# package. Pinned versions so every rebuild is reproducible.
-RUN --mount=type=cache,target=/root/.cache/pnpm \
-    echo "only-built-dependencies[]=agent-browser" > /root/.npmrc && \
-    echo "only-built-dependencies[]=@anthropic-ai/claude-code" >> /root/.npmrc && \
-    pnpm install -g \
-        "@anthropic-ai/claude-code@${CLAUDE_CODE_VERSION}" \
-        "agent-browser@${AGENT_BROWSER_VERSION}" \
-        "vercel@${VERCEL_VERSION}"
-
-# ---- agent-runner ------------------------------------------------------------
+# ---- agent-runner deps -------------------------------------------------------
+# Deps are cached independently of CLI versions. Source is NOT baked in —
+# it's provided by the shared RO mount at runtime.
 WORKDIR /app

-# Copy manifest + lockfile first so the install layer caches independently of
-# source edits.
 COPY agent-runner/package.json agent-runner/bun.lock ./

 RUN --mount=type=cache,target=/root/.bun/install/cache \
    bun install --frozen-lockfile

-# Source. Bun runs TS directly — no tsc build step. The host remounts this
-# path at runtime via `src/container-runner.ts` so source edits on the host
-# take effect without rebuilding the image; the baked copy is the fallback.
-COPY agent-runner/ ./
+# ---- pnpm + global Node CLIs -------------------------------------------------
+# Most stable first, most frequently bumped last. Bumping claude-code
+# (the most common change) only invalidates one layer.
+#
+# only-built-dependencies gates pnpm's supply-chain policy:
+#   - agent-browser has a postinstall build step.
+#   - @anthropic-ai/claude-code's postinstall downloads the native Claude
+#     binary (linux-arm64 variant on our image). Without the allowlist
+#     the SDK fails at spawn time with "native binary not found".
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack enable
+
+RUN --mount=type=cache,target=/root/.cache/pnpm \
+    echo "only-built-dependencies[]=agent-browser" > /root/.npmrc && \
+    echo "only-built-dependencies[]=@anthropic-ai/claude-code" >> /root/.npmrc && \
+    pnpm install -g "vercel@${VERCEL_VERSION}"
+
+RUN --mount=type=cache,target=/root/.cache/pnpm \
+    pnpm install -g "agent-browser@${AGENT_BROWSER_VERSION}"
+
+RUN --mount=type=cache,target=/root/.cache/pnpm \
+    pnpm install -g "@anthropic-ai/claude-code@${CLAUDE_CODE_VERSION}"

 # ---- Entrypoint --------------------------------------------------------------
 COPY entrypoint.sh /app/entrypoint.sh
--- a/container/agent-runner/src/config.ts
+++ b/container/agent-runner/src/config.ts
@@ -0,0 +1,55 @@
+/**
+ * Runner config — reads /workspace/agent/container.json at startup.
+ *
+ * This file is mounted read-only inside the container. The host writes it;
+ * the runner only reads. All NanoClaw-specific configuration lives here
+ * instead of environment variables.
+ */
+import fs from 'fs';
+
+const CONFIG_PATH = '/workspace/agent/container.json';
+
+export interface RunnerConfig {
+  provider: string;
+  assistantName: string;
+  groupName: string;
+  agentGroupId: string;
+  maxMessagesPerPrompt: number;
+  mcpServers: Record<string, { command: string; args: string[]; env: Record<string, string> }>;
+}
+
+const DEFAULT_MAX_MESSAGES = 10;
+
+let _config: RunnerConfig | null = null;
+
+/**
+ * Load config from container.json. Called once at startup.
+ * Falls back to sensible defaults for any missing field.
+ */
+export function loadConfig(): RunnerConfig {
+  if (_config) return _config;
+
+  let raw: Record<string, unknown> = {};
+  try {
+    raw = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
+  } catch {
+    console.error(`[config] Failed to read ${CONFIG_PATH}, using defaults`);
+  }
+
+  _config = {
+    provider: (raw.provider as string) || 'claude',
+    assistantName: (raw.assistantName as string) || '',
+    groupName: (raw.groupName as string) || '',
+    agentGroupId: (raw.agentGroupId as string) || '',
+    maxMessagesPerPrompt: (raw.maxMessagesPerPrompt as number) || DEFAULT_MAX_MESSAGES,
+    mcpServers: (raw.mcpServers as RunnerConfig['mcpServers']) || {},
+  };
+
+  return _config;
+}
+
+/** Get the loaded config. Throws if loadConfig() hasn't been called. */
+export function getConfig(): RunnerConfig {
+  if (!_config) throw new Error('Config not loaded — call loadConfig() first');
+  return _config;
+}
--- a/container/agent-runner/src/db/connection.ts
+++ b/container/agent-runner/src/db/connection.ts
@@ -31,8 +31,7 @@ let _heartbeatPath: string = DEFAULT_HEARTBEAT_PATH;
 /** Inbound DB — container opens read-only (host is the sole writer). */
 export function getInboundDb(): Database {
  if (!_inbound) {
-    const dbPath = process.env.SESSION_INBOUND_DB_PATH || DEFAULT_INBOUND_PATH;
-    _inbound = new Database(dbPath, { readonly: true });
+    _inbound = new Database(DEFAULT_INBOUND_PATH, { readonly: true });
    _inbound.exec('PRAGMA busy_timeout = 5000');
  }
  return _inbound;
@@ -41,8 +40,7 @@ export function getInboundDb(): Database {
 /** Outbound DB — container owns this file (sole writer). */
 export function getOutboundDb(): Database {
  if (!_outbound) {
-    const dbPath = process.env.SESSION_OUTBOUND_DB_PATH || DEFAULT_OUTBOUND_PATH;
-    _outbound = new Database(dbPath);
+    _outbound = new Database(DEFAULT_OUTBOUND_PATH);
    _outbound.exec('PRAGMA journal_mode = DELETE');
    _outbound.exec('PRAGMA busy_timeout = 5000');
    _outbound.exec('PRAGMA foreign_keys = ON');
@@ -122,7 +120,7 @@ export function clearContainerToolInFlight(): void {
 * A file touch is cheaper and avoids cross-boundary DB write contention.
 */
 export function touchHeartbeat(): void {
-  const p = process.env.SESSION_HEARTBEAT_PATH || _heartbeatPath;
+  const p = _heartbeatPath;
  const now = new Date();
  try {
    fs.utimesSync(p, now, now);
--- a/container/agent-runner/src/db/messages-in.ts
+++ b/container/agent-runner/src/db/messages-in.ts
@@ -7,6 +7,7 @@
 * The container never writes to inbound.db — all status tracking goes through
 * processing_ack. The host reads processing_ack to sync message lifecycle.
 */
+import { getConfig } from '../config.js';
 import { getInboundDb, getOutboundDb } from './connection.js';

 export interface MessageInRow {
@@ -26,14 +27,16 @@ export interface MessageInRow {
  content: string;
 }

-// Cap on how many messages reach the agent in one prompt, including any
-// accumulated-but-not-triggered context. Host controls the cap via the
-// NANOCLAW_MAX_MESSAGES_PER_PROMPT env var; default mirrors the host's
-// config.ts default of 10.
-const MAX_MESSAGES_PER_PROMPT = Math.max(
-  1,
-  parseInt(process.env.NANOCLAW_MAX_MESSAGES_PER_PROMPT || '10', 10) || 10,
-);
+// Cap on how many messages reach the agent in one prompt. Read from
+// container.json; falls back to 10.
+function getMaxMessagesPerPrompt(): number {
+  try {
+    return getConfig().maxMessagesPerPrompt;
+  } catch {
+    // Config not loaded yet (e.g. test harness) — use default
+    return 10;
+  }
+}

 /**
 * Fetch pending messages that are due for processing.
@@ -58,7 +61,7 @@ export function getPendingMessages(): MessageInRow[] {
       ORDER BY seq DESC
       LIMIT ?`,
    )
-    .all(MAX_MESSAGES_PER_PROMPT) as MessageInRow[];
+    .all(getMaxMessagesPerPrompt()) as MessageInRow[];

  if (pending.length === 0) return [];

--- a/container/agent-runner/src/formatter.ts
+++ b/container/agent-runner/src/formatter.ts
@@ -55,6 +55,17 @@ export function categorizeMessage(msg: MessageInRow): CommandInfo {
  return { category: 'passthrough', command, text, senderId };
 }

+/**
+ * Narrow check for /clear — the only command the runner handles directly.
+ * All other command gating (filtered, admin) is done by the host router
+ * before messages reach the container.
+ */
+export function isClearCommand(msg: MessageInRow): boolean {
+  const content = parseContent(msg.content);
+  const text = (content.text || '').trim();
+  return text.toLowerCase().startsWith('/clear');
+}
+
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 function extractSenderId(msg: MessageInRow, content: any): string | null {
  const raw: string | null = content?.senderId || content?.author?.userId || null;
--- a/container/agent-runner/src/index.ts
+++ b/container/agent-runner/src/index.ts
@@ -4,14 +4,8 @@
 * Runs inside a container. All IO goes through the session DB.
 * No stdin, no stdout markers, no IPC files.
 *
- * Config:
- *   - SESSION_INBOUND_DB_PATH:  path to host-owned inbound DB (default: /workspace/inbound.db)
- *   - SESSION_OUTBOUND_DB_PATH: path to container-owned outbound DB (default: /workspace/outbound.db)
- *   - SESSION_HEARTBEAT_PATH:   heartbeat file path (default: /workspace/.heartbeat)
- *   - AGENT_PROVIDER: any registered provider name (default: claude). The
- *     set of registered providers is whatever `providers/index.ts` imports.
- *   - NANOCLAW_ASSISTANT_NAME: assistant name for transcript archiving
- *   - NANOCLAW_ADMIN_USER_IDS: comma-separated user IDs allowed to run admin commands
+ * Config is read from /workspace/agent/container.json (mounted RO).
+ * Only TZ and OneCLI networking vars come from env.
 *
 * Mount structure:
 *   /workspace/
@@ -19,14 +13,19 @@
 *     outbound.db       ← container-owned session DB
 *     .heartbeat        ← container touches for liveness detection
 *     outbox/           ← outbound files
- *     agent/            ← agent group folder (CLAUDE.md, skills, working files)
- *     .claude/          ← Claude SDK session data
+ *     agent/            ← agent group folder (CLAUDE.md, container.json, working files)
+ *       container.json  ← per-group config (RO nested mount)
+ *     global/           ← shared global memory (RO)
+ *   /app/src/           ← shared agent-runner source (RO)
+ *   /app/skills/        ← shared skills (RO)
+ *   /home/node/.claude/ ← Claude SDK state + skill symlinks (RW)
 */

 import fs from 'fs';
 import path from 'path';
 import { fileURLToPath } from 'url';

+import { loadConfig } from './config.js';
 import { buildSystemPromptAddendum } from './destinations.js';
 // Providers barrel — each enabled provider self-registers on import.
 // Provider skills append imports to providers/index.ts.
@@ -41,21 +40,11 @@ function log(msg: string): void {
 const CWD = '/workspace/agent';

 async function main(): Promise<void> {
-  const providerName = (process.env.AGENT_PROVIDER || 'claude').toLowerCase() as ProviderName;
-  const assistantName = process.env.NANOCLAW_ASSISTANT_NAME;
-  const adminUserIds = new Set(
-    (process.env.NANOCLAW_ADMIN_USER_IDS || '')
-      .split(',')
-      .map((s) => s.trim())
-      .filter(Boolean),
-  );
+  const config = loadConfig();
+  const providerName = config.provider.toLowerCase() as ProviderName;

  log(`Starting v2 agent-runner (provider: ${providerName})`);

-  // Destinations addendum is the only runtime-generated context we inject.
-  // Global CLAUDE.md is loaded by Claude Code from /workspace/agent/CLAUDE.md
-  // (which imports /workspace/global/CLAUDE.md via @-syntax) — no need to
-  // read it manually anymore.
  const instructions = buildSystemPromptAddendum();

  // Discover additional directories mounted at /workspace/extra/*
@@ -77,34 +66,22 @@ async function main(): Promise<void> {
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
  const mcpServerPath = path.join(__dirname, 'mcp-tools', 'index.ts');

-  // Build MCP servers config: nanoclaw built-in + any additional from host
+  // Build MCP servers config: nanoclaw built-in + any from container.json
  const mcpServers: Record<string, { command: string; args: string[]; env: Record<string, string> }> = {
    nanoclaw: {
      command: 'bun',
      args: ['run', mcpServerPath],
-      env: {
-        SESSION_INBOUND_DB_PATH: process.env.SESSION_INBOUND_DB_PATH || '/workspace/inbound.db',
-        SESSION_OUTBOUND_DB_PATH: process.env.SESSION_OUTBOUND_DB_PATH || '/workspace/outbound.db',
-        SESSION_HEARTBEAT_PATH: process.env.SESSION_HEARTBEAT_PATH || '/workspace/.heartbeat',
-      },
+      env: {},
    },
  };

-  // Merge additional MCP servers from host configuration
-  if (process.env.NANOCLAW_MCP_SERVERS) {
-    try {
-      const additional = JSON.parse(process.env.NANOCLAW_MCP_SERVERS) as Record<string, { command: string; args: string[]; env: Record<string, string> }>;
-      for (const [name, config] of Object.entries(additional)) {
-        mcpServers[name] = config;
-        log(`Additional MCP server: ${name} (${config.command})`);
-      }
-    } catch (e) {
-      log(`Failed to parse NANOCLAW_MCP_SERVERS: ${e}`);
-    }
+  for (const [name, serverConfig] of Object.entries(config.mcpServers)) {
+    mcpServers[name] = serverConfig;
+    log(`Additional MCP server: ${name} (${serverConfig.command})`);
  }

  const provider = createProvider(providerName, {
-    assistantName,
+    assistantName: config.assistantName || undefined,
    mcpServers,
    env: { ...process.env },
    additionalDirectories: additionalDirectories.length > 0 ? additionalDirectories : undefined,
@@ -114,7 +91,6 @@ async function main(): Promise<void> {
    provider,
    cwd: CWD,
    systemContext: { instructions },
-    adminUserIds,
  });
 }

--- a/container/agent-runner/src/poll-loop.ts
+++ b/container/agent-runner/src/poll-loop.ts
@@ -3,7 +3,7 @@ import { getPendingMessages, markProcessing, markCompleted, type MessageInRow }
 import { writeMessageOut } from './db/messages-out.js';
 import { touchHeartbeat, clearStaleProcessingAcks } from './db/connection.js';
 import { getStoredSessionId, setStoredSessionId, clearStoredSessionId } from './db/session-state.js';
-import { formatMessages, extractRouting, categorizeMessage, stripInternalTags, type RoutingContext } from './formatter.js';
+import { formatMessages, extractRouting, categorizeMessage, isClearCommand, stripInternalTags, type RoutingContext } from './formatter.js';
 import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types.js';

 const POLL_INTERVAL_MS = 1000;
@@ -23,12 +23,6 @@ export interface PollLoopConfig {
  systemContext?: {
    instructions?: string;
  };
-  /**
-   * Set of user IDs allowed to run admin commands (e.g. /clear) in this
-   * agent group. Host populates from owners + global admins + scoped admins
-   * at container wake time, so role changes take effect on next spawn.
-   */
-  adminUserIds?: Set<string>;
 }

 /**
@@ -90,74 +84,36 @@ export async function runPollLoop(config: PollLoopConfig): Promise<void> {

    const routing = extractRouting(messages);

-    // Handle commands: categorize chat messages
-    const adminUserIds = config.adminUserIds ?? new Set<string>();
-    const normalMessages = [];
+    // Command handling: the host router gates filtered and unauthorized
+    // admin commands before they reach the container. The only command
+    // the runner handles directly is /clear (session reset).
+    const normalMessages: MessageInRow[] = [];
    const commandIds: string[] = [];

    for (const msg of messages) {
-      if (msg.kind !== 'chat' && msg.kind !== 'chat-sdk') {
-        normalMessages.push(msg);
-        continue;
-      }
-
-      const cmdInfo = categorizeMessage(msg);
-
-      if (cmdInfo.category === 'filtered') {
-        // Silently drop — mark completed, don't process
-        log(`Filtered command: ${cmdInfo.command} (msg: ${msg.id})`);
+      if ((msg.kind === 'chat' || msg.kind === 'chat-sdk') && isClearCommand(msg)) {
+        log('Clearing session (resetting continuation)');
+        continuation = undefined;
+        clearStoredSessionId();
+        writeMessageOut({
+          id: generateId(),
+          kind: 'chat',
+          platform_id: routing.platformId,
+          channel_type: routing.channelType,
+          thread_id: routing.threadId,
+          content: JSON.stringify({ text: 'Session cleared.' }),
+        });
        commandIds.push(msg.id);
        continue;
      }
-
-      if (cmdInfo.category === 'admin') {
-        if (!cmdInfo.senderId || !adminUserIds.has(cmdInfo.senderId)) {
-          log(`Admin command denied: ${cmdInfo.command} from ${cmdInfo.senderId} (msg: ${msg.id})`);
-          writeMessageOut({
-            id: generateId(),
-            kind: 'chat',
-            platform_id: routing.platformId,
-            channel_type: routing.channelType,
-            thread_id: routing.threadId,
-            content: JSON.stringify({ text: `Permission denied: ${cmdInfo.command} requires admin access.` }),
-          });
-          commandIds.push(msg.id);
-          continue;
-        }
-        // Handle admin commands directly
-        if (cmdInfo.command === '/clear') {
-          log('Clearing session (resetting continuation)');
-          continuation = undefined;
-          clearStoredSessionId();
-          writeMessageOut({
-            id: generateId(),
-            kind: 'chat',
-            platform_id: routing.platformId,
-            channel_type: routing.channelType,
-            thread_id: routing.threadId,
-            content: JSON.stringify({ text: 'Session cleared.' }),
-          });
-          commandIds.push(msg.id);
-          continue;
-        }
-
-        // Other admin commands — pass through to agent
-        normalMessages.push(msg);
-        continue;
-      }
-
-      // passthrough or none
      normalMessages.push(msg);
    }

-    // Mark filtered/denied command messages as completed immediately
    if (commandIds.length > 0) {
      markCompleted(commandIds);
    }

-    // If all messages were filtered commands, skip processing
    if (normalMessages.length === 0) {
-      // Mark remaining processing IDs as completed
      const remainingIds = ids.filter((id) => !commandIds.includes(id));
      if (remainingIds.length > 0) markCompleted(remainingIds);
      log(`All ${messages.length} message(s) were commands, skipping query`);
@@ -289,17 +245,14 @@ async function processQuery(query: AgentQuery, routing: RoutingContext): Promise
  const pollHandle = setInterval(() => {
    if (done) return;

-    // Skip system messages (MCP tool responses) and admin commands (need fresh query).
+    // Skip system messages (MCP tool responses) and /clear (needs fresh query).
    // Also defer messages whose thread_id differs from the active turn's routing
    // — mixing threads into one streaming turn would send the reply to the wrong
    // thread because `routing` is captured at turn start. The next turn will pick
    // them up with fresh routing.
    const newMessages = getPendingMessages().filter((m) => {
      if (m.kind === 'system') return false;
-      if (m.kind === 'chat' || m.kind === 'chat-sdk') {
-        const cmd = categorizeMessage(m);
-        if (cmd.category === 'admin') return false;
-      }
+      if ((m.kind === 'chat' || m.kind === 'chat-sdk') && isClearCommand(m)) return false;
      if ((m.thread_id ?? null) !== (routing.threadId ?? null)) return false;
      return true;
    });