Files
nanoclaw/src/container-runner.ts
gavrielc 2e6dc21748 refactor(v2): per-group filesystem init, persistent across spawns
Each group's on-disk state (CLAUDE.md, .claude-shared/, agent-runner-src/)
is now initialized exactly once at group creation and owned by the group
forever after. Spawn does only mounts — no copies, no settings.json
overwrites, no skill clobbers, no source resyncs.

Global memory composition switches from "host reads /workspace/global/CLAUDE.md
at bootstrap and stuffs it into systemPrompt.append" to "group CLAUDE.md
imports it via @/workspace/global/CLAUDE.md at the top." Edits to global
propagate instantly through the existing read-only mount; no copy, no
restart.

- src/group-init.ts: new initGroupFilesystem(group, opts?) — idempotent,
  populates groups/<folder>/, .claude-shared/, agent-runner-src/ only when
  paths don't already exist.
- src/container-runner.ts: buildMounts() calls init defensively at the
  top (catches existing groups on first spawn after this change), drops
  the inline settings.json write, skills cpSync loop, and agent-runner-src
  rm-then-copy. Just mounts now.
- src/delivery.ts: create_agent flow uses initGroupFilesystem with
  optional instructions, replacing the inline mkdirSync + writeFileSync.
- container/agent-runner/src/index.ts: drops GLOBAL_CLAUDE_MD reading.
  systemContext.instructions is now only the runtime-generated
  destinations addendum.
- scripts/migrate-group-claude-md.ts: one-shot migration that prepends
  the @-import to existing groups' CLAUDE.md. Skips if global doesn't
  exist or if the @-import is already present (regex match on the @ form
  to avoid false positives from prose mentions of the path).
- groups/main/CLAUDE.md: prepended by the migration.

Existing groups need a one-time wipe of their agent-runner-src/ dir so
init re-populates from current host source — done locally before this
commit. Future host-side updates to container/skills/ or
container/agent-runner/src/ won't auto-propagate; that's the trade-off
for unconditional persistence and will be covered by host-mediated
refresh tools in a follow-up.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 14:17:50 +03:00

364 lines
14 KiB
TypeScript

/**
* Container Runner v2
* Spawns agent containers with session folder + agent group folder mounts.
* The container runs the v2 agent-runner which polls the session DB.
*/
import { ChildProcess, execSync, spawn } from 'child_process';
import fs from 'fs';
import path from 'path';
import { OneCLI } from '@onecli-sh/sdk';
import { CONTAINER_IMAGE, DATA_DIR, GROUPS_DIR, IDLE_TIMEOUT, ONECLI_URL, TIMEZONE } from './config.js';
import { CONTAINER_RUNTIME_BIN, hostGatewayArgs, readonlyMountArgs, stopContainer } from './container-runtime.js';
import { getAgentGroup } from './db/agent-groups.js';
import { getMessagingGroup } from './db/messaging-groups.js';
import { initGroupFilesystem } from './group-init.js';
import { log } from './log.js';
import { validateAdditionalMounts } from './mount-security.js';
import {
markContainerIdle,
markContainerRunning,
markContainerStopped,
sessionDir,
writeDestinations,
writeSessionRouting,
} from './session-manager.js';
import type { AgentGroup, Session } from './types.js';
const onecli = new OneCLI({ url: ONECLI_URL });
interface VolumeMount {
hostPath: string;
containerPath: string;
readonly: boolean;
}
/** Active containers tracked by session ID. */
const activeContainers = new Map<string, { process: ChildProcess; containerName: string }>();
/**
* In-flight wake promises, keyed by session id. Deduplicates concurrent
* `wakeContainer` calls while the first spawn is still mid-setup (async
* buildContainerArgs, OneCLI gateway apply, etc.) — otherwise a second
* wake in that window passes the `activeContainers.has` check and spawns
* a duplicate container against the same session directory, producing
* racy double-replies.
*/
const wakePromises = new Map<string, Promise<void>>();
export function getActiveContainerCount(): number {
return activeContainers.size;
}
export function isContainerRunning(sessionId: string): boolean {
return activeContainers.has(sessionId);
}
/**
* Wake up a container for a session. If already running or mid-spawn, no-op
* (the in-flight wake promise is reused).
*
* The container runs the v2 agent-runner which polls the session DB.
*/
export function wakeContainer(session: Session): Promise<void> {
if (activeContainers.has(session.id)) {
log.debug('Container already running', { sessionId: session.id });
return Promise.resolve();
}
const existing = wakePromises.get(session.id);
if (existing) {
log.debug('Container wake already in-flight — joining existing promise', { sessionId: session.id });
return existing;
}
const promise = spawnContainer(session).finally(() => {
wakePromises.delete(session.id);
});
wakePromises.set(session.id, promise);
return promise;
}
async function spawnContainer(session: Session): Promise<void> {
const agentGroup = getAgentGroup(session.agent_group_id);
if (!agentGroup) {
log.error('Agent group not found', { agentGroupId: session.agent_group_id });
return;
}
// Refresh the destination map and default reply routing so any admin
// changes take effect on wake.
writeDestinations(agentGroup.id, session.id);
writeSessionRouting(agentGroup.id, session.id);
const mounts = buildMounts(agentGroup, session);
const containerName = `nanoclaw-v2-${agentGroup.folder}-${Date.now()}`;
// OneCLI agent identifier is the agent group id. The admin group uses OneCLI's
// default agent (undefined), so unscoped credentials apply. Non-admin groups
// use their stable ag-xxx id, which is reversible via getAgentGroup() for
// approval-request routing.
const agentIdentifier = agentGroup.is_admin ? undefined : agentGroup.id;
const args = await buildContainerArgs(mounts, containerName, session, agentGroup, agentIdentifier);
log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName });
const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] });
activeContainers.set(session.id, { process: container, containerName });
markContainerRunning(session.id);
// Log stderr
container.stderr?.on('data', (data) => {
for (const line of data.toString().trim().split('\n')) {
if (line) log.debug(line, { container: agentGroup.folder });
}
});
// stdout is unused in v2 (all IO is via session DB)
container.stdout?.on('data', () => {});
// Idle timeout: kill container after IDLE_TIMEOUT of no activity
let idleTimer = setTimeout(() => killContainer(session.id, 'idle timeout'), IDLE_TIMEOUT);
const resetIdle = () => {
clearTimeout(idleTimer);
idleTimer = setTimeout(() => killContainer(session.id, 'idle timeout'), IDLE_TIMEOUT);
};
// Reset idle timer when the host detects new messages_out (called by delivery.ts)
const entry = activeContainers.get(session.id);
if (entry) {
(entry as { resetIdle?: () => void }).resetIdle = resetIdle;
}
container.on('close', (code) => {
clearTimeout(idleTimer);
activeContainers.delete(session.id);
markContainerStopped(session.id);
log.info('Container exited', { sessionId: session.id, code, containerName });
});
container.on('error', (err) => {
clearTimeout(idleTimer);
activeContainers.delete(session.id);
markContainerStopped(session.id);
log.error('Container spawn error', { sessionId: session.id, err });
});
}
/** Reset the idle timer for a session's container (called when messages_out are delivered). */
export function resetContainerIdleTimer(sessionId: string): void {
const entry = activeContainers.get(sessionId) as { resetIdle?: () => void } | undefined;
entry?.resetIdle?.();
}
/** Kill a container for a session. */
export function killContainer(sessionId: string, reason: string): void {
const entry = activeContainers.get(sessionId);
if (!entry) return;
log.info('Killing container', { sessionId, reason, containerName: entry.containerName });
try {
stopContainer(entry.containerName);
} catch {
entry.process.kill('SIGKILL');
}
}
function buildMounts(agentGroup: AgentGroup, session: Session): VolumeMount[] {
// Per-group filesystem state lives forever after first creation. Init is
// idempotent: it only writes paths that don't already exist, so this call
// is a no-op for groups that have spawned before. Pulling in upstream
// built-in skill or agent-runner source updates is an explicit operation
// (host-mediated tools), not something the spawn path does silently.
initGroupFilesystem(agentGroup);
const mounts: VolumeMount[] = [];
const projectRoot = process.cwd();
const sessDir = sessionDir(agentGroup.id, session.id);
const groupDir = path.resolve(GROUPS_DIR, agentGroup.folder);
// Session folder at /workspace (contains inbound.db, outbound.db, outbox/, .claude/)
mounts.push({ hostPath: sessDir, containerPath: '/workspace', readonly: false });
// Agent group folder at /workspace/agent
mounts.push({ hostPath: groupDir, containerPath: '/workspace/agent', readonly: false });
// Global memory directory — read-only for non-admin so the @import
// in each group's CLAUDE.md can resolve it without risk of being
// overwritten by an agent in some other group.
const globalDir = path.join(GROUPS_DIR, 'global');
if (fs.existsSync(globalDir)) {
mounts.push({ hostPath: globalDir, containerPath: '/workspace/global', readonly: !agentGroup.is_admin });
}
// Per-group .claude-shared at /home/node/.claude (Claude state, settings,
// skills — initialized once at group creation, persistent thereafter)
const claudeDir = path.join(DATA_DIR, 'v2-sessions', agentGroup.id, '.claude-shared');
mounts.push({ hostPath: claudeDir, containerPath: '/home/node/.claude', readonly: false });
// Per-group agent-runner source at /app/src (initialized once at group
// creation, persistent thereafter — agents can modify their runner)
const groupRunnerDir = path.join(DATA_DIR, 'v2-sessions', agentGroup.id, 'agent-runner-src');
mounts.push({ hostPath: groupRunnerDir, containerPath: '/app/src', readonly: false });
// Admin: mount project root read-only
if (agentGroup.is_admin) {
mounts.push({ hostPath: projectRoot, containerPath: '/workspace/project', readonly: true });
const envFile = path.join(projectRoot, '.env');
if (fs.existsSync(envFile)) {
mounts.push({ hostPath: '/dev/null', containerPath: '/workspace/project/.env', readonly: true });
}
}
// Additional mounts from container config
const containerConfig = agentGroup.container_config ? JSON.parse(agentGroup.container_config) : {};
if (containerConfig.additionalMounts) {
const validated = validateAdditionalMounts(
containerConfig.additionalMounts,
agentGroup.name,
!!agentGroup.is_admin,
);
mounts.push(...validated);
}
return mounts;
}
async function buildContainerArgs(
mounts: VolumeMount[],
containerName: string,
session: Session,
agentGroup: AgentGroup,
agentIdentifier?: string,
): Promise<string[]> {
const args: string[] = ['run', '--rm', '--name', containerName];
// Environment
args.push('-e', `TZ=${TIMEZONE}`);
args.push('-e', `AGENT_PROVIDER=${session.agent_provider || agentGroup.agent_provider || 'claude'}`);
// Two-DB split: container reads inbound.db, writes outbound.db
args.push('-e', 'SESSION_INBOUND_DB_PATH=/workspace/inbound.db');
args.push('-e', 'SESSION_OUTBOUND_DB_PATH=/workspace/outbound.db');
args.push('-e', 'SESSION_HEARTBEAT_PATH=/workspace/.heartbeat');
// Pass admin user ID and assistant name from messaging group/agent group
if (session.messaging_group_id) {
const mg = getMessagingGroup(session.messaging_group_id);
if (mg?.admin_user_id) {
args.push('-e', `NANOCLAW_ADMIN_USER_ID=${mg.admin_user_id}`);
}
}
if (agentGroup.name) {
args.push('-e', `NANOCLAW_ASSISTANT_NAME=${agentGroup.name}`);
}
args.push('-e', `NANOCLAW_AGENT_GROUP_ID=${agentGroup.id}`);
args.push('-e', `NANOCLAW_AGENT_GROUP_NAME=${agentGroup.name}`);
args.push('-e', `NANOCLAW_IS_ADMIN=${agentGroup.is_admin ? '1' : '0'}`);
// OneCLI gateway — injects HTTPS_PROXY + certs so container API calls
// are routed through the agent vault for credential injection.
// Must ensureAgent first for non-admin groups, otherwise applyContainerConfig
// rejects the unknown agent identifier and returns false.
try {
if (agentIdentifier) {
await onecli.ensureAgent({ name: agentGroup.name, identifier: agentIdentifier });
}
const onecliApplied = await onecli.applyContainerConfig(args, { addHostMapping: false, agent: agentIdentifier });
if (onecliApplied) {
log.info('OneCLI gateway applied', { containerName });
} else {
log.warn('OneCLI gateway not applied — container will have no credentials', { containerName });
}
} catch (err) {
log.warn('OneCLI gateway error — container will have no credentials', { containerName, err });
}
// Host gateway
args.push(...hostGatewayArgs());
// User mapping
const hostUid = process.getuid?.();
const hostGid = process.getgid?.();
if (hostUid != null && hostUid !== 0 && hostUid !== 1000) {
args.push('--user', `${hostUid}:${hostGid}`);
args.push('-e', 'HOME=/home/node');
}
// Volume mounts
for (const mount of mounts) {
if (mount.readonly) {
args.push(...readonlyMountArgs(mount.hostPath, mount.containerPath));
} else {
args.push('-v', `${mount.hostPath}:${mount.containerPath}`);
}
}
// Pass additional MCP servers from container config
const containerConfig = agentGroup.container_config ? JSON.parse(agentGroup.container_config) : {};
if (containerConfig.mcpServers && Object.keys(containerConfig.mcpServers).length > 0) {
args.push('-e', `NANOCLAW_MCP_SERVERS=${JSON.stringify(containerConfig.mcpServers)}`);
}
// Override entrypoint: compile agent-runner source, run v2 entry point (no stdin)
args.push('--entrypoint', 'bash');
// Use per-agent-group image if one has been built, otherwise base image
const imageTag = containerConfig.imageTag || CONTAINER_IMAGE;
args.push(imageTag);
args.push(
'-c',
'cd /app && npx tsc --outDir /tmp/dist 2>&1 >&2 && ln -sf /app/node_modules /tmp/dist/node_modules && node /tmp/dist/index.js',
);
return args;
}
/** Build a per-agent-group Docker image with custom packages. */
export async function buildAgentGroupImage(agentGroupId: string): Promise<void> {
const agentGroup = getAgentGroup(agentGroupId);
if (!agentGroup) throw new Error('Agent group not found');
const containerConfig = agentGroup.container_config ? JSON.parse(agentGroup.container_config) : {};
const packages = containerConfig.packages || { apt: [], npm: [] };
const aptPackages = (packages.apt || []) as string[];
const npmPackages = (packages.npm || []) as string[];
if (aptPackages.length === 0 && npmPackages.length === 0) {
throw new Error('No packages to install. Use install_packages first.');
}
let dockerfile = `FROM ${CONTAINER_IMAGE}\nUSER root\n`;
if (aptPackages.length > 0) {
dockerfile += `RUN apt-get update && apt-get install -y ${aptPackages.join(' ')} && rm -rf /var/lib/apt/lists/*\n`;
}
if (npmPackages.length > 0) {
dockerfile += `RUN npm install -g ${npmPackages.join(' ')}\n`;
}
dockerfile += 'USER node\n';
const imageTag = `nanoclaw-agent:${agentGroupId}`;
log.info('Building per-agent-group image', { agentGroupId, imageTag, apt: aptPackages, npm: npmPackages });
// Write Dockerfile to temp file and build
const tmpDockerfile = path.join(DATA_DIR, `Dockerfile.${agentGroupId}`);
fs.writeFileSync(tmpDockerfile, dockerfile);
try {
execSync(`${CONTAINER_RUNTIME_BIN} build -t ${imageTag} -f ${tmpDockerfile} .`, {
cwd: DATA_DIR,
stdio: 'pipe',
timeout: 300_000,
});
} finally {
fs.unlinkSync(tmpDockerfile);
}
// Store the image tag in container_config
containerConfig.imageTag = imageTag;
const { updateAgentGroup } = await import('./db/agent-groups.js');
updateAgentGroup(agentGroupId, { container_config: JSON.stringify(containerConfig) });
log.info('Per-agent-group image built', { agentGroupId, imageTag });
}