fix(container): scope orphan reaper by install label so peers don't kill each other

Two installs on the same host could trash each other's containers: the
reaper used `docker ps --filter name=nanoclaw-`, a substring match that
picked up every install's containers. A crash-looping peer (e.g. a legacy
v1 plist respawning ~6k times) would call cleanupOrphans on every boot and
kill the healthy install's session containers within seconds of spawn.

- Stamp `--label nanoclaw-install=<slug>` onto every spawned container.
- cleanupOrphans filters by that label; healthy peers are left alone.
- Setup preflight enumerates `com.nanoclaw*` launchd plists / nanoclaw
  user systemd units, probes state/runs, and unloads any that are
  crash-looping (state != running AND runs > 10) before installing
  this install's service.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Lazer Cohen
2026-04-23 12:12:30 +03:00
parent 990d243dbd
commit 2383bde80f
6 changed files with 234 additions and 7 deletions

View File

@@ -2,7 +2,7 @@ import os from 'os';
import path from 'path';
import { readEnvFile } from './env.js';
import { getContainerImageBase, getDefaultContainerImage } from './install-slug.js';
import { getContainerImageBase, getDefaultContainerImage, getInstallSlug } from './install-slug.js';
import { isValidTimezone } from './timezone.js';
// Read config values from .env (falls back to process.env).
@@ -27,6 +27,10 @@ export const DATA_DIR = path.resolve(PROJECT_ROOT, 'data');
// `nanoclaw-agent:latest` and clobber each other on rebuild.
export const CONTAINER_IMAGE_BASE = process.env.CONTAINER_IMAGE_BASE || getContainerImageBase(PROJECT_ROOT);
export const CONTAINER_IMAGE = process.env.CONTAINER_IMAGE || getDefaultContainerImage(PROJECT_ROOT);
// Install slug — stamped onto every spawned container via --label so
// cleanupOrphans only reaps containers from this install, not peers.
export const INSTALL_SLUG = getInstallSlug(PROJECT_ROOT);
export const CONTAINER_INSTALL_LABEL = `nanoclaw-install=${INSTALL_SLUG}`;
export const CONTAINER_TIMEOUT = parseInt(process.env.CONTAINER_TIMEOUT || '1800000', 10);
export const CONTAINER_MAX_OUTPUT_SIZE = parseInt(process.env.CONTAINER_MAX_OUTPUT_SIZE || '10485760', 10); // 10MB default
export const ONECLI_URL = process.env.ONECLI_URL || envConfig.ONECLI_URL;

View File

@@ -12,6 +12,7 @@ import { OneCLI } from '@onecli-sh/sdk';
import {
CONTAINER_IMAGE,
CONTAINER_IMAGE_BASE,
CONTAINER_INSTALL_LABEL,
DATA_DIR,
GROUPS_DIR,
ONECLI_API_KEY,
@@ -389,7 +390,7 @@ async function buildContainerArgs(
providerContribution: ProviderContainerContribution,
agentIdentifier?: string,
): Promise<string[]> {
const args: string[] = ['run', '--rm', '--name', containerName];
const args: string[] = ['run', '--rm', '--name', containerName, '--label', CONTAINER_INSTALL_LABEL];
// Environment — only vars read by code we don't own.
// Everything NanoClaw-specific is in container.json (read by runner at startup).

View File

@@ -24,6 +24,7 @@ import {
ensureContainerRuntimeRunning,
cleanupOrphans,
} from './container-runtime.js';
import { CONTAINER_INSTALL_LABEL } from './config.js';
import { log } from './log.js';
beforeEach(() => {
@@ -84,6 +85,17 @@ describe('ensureContainerRuntimeRunning', () => {
// --- cleanupOrphans ---
describe('cleanupOrphans', () => {
it('filters ps by the install label so peers are not reaped', () => {
mockExecSync.mockReturnValueOnce('');
cleanupOrphans();
expect(mockExecSync).toHaveBeenCalledWith(
`${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`,
expect.any(Object),
);
});
it('stops orphaned nanoclaw containers', () => {
// docker ps returns container names, one per line
mockExecSync.mockReturnValueOnce('nanoclaw-group1-111\nnanoclaw-group2-222\n');

View File

@@ -5,6 +5,7 @@
import { execSync } from 'child_process';
import os from 'os';
import { CONTAINER_INSTALL_LABEL } from './config.js';
import { log } from './log.js';
/** The container runtime binary name. */
@@ -56,13 +57,22 @@ export function ensureContainerRuntimeRunning(): void {
}
}
/** Kill orphaned NanoClaw containers from previous runs. */
/**
* Kill orphaned NanoClaw containers from THIS install's previous runs.
*
* Scoped by label `nanoclaw-install=<slug>` so a crash-looping peer install
* cannot reap our containers, and we cannot reap theirs. The label is
* stamped onto every container at spawn time — see container-runner.ts.
*/
export function cleanupOrphans(): void {
try {
const output = execSync(`${CONTAINER_RUNTIME_BIN} ps --filter name=nanoclaw- --format '{{.Names}}'`, {
stdio: ['pipe', 'pipe', 'pipe'],
encoding: 'utf-8',
});
const output = execSync(
`${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`,
{
stdio: ['pipe', 'pipe', 'pipe'],
encoding: 'utf-8',
},
);
const orphans = output.trim().split('\n').filter(Boolean);
for (const name of orphans) {
try {