fix(container): scope orphan reaper by install label so peers don't kill each other
Two installs on the same host could trash each other's containers: the reaper used `docker ps --filter name=nanoclaw-`, a substring match that picked up every install's containers. A crash-looping peer (e.g. a legacy v1 plist respawning ~6k times) would call cleanupOrphans on every boot and kill the healthy install's session containers within seconds of spawn. - Stamp `--label nanoclaw-install=<slug>` onto every spawned container. - cleanupOrphans filters by that label; healthy peers are left alone. - Setup preflight enumerates `com.nanoclaw*` launchd plists / nanoclaw user systemd units, probes state/runs, and unloads any that are crash-looping (state != running AND runs > 10) before installing this install's service. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
186
setup/peer-cleanup.ts
Normal file
186
setup/peer-cleanup.ts
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
/**
|
||||||
|
* Detect and clean up unhealthy NanoClaw peer services.
|
||||||
|
*
|
||||||
|
* Runs as a setup preflight before we install our own service. A crash-looping
|
||||||
|
* peer install (typically the legacy v1 `com.nanoclaw` plist) silently trashes
|
||||||
|
* this install's containers on every respawn because its `cleanupOrphans()`
|
||||||
|
* reaps anything matching `nanoclaw-`. We scope our reaper by label now, but
|
||||||
|
* we still need to stop the peer from killing us on its way down.
|
||||||
|
*
|
||||||
|
* A peer is "unhealthy" when:
|
||||||
|
* - launchd: `state != running` AND `runs > UNHEALTHY_RUNS_THRESHOLD`
|
||||||
|
* - systemd: unit is in `failed` state, OR `activating` with many restarts
|
||||||
|
*
|
||||||
|
* Healthy peers are left alone — multiple installs can coexist fine now that
|
||||||
|
* container-reaper is label-scoped.
|
||||||
|
*/
|
||||||
|
import { execFileSync } from 'child_process';
|
||||||
|
import fs from 'fs';
|
||||||
|
import os from 'os';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
import { getLaunchdLabel, getSystemdUnit } from '../src/install-slug.js';
|
||||||
|
import { log } from '../src/log.js';
|
||||||
|
|
||||||
|
const UNHEALTHY_RUNS_THRESHOLD = 10;
|
||||||
|
|
||||||
|
export interface PeerStatus {
|
||||||
|
label: string;
|
||||||
|
configPath: string;
|
||||||
|
state: string;
|
||||||
|
runs: number;
|
||||||
|
unhealthy: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PeerCleanupResult {
|
||||||
|
checked: PeerStatus[];
|
||||||
|
unloaded: PeerStatus[];
|
||||||
|
failures: Array<{ label: string; err: string }>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scan for peer NanoClaw services and unload any that are crash-looping.
|
||||||
|
* Returns a summary suitable for emitStatus / setup-log reporting.
|
||||||
|
*/
|
||||||
|
export function cleanupUnhealthyPeers(projectRoot: string = process.cwd()): PeerCleanupResult {
|
||||||
|
const platform = os.platform();
|
||||||
|
if (platform === 'darwin') {
|
||||||
|
return cleanupLaunchdPeers(projectRoot);
|
||||||
|
}
|
||||||
|
if (platform === 'linux') {
|
||||||
|
return cleanupSystemdPeers(projectRoot);
|
||||||
|
}
|
||||||
|
return { checked: [], unloaded: [], failures: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- launchd (macOS) --------------------------------------------------------
|
||||||
|
|
||||||
|
function cleanupLaunchdPeers(projectRoot: string): PeerCleanupResult {
|
||||||
|
const ownLabel = getLaunchdLabel(projectRoot);
|
||||||
|
const agentsDir = path.join(os.homedir(), 'Library', 'LaunchAgents');
|
||||||
|
const result: PeerCleanupResult = { checked: [], unloaded: [], failures: [] };
|
||||||
|
|
||||||
|
let plists: string[];
|
||||||
|
try {
|
||||||
|
plists = fs
|
||||||
|
.readdirSync(agentsDir)
|
||||||
|
.filter((f) => /^com\.nanoclaw.*\.plist$/.test(f))
|
||||||
|
.map((f) => path.join(agentsDir, f));
|
||||||
|
} catch {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uid = process.getuid?.() ?? 0;
|
||||||
|
|
||||||
|
for (const plistPath of plists) {
|
||||||
|
const label = path.basename(plistPath, '.plist');
|
||||||
|
if (label === ownLabel) continue;
|
||||||
|
|
||||||
|
const status = probeLaunchdPeer(label, plistPath, uid);
|
||||||
|
if (!status) continue;
|
||||||
|
result.checked.push(status);
|
||||||
|
|
||||||
|
if (!status.unhealthy) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
execFileSync('launchctl', ['unload', plistPath], { stdio: 'pipe' });
|
||||||
|
log.info('Unloaded unhealthy peer launchd service', {
|
||||||
|
label,
|
||||||
|
state: status.state,
|
||||||
|
runs: status.runs,
|
||||||
|
plistPath,
|
||||||
|
});
|
||||||
|
result.unloaded.push(status);
|
||||||
|
} catch (err) {
|
||||||
|
const message = err instanceof Error ? err.message : String(err);
|
||||||
|
log.warn('Failed to unload peer launchd service', { label, err: message });
|
||||||
|
result.failures.push({ label, err: message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function probeLaunchdPeer(label: string, plistPath: string, uid: number): PeerStatus | null {
|
||||||
|
let output: string;
|
||||||
|
try {
|
||||||
|
output = execFileSync('launchctl', ['print', `gui/${uid}/${label}`], {
|
||||||
|
stdio: ['ignore', 'pipe', 'pipe'],
|
||||||
|
encoding: 'utf-8',
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
// Not loaded → not currently a threat. Skip silently.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const state = /^\s*state\s*=\s*(.+?)\s*$/m.exec(output)?.[1] ?? 'unknown';
|
||||||
|
const runsStr = /^\s*runs\s*=\s*(\d+)/m.exec(output)?.[1];
|
||||||
|
const runs = runsStr ? parseInt(runsStr, 10) : 0;
|
||||||
|
|
||||||
|
const unhealthy = state !== 'running' && runs > UNHEALTHY_RUNS_THRESHOLD;
|
||||||
|
return { label, configPath: plistPath, state, runs, unhealthy };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- systemd (Linux) --------------------------------------------------------
|
||||||
|
|
||||||
|
function cleanupSystemdPeers(projectRoot: string): PeerCleanupResult {
|
||||||
|
const ownUnit = getSystemdUnit(projectRoot);
|
||||||
|
const unitDir = path.join(os.homedir(), '.config', 'systemd', 'user');
|
||||||
|
const result: PeerCleanupResult = { checked: [], unloaded: [], failures: [] };
|
||||||
|
|
||||||
|
let units: string[];
|
||||||
|
try {
|
||||||
|
units = fs
|
||||||
|
.readdirSync(unitDir)
|
||||||
|
.filter((f) => /^nanoclaw.*\.service$/.test(f))
|
||||||
|
.map((f) => f.replace(/\.service$/, ''));
|
||||||
|
} catch {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const unit of units) {
|
||||||
|
if (unit === ownUnit) continue;
|
||||||
|
|
||||||
|
const status = probeSystemdPeer(unit);
|
||||||
|
if (!status) continue;
|
||||||
|
result.checked.push(status);
|
||||||
|
|
||||||
|
if (!status.unhealthy) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
execFileSync('systemctl', ['--user', 'disable', '--now', `${unit}.service`], { stdio: 'pipe' });
|
||||||
|
log.info('Disabled unhealthy peer systemd unit', {
|
||||||
|
unit,
|
||||||
|
state: status.state,
|
||||||
|
runs: status.runs,
|
||||||
|
});
|
||||||
|
result.unloaded.push(status);
|
||||||
|
} catch (err) {
|
||||||
|
const message = err instanceof Error ? err.message : String(err);
|
||||||
|
log.warn('Failed to disable peer systemd unit', { unit, err: message });
|
||||||
|
result.failures.push({ label: unit, err: message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function probeSystemdPeer(unit: string): PeerStatus | null {
|
||||||
|
const unitPath = path.join(os.homedir(), '.config', 'systemd', 'user', `${unit}.service`);
|
||||||
|
try {
|
||||||
|
const output = execFileSync(
|
||||||
|
'systemctl',
|
||||||
|
['--user', 'show', '--property=ActiveState,NRestarts', `${unit}.service`],
|
||||||
|
{ stdio: ['ignore', 'pipe', 'pipe'], encoding: 'utf-8' },
|
||||||
|
);
|
||||||
|
const activeState = /^ActiveState=(.+)$/m.exec(output)?.[1]?.trim() ?? 'unknown';
|
||||||
|
const restartsStr = /^NRestarts=(\d+)/m.exec(output)?.[1];
|
||||||
|
const runs = restartsStr ? parseInt(restartsStr, 10) : 0;
|
||||||
|
|
||||||
|
const unhealthy =
|
||||||
|
activeState === 'failed' || (activeState !== 'active' && runs > UNHEALTHY_RUNS_THRESHOLD);
|
||||||
|
return { label: unit, configPath: unitPath, state: activeState, runs, unhealthy };
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@ import path from 'path';
|
|||||||
|
|
||||||
import { log } from '../src/log.js';
|
import { log } from '../src/log.js';
|
||||||
import { getLaunchdLabel, getSystemdUnit } from '../src/install-slug.js';
|
import { getLaunchdLabel, getSystemdUnit } from '../src/install-slug.js';
|
||||||
|
import { cleanupUnhealthyPeers } from './peer-cleanup.js';
|
||||||
import {
|
import {
|
||||||
commandExists,
|
commandExists,
|
||||||
getPlatform,
|
getPlatform,
|
||||||
@@ -53,6 +54,19 @@ export async function run(_args: string[]): Promise<void> {
|
|||||||
|
|
||||||
fs.mkdirSync(path.join(projectRoot, 'logs'), { recursive: true });
|
fs.mkdirSync(path.join(projectRoot, 'logs'), { recursive: true });
|
||||||
|
|
||||||
|
// Peer preflight — a crash-looping peer install (most often the legacy v1
|
||||||
|
// `com.nanoclaw` plist) will keep trashing this install's containers on
|
||||||
|
// every respawn via its own cleanupOrphans. Detect and unload any peer
|
||||||
|
// that's unhealthy before we install our service. Healthy peers are left
|
||||||
|
// alone now that container reaping is install-label-scoped.
|
||||||
|
const peerReport = cleanupUnhealthyPeers(projectRoot);
|
||||||
|
if (peerReport.unloaded.length > 0) {
|
||||||
|
log.warn('Unloaded unhealthy peer NanoClaw services', {
|
||||||
|
count: peerReport.unloaded.length,
|
||||||
|
labels: peerReport.unloaded.map((p) => p.label),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (platform === 'macos') {
|
if (platform === 'macos') {
|
||||||
setupLaunchd(projectRoot, nodePath, homeDir);
|
setupLaunchd(projectRoot, nodePath, homeDir);
|
||||||
} else if (platform === 'linux') {
|
} else if (platform === 'linux') {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import os from 'os';
|
|||||||
import path from 'path';
|
import path from 'path';
|
||||||
|
|
||||||
import { readEnvFile } from './env.js';
|
import { readEnvFile } from './env.js';
|
||||||
import { getContainerImageBase, getDefaultContainerImage } from './install-slug.js';
|
import { getContainerImageBase, getDefaultContainerImage, getInstallSlug } from './install-slug.js';
|
||||||
import { isValidTimezone } from './timezone.js';
|
import { isValidTimezone } from './timezone.js';
|
||||||
|
|
||||||
// Read config values from .env (falls back to process.env).
|
// Read config values from .env (falls back to process.env).
|
||||||
@@ -27,6 +27,10 @@ export const DATA_DIR = path.resolve(PROJECT_ROOT, 'data');
|
|||||||
// `nanoclaw-agent:latest` and clobber each other on rebuild.
|
// `nanoclaw-agent:latest` and clobber each other on rebuild.
|
||||||
export const CONTAINER_IMAGE_BASE = process.env.CONTAINER_IMAGE_BASE || getContainerImageBase(PROJECT_ROOT);
|
export const CONTAINER_IMAGE_BASE = process.env.CONTAINER_IMAGE_BASE || getContainerImageBase(PROJECT_ROOT);
|
||||||
export const CONTAINER_IMAGE = process.env.CONTAINER_IMAGE || getDefaultContainerImage(PROJECT_ROOT);
|
export const CONTAINER_IMAGE = process.env.CONTAINER_IMAGE || getDefaultContainerImage(PROJECT_ROOT);
|
||||||
|
// Install slug — stamped onto every spawned container via --label so
|
||||||
|
// cleanupOrphans only reaps containers from this install, not peers.
|
||||||
|
export const INSTALL_SLUG = getInstallSlug(PROJECT_ROOT);
|
||||||
|
export const CONTAINER_INSTALL_LABEL = `nanoclaw-install=${INSTALL_SLUG}`;
|
||||||
export const CONTAINER_TIMEOUT = parseInt(process.env.CONTAINER_TIMEOUT || '1800000', 10);
|
export const CONTAINER_TIMEOUT = parseInt(process.env.CONTAINER_TIMEOUT || '1800000', 10);
|
||||||
export const CONTAINER_MAX_OUTPUT_SIZE = parseInt(process.env.CONTAINER_MAX_OUTPUT_SIZE || '10485760', 10); // 10MB default
|
export const CONTAINER_MAX_OUTPUT_SIZE = parseInt(process.env.CONTAINER_MAX_OUTPUT_SIZE || '10485760', 10); // 10MB default
|
||||||
export const ONECLI_URL = process.env.ONECLI_URL || envConfig.ONECLI_URL;
|
export const ONECLI_URL = process.env.ONECLI_URL || envConfig.ONECLI_URL;
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import { OneCLI } from '@onecli-sh/sdk';
|
|||||||
import {
|
import {
|
||||||
CONTAINER_IMAGE,
|
CONTAINER_IMAGE,
|
||||||
CONTAINER_IMAGE_BASE,
|
CONTAINER_IMAGE_BASE,
|
||||||
|
CONTAINER_INSTALL_LABEL,
|
||||||
DATA_DIR,
|
DATA_DIR,
|
||||||
GROUPS_DIR,
|
GROUPS_DIR,
|
||||||
ONECLI_API_KEY,
|
ONECLI_API_KEY,
|
||||||
@@ -389,7 +390,7 @@ async function buildContainerArgs(
|
|||||||
providerContribution: ProviderContainerContribution,
|
providerContribution: ProviderContainerContribution,
|
||||||
agentIdentifier?: string,
|
agentIdentifier?: string,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
const args: string[] = ['run', '--rm', '--name', containerName];
|
const args: string[] = ['run', '--rm', '--name', containerName, '--label', CONTAINER_INSTALL_LABEL];
|
||||||
|
|
||||||
// Environment — only vars read by code we don't own.
|
// Environment — only vars read by code we don't own.
|
||||||
// Everything NanoClaw-specific is in container.json (read by runner at startup).
|
// Everything NanoClaw-specific is in container.json (read by runner at startup).
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import {
|
|||||||
ensureContainerRuntimeRunning,
|
ensureContainerRuntimeRunning,
|
||||||
cleanupOrphans,
|
cleanupOrphans,
|
||||||
} from './container-runtime.js';
|
} from './container-runtime.js';
|
||||||
|
import { CONTAINER_INSTALL_LABEL } from './config.js';
|
||||||
import { log } from './log.js';
|
import { log } from './log.js';
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
@@ -84,6 +85,17 @@ describe('ensureContainerRuntimeRunning', () => {
|
|||||||
// --- cleanupOrphans ---
|
// --- cleanupOrphans ---
|
||||||
|
|
||||||
describe('cleanupOrphans', () => {
|
describe('cleanupOrphans', () => {
|
||||||
|
it('filters ps by the install label so peers are not reaped', () => {
|
||||||
|
mockExecSync.mockReturnValueOnce('');
|
||||||
|
|
||||||
|
cleanupOrphans();
|
||||||
|
|
||||||
|
expect(mockExecSync).toHaveBeenCalledWith(
|
||||||
|
`${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`,
|
||||||
|
expect.any(Object),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
it('stops orphaned nanoclaw containers', () => {
|
it('stops orphaned nanoclaw containers', () => {
|
||||||
// docker ps returns container names, one per line
|
// docker ps returns container names, one per line
|
||||||
mockExecSync.mockReturnValueOnce('nanoclaw-group1-111\nnanoclaw-group2-222\n');
|
mockExecSync.mockReturnValueOnce('nanoclaw-group1-111\nnanoclaw-group2-222\n');
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
import { execSync } from 'child_process';
|
import { execSync } from 'child_process';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
|
|
||||||
|
import { CONTAINER_INSTALL_LABEL } from './config.js';
|
||||||
import { log } from './log.js';
|
import { log } from './log.js';
|
||||||
|
|
||||||
/** The container runtime binary name. */
|
/** The container runtime binary name. */
|
||||||
@@ -56,13 +57,22 @@ export function ensureContainerRuntimeRunning(): void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Kill orphaned NanoClaw containers from previous runs. */
|
/**
|
||||||
|
* Kill orphaned NanoClaw containers from THIS install's previous runs.
|
||||||
|
*
|
||||||
|
* Scoped by label `nanoclaw-install=<slug>` so a crash-looping peer install
|
||||||
|
* cannot reap our containers, and we cannot reap theirs. The label is
|
||||||
|
* stamped onto every container at spawn time — see container-runner.ts.
|
||||||
|
*/
|
||||||
export function cleanupOrphans(): void {
|
export function cleanupOrphans(): void {
|
||||||
try {
|
try {
|
||||||
const output = execSync(`${CONTAINER_RUNTIME_BIN} ps --filter name=nanoclaw- --format '{{.Names}}'`, {
|
const output = execSync(
|
||||||
stdio: ['pipe', 'pipe', 'pipe'],
|
`${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`,
|
||||||
encoding: 'utf-8',
|
{
|
||||||
});
|
stdio: ['pipe', 'pipe', 'pipe'],
|
||||||
|
encoding: 'utf-8',
|
||||||
|
},
|
||||||
|
);
|
||||||
const orphans = output.trim().split('\n').filter(Boolean);
|
const orphans = output.trim().split('\n').filter(Boolean);
|
||||||
for (const name of orphans) {
|
for (const name of orphans) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
Reference in New Issue
Block a user