fix(setup): auto-recover from stale docker group mid-session
- container: install Docker via setup/install-docker.sh when missing, distinguish socket EACCES from daemon-down so we bail fast instead of polling 60s, and re-exec the step under `sg docker` when usermod hasn't reached the current shell. - auto: after the container step, re-exec the whole driver under `sg docker` (with a NANOCLAW_REEXEC_SG guard) so onecli/service/verify also get docker-group access without a re-login. Surface the new docker_group_not_active error from the container step. - service: when the systemd user manager has a stale group list, auto- apply \`sudo setfacl -m u:\$USER:rw /var/run/docker.sock\` so the service can start without waiting for the next login. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -71,6 +71,33 @@ function runStep(name: string, extra: string[] = []): Promise<StepResult> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* After installing Docker, this process's supplementary groups are still
|
||||||
|
* frozen from login — subsequent steps that talk to /var/run/docker.sock
|
||||||
|
* (onecli install, service start, …) fail with EACCES even though the
|
||||||
|
* daemon is up. Detect that and re-exec the whole driver under `sg docker`
|
||||||
|
* so the rest of the run inherits the docker group without a re-login.
|
||||||
|
*/
|
||||||
|
function maybeReexecUnderSg(): void {
|
||||||
|
if (process.env.NANOCLAW_REEXEC_SG === '1') return; // already re-exec'd
|
||||||
|
if (process.platform !== 'linux') return;
|
||||||
|
const info = spawnSync('docker', ['info'], { encoding: 'utf-8' });
|
||||||
|
if (info.status === 0) return;
|
||||||
|
const err = `${info.stderr ?? ''}\n${info.stdout ?? ''}`;
|
||||||
|
if (!/permission denied/i.test(err)) return;
|
||||||
|
if (spawnSync('which', ['sg'], { stdio: 'ignore' }).status !== 0) return;
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
'\n[setup:auto] Docker socket not accessible in current group — ' +
|
||||||
|
're-executing under `sg docker` to pick up new group membership.',
|
||||||
|
);
|
||||||
|
const res = spawnSync('sg', ['docker', '-c', 'pnpm run setup:auto'], {
|
||||||
|
stdio: 'inherit',
|
||||||
|
env: { ...process.env, NANOCLAW_REEXEC_SG: '1' },
|
||||||
|
});
|
||||||
|
process.exit(res.status ?? 1);
|
||||||
|
}
|
||||||
|
|
||||||
function anthropicSecretExists(): boolean {
|
function anthropicSecretExists(): boolean {
|
||||||
try {
|
try {
|
||||||
const res = spawnSync('onecli', ['secrets', 'list'], {
|
const res = spawnSync('onecli', ['secrets', 'list'], {
|
||||||
@@ -132,11 +159,18 @@ async function main(): Promise<void> {
|
|||||||
'Install Docker Desktop or start it manually, then retry.',
|
'Install Docker Desktop or start it manually, then retry.',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (res.fields.ERROR === 'docker_group_not_active') {
|
||||||
|
fail(
|
||||||
|
'Docker was just installed but your shell is not yet in the `docker` group.',
|
||||||
|
'Log out and back in (or run `newgrp docker` in a new shell), then retry `pnpm run setup:auto`.',
|
||||||
|
);
|
||||||
|
}
|
||||||
fail(
|
fail(
|
||||||
'container build/test failed',
|
'container build/test failed',
|
||||||
'For stale build cache: `docker builder prune -f`, then retry `pnpm run setup:auto`.',
|
'For stale build cache: `docker builder prune -f`, then retry `pnpm run setup:auto`.',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
maybeReexecUnderSg();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!skip.has('onecli')) {
|
if (!skip.has('onecli')) {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
* Step: container — Build container image and verify with test run.
|
* Step: container — Build container image and verify with test run.
|
||||||
* Replaces 03-setup-container.sh
|
* Replaces 03-setup-container.sh
|
||||||
*/
|
*/
|
||||||
import { execSync } from 'child_process';
|
import { execSync, spawnSync } from 'child_process';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import { setTimeout as sleep } from 'timers/promises';
|
import { setTimeout as sleep } from 'timers/promises';
|
||||||
|
|
||||||
@@ -10,20 +10,28 @@ import { log } from '../src/log.js';
|
|||||||
import { commandExists, getPlatform } from './platform.js';
|
import { commandExists, getPlatform } from './platform.js';
|
||||||
import { emitStatus } from './status.js';
|
import { emitStatus } from './status.js';
|
||||||
|
|
||||||
|
type DockerStatus = 'ok' | 'no-permission' | 'no-daemon' | 'other';
|
||||||
|
|
||||||
|
function dockerStatus(): DockerStatus {
|
||||||
|
const res = spawnSync('docker', ['info'], { encoding: 'utf-8' });
|
||||||
|
if (res.status === 0) return 'ok';
|
||||||
|
const err = `${res.stderr ?? ''}\n${res.stdout ?? ''}`;
|
||||||
|
if (/permission denied/i.test(err)) return 'no-permission';
|
||||||
|
if (/cannot connect|is the docker daemon running|no such file/i.test(err)) return 'no-daemon';
|
||||||
|
return 'other';
|
||||||
|
}
|
||||||
|
|
||||||
function dockerRunning(): boolean {
|
function dockerRunning(): boolean {
|
||||||
try {
|
return dockerStatus() === 'ok';
|
||||||
execSync('docker info', { stdio: 'ignore' });
|
|
||||||
return true;
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try to start Docker if it's installed but idle. Poll for up to 60s.
|
* Try to start Docker if it's installed but idle. Poll up to 60s for the
|
||||||
* Returns true once `docker info` succeeds, false if we gave up.
|
* daemon to come up — but bail immediately if the socket is reachable and
|
||||||
|
* only blocked by a group-permission error, since that won't resolve by
|
||||||
|
* waiting (the caller handles the sg re-exec for that case).
|
||||||
*/
|
*/
|
||||||
async function tryStartDocker(): Promise<boolean> {
|
async function tryStartDocker(): Promise<DockerStatus> {
|
||||||
const platform = getPlatform();
|
const platform = getPlatform();
|
||||||
log.info('Docker not running — attempting to start', { platform });
|
log.info('Docker not running — attempting to start', { platform });
|
||||||
|
|
||||||
@@ -34,22 +42,27 @@ async function tryStartDocker(): Promise<boolean> {
|
|||||||
// Inherit stdio so sudo can prompt for a password if needed.
|
// Inherit stdio so sudo can prompt for a password if needed.
|
||||||
execSync('sudo systemctl start docker', { stdio: 'inherit' });
|
execSync('sudo systemctl start docker', { stdio: 'inherit' });
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return 'other';
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
log.warn('Start command failed', { err });
|
log.warn('Start command failed', { err });
|
||||||
return false;
|
return 'other';
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < 30; i++) {
|
for (let i = 0; i < 30; i++) {
|
||||||
await sleep(2000);
|
await sleep(2000);
|
||||||
if (dockerRunning()) {
|
const s = dockerStatus();
|
||||||
|
if (s === 'ok') {
|
||||||
log.info('Docker is up');
|
log.info('Docker is up');
|
||||||
return true;
|
return 'ok';
|
||||||
|
}
|
||||||
|
if (s === 'no-permission') {
|
||||||
|
log.info('Docker daemon is up but socket is not accessible (group membership)');
|
||||||
|
return 'no-permission';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
log.warn('Docker did not become ready within 60s');
|
log.warn('Docker did not become ready within 60s');
|
||||||
return false;
|
return 'no-daemon';
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseArgs(args: string[]): { runtime: string } {
|
function parseArgs(args: string[]): { runtime: string } {
|
||||||
@@ -84,6 +97,15 @@ export async function run(args: string[]): Promise<void> {
|
|||||||
process.exit(4);
|
process.exit(4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!commandExists('docker')) {
|
||||||
|
log.info('Docker not found — running setup/install-docker.sh');
|
||||||
|
try {
|
||||||
|
execSync('bash setup/install-docker.sh', { cwd: projectRoot, stdio: 'inherit' });
|
||||||
|
} catch (err) {
|
||||||
|
log.warn('install-docker.sh failed', { err });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!commandExists('docker')) {
|
if (!commandExists('docker')) {
|
||||||
emitStatus('SETUP_CONTAINER', {
|
emitStatus('SETUP_CONTAINER', {
|
||||||
RUNTIME: runtime,
|
RUNTIME: runtime,
|
||||||
@@ -97,16 +119,37 @@ export async function run(args: string[]): Promise<void> {
|
|||||||
process.exit(2);
|
process.exit(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!dockerRunning()) {
|
{
|
||||||
const started = await tryStartDocker();
|
let status = dockerStatus();
|
||||||
if (!started) {
|
if (status !== 'ok') {
|
||||||
|
status = await tryStartDocker();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Socket is unreachable due to group perms — current shell's supplementary
|
||||||
|
// groups are fixed at login, so `usermod -aG docker` (via install-docker.sh
|
||||||
|
// or a prior install) doesn't affect us until next login. Re-exec this
|
||||||
|
// step under `sg docker` so the child picks up docker as its primary
|
||||||
|
// group and can talk to /var/run/docker.sock without a logout.
|
||||||
|
if (status === 'no-permission' && getPlatform() === 'linux' && commandExists('sg')) {
|
||||||
|
log.info('Re-executing container step under `sg docker`');
|
||||||
|
const res = spawnSync(
|
||||||
|
'sg',
|
||||||
|
['docker', '-c', 'pnpm exec tsx setup/index.ts --step container'],
|
||||||
|
{ cwd: projectRoot, stdio: 'inherit' },
|
||||||
|
);
|
||||||
|
process.exit(res.status ?? 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status !== 'ok') {
|
||||||
|
const error =
|
||||||
|
status === 'no-permission' ? 'docker_group_not_active' : 'runtime_not_available';
|
||||||
emitStatus('SETUP_CONTAINER', {
|
emitStatus('SETUP_CONTAINER', {
|
||||||
RUNTIME: runtime,
|
RUNTIME: runtime,
|
||||||
IMAGE: image,
|
IMAGE: image,
|
||||||
BUILD_OK: false,
|
BUILD_OK: false,
|
||||||
TEST_OK: false,
|
TEST_OK: false,
|
||||||
STATUS: 'failed',
|
STATUS: 'failed',
|
||||||
ERROR: 'runtime_not_available',
|
ERROR: error,
|
||||||
LOG: 'logs/setup.log',
|
LOG: 'logs/setup.log',
|
||||||
});
|
});
|
||||||
process.exit(2);
|
process.exit(2);
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import path from 'path';
|
|||||||
|
|
||||||
import { log } from '../src/log.js';
|
import { log } from '../src/log.js';
|
||||||
import {
|
import {
|
||||||
|
commandExists,
|
||||||
getPlatform,
|
getPlatform,
|
||||||
getNodePath,
|
getNodePath,
|
||||||
getServiceManager,
|
getServiceManager,
|
||||||
@@ -255,12 +256,34 @@ WantedBy=${runningAsRoot ? 'multi-user.target' : 'default.target'}`;
|
|||||||
fs.writeFileSync(unitPath, unit);
|
fs.writeFileSync(unitPath, unit);
|
||||||
log.info('Wrote systemd unit', { unitPath });
|
log.info('Wrote systemd unit', { unitPath });
|
||||||
|
|
||||||
// Detect stale docker group before starting (user systemd only)
|
// Detect stale docker group before starting (user systemd only). The user
|
||||||
const dockerGroupStale = !runningAsRoot && checkDockerGroupStale();
|
// systemd manager is a long-running process whose group list is frozen at
|
||||||
|
// login, so `usermod -aG docker` mid-session doesn't reach it. Rather than
|
||||||
|
// require the user to log out + back in, punch a POSIX ACL onto the socket
|
||||||
|
// that grants the current user rw directly. This is temporary — the socket
|
||||||
|
// is recreated by dockerd on restart (and by then the user has relogged, so
|
||||||
|
// normal group perms apply again).
|
||||||
|
let dockerGroupStale = !runningAsRoot && checkDockerGroupStale();
|
||||||
if (dockerGroupStale) {
|
if (dockerGroupStale) {
|
||||||
log.warn(
|
log.warn(
|
||||||
'Docker group not active in systemd session — user was likely added to docker group mid-session',
|
'Docker group not active in systemd session — user was likely added to docker group mid-session',
|
||||||
);
|
);
|
||||||
|
if (commandExists('setfacl')) {
|
||||||
|
const user = execSync('whoami', { encoding: 'utf-8' }).trim();
|
||||||
|
try {
|
||||||
|
execSync(`sudo setfacl -m u:${user}:rw /var/run/docker.sock`, {
|
||||||
|
stdio: 'inherit',
|
||||||
|
});
|
||||||
|
log.info(
|
||||||
|
'Applied temporary ACL to /var/run/docker.sock (resets on docker restart or reboot)',
|
||||||
|
);
|
||||||
|
dockerGroupStale = false;
|
||||||
|
} catch (err) {
|
||||||
|
log.warn('Failed to apply setfacl workaround', { err });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.warn('setfacl not installed — cannot apply automatic workaround');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kill orphaned nanoclaw processes to avoid channel connection conflicts
|
// Kill orphaned nanoclaw processes to avoid channel connection conflicts
|
||||||
|
|||||||
Reference in New Issue
Block a user