fix: auto-recover from stale Claude Code session on exit code 1

When Claude Code exits with code 1 during a session resume because the session transcript file no longer exists (ENOENT on .jsonl), clear the stale session from SQLite and retry once with a fresh session. Detection is targeted: only triggers on ENOENT referencing a .jsonl file or explicit "session not found" errors. Transient failures (network, API) fall through to the normal backoff retry path. Also removes unrelated ollama files that were mixed in during rebase. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 23:03:44 +11:00
parent 3098f28b74
commit 38009be263
5 changed files with 14 additions and 338 deletions
--- a/container/agent-runner/src/index.ts
+++ b/container/agent-runner/src/index.ts
@@ -409,8 +409,7 @@ async function runQuery(
        'TeamCreate', 'TeamDelete', 'SendMessage',
        'TodoWrite', 'ToolSearch', 'Skill',
        'NotebookEdit',
-        'mcp__nanoclaw__*',
-        'mcp__ollama__*'
+        'mcp__nanoclaw__*'
      ],
      env: sdkEnv,
      permissionMode: 'bypassPermissions',
@@ -426,10 +425,6 @@ async function runQuery(
            NANOCLAW_IS_MAIN: containerInput.isMain ? '1' : '0',
          },
        },
-        ollama: {
-          command: 'node',
-          args: [path.join(path.dirname(mcpServerPath), 'ollama-mcp-stdio.js')],
-        },
      },
      hooks: {
        PreCompact: [{ hooks: [createPreCompactHook(containerInput.assistantName)] }],
--- a/container/agent-runner/src/ollama-mcp-stdio.ts
+++ b/container/agent-runner/src/ollama-mcp-stdio.ts
@@ -1,281 +0,0 @@
-/**
- * Ollama MCP Server for NanoClaw
- * Exposes local Ollama models as tools for the container agent.
- * Uses host.docker.internal to reach the host's Ollama instance from Docker.
- */
-
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
-import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import { z } from 'zod';
-
-import fs from 'fs';
-import path from 'path';
-
-const OLLAMA_HOST = process.env.OLLAMA_HOST || 'http://host.docker.internal:11434';
-const OLLAMA_STATUS_FILE = '/workspace/ipc/ollama_status.json';
-
-function log(msg: string): void {
-  console.error(`[OLLAMA] ${msg}`);
-}
-
-function writeStatus(status: string, detail?: string): void {
-  try {
-    const data = { status, detail, timestamp: new Date().toISOString() };
-    const tmpPath = `${OLLAMA_STATUS_FILE}.tmp`;
-    fs.mkdirSync(path.dirname(OLLAMA_STATUS_FILE), { recursive: true });
-    fs.writeFileSync(tmpPath, JSON.stringify(data));
-    fs.renameSync(tmpPath, OLLAMA_STATUS_FILE);
-  } catch { /* best-effort */ }
-}
-
-async function ollamaFetch(path: string, options?: RequestInit): Promise<Response> {
-  const url = `${OLLAMA_HOST}${path}`;
-  try {
-    return await fetch(url, options);
-  } catch (err) {
-    // Fallback to localhost if host.docker.internal fails
-    if (OLLAMA_HOST.includes('host.docker.internal')) {
-      const fallbackUrl = url.replace('host.docker.internal', 'localhost');
-      return await fetch(fallbackUrl, options);
-    }
-    throw err;
-  }
-}
-
-const server = new McpServer({
-  name: 'ollama',
-  version: '1.0.0',
-});
-
-server.tool(
-  'ollama_list_models',
-  'List all locally installed Ollama models. Use this to see which models are available before calling ollama_generate.',
-  {},
-  async () => {
-    log('Listing models...');
-    writeStatus('listing', 'Listing available models');
-    try {
-      const res = await ollamaFetch('/api/tags');
-      if (!res.ok) {
-        return {
-          content: [{ type: 'text' as const, text: `Ollama API error: ${res.status} ${res.statusText}` }],
-          isError: true,
-        };
-      }
-
-      const data = await res.json() as { models?: Array<{ name: string; size: number; modified_at: string }> };
-      const models = data.models || [];
-
-      if (models.length === 0) {
-        return { content: [{ type: 'text' as const, text: 'No models installed. Run `ollama pull <model>` on the host to install one.' }] };
-      }
-
-      const list = models
-        .map(m => `- ${m.name} (${(m.size / 1e9).toFixed(1)}GB)`)
-        .join('\n');
-
-      log(`Found ${models.length} models`);
-      return { content: [{ type: 'text' as const, text: `Installed models:\n${list}` }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to connect to Ollama at ${OLLAMA_HOST}: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-server.tool(
-  'ollama_generate',
-  'Send a prompt to a local Ollama model and get a response. Good for cheaper/faster tasks like summarization, translation, or general queries. Use ollama_list_models first to see available models.',
-  {
-    model: z.string().describe('The model name (e.g., "llama3.2", "mistral", "gemma2")'),
-    prompt: z.string().describe('The prompt to send to the model'),
-    system: z.string().optional().describe('Optional system prompt to set model behavior'),
-  },
-  async (args) => {
-    log(`>>> Generating with ${args.model} (${args.prompt.length} chars)...`);
-    writeStatus('generating', `Generating with ${args.model}`);
-    try {
-      const body: Record<string, unknown> = {
-        model: args.model,
-        prompt: args.prompt,
-        stream: false,
-      };
-      if (args.system) {
-        body.system = args.system;
-      }
-
-      const res = await ollamaFetch('/api/generate', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify(body),
-      });
-
-      if (!res.ok) {
-        const errorText = await res.text();
-        return {
-          content: [{ type: 'text' as const, text: `Ollama error (${res.status}): ${errorText}` }],
-          isError: true,
-        };
-      }
-
-      const data = await res.json() as { response: string; total_duration?: number; eval_count?: number };
-
-      let meta = '';
-      if (data.total_duration) {
-        const secs = (data.total_duration / 1e9).toFixed(1);
-        meta = `\n\n[${args.model} | ${secs}s${data.eval_count ? ` | ${data.eval_count} tokens` : ''}]`;
-        log(`<<< Done: ${args.model} | ${secs}s | ${data.eval_count || '?'} tokens | ${data.response.length} chars`);
-        writeStatus('done', `${args.model} | ${secs}s | ${data.eval_count || '?'} tokens`);
-      } else {
-        log(`<<< Done: ${args.model} | ${data.response.length} chars`);
-        writeStatus('done', `${args.model} | ${data.response.length} chars`);
-      }
-
-      return { content: [{ type: 'text' as const, text: data.response + meta }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to call Ollama: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-server.tool(
-  'ollama_pull_model',
-  'Pull (download) a model from the Ollama registry by name. Returns the final status once the pull is complete. Use model names like "llama3.2", "mistral", "gemma2:9b".',
-  {
-    model: z.string().describe('Model name to pull, e.g. "llama3.2", "mistral", "gemma2:9b"'),
-  },
-  async (args) => {
-    log(`Pulling model: ${args.model}...`);
-    writeStatus('pulling', `Pulling ${args.model}`);
-    try {
-      const res = await ollamaFetch('/api/pull', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ model: args.model, stream: false }),
-      });
-      if (!res.ok) {
-        const errorText = await res.text();
-        return {
-          content: [{ type: 'text' as const, text: `Ollama error (${res.status}): ${errorText}` }],
-          isError: true,
-        };
-      }
-      const data = await res.json() as { status: string };
-      log(`Pull complete: ${args.model} — ${data.status}`);
-      writeStatus('done', `Pulled ${args.model}`);
-      return { content: [{ type: 'text' as const, text: `Pull complete: ${args.model} — ${data.status}` }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to pull model: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-server.tool(
-  'ollama_delete_model',
-  'Delete a locally installed Ollama model to free up disk space.',
-  {
-    model: z.string().describe('Model name to delete, e.g. "llama3.2", "mistral:latest"'),
-  },
-  async (args) => {
-    log(`Deleting model: ${args.model}...`);
-    writeStatus('deleting', `Deleting ${args.model}`);
-    try {
-      const res = await ollamaFetch('/api/delete', {
-        method: 'DELETE',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ model: args.model }),
-      });
-      if (!res.ok) {
-        const errorText = await res.text();
-        return {
-          content: [{ type: 'text' as const, text: `Ollama error (${res.status}): ${errorText}` }],
-          isError: true,
-        };
-      }
-      log(`Deleted: ${args.model}`);
-      writeStatus('done', `Deleted ${args.model}`);
-      return { content: [{ type: 'text' as const, text: `Deleted model: ${args.model}` }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to delete model: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-server.tool(
-  'ollama_show_model',
-  'Show details for a locally installed Ollama model: modelfile, parameters, template, system prompt, and architecture info (context length, parameter count, etc.).',
-  {
-    model: z.string().describe('Model name to inspect, e.g. "llama3.2", "mistral:latest"'),
-  },
-  async (args) => {
-    log(`Showing model info: ${args.model}...`);
-    try {
-      const res = await ollamaFetch('/api/show', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ model: args.model }),
-      });
-      if (!res.ok) {
-        const errorText = await res.text();
-        return {
-          content: [{ type: 'text' as const, text: `Ollama error (${res.status}): ${errorText}` }],
-          isError: true,
-        };
-      }
-      const data = await res.json();
-      return { content: [{ type: 'text' as const, text: JSON.stringify(data, null, 2) }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to show model info: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-server.tool(
-  'ollama_list_running',
-  'List Ollama models currently loaded in memory with their memory usage, processor type (CPU/GPU), and time until they are unloaded.',
-  {},
-  async () => {
-    log('Listing running models...');
-    try {
-      const res = await ollamaFetch('/api/ps');
-      if (!res.ok) {
-        return {
-          content: [{ type: 'text' as const, text: `Ollama API error: ${res.status} ${res.statusText}` }],
-          isError: true,
-        };
-      }
-      const data = await res.json() as { models?: Array<{ name: string; size_vram: number; processor: string; expires_at: string }> };
-      const models = data.models || [];
-      if (models.length === 0) {
-        return { content: [{ type: 'text' as const, text: 'No models currently loaded in memory.' }] };
-      }
-      const list = models
-        .map(m => `- ${m.name} (${(m.size_vram / 1e9).toFixed(1)}GB ${m.processor}, unloads at ${m.expires_at})`)
-        .join('\n');
-      log(`${models.length} model(s) running`);
-      return { content: [{ type: 'text' as const, text: `Models loaded in memory:\n${list}` }] };
-    } catch (err) {
-      return {
-        content: [{ type: 'text' as const, text: `Failed to list running models: ${err instanceof Error ? err.message : String(err)}` }],
-        isError: true,
-      };
-    }
-  },
-);
-
-const transport = new StdioServerTransport();
-await server.connect(transport);
--- a/scripts/ollama-watch.sh
+++ b/scripts/ollama-watch.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-# Watch NanoClaw IPC for Ollama activity and show macOS notifications
-# Usage: ./scripts/ollama-watch.sh
-
-cd "$(dirname "$0")/.." || exit 1
-
-echo "Watching for Ollama activity..."
-echo "Press Ctrl+C to stop"
-echo ""
-
-LAST_TIMESTAMP=""
-
-while true; do
-  # Check all group IPC dirs for ollama_status.json
-  for status_file in data/ipc/*/ollama_status.json; do
-    [ -f "$status_file" ] || continue
-
-    TIMESTAMP=$(python3 -c "import json; print(json.load(open('$status_file'))['timestamp'])" 2>/dev/null)
-    [ -z "$TIMESTAMP" ] && continue
-    [ "$TIMESTAMP" = "$LAST_TIMESTAMP" ] && continue
-
-    LAST_TIMESTAMP="$TIMESTAMP"
-    STATUS=$(python3 -c "import json; d=json.load(open('$status_file')); print(d['status'])" 2>/dev/null)
-    DETAIL=$(python3 -c "import json; d=json.load(open('$status_file')); print(d.get('detail',''))" 2>/dev/null)
-
-    case "$STATUS" in
-      generating)
-        osascript -e "display notification \"$DETAIL\" with title \"NanoClaw → Ollama\" sound name \"Submarine\"" 2>/dev/null
-        echo "$(date +%H:%M:%S) 🔄 $DETAIL"
-        ;;
-      done)
-        osascript -e "display notification \"$DETAIL\" with title \"NanoClaw ← Ollama ✓\" sound name \"Glass\"" 2>/dev/null
-        echo "$(date +%H:%M:%S) ✅ $DETAIL"
-        ;;
-      listing)
-        echo "$(date +%H:%M:%S) 📋 Listing models..."
-        ;;
-    esac
-  done
-  sleep 0.5
-done
--- a/src/container-runner.ts
+++ b/src/container-runner.ts
@@ -400,12 +400,7 @@ export async function runContainerAgent(
      const chunk = data.toString();
      const lines = chunk.trim().split('\n');
      for (const line of lines) {
-        if (!line) continue;
-        if (line.includes('[OLLAMA]')) {
-          logger.info({ container: group.folder }, line);
-        } else {
-          logger.debug({ container: group.folder }, line);
-        }
+        if (line) logger.debug({ container: group.folder }, line);
      }
      // Don't reset timeout on stderr — SDK writes debug logs continuously.
      // Timeout only resets on actual output (OUTPUT_MARKER in stdout).
--- a/src/index.ts
+++ b/src/index.ts
@@ -403,12 +403,20 @@ async function runAgent(
    }

    if (output.status === 'error') {
-      // Detect stale/corrupt session: container failed while resuming an existing session.
-      // Clear the session and retry once with a fresh session to avoid infinite retry loops.
-      if (sessionId) {
+      // Detect stale/corrupt session: the SDK throws ENOENT when the session
+      // transcript file (.jsonl) doesn't exist inside the container. This
+      // happens after container restarts since the filesystem is ephemeral.
+      // Only clear + retry for this specific signal — transient errors
+      // (network, API) should fall through to the normal backoff path.
+      const isStaleSession =
+        sessionId &&
+        output.error &&
+        /ENOENT.*\.jsonl|session.*not found/i.test(output.error);
+
+      if (isStaleSession) {
        logger.warn(
          { group: group.name, staleSessionId: sessionId, error: output.error },
-          'Container failed with existing session — clearing stale session and retrying with fresh session',
+          'Stale session detected (ENOENT on session transcript) — clearing and retrying with fresh session',
        );
        delete sessions[group.folder];
        deleteSession(group.folder);