fix(agent-runner): remove thread_id filter and fix processing ack on empty result

The concurrent poll in processQuery filtered out messages with
mismatched thread_ids, causing a deadlock when the initial batch
(e.g. a host-generated welcome trigger with null thread_id) completed
but follow-ups arrived with a different thread_id (e.g. a Discord DM).
The query stayed open waiting for matching-thread pushes that never
came, blocking the poll loop indefinitely.

Thread routing is the router's concern — per-thread sessions already
isolate threads into separate containers; shared sessions intentionally
merge everything. Removed the filter.

Also fixed processing_ack: a result event (with or without text) means
the turn is done, but markCompleted only ran when event.text was truthy.
When the agent responded via MCP send_message (empty result text), the
initial batch stayed in 'processing' for the query's lifetime, creating
false stuck signals in the host sweep. Now marks completed on any result
event.

Belt-and-suspenders: init-first-agent welcome trigger now sets threadId
to the DM platform_id instead of null.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
exe.dev user
2026-04-22 10:42:56 +00:00
parent c8fc1da719
commit 7da24b166d
2 changed files with 26 additions and 10 deletions

View File

@@ -160,7 +160,7 @@ export async function runPollLoop(config: PollLoopConfig): Promise<void> {
const skippedSet = new Set(skipped);
const processingIds = ids.filter((id) => !commandIds.includes(id) && !skippedSet.has(id));
try {
const result = await processQuery(query, routing);
const result = await processQuery(query, routing, processingIds);
if (result.continuation && result.continuation !== continuation) {
continuation = result.continuation;
setStoredSessionId(continuation);
@@ -189,6 +189,8 @@ export async function runPollLoop(config: PollLoopConfig): Promise<void> {
});
}
// Ensure completed even if processQuery ended without a result event
// (e.g. stream closed unexpectedly).
markCompleted(processingIds);
log(`Completed ${ids.length} message(s)`);
}
@@ -232,7 +234,11 @@ interface QueryResult {
continuation?: string;
}
async function processQuery(query: AgentQuery, routing: RoutingContext): Promise<QueryResult> {
async function processQuery(
query: AgentQuery,
routing: RoutingContext,
initialBatchIds: string[],
): Promise<QueryResult> {
let queryContinuation: string | undefined;
let done = false;
@@ -246,14 +252,15 @@ async function processQuery(query: AgentQuery, routing: RoutingContext): Promise
if (done) return;
// Skip system messages (MCP tool responses) and /clear (needs fresh query).
// Also defer messages whose thread_id differs from the active turn's routing
// — mixing threads into one streaming turn would send the reply to the wrong
// thread because `routing` is captured at turn start. The next turn will pick
// them up with fresh routing.
// Thread routing is the router's concern — if a message landed in this
// session, the agent should see it. Per-thread sessions already isolate
// threads into separate containers; shared sessions intentionally merge
// everything. Filtering on thread_id here caused deadlocks when the
// initial batch and follow-ups had mismatched thread_ids (e.g. a
// host-generated welcome trigger with null thread vs a Discord DM reply).
const newMessages = getPendingMessages().filter((m) => {
if (m.kind === 'system') return false;
if ((m.kind === 'chat' || m.kind === 'chat-sdk') && isClearCommand(m)) return false;
if ((m.thread_id ?? null) !== (routing.threadId ?? null)) return false;
return true;
});
if (newMessages.length > 0) {
@@ -282,8 +289,17 @@ async function processQuery(query: AgentQuery, routing: RoutingContext): Promise
// effectively orphaned and the next message started a blank
// Claude session with no prior context.
setStoredSessionId(event.continuation);
} else if (event.type === 'result' && event.text) {
dispatchResultText(event.text, routing);
} else if (event.type === 'result') {
// A result — with or without text — means the turn is done. Mark
// the initial batch completed now so the host sweep doesn't see
// stale 'processing' claims while the query stays open for
// follow-up pushes. The agent may have responded via MCP
// (send_message) mid-turn, or the message may not need a response
// at all — either way the turn is finished.
markCompleted(initialBatchIds);
if (event.text) {
dispatchResultText(event.text, routing);
}
}
}
} finally {