feat: per-group queue, SQLite state, graceful shutdown (#111)

* fix: wire up queue processMessagesFn before recovery to prevent silent message loss

recoverPendingMessages() was called after startMessageLoop(), which meant:
1. Recovery could race with the message loop's first iteration
2. processMessagesFn was set inside startMessageLoop, so recovery
   enqueues would fire runForGroup with processMessagesFn still null,
   silently skipping message processing

Move setProcessMessagesFn and recoverPendingMessages before startMessageLoop
so the queue is fully wired before any messages are enqueued.

https://claude.ai/code/session_01PCY8zNjDa2N29jvBAV5vfL

* feat: structured agent output to fix infinite retry on silent responses (#113)

Use Agent SDK's outputFormat with json_schema to get typed responses
from the agent. The agent now returns { status: 'responded' | 'silent',
userMessage?, internalLog? } instead of a plain string. This fixes a
critical bug where a null/empty agent response caused infinite 5-second
retry loops by conflating "nothing to say" with "error".

- Agent runner: add AGENT_RESPONSE_SCHEMA and parse structured_output
- Host: advance lastAgentTimestamp on both responded AND silent status
- GroupQueue: add exponential backoff (5s-80s) with max 5 retries for
  actual errors, replacing unbounded fixed-interval retries

https://claude.ai/code/session_014SLc8MxP9BYhEhDCLox9U8

Co-authored-by: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-02-06 18:54:26 +02:00
committed by GitHub
parent 03df69e9b5
commit ae177156ec
5 changed files with 115 additions and 36 deletions

View File

@@ -40,9 +40,15 @@ export interface ContainerInput {
isMain: boolean;
}
export interface AgentResponse {
status: 'responded' | 'silent';
userMessage?: string;
internalLog?: string;
}
export interface ContainerOutput {
status: 'success' | 'error';
result: string | null;
result: AgentResponse | null;
newSessionId?: string;
error?: string;
}

View File

@@ -9,12 +9,16 @@ interface QueuedTask {
fn: () => Promise<void>;
}
const MAX_RETRIES = 5;
const BASE_RETRY_MS = 5000;
interface GroupState {
active: boolean;
pendingMessages: boolean;
pendingTasks: QueuedTask[];
process: ChildProcess | null;
containerName: string | null;
retryCount: number;
}
export class GroupQueue {
@@ -34,6 +38,7 @@ export class GroupQueue {
pendingTasks: [],
process: null,
containerName: null,
retryCount: 0,
};
this.groups.set(groupJid, state);
}
@@ -126,22 +131,15 @@ export class GroupQueue {
try {
if (this.processMessagesFn) {
const success = await this.processMessagesFn(groupJid);
if (!success) {
logger.info({ groupJid }, 'Processing failed, scheduling retry');
setTimeout(() => {
if (!this.shuttingDown) {
this.enqueueMessageCheck(groupJid);
}
}, 5000);
if (success) {
state.retryCount = 0;
} else {
this.scheduleRetry(groupJid, state);
}
}
} catch (err) {
logger.error({ groupJid, err }, 'Error processing messages for group');
setTimeout(() => {
if (!this.shuttingDown) {
this.enqueueMessageCheck(groupJid);
}
}, 5000);
this.scheduleRetry(groupJid, state);
} finally {
state.active = false;
state.process = null;
@@ -174,6 +172,29 @@ export class GroupQueue {
}
}
private scheduleRetry(groupJid: string, state: GroupState): void {
state.retryCount++;
if (state.retryCount > MAX_RETRIES) {
logger.error(
{ groupJid, retryCount: state.retryCount },
'Max retries exceeded, dropping messages (will retry on next incoming message)',
);
state.retryCount = 0;
return;
}
const delayMs = BASE_RETRY_MS * Math.pow(2, state.retryCount - 1);
logger.info(
{ groupJid, retryCount: state.retryCount, delayMs },
'Scheduling retry with backoff',
);
setTimeout(() => {
if (!this.shuttingDown) {
this.enqueueMessageCheck(groupJid);
}
}, delayMs);
}
private drainGroup(groupJid: string): void {
if (this.shuttingDown) return;

View File

@@ -21,6 +21,7 @@ import {
TRIGGER_PATTERN,
} from './config.js';
import {
AgentResponse,
AvailableGroup,
runContainerAgent,
writeGroupsSnapshot,
@@ -236,22 +237,35 @@ async function processGroupMessages(chatJid: string): Promise<boolean> {
const response = await runAgent(group, prompt, chatJid);
await setTyping(chatJid, false);
if (response) {
// Fix batching bug: advance to latest message in batch, not just the trigger
lastAgentTimestamp[chatJid] =
missedMessages[missedMessages.length - 1].timestamp;
saveState();
await sendMessage(chatJid, `${ASSISTANT_NAME}: ${response}`);
return true;
if (response === 'error') {
// Container or agent error — signal failure so queue can retry with backoff
return false;
}
return false;
// Agent processed messages successfully (whether it responded or stayed silent)
lastAgentTimestamp[chatJid] =
missedMessages[missedMessages.length - 1].timestamp;
saveState();
if (response.status === 'responded' && response.userMessage) {
await sendMessage(chatJid, `${ASSISTANT_NAME}: ${response.userMessage}`);
}
if (response.internalLog) {
logger.info(
{ group: group.name, agentStatus: response.status },
`Agent: ${response.internalLog}`,
);
}
return true;
}
async function runAgent(
group: RegisteredGroup,
prompt: string,
chatJid: string,
): Promise<string | null> {
): Promise<AgentResponse | 'error'> {
const isMain = group.folder === MAIN_GROUP_FOLDER;
const sessionId = sessions[group.folder];
@@ -303,13 +317,13 @@ async function runAgent(
{ group: group.name, error: output.error },
'Container agent error',
);
return null;
return 'error';
}
return output.result;
return output.result ?? { status: 'silent' };
} catch (err) {
logger.error({ group: group.name, err }, 'Agent error');
return null;
return 'error';
}
}
@@ -740,8 +754,9 @@ async function connectWhatsApp(): Promise<void> {
onProcess: (groupJid, proc, containerName) => queue.registerProcess(groupJid, proc, containerName),
});
startIpcWatcher();
startMessageLoop();
queue.setProcessMessagesFn(processGroupMessages);
recoverPendingMessages();
startMessageLoop();
}
});
@@ -783,9 +798,6 @@ async function startMessageLoop(): Promise<void> {
}
messageLoopRunning = true;
// Wire up the queue's message processing function
queue.setProcessMessagesFn(processGroupMessages);
logger.info(`NanoClaw running (trigger: @${ASSISTANT_NAME})`);
while (true) {

View File

@@ -103,8 +103,8 @@ async function runTask(
if (output.status === 'error') {
error = output.error || 'Unknown error';
} else {
result = output.result;
} else if (output.result) {
result = output.result.userMessage || output.result.internalLog || null;
}
logger.info(