fix: proper container lifecycle management to prevent stopped container accumulation

- Name containers (nanoclaw-{group}-{timestamp}) for trackability
- Replace SIGKILL timeout with graceful `container stop` so --rm fires
- Add startup sweep to clean up stopped nanoclaw containers from previous runs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-02-06 07:10:26 +02:00
parent 3a4d340f80
commit db216a459e
2 changed files with 62 additions and 11 deletions

View File

@@ -2,7 +2,7 @@
* Container Runner for NanoClaw * Container Runner for NanoClaw
* Spawns agent execution in Apple Container and handles IPC * Spawns agent execution in Apple Container and handles IPC
*/ */
import { spawn } from 'child_process'; import { exec, spawn } from 'child_process';
import fs from 'fs'; import fs from 'fs';
import os from 'os'; import os from 'os';
import path from 'path'; import path from 'path';
@@ -162,8 +162,8 @@ function buildVolumeMounts(
return mounts; return mounts;
} }
function buildContainerArgs(mounts: VolumeMount[]): string[] { function buildContainerArgs(mounts: VolumeMount[], containerName: string): string[] {
const args: string[] = ['run', '-i', '--rm']; const args: string[] = ['run', '-i', '--rm', '--name', containerName];
// Apple Container: --mount for readonly, -v for read-write // Apple Container: --mount for readonly, -v for read-write
for (const mount of mounts) { for (const mount of mounts) {
@@ -192,11 +192,14 @@ export async function runContainerAgent(
fs.mkdirSync(groupDir, { recursive: true }); fs.mkdirSync(groupDir, { recursive: true });
const mounts = buildVolumeMounts(group, input.isMain); const mounts = buildVolumeMounts(group, input.isMain);
const containerArgs = buildContainerArgs(mounts); const safeName = group.folder.replace(/[^a-zA-Z0-9-]/g, '-');
const containerName = `nanoclaw-${safeName}-${Date.now()}`;
const containerArgs = buildContainerArgs(mounts, containerName);
logger.debug( logger.debug(
{ {
group: group.name, group: group.name,
containerName,
mounts: mounts.map( mounts: mounts.map(
(m) => (m) =>
`${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`, `${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`,
@@ -209,6 +212,7 @@ export async function runContainerAgent(
logger.info( logger.info(
{ {
group: group.name, group: group.name,
containerName,
mountCount: mounts.length, mountCount: mounts.length,
isMain: input.isMain, isMain: input.isMain,
}, },
@@ -267,13 +271,17 @@ export async function runContainerAgent(
} }
}); });
let timedOut = false;
const timeout = setTimeout(() => { const timeout = setTimeout(() => {
logger.error({ group: group.name }, 'Container timeout, killing'); timedOut = true;
container.kill('SIGKILL'); logger.error({ group: group.name, containerName }, 'Container timeout, stopping gracefully');
resolve({ // Graceful stop: sends SIGTERM, waits, then SIGKILL — lets --rm fire
status: 'error', exec(`container stop ${containerName}`, { timeout: 15000 }, (err) => {
result: null, if (err) {
error: `Container timed out after ${CONTAINER_TIMEOUT}ms`, logger.warn({ group: group.name, containerName, err }, 'Graceful stop failed, force killing');
container.kill('SIGKILL');
}
}); });
}, group.containerConfig?.timeout || CONTAINER_TIMEOUT); }, group.containerConfig?.timeout || CONTAINER_TIMEOUT);
@@ -281,6 +289,31 @@ export async function runContainerAgent(
clearTimeout(timeout); clearTimeout(timeout);
const duration = Date.now() - startTime; const duration = Date.now() - startTime;
if (timedOut) {
const ts = new Date().toISOString().replace(/[:.]/g, '-');
const timeoutLog = path.join(logsDir, `container-${ts}.log`);
fs.writeFileSync(timeoutLog, [
`=== Container Run Log (TIMEOUT) ===`,
`Timestamp: ${new Date().toISOString()}`,
`Group: ${group.name}`,
`Container: ${containerName}`,
`Duration: ${duration}ms`,
`Exit Code: ${code}`,
].join('\n'));
logger.error(
{ group: group.name, containerName, duration, code },
'Container timed out',
);
resolve({
status: 'error',
result: null,
error: `Container timed out after ${group.containerConfig?.timeout || CONTAINER_TIMEOUT}ms`,
});
return;
}
const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const logFile = path.join(logsDir, `container-${timestamp}.log`); const logFile = path.join(logsDir, `container-${timestamp}.log`);
const isVerbose = const isVerbose =
@@ -414,7 +447,7 @@ export async function runContainerAgent(
container.on('error', (err) => { container.on('error', (err) => {
clearTimeout(timeout); clearTimeout(timeout);
logger.error({ group: group.name, error: err }, 'Container spawn error'); logger.error({ group: group.name, containerName, error: err }, 'Container spawn error');
resolve({ resolve({
status: 'error', status: 'error',
result: null, result: null,

View File

@@ -832,6 +832,24 @@ function ensureContainerSystemRunning(): void {
throw new Error('Apple Container system is required but failed to start'); throw new Error('Apple Container system is required but failed to start');
} }
} }
// Clean up stopped NanoClaw containers from previous runs
try {
const output = execSync('container ls -a --format {{.Names}}', {
stdio: ['pipe', 'pipe', 'pipe'],
encoding: 'utf-8',
});
const stale = output
.split('\n')
.map((n) => n.trim())
.filter((n) => n.startsWith('nanoclaw-'));
if (stale.length > 0) {
execSync(`container rm ${stale.join(' ')}`, { stdio: 'pipe' });
logger.info({ count: stale.length }, 'Cleaned up stopped containers');
}
} catch {
// No stopped containers or ls/rm not supported
}
} }
async function main(): Promise<void> { async function main(): Promise<void> {