fix: proper container lifecycle management to prevent stopped container accumulation
- Name containers (nanoclaw-{group}-{timestamp}) for trackability
- Replace SIGKILL timeout with graceful `container stop` so --rm fires
- Add startup sweep to clean up stopped nanoclaw containers from previous runs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
* Container Runner for NanoClaw
|
||||
* Spawns agent execution in Apple Container and handles IPC
|
||||
*/
|
||||
import { spawn } from 'child_process';
|
||||
import { exec, spawn } from 'child_process';
|
||||
import fs from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
@@ -162,8 +162,8 @@ function buildVolumeMounts(
|
||||
return mounts;
|
||||
}
|
||||
|
||||
function buildContainerArgs(mounts: VolumeMount[]): string[] {
|
||||
const args: string[] = ['run', '-i', '--rm'];
|
||||
function buildContainerArgs(mounts: VolumeMount[], containerName: string): string[] {
|
||||
const args: string[] = ['run', '-i', '--rm', '--name', containerName];
|
||||
|
||||
// Apple Container: --mount for readonly, -v for read-write
|
||||
for (const mount of mounts) {
|
||||
@@ -192,11 +192,14 @@ export async function runContainerAgent(
|
||||
fs.mkdirSync(groupDir, { recursive: true });
|
||||
|
||||
const mounts = buildVolumeMounts(group, input.isMain);
|
||||
const containerArgs = buildContainerArgs(mounts);
|
||||
const safeName = group.folder.replace(/[^a-zA-Z0-9-]/g, '-');
|
||||
const containerName = `nanoclaw-${safeName}-${Date.now()}`;
|
||||
const containerArgs = buildContainerArgs(mounts, containerName);
|
||||
|
||||
logger.debug(
|
||||
{
|
||||
group: group.name,
|
||||
containerName,
|
||||
mounts: mounts.map(
|
||||
(m) =>
|
||||
`${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`,
|
||||
@@ -209,6 +212,7 @@ export async function runContainerAgent(
|
||||
logger.info(
|
||||
{
|
||||
group: group.name,
|
||||
containerName,
|
||||
mountCount: mounts.length,
|
||||
isMain: input.isMain,
|
||||
},
|
||||
@@ -267,13 +271,17 @@ export async function runContainerAgent(
|
||||
}
|
||||
});
|
||||
|
||||
let timedOut = false;
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
logger.error({ group: group.name }, 'Container timeout, killing');
|
||||
timedOut = true;
|
||||
logger.error({ group: group.name, containerName }, 'Container timeout, stopping gracefully');
|
||||
// Graceful stop: sends SIGTERM, waits, then SIGKILL — lets --rm fire
|
||||
exec(`container stop ${containerName}`, { timeout: 15000 }, (err) => {
|
||||
if (err) {
|
||||
logger.warn({ group: group.name, containerName, err }, 'Graceful stop failed, force killing');
|
||||
container.kill('SIGKILL');
|
||||
resolve({
|
||||
status: 'error',
|
||||
result: null,
|
||||
error: `Container timed out after ${CONTAINER_TIMEOUT}ms`,
|
||||
}
|
||||
});
|
||||
}, group.containerConfig?.timeout || CONTAINER_TIMEOUT);
|
||||
|
||||
@@ -281,6 +289,31 @@ export async function runContainerAgent(
|
||||
clearTimeout(timeout);
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
if (timedOut) {
|
||||
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
||||
const timeoutLog = path.join(logsDir, `container-${ts}.log`);
|
||||
fs.writeFileSync(timeoutLog, [
|
||||
`=== Container Run Log (TIMEOUT) ===`,
|
||||
`Timestamp: ${new Date().toISOString()}`,
|
||||
`Group: ${group.name}`,
|
||||
`Container: ${containerName}`,
|
||||
`Duration: ${duration}ms`,
|
||||
`Exit Code: ${code}`,
|
||||
].join('\n'));
|
||||
|
||||
logger.error(
|
||||
{ group: group.name, containerName, duration, code },
|
||||
'Container timed out',
|
||||
);
|
||||
|
||||
resolve({
|
||||
status: 'error',
|
||||
result: null,
|
||||
error: `Container timed out after ${group.containerConfig?.timeout || CONTAINER_TIMEOUT}ms`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
||||
const logFile = path.join(logsDir, `container-${timestamp}.log`);
|
||||
const isVerbose =
|
||||
@@ -414,7 +447,7 @@ export async function runContainerAgent(
|
||||
|
||||
container.on('error', (err) => {
|
||||
clearTimeout(timeout);
|
||||
logger.error({ group: group.name, error: err }, 'Container spawn error');
|
||||
logger.error({ group: group.name, containerName, error: err }, 'Container spawn error');
|
||||
resolve({
|
||||
status: 'error',
|
||||
result: null,
|
||||
|
||||
18
src/index.ts
18
src/index.ts
@@ -832,6 +832,24 @@ function ensureContainerSystemRunning(): void {
|
||||
throw new Error('Apple Container system is required but failed to start');
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up stopped NanoClaw containers from previous runs
|
||||
try {
|
||||
const output = execSync('container ls -a --format {{.Names}}', {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
encoding: 'utf-8',
|
||||
});
|
||||
const stale = output
|
||||
.split('\n')
|
||||
.map((n) => n.trim())
|
||||
.filter((n) => n.startsWith('nanoclaw-'));
|
||||
if (stale.length > 0) {
|
||||
execSync(`container rm ${stale.join(' ')}`, { stdio: 'pipe' });
|
||||
logger.info({ count: stale.length }, 'Cleaned up stopped containers');
|
||||
}
|
||||
} catch {
|
||||
// No stopped containers or ls/rm not supported
|
||||
}
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
|
||||
Reference in New Issue
Block a user