diff --git a/src/agents/subagent-orphan-recovery.ts b/src/agents/subagent-orphan-recovery.ts index 320d96f3727..ed2eac6d8f3 100644 --- a/src/agents/subagent-orphan-recovery.ts +++ b/src/agents/subagent-orphan-recovery.ts @@ -100,10 +100,16 @@ async function resumeOrphanedSession(params: { }, timeoutMs: 10_000, }); - replaceSubagentRunAfterSteer({ + const remapped = replaceSubagentRunAfterSteer({ previousRunId: params.originalRunId, nextRunId: result.runId, }); + if (!remapped) { + log.warn( + `resumed orphaned session ${params.sessionKey} but remap failed (old run already removed); treating as failure`, + ); + return false; + } log.info(`resumed orphaned session: ${params.sessionKey}`); return true; } catch (err) { @@ -125,9 +131,11 @@ async function resumeOrphanedSession(params: { */ export async function recoverOrphanedSubagentSessions(params: { getActiveRuns: () => Map; + /** Persisted across retries so already-resumed sessions are not resumed again. */ + resumedSessionKeys?: Set; }): Promise<{ recovered: number; failed: number; skipped: number }> { const result = { recovered: 0, failed: 0, skipped: 0 }; - const resumedSessionKeys = new Set(); + const resumedSessionKeys = params.resumedSessionKeys ?? new Set(); const configChangePattern = /openclaw\.json|openclaw gateway restart|config\.patch/i; try { @@ -236,6 +244,10 @@ export async function recoverOrphanedSubagentSessions(params: { } } catch (err) { log.warn(`orphan recovery scan failed: ${String(err)}`); + // Ensure retry logic fires for scan-level exceptions. + if (result.failed === 0) { + result.failed = 1; + } } if (result.recovered > 0 || result.failed > 0) { @@ -265,9 +277,11 @@ export function scheduleOrphanRecovery(params: { const initialDelay = params.delayMs ?? DEFAULT_RECOVERY_DELAY_MS; const maxRetries = params.maxRetries ?? MAX_RECOVERY_RETRIES; + const resumedSessionKeys = new Set(); + const attemptRecovery = (attempt: number, delay: number) => { setTimeout(() => { - void recoverOrphanedSubagentSessions(params) + void recoverOrphanedSubagentSessions({ ...params, resumedSessionKeys }) .then((result) => { if (result.failed > 0 && attempt < maxRetries) { const nextDelay = delay * RETRY_BACKOFF_MULTIPLIER; diff --git a/src/cli/daemon-cli/lifecycle.ts b/src/cli/daemon-cli/lifecycle.ts index 76099fe956c..d3e01f66412 100644 --- a/src/cli/daemon-cli/lifecycle.ts +++ b/src/cli/daemon-cli/lifecycle.ts @@ -51,8 +51,7 @@ function resolveGatewayPortFallback(): Promise { async function assertUnmanagedGatewayRestartEnabled(port: number): Promise { const cfg = await readBestEffortConfig().catch(() => undefined); - const tlsEnabled = !!(cfg as { gateway?: { tls?: { enabled?: unknown } } } | undefined)?.gateway - ?.tls?.enabled; + const tlsEnabled = !!cfg?.gateway?.tls?.enabled; const scheme = tlsEnabled ? "wss" : "ws"; const probe = await probeGateway({ url: `${scheme}://127.0.0.1:${port}`, diff --git a/src/infra/restart.ts b/src/infra/restart.ts index 1238b4954d0..f671df382e2 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -1,6 +1,7 @@ import { spawnSync } from "node:child_process"; import os from "node:os"; import path from "node:path"; +import { loadConfig } from "../config/config.js"; import { resolveGatewayLaunchAgentLabel, resolveGatewaySystemdServiceName, @@ -476,7 +477,11 @@ export function scheduleGatewaySigusr1Restart(opts?: { emitGatewayRestart(); return; } - deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck }); + const cfg = loadConfig(); + deferGatewayRestartUntilIdle({ + getPendingCount: pendingCheck, + maxWaitMs: cfg.gateway?.reload?.deferralTimeoutMs, + }); }, Math.max(0, requestedDueAt - nowMs), );