Compare commits

...

7 Commits

Author SHA1 Message Date
Peter Steinberger
67e22fa07f fix: add changelog for config restart guard (#38699) (thanks @lml2468) 2026-03-09 05:53:42 +00:00
merlin
ff764ee690 fix: release gateway lock on restart failure + reply to Codex reviews
- Release gateway lock when in-process restart fails, so daemon
  restart/stop can still manage the process (Codex P2)
- P1 (env mismatch) already addressed: best-effort by design, documented
  in JSDoc
2026-03-09 05:39:42 +00:00
merlin
5f85b654f0 fix: move config pre-flight before onNotLoaded in runServiceRestart (Codex P2)
The config check was positioned after onNotLoaded, which could send
SIGUSR1 to an unmanaged process before config was validated.
2026-03-09 05:39:42 +00:00
merlin
954928a90d fix: address bot review feedback on #35862
- Remove dead 'return false' in runServiceStart (Greptile)
- Include stack trace in run-loop crash guard error log (Greptile)
- Only catch startup errors on subsequent restarts, not initial start (Codex P1)
- Add JSDoc note about env var false positive edge case (Codex P1)
2026-03-09 05:39:42 +00:00
merlin
ac27fa485a test: add runServiceStart config pre-flight tests (#35862)
Address Greptile review: add test coverage for runServiceStart path.
The error message copy-paste issue was already fixed in the DRY refactor
(uses params.serviceNoun instead of hardcoded 'restart').
2026-03-09 05:39:42 +00:00
merlin
b11933d8a9 fix(gateway): catch startup failure in run loop to prevent process exit (#35862)
When an in-process restart (SIGUSR1) triggers a config-triggered restart
and the new config is invalid, params.start() throws and the while loop
exits, killing the process. On macOS this loses TCC permissions.

Wrap params.start() in try/catch: on failure, set server=null, log the
error, and wait for the next SIGUSR1 instead of crashing.
2026-03-09 05:39:41 +00:00
merlin
da4e43d52c fix(gateway): validate config before restart to prevent crash + macOS permission loss (#35862)
When 'openclaw gateway restart' is run with an invalid config, the new
process crashes on startup due to config validation failure. On macOS,
this causes Full Disk Access (TCC) permissions to be lost because the
respawned process has a different PID.

Add getConfigValidationError() helper and pre-flight config validation
in both runServiceRestart() and runServiceStart(). If config is invalid,
abort with a clear error message instead of crashing.

The config watcher's hot-reload path already had this guard
(handleInvalidSnapshot), but the CLI restart/start commands did not.

AI-assisted (OpenClaw agent, fully tested)
2026-03-09 05:39:41 +00:00
4 changed files with 281 additions and 2 deletions

View File

@ -58,6 +58,7 @@ Docs: https://docs.openclaw.ai
- Matrix/DM routing: add safer fallback detection for broken `m.direct` homeservers, honor explicit room bindings over DM classification, and preserve room-bound agent selection for Matrix DM rooms. (#19736) Thanks @derbronko.
- Cron/Telegram announce delivery: route text-only announce jobs through the real outbound adapters after finalizing descendant output so plain Telegram targets no longer report `delivered: true` when no message actually reached Telegram. (#40575) thanks @obviyus.
- Gateway/restart timeout recovery: exit non-zero when restart-triggered shutdown drains time out so launchd/systemd restart the gateway instead of treating the failed restart as a clean stop. Landed from contributor PR #40380 by @dsantoreis. Thanks @dsantoreis.
- Gateway/config restart guard: validate config before service start/restart and keep post-SIGUSR1 startup failures from crashing the gateway process, reducing invalid-config restart loops and macOS permission loss. Landed from contributor PR #38699 by @lml2468. Thanks @lml2468.
## 2026.3.7

View File

@ -0,0 +1,206 @@
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
const readConfigFileSnapshotMock = vi.fn();
const loadConfig = vi.fn(() => ({}));
const runtimeLogs: string[] = [];
const defaultRuntime = {
log: (message: string) => runtimeLogs.push(message),
error: vi.fn(),
exit: (code: number) => {
throw new Error(`__exit__:${code}`);
},
};
const service = {
label: "TestService",
loadedText: "loaded",
notLoadedText: "not loaded",
install: vi.fn(),
uninstall: vi.fn(),
stop: vi.fn(),
isLoaded: vi.fn(),
readCommand: vi.fn(),
readRuntime: vi.fn(),
restart: vi.fn(),
};
vi.mock("../../config/config.js", () => ({
loadConfig: () => loadConfig(),
readConfigFileSnapshot: () => readConfigFileSnapshotMock(),
}));
vi.mock("../../config/issue-format.js", () => ({
formatConfigIssueLines: (
issues: Array<{ path: string; message: string }>,
_prefix: string,
_opts?: unknown,
) => issues.map((i) => `${i.path}: ${i.message}`),
}));
vi.mock("../../runtime.js", () => ({
defaultRuntime,
}));
describe("runServiceRestart config pre-flight (#35862)", () => {
let runServiceRestart: typeof import("./lifecycle-core.js").runServiceRestart;
beforeAll(async () => {
({ runServiceRestart } = await import("./lifecycle-core.js"));
});
beforeEach(() => {
runtimeLogs.length = 0;
readConfigFileSnapshotMock.mockReset();
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: true,
config: {},
issues: [],
});
loadConfig.mockReset();
loadConfig.mockReturnValue({});
service.isLoaded.mockClear();
service.readCommand.mockClear();
service.restart.mockClear();
service.isLoaded.mockResolvedValue(true);
service.readCommand.mockResolvedValue({ environment: {} });
service.restart.mockResolvedValue(undefined);
vi.unstubAllEnvs();
vi.stubEnv("OPENCLAW_GATEWAY_TOKEN", "");
vi.stubEnv("CLAWDBOT_GATEWAY_TOKEN", "");
});
it("aborts restart when config is invalid", async () => {
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: false,
config: {},
issues: [{ path: "agents.defaults.pdfModel", message: "Unrecognized key" }],
});
await expect(
runServiceRestart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
}),
).rejects.toThrow("__exit__:1");
expect(service.restart).not.toHaveBeenCalled();
});
it("proceeds with restart when config is valid", async () => {
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: true,
config: {},
issues: [],
});
const result = await runServiceRestart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
});
expect(result).toBe(true);
expect(service.restart).toHaveBeenCalledTimes(1);
});
it("proceeds with restart when config file does not exist", async () => {
readConfigFileSnapshotMock.mockResolvedValue({
exists: false,
valid: true,
config: {},
issues: [],
});
const result = await runServiceRestart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
});
expect(result).toBe(true);
expect(service.restart).toHaveBeenCalledTimes(1);
});
it("proceeds with restart when snapshot read throws", async () => {
readConfigFileSnapshotMock.mockRejectedValue(new Error("read failed"));
const result = await runServiceRestart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
});
expect(result).toBe(true);
expect(service.restart).toHaveBeenCalledTimes(1);
});
});
describe("runServiceStart config pre-flight (#35862)", () => {
let runServiceStart: typeof import("./lifecycle-core.js").runServiceStart;
beforeAll(async () => {
({ runServiceStart } = await import("./lifecycle-core.js"));
});
beforeEach(() => {
runtimeLogs.length = 0;
readConfigFileSnapshotMock.mockReset();
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: true,
config: {},
issues: [],
});
service.isLoaded.mockClear();
service.restart.mockClear();
service.isLoaded.mockResolvedValue(true);
service.restart.mockResolvedValue(undefined);
});
it("aborts start when config is invalid", async () => {
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: false,
config: {},
issues: [{ path: "agents.defaults.pdfModel", message: "Unrecognized key" }],
});
await expect(
runServiceStart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
}),
).rejects.toThrow("__exit__:1");
expect(service.restart).not.toHaveBeenCalled();
});
it("proceeds with start when config is valid", async () => {
readConfigFileSnapshotMock.mockResolvedValue({
exists: true,
valid: true,
config: {},
issues: [],
});
await runServiceStart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
});
expect(service.restart).toHaveBeenCalledTimes(1);
});
});

View File

@ -1,5 +1,6 @@
import type { Writable } from "node:stream";
import { readBestEffortConfig } from "../../config/config.js";
import { readBestEffortConfig, readConfigFileSnapshot } from "../../config/config.js";
import { formatConfigIssueLines } from "../../config/issue-format.js";
import { resolveIsNixMode } from "../../config/paths.js";
import { checkTokenDrift } from "../../daemon/service-audit.js";
import type { GatewayService } from "../../daemon/service.js";
@ -107,6 +108,29 @@ async function resolveServiceLoadedOrFail(params: {
}
}
/**
* Best-effort config validation. Returns a string describing the issues if
* config exists and is invalid, or null if config is valid/missing/unreadable.
*
* Note: This reads the config file snapshot in the current CLI environment.
* Configs using env vars only available in the service context (launchd/systemd)
* may produce false positives, but the check is intentionally best-effort
* a false positive here is safer than a crash on startup. (#35862)
*/
async function getConfigValidationError(): Promise<string | null> {
try {
const snapshot = await readConfigFileSnapshot();
if (!snapshot.exists || snapshot.valid) {
return null;
}
return snapshot.issues.length > 0
? formatConfigIssueLines(snapshot.issues, "", { normalizeRoot: true }).join("\n")
: "Unknown validation issue.";
} catch {
return null;
}
}
export async function runServiceUninstall(params: {
serviceNoun: string;
service: GatewayService;
@ -187,6 +211,17 @@ export async function runServiceStart(params: {
});
return;
}
// Pre-flight config validation (#35862)
{
const configError = await getConfigValidationError();
if (configError) {
fail(
`${params.serviceNoun} aborted: config is invalid.\n${configError}\nFix the config and retry, or run "openclaw doctor" to repair.`,
);
return;
}
}
try {
await params.service.restart({ env: process.env, stdout });
} catch (err) {
@ -298,6 +333,19 @@ export async function runServiceRestart(params: {
if (loaded === null) {
return false;
}
// Pre-flight config validation: check before any restart action (including
// onNotLoaded which may send SIGUSR1 to an unmanaged process). (#35862)
{
const configError = await getConfigValidationError();
if (configError) {
fail(
`${params.serviceNoun} aborted: config is invalid.\n${configError}\nFix the config and retry, or run "openclaw doctor" to repair.`,
);
return false;
}
}
if (!loaded) {
try {
handledNotLoaded = (await params.onNotLoaded?.({ json, stdout, fail })) ?? null;

View File

@ -190,10 +190,34 @@ export async function runGatewayLoop(params: {
// Keep process alive; SIGUSR1 triggers an in-process restart (no supervisor required).
// SIGTERM/SIGINT still exit after a graceful shutdown.
let isFirstStart = true;
// eslint-disable-next-line no-constant-condition
while (true) {
onIteration();
server = await params.start();
try {
server = await params.start();
isFirstStart = false;
} catch (err) {
// On initial startup, let the error propagate so the outer handler
// can report "Gateway failed to start" and exit non-zero. Only
// swallow errors on subsequent in-process restarts to keep the
// process alive (a crash would lose macOS TCC permissions). (#35862)
if (isFirstStart) {
throw err;
}
server = null;
// Release the gateway lock so that `daemon restart/stop` (which
// discovers PIDs via the gateway port) can still manage the process.
// Without this, the process holds the lock but is not listening,
// forcing manual cleanup. (#35862)
await releaseLockIfHeld();
const errMsg = err instanceof Error ? err.message : String(err);
const errStack = err instanceof Error && err.stack ? `\n${err.stack}` : "";
gatewayLog.error(
`gateway startup failed: ${errMsg}. ` +
`Process will stay alive; fix the issue and restart.${errStack}`,
);
}
await new Promise<void>((resolve) => {
restartResolver = resolve;
});