refactor: focus crash-loop rollback PR
This commit is contained in:
parent
eecd8a9711
commit
46cd41bde0
@ -1,9 +1,6 @@
|
|||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import os from "node:os";
|
import os from "node:os";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { stopGatewayViaLock } from "../infra/gateway-lock.js";
|
|
||||||
import type { GatewayServiceRuntime } from "./service-runtime.js";
|
|
||||||
import { colorize, isRich, theme } from "../terminal/theme.js";
|
|
||||||
import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js";
|
import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js";
|
||||||
import { splitArgsPreservingQuotes } from "./arg-split.js";
|
import { splitArgsPreservingQuotes } from "./arg-split.js";
|
||||||
import {
|
import {
|
||||||
@ -544,30 +541,6 @@ export async function uninstallSystemdService({
|
|||||||
|
|
||||||
async function runSystemdServiceAction(params: {
|
async function runSystemdServiceAction(params: {
|
||||||
stdout: NodeJS.WritableStream;
|
stdout: NodeJS.WritableStream;
|
||||||
env?: Record<string, string | undefined>;
|
|
||||||
}): Promise<void> {
|
|
||||||
try {
|
|
||||||
await assertSystemdAvailable();
|
|
||||||
const serviceName = resolveSystemdServiceName(env ?? {});
|
|
||||||
const unitName = `${serviceName}.service`;
|
|
||||||
const res = await execSystemctl(["--user", "stop", unitName]);
|
|
||||||
if (res.code !== 0) {
|
|
||||||
throw new Error(`systemctl stop failed: ${res.stderr || res.stdout}`.trim());
|
|
||||||
}
|
|
||||||
stdout.write(`${formatLine("Stopped systemd service", unitName)}\n`);
|
|
||||||
} catch (err) {
|
|
||||||
const detail = String(err);
|
|
||||||
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
|
|
||||||
const stopped = await stopGatewayViaLock({
|
|
||||||
env: (env ?? process.env) as NodeJS.ProcessEnv,
|
|
||||||
stdout,
|
|
||||||
});
|
|
||||||
if (stopped) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw err;
|
|
||||||
}
|
|
||||||
env?: GatewayServiceEnv;
|
env?: GatewayServiceEnv;
|
||||||
action: "stop" | "restart";
|
action: "stop" | "restart";
|
||||||
label: string;
|
label: string;
|
||||||
@ -598,33 +571,6 @@ export async function stopSystemdService({
|
|||||||
export async function restartSystemdService({
|
export async function restartSystemdService({
|
||||||
stdout,
|
stdout,
|
||||||
env,
|
env,
|
||||||
}: {
|
|
||||||
stdout: NodeJS.WritableStream;
|
|
||||||
env?: Record<string, string | undefined>;
|
|
||||||
}): Promise<void> {
|
|
||||||
try {
|
|
||||||
await assertSystemdAvailable();
|
|
||||||
const serviceName = resolveSystemdServiceName(env ?? {});
|
|
||||||
const unitName = `${serviceName}.service`;
|
|
||||||
const res = await execSystemctl(["--user", "restart", unitName]);
|
|
||||||
if (res.code !== 0) {
|
|
||||||
throw new Error(`systemctl restart failed: ${res.stderr || res.stdout}`.trim());
|
|
||||||
}
|
|
||||||
stdout.write(`${formatLine("Restarted systemd service", unitName)}\n`);
|
|
||||||
} catch (err) {
|
|
||||||
const detail = String(err);
|
|
||||||
if (detail.includes("systemctl not available") || detail.includes("systemctl --user unavailable")) {
|
|
||||||
const stopped = await stopGatewayViaLock({
|
|
||||||
env: (env ?? process.env) as NodeJS.ProcessEnv,
|
|
||||||
stdout,
|
|
||||||
});
|
|
||||||
if (stopped) {
|
|
||||||
stdout.write("Gateway process stopped via fallback; it will be restarted by the system (if using cron/watcher) or can be started manually.\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw err;
|
|
||||||
}
|
|
||||||
}: GatewayServiceControlArgs): Promise<GatewayServiceRestartResult> {
|
}: GatewayServiceControlArgs): Promise<GatewayServiceRestartResult> {
|
||||||
await runSystemdServiceAction({
|
await runSystemdServiceAction({
|
||||||
stdout,
|
stdout,
|
||||||
|
|||||||
@ -11,7 +11,6 @@ import { createDefaultDeps } from "../cli/deps.js";
|
|||||||
import { isRestartEnabled } from "../config/commands.js";
|
import { isRestartEnabled } from "../config/commands.js";
|
||||||
import {
|
import {
|
||||||
CONFIG_PATH,
|
CONFIG_PATH,
|
||||||
STATE_DIR,
|
|
||||||
type OpenClawConfig,
|
type OpenClawConfig,
|
||||||
isNixMode,
|
isNixMode,
|
||||||
loadConfig,
|
loadConfig,
|
||||||
@ -26,6 +25,7 @@ import {
|
|||||||
scheduleLastKnownGoodSave,
|
scheduleLastKnownGoodSave,
|
||||||
} from "../config/crash-tracker.js";
|
} from "../config/crash-tracker.js";
|
||||||
import { formatConfigIssueLines } from "../config/issue-format.js";
|
import { formatConfigIssueLines } from "../config/issue-format.js";
|
||||||
|
import { resolveStateDir } from "../config/paths.js";
|
||||||
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
||||||
import { resolveMainSessionKey } from "../config/sessions.js";
|
import { resolveMainSessionKey } from "../config/sessions.js";
|
||||||
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
||||||
@ -275,15 +275,17 @@ export async function startGatewayServer(
|
|||||||
port = 18789,
|
port = 18789,
|
||||||
opts: GatewayServerOptions = {},
|
opts: GatewayServerOptions = {},
|
||||||
): Promise<GatewayServer> {
|
): Promise<GatewayServer> {
|
||||||
const isCrashLoop = recordStartupAndCheckCrashLoop(STATE_DIR);
|
const stateDir = resolveStateDir();
|
||||||
|
const isCrashLoop = recordStartupAndCheckCrashLoop(stateDir);
|
||||||
if (isCrashLoop) {
|
if (isCrashLoop) {
|
||||||
if (hasLastKnownGood(CONFIG_PATH)) {
|
if (hasLastKnownGood(CONFIG_PATH)) {
|
||||||
log.warn("gateway: crash loop detected; reverting to last-known-good config");
|
log.warn("gateway: crash loop detected; reverting to last-known-good config");
|
||||||
revertToLastKnownGood(CONFIG_PATH, STATE_DIR);
|
revertToLastKnownGood(CONFIG_PATH, stateDir);
|
||||||
} else {
|
} else {
|
||||||
log.warn("gateway: crash loop detected but no last-known-good config found");
|
log.warn("gateway: crash loop detected but no last-known-good config found");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const minimalTestGateway =
|
const minimalTestGateway =
|
||||||
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
process.env.VITEST === "1" && process.env.OPENCLAW_TEST_MINIMAL_GATEWAY === "1";
|
||||||
|
|
||||||
@ -1079,7 +1081,7 @@ export async function startGatewayServer(
|
|||||||
httpServers,
|
httpServers,
|
||||||
});
|
});
|
||||||
|
|
||||||
scheduleLastKnownGoodSave(CONFIG_PATH, STATE_DIR);
|
scheduleLastKnownGoodSave(CONFIG_PATH, stateDir);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
close: async (opts) => {
|
close: async (opts) => {
|
||||||
|
|||||||
@ -136,7 +136,7 @@ async function resolveGatewayOwnerStatus(
|
|||||||
return isGatewayArgv(args) ? "alive" : "dead";
|
return isGatewayArgv(args) ? "alive" : "dead";
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||||
try {
|
try {
|
||||||
const raw = await fs.readFile(lockPath, "utf8");
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
const parsed = JSON.parse(raw) as Partial<LockPayload>;
|
const parsed = JSON.parse(raw) as Partial<LockPayload>;
|
||||||
@ -161,7 +161,7 @@ export async function readLockPayload(lockPath: string): Promise<LockPayload | n
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
||||||
const stateDir = resolveStateDir(env);
|
const stateDir = resolveStateDir(env);
|
||||||
const configPath = resolveConfigPath(env, stateDir);
|
const configPath = resolveConfigPath(env, stateDir);
|
||||||
const hash = createHash("sha256").update(configPath).digest("hex").slice(0, 8);
|
const hash = createHash("sha256").update(configPath).digest("hex").slice(0, 8);
|
||||||
@ -170,54 +170,6 @@ export function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
|||||||
return { lockPath, configPath };
|
return { lockPath, configPath };
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Attempt to stop a running gateway by reading its PID from the lock file
|
|
||||||
* and sending SIGTERM.
|
|
||||||
*/
|
|
||||||
export async function stopGatewayViaLock(opts: {
|
|
||||||
env: NodeJS.ProcessEnv;
|
|
||||||
stdout: NodeJS.WritableStream;
|
|
||||||
timeoutMs?: number;
|
|
||||||
}): Promise<boolean> {
|
|
||||||
const { lockPath } = resolveGatewayLockPath(opts.env);
|
|
||||||
const payload = await readLockPayload(lockPath);
|
|
||||||
if (!payload || !payload.pid) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const pid = payload.pid;
|
|
||||||
if (!isAlive(pid)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
opts.stdout.write(`Stopping gateway (pid ${pid})...\n`);
|
|
||||||
try {
|
|
||||||
process.kill(pid, "SIGTERM");
|
|
||||||
} catch (err) {
|
|
||||||
opts.stdout.write(`Failed to send SIGTERM to pid ${pid}: ${String(err)}\n`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for it to die
|
|
||||||
const timeoutMs = opts.timeoutMs ?? 5000;
|
|
||||||
const startedAt = Date.now();
|
|
||||||
while (Date.now() - startedAt < timeoutMs) {
|
|
||||||
if (!isAlive(pid)) {
|
|
||||||
opts.stdout.write(`Gateway (pid ${pid}) stopped.\n`);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
await new Promise((r) => setTimeout(r, 200));
|
|
||||||
}
|
|
||||||
|
|
||||||
opts.stdout.write(`Gateway (pid ${pid}) did not stop after ${timeoutMs}ms. Sending SIGKILL...\n`);
|
|
||||||
try {
|
|
||||||
process.kill(pid, "SIGKILL");
|
|
||||||
return true;
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function acquireGatewayLock(
|
export async function acquireGatewayLock(
|
||||||
opts: GatewayLockOptions = {},
|
opts: GatewayLockOptions = {},
|
||||||
): Promise<GatewayLockHandle | null> {
|
): Promise<GatewayLockHandle | null> {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user