fix: harden update restart service convergence
This commit is contained in:
parent
59c78c105a
commit
e93ba6ce2a
@ -1,3 +1,4 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig, ConfigFileSnapshot } from "../config/types.openclaw.js";
|
||||
@ -16,6 +17,10 @@ const serviceLoaded = vi.fn();
|
||||
const prepareRestartScript = vi.fn();
|
||||
const runRestartScript = vi.fn();
|
||||
const mockedRunDaemonInstall = vi.fn();
|
||||
const serviceReadRuntime = vi.fn();
|
||||
const inspectPortUsage = vi.fn();
|
||||
const classifyPortListener = vi.fn();
|
||||
const formatPortDiagnostics = vi.fn();
|
||||
|
||||
vi.mock("@clack/prompts", () => ({
|
||||
confirm,
|
||||
@ -35,6 +40,7 @@ vi.mock("../infra/openclaw-root.js", () => ({
|
||||
|
||||
vi.mock("../config/config.js", () => ({
|
||||
readConfigFileSnapshot: vi.fn(),
|
||||
resolveGatewayPort: vi.fn(() => 18789),
|
||||
writeConfigFile: vi.fn(),
|
||||
}));
|
||||
|
||||
@ -80,9 +86,16 @@ vi.mock("./update-cli/shared.js", async (importOriginal) => {
|
||||
vi.mock("../daemon/service.js", () => ({
|
||||
resolveGatewayService: vi.fn(() => ({
|
||||
isLoaded: (...args: unknown[]) => serviceLoaded(...args),
|
||||
readRuntime: (...args: unknown[]) => serviceReadRuntime(...args),
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("../infra/ports.js", () => ({
|
||||
inspectPortUsage: (...args: unknown[]) => inspectPortUsage(...args),
|
||||
classifyPortListener: (...args: unknown[]) => classifyPortListener(...args),
|
||||
formatPortDiagnostics: (...args: unknown[]) => formatPortDiagnostics(...args),
|
||||
}));
|
||||
|
||||
vi.mock("./update-cli/restart-helper.js", () => ({
|
||||
prepareRestartScript: (...args: unknown[]) => prepareRestartScript(...args),
|
||||
runRestartScript: (...args: unknown[]) => runRestartScript(...args),
|
||||
@ -230,8 +243,12 @@ describe("update-cli", () => {
|
||||
readPackageVersion.mockReset();
|
||||
resolveGlobalManager.mockReset();
|
||||
serviceLoaded.mockReset();
|
||||
serviceReadRuntime.mockReset();
|
||||
prepareRestartScript.mockReset();
|
||||
runRestartScript.mockReset();
|
||||
inspectPortUsage.mockReset();
|
||||
classifyPortListener.mockReset();
|
||||
formatPortDiagnostics.mockReset();
|
||||
vi.mocked(resolveOpenClawPackageRoot).mockResolvedValue(process.cwd());
|
||||
vi.mocked(readConfigFileSnapshot).mockResolvedValue(baseSnapshot);
|
||||
vi.mocked(fetchNpmTagVersion).mockResolvedValue({
|
||||
@ -279,8 +296,21 @@ describe("update-cli", () => {
|
||||
readPackageVersion.mockResolvedValue("1.0.0");
|
||||
resolveGlobalManager.mockResolvedValue("npm");
|
||||
serviceLoaded.mockResolvedValue(false);
|
||||
serviceReadRuntime.mockResolvedValue({
|
||||
status: "running",
|
||||
pid: 4242,
|
||||
state: "running",
|
||||
});
|
||||
prepareRestartScript.mockResolvedValue("/tmp/openclaw-restart-test.sh");
|
||||
runRestartScript.mockResolvedValue(undefined);
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "busy",
|
||||
listeners: [{ pid: 4242, command: "openclaw-gateway" }],
|
||||
hints: [],
|
||||
});
|
||||
classifyPortListener.mockReturnValue("gateway");
|
||||
formatPortDiagnostics.mockReturnValue(["Port 18789 is already in use."]);
|
||||
vi.mocked(runDaemonInstall).mockResolvedValue(undefined);
|
||||
setTty(false);
|
||||
setStdoutTty(false);
|
||||
@ -486,6 +516,36 @@ describe("update-cli", () => {
|
||||
expect(runDaemonRestart).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("updateCommand refreshes service env from updated install root when available", async () => {
|
||||
const root = createCaseDir("openclaw-updated-root");
|
||||
await fs.mkdir(path.join(root, "dist"), { recursive: true });
|
||||
await fs.writeFile(path.join(root, "dist", "entry.js"), "console.log('ok');\n", "utf8");
|
||||
|
||||
vi.mocked(runGatewayUpdate).mockResolvedValue({
|
||||
status: "ok",
|
||||
mode: "npm",
|
||||
root,
|
||||
steps: [],
|
||||
durationMs: 100,
|
||||
});
|
||||
serviceLoaded.mockResolvedValue(true);
|
||||
|
||||
await updateCommand({});
|
||||
|
||||
expect(runCommandWithTimeout).toHaveBeenCalledWith(
|
||||
[
|
||||
expect.stringMatching(/node/),
|
||||
path.join(root, "dist", "entry.js"),
|
||||
"gateway",
|
||||
"install",
|
||||
"--force",
|
||||
],
|
||||
expect.objectContaining({ timeoutMs: 60_000 }),
|
||||
);
|
||||
expect(runDaemonInstall).not.toHaveBeenCalled();
|
||||
expect(runRestartScript).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("updateCommand falls back to restart when env refresh install fails", async () => {
|
||||
const mockResult: UpdateRunResult = {
|
||||
status: "ok",
|
||||
|
||||
@ -5,8 +5,19 @@ import {
|
||||
ensureCompletionCacheExists,
|
||||
} from "../../commands/doctor-completion.js";
|
||||
import { doctorCommand } from "../../commands/doctor.js";
|
||||
import { readConfigFileSnapshot, writeConfigFile } from "../../config/config.js";
|
||||
import {
|
||||
readConfigFileSnapshot,
|
||||
resolveGatewayPort,
|
||||
writeConfigFile,
|
||||
} from "../../config/config.js";
|
||||
import type { GatewayServiceRuntime } from "../../daemon/service-runtime.js";
|
||||
import { resolveGatewayService } from "../../daemon/service.js";
|
||||
import {
|
||||
classifyPortListener,
|
||||
formatPortDiagnostics,
|
||||
inspectPortUsage,
|
||||
type PortUsage,
|
||||
} from "../../infra/ports.js";
|
||||
import {
|
||||
channelToNpmTag,
|
||||
DEFAULT_GIT_CHANNEL,
|
||||
@ -29,7 +40,7 @@ import { runCommandWithTimeout } from "../../process/exec.js";
|
||||
import { defaultRuntime } from "../../runtime.js";
|
||||
import { stylePromptMessage } from "../../terminal/prompt-style.js";
|
||||
import { theme } from "../../terminal/theme.js";
|
||||
import { pathExists } from "../../utils.js";
|
||||
import { pathExists, sleep } from "../../utils.js";
|
||||
import { replaceCliName, resolveCliName } from "../cli-name.js";
|
||||
import { formatCliCommand } from "../command-format.js";
|
||||
import { installCompletion } from "../completion-cli.js";
|
||||
@ -55,6 +66,9 @@ import {
|
||||
import { suppressDeprecations } from "./suppress-deprecations.js";
|
||||
|
||||
const CLI_NAME = resolveCliName();
|
||||
const SERVICE_REFRESH_TIMEOUT_MS = 60_000;
|
||||
const POST_RESTART_HEALTH_ATTEMPTS = 8;
|
||||
const POST_RESTART_HEALTH_DELAY_MS = 450;
|
||||
|
||||
const UPDATE_QUIPS = [
|
||||
"Leveled up! New skills unlocked. You're welcome.",
|
||||
@ -83,6 +97,180 @@ function pickUpdateQuip(): string {
|
||||
return UPDATE_QUIPS[Math.floor(Math.random() * UPDATE_QUIPS.length)] ?? "Update complete.";
|
||||
}
|
||||
|
||||
type GatewayRestartSnapshot = {
|
||||
runtime: GatewayServiceRuntime;
|
||||
portUsage: PortUsage;
|
||||
healthy: boolean;
|
||||
staleGatewayPids: number[];
|
||||
};
|
||||
|
||||
function resolveGatewayInstallEntrypointCandidates(root?: string): string[] {
|
||||
if (!root) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
path.join(root, "dist", "entry.js"),
|
||||
path.join(root, "dist", "entry.mjs"),
|
||||
path.join(root, "dist", "index.js"),
|
||||
path.join(root, "dist", "index.mjs"),
|
||||
];
|
||||
}
|
||||
|
||||
function formatCommandFailure(stdout: string, stderr: string): string {
|
||||
const detail = (stderr || stdout).trim();
|
||||
if (!detail) {
|
||||
return "command returned a non-zero exit code";
|
||||
}
|
||||
return detail.split("\n").slice(-3).join("\n");
|
||||
}
|
||||
|
||||
async function refreshGatewayServiceEnv(params: {
|
||||
result: UpdateRunResult;
|
||||
jsonMode: boolean;
|
||||
}): Promise<void> {
|
||||
const args = ["gateway", "install", "--force"];
|
||||
if (params.jsonMode) {
|
||||
args.push("--json");
|
||||
}
|
||||
|
||||
for (const candidate of resolveGatewayInstallEntrypointCandidates(params.result.root)) {
|
||||
if (!(await pathExists(candidate))) {
|
||||
continue;
|
||||
}
|
||||
const res = await runCommandWithTimeout([resolveNodeRunner(), candidate, ...args], {
|
||||
timeoutMs: SERVICE_REFRESH_TIMEOUT_MS,
|
||||
});
|
||||
if (res.code === 0) {
|
||||
return;
|
||||
}
|
||||
throw new Error(
|
||||
`updated install refresh failed (${candidate}): ${formatCommandFailure(res.stdout, res.stderr)}`,
|
||||
);
|
||||
}
|
||||
|
||||
await runDaemonInstall({ force: true, json: params.jsonMode || undefined });
|
||||
}
|
||||
|
||||
async function inspectGatewayRestart(port: number): Promise<GatewayRestartSnapshot> {
|
||||
const service = resolveGatewayService();
|
||||
let runtime: GatewayServiceRuntime = { status: "unknown" };
|
||||
try {
|
||||
runtime = await service.readRuntime(process.env);
|
||||
} catch (err) {
|
||||
runtime = { status: "unknown", detail: String(err) };
|
||||
}
|
||||
|
||||
let portUsage: PortUsage;
|
||||
try {
|
||||
portUsage = await inspectPortUsage(port);
|
||||
} catch (err) {
|
||||
portUsage = {
|
||||
port,
|
||||
status: "unknown",
|
||||
listeners: [],
|
||||
hints: [],
|
||||
errors: [String(err)],
|
||||
};
|
||||
}
|
||||
|
||||
const gatewayListeners =
|
||||
portUsage.status === "busy"
|
||||
? portUsage.listeners.filter((listener) => classifyPortListener(listener, port) === "gateway")
|
||||
: [];
|
||||
const running = runtime.status === "running";
|
||||
const ownsPort =
|
||||
runtime.pid != null
|
||||
? portUsage.listeners.some((listener) => listener.pid === runtime.pid)
|
||||
: gatewayListeners.length > 0 ||
|
||||
(portUsage.status === "busy" && portUsage.listeners.length === 0);
|
||||
const healthy = running && ownsPort;
|
||||
const staleGatewayPids = Array.from(
|
||||
new Set(
|
||||
gatewayListeners
|
||||
.map((listener) => listener.pid)
|
||||
.filter((pid): pid is number => Number.isFinite(pid))
|
||||
.filter((pid) => runtime.pid == null || pid !== runtime.pid || !running),
|
||||
),
|
||||
);
|
||||
|
||||
return {
|
||||
runtime,
|
||||
portUsage,
|
||||
healthy,
|
||||
staleGatewayPids,
|
||||
};
|
||||
}
|
||||
|
||||
async function waitForGatewayHealthyRestart(port: number): Promise<GatewayRestartSnapshot> {
|
||||
let snapshot = await inspectGatewayRestart(port);
|
||||
for (let attempt = 0; attempt < POST_RESTART_HEALTH_ATTEMPTS; attempt += 1) {
|
||||
if (snapshot.healthy) {
|
||||
return snapshot;
|
||||
}
|
||||
if (snapshot.staleGatewayPids.length > 0 && snapshot.runtime.status !== "running") {
|
||||
return snapshot;
|
||||
}
|
||||
await sleep(POST_RESTART_HEALTH_DELAY_MS);
|
||||
snapshot = await inspectGatewayRestart(port);
|
||||
}
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): string[] {
|
||||
const lines: string[] = [];
|
||||
const runtimeSummary = [
|
||||
snapshot.runtime.status ? `status=${snapshot.runtime.status}` : null,
|
||||
snapshot.runtime.state ? `state=${snapshot.runtime.state}` : null,
|
||||
snapshot.runtime.pid != null ? `pid=${snapshot.runtime.pid}` : null,
|
||||
snapshot.runtime.lastExitStatus != null ? `lastExit=${snapshot.runtime.lastExitStatus}` : null,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(", ");
|
||||
if (runtimeSummary) {
|
||||
lines.push(`Service runtime: ${runtimeSummary}`);
|
||||
}
|
||||
if (snapshot.portUsage.status === "busy") {
|
||||
lines.push(...formatPortDiagnostics(snapshot.portUsage));
|
||||
} else {
|
||||
lines.push(`Gateway port ${snapshot.portUsage.port} status: ${snapshot.portUsage.status}.`);
|
||||
}
|
||||
if (snapshot.portUsage.errors?.length) {
|
||||
lines.push(`Port diagnostics errors: ${snapshot.portUsage.errors.join("; ")}`);
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> {
|
||||
const killed: number[] = [];
|
||||
for (const pid of pids) {
|
||||
try {
|
||||
process.kill(pid, "SIGTERM");
|
||||
killed.push(pid);
|
||||
} catch (err) {
|
||||
const code = (err as NodeJS.ErrnoException)?.code;
|
||||
if (code !== "ESRCH") {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (killed.length === 0) {
|
||||
return killed;
|
||||
}
|
||||
await sleep(400);
|
||||
for (const pid of killed) {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
process.kill(pid, "SIGKILL");
|
||||
} catch (err) {
|
||||
const code = (err as NodeJS.ErrnoException)?.code;
|
||||
if (code !== "ESRCH") {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed;
|
||||
}
|
||||
|
||||
async function tryInstallShellCompletion(opts: {
|
||||
jsonMode: boolean;
|
||||
skipPrompt: boolean;
|
||||
@ -392,6 +580,7 @@ async function maybeRestartService(params: {
|
||||
result: UpdateRunResult;
|
||||
opts: UpdateCommandOptions;
|
||||
refreshServiceEnv: boolean;
|
||||
gatewayPort: number;
|
||||
restartScriptPath?: string | null;
|
||||
}): Promise<void> {
|
||||
if (params.shouldRestart) {
|
||||
@ -405,7 +594,10 @@ async function maybeRestartService(params: {
|
||||
let restartInitiated = false;
|
||||
if (params.refreshServiceEnv) {
|
||||
try {
|
||||
await runDaemonInstall({ force: true, json: params.opts.json });
|
||||
await refreshGatewayServiceEnv({
|
||||
result: params.result,
|
||||
jsonMode: Boolean(params.opts.json),
|
||||
});
|
||||
} catch (err) {
|
||||
if (!params.opts.json) {
|
||||
defaultRuntime.log(
|
||||
@ -441,12 +633,33 @@ async function maybeRestartService(params: {
|
||||
}
|
||||
|
||||
if (!params.opts.json && restartInitiated) {
|
||||
defaultRuntime.log(theme.success("Daemon restart initiated."));
|
||||
defaultRuntime.log(
|
||||
theme.muted(
|
||||
`Verify with \`${replaceCliName(formatCliCommand("openclaw gateway status"), CLI_NAME)}\` once the gateway is back.`,
|
||||
),
|
||||
);
|
||||
let health = await waitForGatewayHealthyRestart(params.gatewayPort);
|
||||
if (!health.healthy && health.staleGatewayPids.length > 0) {
|
||||
if (!params.opts.json) {
|
||||
defaultRuntime.log(
|
||||
theme.warn(
|
||||
`Found stale gateway process(es) after restart: ${health.staleGatewayPids.join(", ")}. Cleaning up...`,
|
||||
),
|
||||
);
|
||||
}
|
||||
await terminateStaleGatewayPids(health.staleGatewayPids);
|
||||
await runDaemonRestart();
|
||||
health = await waitForGatewayHealthyRestart(params.gatewayPort);
|
||||
}
|
||||
|
||||
if (health.healthy) {
|
||||
defaultRuntime.log(theme.success("Daemon restart completed."));
|
||||
} else {
|
||||
defaultRuntime.log(theme.warn("Gateway did not become healthy after restart."));
|
||||
for (const line of renderRestartDiagnostics(health)) {
|
||||
defaultRuntime.log(theme.muted(line));
|
||||
}
|
||||
defaultRuntime.log(
|
||||
theme.muted(
|
||||
`Run \`${replaceCliName(formatCliCommand("openclaw gateway status --probe --deep"), CLI_NAME)}\` for details.`,
|
||||
),
|
||||
);
|
||||
}
|
||||
defaultRuntime.log("");
|
||||
}
|
||||
} catch (err) {
|
||||
@ -686,6 +899,7 @@ export async function updateCommand(opts: UpdateCommandOptions): Promise<void> {
|
||||
result,
|
||||
opts,
|
||||
refreshServiceEnv: refreshGatewayServiceEnv,
|
||||
gatewayPort: resolveGatewayPort(configSnapshot.valid ? configSnapshot.config : undefined),
|
||||
restartScriptPath,
|
||||
});
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user