From 3e1bee931a87d7c87288aa004c98d9f9973533de Mon Sep 17 00:00:00 2001 From: Bryan Marty Date: Wed, 11 Mar 2026 04:38:00 +0000 Subject: [PATCH] feat: split restart sentinel into user-facing and internal context messages - Add formatRestartSentinelUserMessage for clean user-facing delivery (omits status prefix and doctorHint) - Add formatRestartSentinelInternalContext for agent system prompt injection with full diagnostic details - Update server-restart-sentinel to use userMessage for direct channel delivery - Add tests for new formatting functions --- docs/reference/templates/AGENTS.md | 31 ++++++ docs/zh-CN/reference/AGENTS.default.md | 23 +++++ src/agents/tools/gateway-tool.ts | 2 +- src/gateway/server-restart-sentinel.test.ts | 23 +++-- src/gateway/server-restart-sentinel.ts | 25 +++-- src/infra/restart-sentinel.test.ts | 109 +++++++++++++++++++- src/infra/restart-sentinel.ts | 41 ++++++++ 7 files changed, 238 insertions(+), 16 deletions(-) diff --git a/docs/reference/templates/AGENTS.md b/docs/reference/templates/AGENTS.md index 9375684b0dd..216a52cdbbe 100644 --- a/docs/reference/templates/AGENTS.md +++ b/docs/reference/templates/AGENTS.md @@ -214,6 +214,37 @@ Think of it like a human reviewing their journal and updating their mental model The goal: Be helpful without being annoying. Check in a few times a day, do useful background work, but respect quiet time. +## 🔄 Gateway Restarts — Do It Right! + +**Never use `openclaw gateway restart` (CLI/shell).** It bypasses the restart sentinel, so you won't auto-resume or notify the user after restart. You'll just sit there silently until someone pings you. + +**Always restart via the gateway tool** (action=restart) or via `config.patch`/`config.apply` — these write a sentinel file before restarting, which the new process consumes to wake you up and message the user automatically. + +```bash +# ✅ Correct: restart via gateway tool (action=restart, sessionKey, note) +# ✅ Correct: config.patch with a key that requires restart (writes sentinel automatically) + +# ❌ Wrong: openclaw gateway restart — no sentinel, silent after restart +# ❌ Wrong: systemctl --user restart openclaw-gateway.service — same problem +``` + +### Which config keys trigger a real restart vs dynamic reload? + +**Full process restart** (sentinel written, agent wakes up): + +- `gateway.*`, `discovery.*`, `plugins.*`, `canvasHost.*` +- Any unrecognized/new config key + +**Hot reload** (no restart, no sentinel needed): + +- `hooks.*`, `cron.*`, `browser.*`, `models.*`, `agents.defaults.heartbeat` + +**Dynamic no-op** (read on next access, no process action): + +- `messages.*`, `agents.*`, `tools.*`, `routing.*`, `session.*`, `skills.*`, `secrets.*`, `meta.*`, `identity.*`, `logging.*`, `ui.*` + +**Rule of thumb:** If you want a test restart, patch `discovery.mdns.mode` to its current value — it's recognized as a restart-triggering key even if the value is unchanged. + ## Make It Yours This is a starting point. Add your own conventions, style, and rules as you figure out what works. diff --git a/docs/zh-CN/reference/AGENTS.default.md b/docs/zh-CN/reference/AGENTS.default.md index 84d4a01e21c..70e6688a374 100644 --- a/docs/zh-CN/reference/AGENTS.default.md +++ b/docs/zh-CN/reference/AGENTS.default.md @@ -120,6 +120,29 @@ git commit -m "Add Clawd workspace" - **bird** — X/Twitter CLI,无需浏览器即可发推、回复、阅读话题和搜索。 - **agent-tools** — 用于自动化和辅助脚本的实用工具包。 +## 🔄 网关重启 — 正确做法! + +**永远不要使用 `openclaw gateway restart`(CLI/shell)。** 这会绕过重启哨兵机制,导致重启后你无法自动恢复,也无法通知用户。你会静静地等待,直到有人 ping 你。 + +**始终通过 gateway 工具**(action=restart)或 `config.patch`/`config.apply` 触发重启——这些方式会在重启前写入哨兵文件,新进程启动后会消费该文件以唤醒你并自动通知用户。 + +### 哪些配置键会触发真正的重启? + +**完整进程重启**(写入哨兵,代理唤醒): + +- `gateway.*`、`discovery.*`、`plugins.*`、`canvasHost.*` +- 任何无法识别的新配置键 + +**热重载**(无需重启,无需哨兵): + +- `hooks.*`、`cron.*`、`browser.*`、`models.*`、`agents.defaults.heartbeat` + +**动态无操作**(下次访问时读取,不触发任何进程操作): + +- `messages.*`、`agents.*`、`tools.*`、`routing.*`、`session.*`、`skills.*`、`secrets.*`、`meta.*` + +**经验法则:** 如果需要测试重启,将 `discovery.mdns.mode` 修改为当前值——即使值未改变,它也会触发重启流程。 + ## 使用说明 - 脚本编写优先使用 `openclaw` CLI;mac 应用处理权限。 diff --git a/src/agents/tools/gateway-tool.ts b/src/agents/tools/gateway-tool.ts index ca125deb6a6..e49f73c57b5 100644 --- a/src/agents/tools/gateway-tool.ts +++ b/src/agents/tools/gateway-tool.ts @@ -80,7 +80,7 @@ export function createGatewayTool(opts?: { name: "gateway", ownerOnly: true, description: - "Restart, inspect a specific config schema path, apply config, or update the gateway in-place (SIGUSR1). Use config.schema.lookup with a targeted dot path before config edits. Use config.patch for safe partial config updates (merges with existing). Use config.apply only when replacing entire config. Both trigger restart after writing. Always pass a human-readable completion message via the `note` parameter so the system can deliver it to the user after restart.", + "Restart, inspect a specific config schema path, apply config, or update the gateway in-place (SIGUSR1). Use config.schema.lookup with a targeted dot path before config edits. Use config.patch for safe partial config updates (merges with existing). Use config.apply only when replacing entire config. Both trigger restart after writing. Always pass a human-readable completion message via the `note` parameter so the system can deliver it to the user after restart. IMPORTANT: Never use the `openclaw gateway restart` CLI command to restart — it bypasses the restart sentinel so the agent will not auto-resume or notify the user after restart. Always restart via this tool (action=restart) or via config.patch/config.apply, which write the sentinel before restarting. Config keys under gateway.*, discovery.*, plugins.*, and canvasHost.* trigger a real process restart; keys under messages.*, agents.*, tools.*, hooks.*, and most others apply dynamically without a restart.", parameters: GatewayToolSchema, execute: async (_toolCallId, args) => { const params = args as Record; diff --git a/src/gateway/server-restart-sentinel.test.ts b/src/gateway/server-restart-sentinel.test.ts index 008437bb037..87f3b8cee1f 100644 --- a/src/gateway/server-restart-sentinel.test.ts +++ b/src/gateway/server-restart-sentinel.test.ts @@ -13,6 +13,10 @@ const mocks = vi.hoisted(() => ({ }, })), formatRestartSentinelMessage: vi.fn(() => "restart message"), + formatRestartSentinelUserMessage: vi.fn(() => "Gateway restarted successfully."), + formatRestartSentinelInternalContext: vi.fn( + () => "[Gateway restart context — internal]\nkind: restart\nstatus: ok", + ), summarizeRestartSentinel: vi.fn(() => "restart summary"), resolveMainSessionKeyFromConfig: vi.fn(() => "agent:main:main"), parseSessionThreadInfo: vi.fn(() => ({ baseSessionKey: null, threadId: undefined })), @@ -39,6 +43,8 @@ vi.mock("../agents/agent-scope.js", () => ({ vi.mock("../infra/restart-sentinel.js", () => ({ consumeRestartSentinel: mocks.consumeRestartSentinel, formatRestartSentinelMessage: mocks.formatRestartSentinelMessage, + formatRestartSentinelUserMessage: mocks.formatRestartSentinelUserMessage, + formatRestartSentinelInternalContext: mocks.formatRestartSentinelInternalContext, summarizeRestartSentinel: mocks.summarizeRestartSentinel, })); @@ -105,21 +111,22 @@ describe("scheduleRestartSentinelWake – two-step delivery + resume", () => { it("delivers restart notice directly (model-independent) then resumes agent", async () => { await scheduleRestartSentinelWake({ deps: {} as never }); - // Step 1: deterministic delivery + // Step 1: deterministic delivery — uses human-friendly userMessage, not raw diagnostic expect(mocks.deliverOutboundPayloads).toHaveBeenCalledWith( expect.objectContaining({ channel: "whatsapp", to: "+15550002", accountId: "acct-2", - payloads: [{ text: "restart message" }], + payloads: [{ text: "Gateway restarted successfully." }], bestEffort: true, }), ); - // Step 2: agent resume + // Step 2: agent resume — userMessage as prompt, internalContext via extraSystemPrompt expect(mocks.agentCommand).toHaveBeenCalledWith( expect.objectContaining({ - message: "restart message", + message: "Gateway restarted successfully.", + extraSystemPrompt: expect.stringContaining("[Gateway restart context"), sessionKey: "agent:main:main", to: "+15550002", channel: "whatsapp", @@ -190,7 +197,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () => await scheduleRestartSentinelWake({ deps: {} as never }); expect(mocks.agentCommand).not.toHaveBeenCalled(); - expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", { + expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", { sessionKey: "agent:main:main", }); }); @@ -204,7 +211,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () => await scheduleRestartSentinelWake({ deps: {} as never }); expect(mocks.agentCommand).not.toHaveBeenCalled(); - expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", { + expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", { sessionKey: "agent:main:main", }); }); @@ -218,7 +225,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () => await scheduleRestartSentinelWake({ deps: {} as never }); expect(mocks.agentCommand).not.toHaveBeenCalled(); - expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", { + expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", { sessionKey: "agent:main:main", }); }); @@ -254,7 +261,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () => await scheduleRestartSentinelWake({ deps: {} as never }); - expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", { + expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", { sessionKey: "agent:main:main", }); // Resume step must still run after delivery failure diff --git a/src/gateway/server-restart-sentinel.ts b/src/gateway/server-restart-sentinel.ts index a8e503532c6..74f9268cdb4 100644 --- a/src/gateway/server-restart-sentinel.ts +++ b/src/gateway/server-restart-sentinel.ts @@ -9,7 +9,9 @@ import { buildOutboundSessionContext } from "../infra/outbound/session-context.j import { resolveOutboundTarget } from "../infra/outbound/targets.js"; import { consumeRestartSentinel, + formatRestartSentinelInternalContext, formatRestartSentinelMessage, + formatRestartSentinelUserMessage, summarizeRestartSentinel, } from "../infra/restart-sentinel.js"; import { enqueueSystemEvent } from "../infra/system-events.js"; @@ -24,7 +26,12 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { } const payload = sentinel.payload; const sessionKey = payload.sessionKey?.trim(); + // Raw diagnostic message (used for system events and enqueue fallbacks). const message = formatRestartSentinelMessage(payload); + // Human-friendly message for direct user delivery — omits status prefix and doctorHint. + const userMessage = formatRestartSentinelUserMessage(payload); + // Full technical context injected into the agent's system prompt. + const internalContext = formatRestartSentinelInternalContext(payload); const summary = summarizeRestartSentinel(payload); if (!sessionKey) { @@ -56,7 +63,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { const channel = channelRaw ? normalizeChannelId(channelRaw) : null; const to = origin?.to; if (!channel || !to) { - enqueueSystemEvent(message, { sessionKey }); + enqueueSystemEvent(userMessage, { sessionKey }); return; } @@ -68,7 +75,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { mode: "implicit", }); if (!resolved.ok) { - enqueueSystemEvent(message, { sessionKey }); + enqueueSystemEvent(userMessage, { sessionKey }); return; } @@ -78,7 +85,9 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { sessionThreadId ?? (origin?.threadId != null ? String(origin.threadId) : undefined); - // Step 1: deliver the restart notice deterministically — model-independent, guaranteed. + // Step 1: deliver a human-friendly restart notice deterministically — model-independent, + // guaranteed. Uses userMessage (omits raw diagnostic fields like status prefix and + // doctorHint) so the user sees a clean message even if the agent turn in Step 2 fails. // Slack uses replyToId (thread_ts) for threading; deliverOutboundPayloads does not do // this mapping automatically, so we convert here. See #17716. const isSlack = channel === "slack"; @@ -93,7 +102,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { accountId: origin?.accountId, replyToId, threadId: resolvedThreadId, - payloads: [{ text: message }], + payloads: [{ text: userMessage }], session: outboundSession, bestEffort: true, }); @@ -102,18 +111,22 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) { // If it does throw (channel plugin/runtime error before best-effort handling is applied), // enqueue a system event so the user receives the restart notice even on delivery failure. // This preserves the prior behaviour where delivery errors in this path produced a fallback event. - enqueueSystemEvent(message, { sessionKey }); + enqueueSystemEvent(userMessage, { sessionKey }); } // Step 2: trigger an agent resume turn so the agent can continue autonomously // after restart. The model sees the restart context and can respond/take actions. + // internalContext is injected via extraSystemPrompt so the agent has full technical + // details (kind, status, note, doctorHint) without exposing raw diagnostics as a + // user-visible chat message. The agent's reply is what the user ultimately sees. // This is safe post-restart: scheduleRestartSentinelWake() runs in the new process // with zero in-flight replies, so the pre-restart race condition (ab4a08a82) does // not apply here. try { await agentCommand( { - message, + message: userMessage, + extraSystemPrompt: internalContext, sessionKey, to: resolved.to, channel, diff --git a/src/infra/restart-sentinel.test.ts b/src/infra/restart-sentinel.test.ts index c28504685bb..1f5df033341 100644 --- a/src/infra/restart-sentinel.test.ts +++ b/src/infra/restart-sentinel.test.ts @@ -5,8 +5,9 @@ import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { captureEnv } from "../test-utils/env.js"; import { consumeRestartSentinel, - formatDoctorNonInteractiveHint, + formatRestartSentinelInternalContext, formatRestartSentinelMessage, + formatRestartSentinelUserMessage, readRestartSentinel, resolveRestartSentinelPath, summarizeRestartSentinel, @@ -160,6 +161,112 @@ describe("restart sentinel", () => { }); }); +describe("formatRestartSentinelUserMessage", () => { + it("returns note for successful restart with note", () => { + const payload = { + kind: "config-patch" as const, + status: "ok" as const, + ts: Date.now(), + message: "testing restart sentinel", + doctorHint: "Run: openclaw doctor --non-interactive", + }; + const result = formatRestartSentinelUserMessage(payload); + expect(result).toBe("testing restart sentinel"); + expect(result).not.toContain("Gateway restart"); + expect(result).not.toContain("config-patch"); + expect(result).not.toContain("doctor"); + }); + + it("returns generic success message when no note", () => { + const payload = { + kind: "update" as const, + status: "ok" as const, + ts: Date.now(), + }; + expect(formatRestartSentinelUserMessage(payload)).toBe("Gateway restarted successfully."); + }); + + it("returns failure message with note for error status", () => { + const payload = { + kind: "config-apply" as const, + status: "error" as const, + ts: Date.now(), + message: "disk full", + }; + const result = formatRestartSentinelUserMessage(payload); + expect(result).toBe("Gateway restart failed: disk full"); + }); + + it("returns generic failure message for error without note", () => { + const payload = { + kind: "restart" as const, + status: "error" as const, + ts: Date.now(), + }; + expect(formatRestartSentinelUserMessage(payload)).toBe("Gateway restart failed."); + }); + + it("never includes doctorHint", () => { + const payload = { + kind: "config-patch" as const, + status: "ok" as const, + ts: Date.now(), + message: "applied config", + doctorHint: "Run: openclaw doctor --non-interactive", + }; + expect(formatRestartSentinelUserMessage(payload)).not.toContain("doctor"); + expect(formatRestartSentinelUserMessage(payload)).not.toContain("openclaw"); + }); +}); + +describe("formatRestartSentinelInternalContext", () => { + it("includes kind, status, note, and doctorHint", () => { + const payload = { + kind: "config-patch" as const, + status: "ok" as const, + ts: Date.now(), + message: "testing restart sentinel", + doctorHint: "Run: openclaw doctor --non-interactive", + stats: { mode: "gateway.config-patch", reason: "discovery.mdns.mode changed" }, + }; + const result = formatRestartSentinelInternalContext(payload); + expect(result).toContain("kind: config-patch"); + expect(result).toContain("status: ok"); + expect(result).toContain("note: testing restart sentinel"); + expect(result).toContain("hint: Run: openclaw doctor"); + expect(result).toContain("mode: gateway.config-patch"); + expect(result).toContain("reason: discovery.mdns.mode changed"); + expect(result).toContain("internal"); + }); + + it("omits empty optional fields", () => { + const payload = { + kind: "restart" as const, + status: "ok" as const, + ts: Date.now(), + }; + const result = formatRestartSentinelInternalContext(payload); + expect(result).not.toContain("note:"); + expect(result).not.toContain("hint:"); + expect(result).not.toContain("reason:"); + expect(result).not.toContain("mode:"); + }); + + it("omits reason when it duplicates note", () => { + const note = "Applying config changes"; + const payload = { + kind: "config-apply" as const, + status: "ok" as const, + ts: Date.now(), + message: note, + stats: { reason: note }, + }; + const result = formatRestartSentinelInternalContext(payload); + const noteOccurrences = result.split(note).length - 1; + expect(noteOccurrences).toBe(1); + }); +}); + describe("restart sentinel message dedup", () => { it("omits duplicate Reason: line when stats.reason matches message", () => { const payload = { diff --git a/src/infra/restart-sentinel.ts b/src/infra/restart-sentinel.ts index baf8168047d..3e2d0e9ad2e 100644 --- a/src/infra/restart-sentinel.ts +++ b/src/infra/restart-sentinel.ts @@ -127,6 +127,47 @@ export function formatRestartSentinelMessage(payload: RestartSentinelPayload): s return lines.join("\n"); } +/** + * Human-friendly message for direct user delivery after a gateway restart. + * Omits raw diagnostic fields (status prefix, doctorHint) — those belong in + * the agent's internal context, not in the user-facing chat message. + */ +export function formatRestartSentinelUserMessage(payload: RestartSentinelPayload): string { + const note = payload.message?.trim(); + if (payload.status === "error") { + return note ? `Gateway restart failed: ${note}` : "Gateway restart failed."; + } + return note ?? "Gateway restarted successfully."; +} + +/** + * Full technical restart context injected into the agent's system prompt so + * it can reason about and respond to the restart without exposing raw + * diagnostic text directly in the user-facing chat message. + */ +export function formatRestartSentinelInternalContext(payload: RestartSentinelPayload): string { + const lines: string[] = [ + "[Gateway restart context — internal, do not surface raw details to user]", + `kind: ${payload.kind}`, + `status: ${payload.status}`, + ]; + const note = payload.message?.trim(); + if (note) { + lines.push(`note: ${note}`); + } + const reason = payload.stats?.reason?.trim(); + if (reason && reason !== note) { + lines.push(`reason: ${reason}`); + } + if (payload.stats?.mode?.trim()) { + lines.push(`mode: ${payload.stats.mode.trim()}`); + } + if (payload.doctorHint?.trim()) { + lines.push(`hint: ${payload.doctorHint.trim()}`); + } + return lines.join("\n"); +} + export function summarizeRestartSentinel(payload: RestartSentinelPayload): string { const kind = payload.kind; const status = payload.status;