From 3e1bee931a87d7c87288aa004c98d9f9973533de Mon Sep 17 00:00:00 2001
From: Bryan Marty <bryanmarty@gmail.com>
Date: Wed, 11 Mar 2026 04:38:00 +0000
Subject: [PATCH] feat: split restart sentinel into user-facing and internal
 context messages

- Add formatRestartSentinelUserMessage for clean user-facing delivery
  (omits status prefix and doctorHint)
- Add formatRestartSentinelInternalContext for agent system prompt injection
  with full diagnostic details
- Update server-restart-sentinel to use userMessage for direct channel delivery
- Add tests for new formatting functions
---
 docs/reference/templates/AGENTS.md          |  31 ++++++
 docs/zh-CN/reference/AGENTS.default.md      |  23 +++++
 src/agents/tools/gateway-tool.ts            |   2 +-
 src/gateway/server-restart-sentinel.test.ts |  23 +++--
 src/gateway/server-restart-sentinel.ts      |  25 +++--
 src/infra/restart-sentinel.test.ts          | 109 +++++++++++++++++++-
 src/infra/restart-sentinel.ts               |  41 ++++++++
 7 files changed, 238 insertions(+), 16 deletions(-)

diff --git a/docs/reference/templates/AGENTS.md b/docs/reference/templates/AGENTS.md
index 9375684b0dd..216a52cdbbe 100644
--- a/docs/reference/templates/AGENTS.md
+++ b/docs/reference/templates/AGENTS.md
@@ -214,6 +214,37 @@ Think of it like a human reviewing their journal and updating their mental model
 
 The goal: Be helpful without being annoying. Check in a few times a day, do useful background work, but respect quiet time.
 
+## 🔄 Gateway Restarts — Do It Right!
+
+**Never use `openclaw gateway restart` (CLI/shell).** It bypasses the restart sentinel, so you won't auto-resume or notify the user after restart. You'll just sit there silently until someone pings you.
+
+**Always restart via the gateway tool** (action=restart) or via `config.patch`/`config.apply` — these write a sentinel file before restarting, which the new process consumes to wake you up and message the user automatically.
+
+```bash
+# ✅ Correct: restart via gateway tool (action=restart, sessionKey, note)
+# ✅ Correct: config.patch with a key that requires restart (writes sentinel automatically)
+
+# ❌ Wrong: openclaw gateway restart  — no sentinel, silent after restart
+# ❌ Wrong: systemctl --user restart openclaw-gateway.service  — same problem
+```
+
+### Which config keys trigger a real restart vs dynamic reload?
+
+**Full process restart** (sentinel written, agent wakes up):
+
+- `gateway.*`, `discovery.*`, `plugins.*`, `canvasHost.*`
+- Any unrecognized/new config key
+
+**Hot reload** (no restart, no sentinel needed):
+
+- `hooks.*`, `cron.*`, `browser.*`, `models.*`, `agents.defaults.heartbeat`
+
+**Dynamic no-op** (read on next access, no process action):
+
+- `messages.*`, `agents.*`, `tools.*`, `routing.*`, `session.*`, `skills.*`, `secrets.*`, `meta.*`, `identity.*`, `logging.*`, `ui.*`
+
+**Rule of thumb:** If you want a test restart, patch `discovery.mdns.mode` to its current value — it's recognized as a restart-triggering key even if the value is unchanged.
+
 ## Make It Yours
 
 This is a starting point. Add your own conventions, style, and rules as you figure out what works.
diff --git a/docs/zh-CN/reference/AGENTS.default.md b/docs/zh-CN/reference/AGENTS.default.md
index 84d4a01e21c..70e6688a374 100644
--- a/docs/zh-CN/reference/AGENTS.default.md
+++ b/docs/zh-CN/reference/AGENTS.default.md
@@ -120,6 +120,29 @@ git commit -m "Add Clawd workspace"
 - **bird** — X/Twitter CLI，无需浏览器即可发推、回复、阅读话题和搜索。
 - **agent-tools** — 用于自动化和辅助脚本的实用工具包。
 
+## 🔄 网关重启 — 正确做法！
+
+**永远不要使用 `openclaw gateway restart`（CLI/shell）。** 这会绕过重启哨兵机制，导致重启后你无法自动恢复，也无法通知用户。你会静静地等待，直到有人 ping 你。
+
+**始终通过 gateway 工具**（action=restart）或 `config.patch`/`config.apply` 触发重启——这些方式会在重启前写入哨兵文件，新进程启动后会消费该文件以唤醒你并自动通知用户。
+
+### 哪些配置键会触发真正的重启？
+
+**完整进程重启**（写入哨兵，代理唤醒）：
+
+- `gateway.*`、`discovery.*`、`plugins.*`、`canvasHost.*`
+- 任何无法识别的新配置键
+
+**热重载**（无需重启，无需哨兵）：
+
+- `hooks.*`、`cron.*`、`browser.*`、`models.*`、`agents.defaults.heartbeat`
+
+**动态无操作**（下次访问时读取，不触发任何进程操作）：
+
+- `messages.*`、`agents.*`、`tools.*`、`routing.*`、`session.*`、`skills.*`、`secrets.*`、`meta.*`
+
+**经验法则：** 如果需要测试重启，将 `discovery.mdns.mode` 修改为当前值——即使值未改变，它也会触发重启流程。
+
 ## 使用说明
 
 - 脚本编写优先使用 `openclaw` CLI；mac 应用处理权限。
diff --git a/src/agents/tools/gateway-tool.ts b/src/agents/tools/gateway-tool.ts
index ca125deb6a6..e49f73c57b5 100644
--- a/src/agents/tools/gateway-tool.ts
+++ b/src/agents/tools/gateway-tool.ts
@@ -80,7 +80,7 @@ export function createGatewayTool(opts?: {
     name: "gateway",
     ownerOnly: true,
     description:
-      "Restart, inspect a specific config schema path, apply config, or update the gateway in-place (SIGUSR1). Use config.schema.lookup with a targeted dot path before config edits. Use config.patch for safe partial config updates (merges with existing). Use config.apply only when replacing entire config. Both trigger restart after writing. Always pass a human-readable completion message via the `note` parameter so the system can deliver it to the user after restart.",
+      "Restart, inspect a specific config schema path, apply config, or update the gateway in-place (SIGUSR1). Use config.schema.lookup with a targeted dot path before config edits. Use config.patch for safe partial config updates (merges with existing). Use config.apply only when replacing entire config. Both trigger restart after writing. Always pass a human-readable completion message via the `note` parameter so the system can deliver it to the user after restart. IMPORTANT: Never use the `openclaw gateway restart` CLI command to restart — it bypasses the restart sentinel so the agent will not auto-resume or notify the user after restart. Always restart via this tool (action=restart) or via config.patch/config.apply, which write the sentinel before restarting. Config keys under gateway.*, discovery.*, plugins.*, and canvasHost.* trigger a real process restart; keys under messages.*, agents.*, tools.*, hooks.*, and most others apply dynamically without a restart.",
     parameters: GatewayToolSchema,
     execute: async (_toolCallId, args) => {
       const params = args as Record<string, unknown>;
diff --git a/src/gateway/server-restart-sentinel.test.ts b/src/gateway/server-restart-sentinel.test.ts
index 008437bb037..87f3b8cee1f 100644
--- a/src/gateway/server-restart-sentinel.test.ts
+++ b/src/gateway/server-restart-sentinel.test.ts
@@ -13,6 +13,10 @@ const mocks = vi.hoisted(() => ({
     },
   })),
   formatRestartSentinelMessage: vi.fn(() => "restart message"),
+  formatRestartSentinelUserMessage: vi.fn(() => "Gateway restarted successfully."),
+  formatRestartSentinelInternalContext: vi.fn(
+    () => "[Gateway restart context — internal]\nkind: restart\nstatus: ok",
+  ),
   summarizeRestartSentinel: vi.fn(() => "restart summary"),
   resolveMainSessionKeyFromConfig: vi.fn(() => "agent:main:main"),
   parseSessionThreadInfo: vi.fn(() => ({ baseSessionKey: null, threadId: undefined })),
@@ -39,6 +43,8 @@ vi.mock("../agents/agent-scope.js", () => ({
 vi.mock("../infra/restart-sentinel.js", () => ({
   consumeRestartSentinel: mocks.consumeRestartSentinel,
   formatRestartSentinelMessage: mocks.formatRestartSentinelMessage,
+  formatRestartSentinelUserMessage: mocks.formatRestartSentinelUserMessage,
+  formatRestartSentinelInternalContext: mocks.formatRestartSentinelInternalContext,
   summarizeRestartSentinel: mocks.summarizeRestartSentinel,
 }));
 
@@ -105,21 +111,22 @@ describe("scheduleRestartSentinelWake – two-step delivery + resume", () => {
   it("delivers restart notice directly (model-independent) then resumes agent", async () => {
     await scheduleRestartSentinelWake({ deps: {} as never });
 
-    // Step 1: deterministic delivery
+    // Step 1: deterministic delivery — uses human-friendly userMessage, not raw diagnostic
     expect(mocks.deliverOutboundPayloads).toHaveBeenCalledWith(
       expect.objectContaining({
         channel: "whatsapp",
         to: "+15550002",
         accountId: "acct-2",
-        payloads: [{ text: "restart message" }],
+        payloads: [{ text: "Gateway restarted successfully." }],
         bestEffort: true,
       }),
     );
 
-    // Step 2: agent resume
+    // Step 2: agent resume — userMessage as prompt, internalContext via extraSystemPrompt
     expect(mocks.agentCommand).toHaveBeenCalledWith(
       expect.objectContaining({
-        message: "restart message",
+        message: "Gateway restarted successfully.",
+        extraSystemPrompt: expect.stringContaining("[Gateway restart context"),
         sessionKey: "agent:main:main",
         to: "+15550002",
         channel: "whatsapp",
@@ -190,7 +197,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () =>
     await scheduleRestartSentinelWake({ deps: {} as never });
 
     expect(mocks.agentCommand).not.toHaveBeenCalled();
-    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", {
+    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", {
       sessionKey: "agent:main:main",
     });
   });
@@ -204,7 +211,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () =>
     await scheduleRestartSentinelWake({ deps: {} as never });
 
     expect(mocks.agentCommand).not.toHaveBeenCalled();
-    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", {
+    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", {
       sessionKey: "agent:main:main",
     });
   });
@@ -218,7 +225,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () =>
     await scheduleRestartSentinelWake({ deps: {} as never });
 
     expect(mocks.agentCommand).not.toHaveBeenCalled();
-    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", {
+    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", {
       sessionKey: "agent:main:main",
     });
   });
@@ -254,7 +261,7 @@ describe("scheduleRestartSentinelWake – fallback to enqueueSystemEvent", () =>
 
     await scheduleRestartSentinelWake({ deps: {} as never });
 
-    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("restart message", {
+    expect(mocks.enqueueSystemEvent).toHaveBeenCalledWith("Gateway restarted successfully.", {
       sessionKey: "agent:main:main",
     });
     // Resume step must still run after delivery failure
diff --git a/src/gateway/server-restart-sentinel.ts b/src/gateway/server-restart-sentinel.ts
index a8e503532c6..74f9268cdb4 100644
--- a/src/gateway/server-restart-sentinel.ts
+++ b/src/gateway/server-restart-sentinel.ts
@@ -9,7 +9,9 @@ import { buildOutboundSessionContext } from "../infra/outbound/session-context.j
 import { resolveOutboundTarget } from "../infra/outbound/targets.js";
 import {
   consumeRestartSentinel,
+  formatRestartSentinelInternalContext,
   formatRestartSentinelMessage,
+  formatRestartSentinelUserMessage,
   summarizeRestartSentinel,
 } from "../infra/restart-sentinel.js";
 import { enqueueSystemEvent } from "../infra/system-events.js";
@@ -24,7 +26,12 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
   }
   const payload = sentinel.payload;
   const sessionKey = payload.sessionKey?.trim();
+  // Raw diagnostic message (used for system events and enqueue fallbacks).
   const message = formatRestartSentinelMessage(payload);
+  // Human-friendly message for direct user delivery — omits status prefix and doctorHint.
+  const userMessage = formatRestartSentinelUserMessage(payload);
+  // Full technical context injected into the agent's system prompt.
+  const internalContext = formatRestartSentinelInternalContext(payload);
   const summary = summarizeRestartSentinel(payload);
 
   if (!sessionKey) {
@@ -56,7 +63,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
   const channel = channelRaw ? normalizeChannelId(channelRaw) : null;
   const to = origin?.to;
   if (!channel || !to) {
-    enqueueSystemEvent(message, { sessionKey });
+    enqueueSystemEvent(userMessage, { sessionKey });
     return;
   }
 
@@ -68,7 +75,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
     mode: "implicit",
   });
   if (!resolved.ok) {
-    enqueueSystemEvent(message, { sessionKey });
+    enqueueSystemEvent(userMessage, { sessionKey });
     return;
   }
 
@@ -78,7 +85,9 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
     sessionThreadId ??
     (origin?.threadId != null ? String(origin.threadId) : undefined);
 
-  // Step 1: deliver the restart notice deterministically — model-independent, guaranteed.
+  // Step 1: deliver a human-friendly restart notice deterministically — model-independent,
+  // guaranteed. Uses userMessage (omits raw diagnostic fields like status prefix and
+  // doctorHint) so the user sees a clean message even if the agent turn in Step 2 fails.
   // Slack uses replyToId (thread_ts) for threading; deliverOutboundPayloads does not do
   // this mapping automatically, so we convert here. See #17716.
   const isSlack = channel === "slack";
@@ -93,7 +102,7 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
       accountId: origin?.accountId,
       replyToId,
       threadId: resolvedThreadId,
-      payloads: [{ text: message }],
+      payloads: [{ text: userMessage }],
       session: outboundSession,
       bestEffort: true,
     });
@@ -102,18 +111,22 @@ export async function scheduleRestartSentinelWake(params: { deps: CliDeps }) {
     // If it does throw (channel plugin/runtime error before best-effort handling is applied),
     // enqueue a system event so the user receives the restart notice even on delivery failure.
     // This preserves the prior behaviour where delivery errors in this path produced a fallback event.
-    enqueueSystemEvent(message, { sessionKey });
+    enqueueSystemEvent(userMessage, { sessionKey });
   }
 
   // Step 2: trigger an agent resume turn so the agent can continue autonomously
   // after restart. The model sees the restart context and can respond/take actions.
+  // internalContext is injected via extraSystemPrompt so the agent has full technical
+  // details (kind, status, note, doctorHint) without exposing raw diagnostics as a
+  // user-visible chat message. The agent's reply is what the user ultimately sees.
   // This is safe post-restart: scheduleRestartSentinelWake() runs in the new process
   // with zero in-flight replies, so the pre-restart race condition (ab4a08a82) does
   // not apply here.
   try {
     await agentCommand(
       {
-        message,
+        message: userMessage,
+        extraSystemPrompt: internalContext,
         sessionKey,
         to: resolved.to,
         channel,
diff --git a/src/infra/restart-sentinel.test.ts b/src/infra/restart-sentinel.test.ts
index c28504685bb..1f5df033341 100644
--- a/src/infra/restart-sentinel.test.ts
+++ b/src/infra/restart-sentinel.test.ts
@@ -5,8 +5,9 @@ import { afterEach, beforeEach, describe, expect, it } from "vitest";
 import { captureEnv } from "../test-utils/env.js";
 import {
   consumeRestartSentinel,
-  formatDoctorNonInteractiveHint,
+  formatRestartSentinelInternalContext,
   formatRestartSentinelMessage,
+  formatRestartSentinelUserMessage,
   readRestartSentinel,
   resolveRestartSentinelPath,
   summarizeRestartSentinel,
@@ -160,6 +161,112 @@ describe("restart sentinel", () => {
   });
 });
 
+describe("formatRestartSentinelUserMessage", () => {
+  it("returns note for successful restart with note", () => {
+    const payload = {
+      kind: "config-patch" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+      message: "testing restart sentinel",
+      doctorHint: "Run: openclaw doctor --non-interactive",
+    };
+    const result = formatRestartSentinelUserMessage(payload);
+    expect(result).toBe("testing restart sentinel");
+    expect(result).not.toContain("Gateway restart");
+    expect(result).not.toContain("config-patch");
+    expect(result).not.toContain("doctor");
+  });
+
+  it("returns generic success message when no note", () => {
+    const payload = {
+      kind: "update" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+    };
+    expect(formatRestartSentinelUserMessage(payload)).toBe("Gateway restarted successfully.");
+  });
+
+  it("returns failure message with note for error status", () => {
+    const payload = {
+      kind: "config-apply" as const,
+      status: "error" as const,
+      ts: Date.now(),
+      message: "disk full",
+    };
+    const result = formatRestartSentinelUserMessage(payload);
+    expect(result).toBe("Gateway restart failed: disk full");
+  });
+
+  it("returns generic failure message for error without note", () => {
+    const payload = {
+      kind: "restart" as const,
+      status: "error" as const,
+      ts: Date.now(),
+    };
+    expect(formatRestartSentinelUserMessage(payload)).toBe("Gateway restart failed.");
+  });
+
+  it("never includes doctorHint", () => {
+    const payload = {
+      kind: "config-patch" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+      message: "applied config",
+      doctorHint: "Run: openclaw doctor --non-interactive",
+    };
+    expect(formatRestartSentinelUserMessage(payload)).not.toContain("doctor");
+    expect(formatRestartSentinelUserMessage(payload)).not.toContain("openclaw");
+  });
+});
+
+describe("formatRestartSentinelInternalContext", () => {
+  it("includes kind, status, note, and doctorHint", () => {
+    const payload = {
+      kind: "config-patch" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+      message: "testing restart sentinel",
+      doctorHint: "Run: openclaw doctor --non-interactive",
+      stats: { mode: "gateway.config-patch", reason: "discovery.mdns.mode changed" },
+    };
+    const result = formatRestartSentinelInternalContext(payload);
+    expect(result).toContain("kind: config-patch");
+    expect(result).toContain("status: ok");
+    expect(result).toContain("note: testing restart sentinel");
+    expect(result).toContain("hint: Run: openclaw doctor");
+    expect(result).toContain("mode: gateway.config-patch");
+    expect(result).toContain("reason: discovery.mdns.mode changed");
+    expect(result).toContain("internal");
+  });
+
+  it("omits empty optional fields", () => {
+    const payload = {
+      kind: "restart" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+    };
+    const result = formatRestartSentinelInternalContext(payload);
+    expect(result).not.toContain("note:");
+    expect(result).not.toContain("hint:");
+    expect(result).not.toContain("reason:");
+    expect(result).not.toContain("mode:");
+  });
+
+  it("omits reason when it duplicates note", () => {
+    const note = "Applying config changes";
+    const payload = {
+      kind: "config-apply" as const,
+      status: "ok" as const,
+      ts: Date.now(),
+      message: note,
+      stats: { reason: note },
+    };
+    const result = formatRestartSentinelInternalContext(payload);
+    const noteOccurrences = result.split(note).length - 1;
+    expect(noteOccurrences).toBe(1);
+  });
+});
+
 describe("restart sentinel message dedup", () => {
   it("omits duplicate Reason: line when stats.reason matches message", () => {
     const payload = {
diff --git a/src/infra/restart-sentinel.ts b/src/infra/restart-sentinel.ts
index baf8168047d..3e2d0e9ad2e 100644
--- a/src/infra/restart-sentinel.ts
+++ b/src/infra/restart-sentinel.ts
@@ -127,6 +127,47 @@ export function formatRestartSentinelMessage(payload: RestartSentinelPayload): s
   return lines.join("\n");
 }
 
+/**
+ * Human-friendly message for direct user delivery after a gateway restart.
+ * Omits raw diagnostic fields (status prefix, doctorHint) — those belong in
+ * the agent's internal context, not in the user-facing chat message.
+ */
+export function formatRestartSentinelUserMessage(payload: RestartSentinelPayload): string {
+  const note = payload.message?.trim();
+  if (payload.status === "error") {
+    return note ? `Gateway restart failed: ${note}` : "Gateway restart failed.";
+  }
+  return note ?? "Gateway restarted successfully.";
+}
+
+/**
+ * Full technical restart context injected into the agent's system prompt so
+ * it can reason about and respond to the restart without exposing raw
+ * diagnostic text directly in the user-facing chat message.
+ */
+export function formatRestartSentinelInternalContext(payload: RestartSentinelPayload): string {
+  const lines: string[] = [
+    "[Gateway restart context — internal, do not surface raw details to user]",
+    `kind: ${payload.kind}`,
+    `status: ${payload.status}`,
+  ];
+  const note = payload.message?.trim();
+  if (note) {
+    lines.push(`note: ${note}`);
+  }
+  const reason = payload.stats?.reason?.trim();
+  if (reason && reason !== note) {
+    lines.push(`reason: ${reason}`);
+  }
+  if (payload.stats?.mode?.trim()) {
+    lines.push(`mode: ${payload.stats.mode.trim()}`);
+  }
+  if (payload.doctorHint?.trim()) {
+    lines.push(`hint: ${payload.doctorHint.trim()}`);
+  }
+  return lines.join("\n");
+}
+
 export function summarizeRestartSentinel(payload: RestartSentinelPayload): string {
   const kind = payload.kind;
   const status = payload.status;