openclaw/src/infra/restart-stale-pids.test.ts

import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

// This entire file tests lsof-based Unix port polling. The feature is a deliberate
// no-op on Windows (findGatewayPidsOnPortSync returns [] immediately). Running these
// tests on a Windows CI runner would require lsof which does not exist there, so we
// skip the suite entirely and rely on the Linux/macOS runners for coverage.
const isWindows = process.platform === "win32";

const mockSpawnSync = vi.hoisted(() => vi.fn());
const mockResolveGatewayPort = vi.hoisted(() => vi.fn(() => 18789));
const mockRestartWarn = vi.hoisted(() => vi.fn());

vi.mock("node:child_process", () => ({
  spawnSync: (...args: unknown[]) => mockSpawnSync(...args),
  execFileSync: vi.fn(),
}));

vi.mock("../config/paths.js", () => ({
  resolveGatewayPort: () => mockResolveGatewayPort(),
}));

vi.mock("./ports-lsof.js", () => ({
  resolveLsofCommandSync: vi.fn(() => "lsof"),
}));

vi.mock("../logging/subsystem.js", () => ({
  createSubsystemLogger: vi.fn(() => ({
    warn: (...args: unknown[]) => mockRestartWarn(...args),
    info: vi.fn(),
    error: vi.fn(),
  })),
}));

import { resolveLsofCommandSync } from "./ports-lsof.js";
let __testing: typeof import("./restart-stale-pids.js").__testing;
let cleanStaleGatewayProcessesSync: typeof import("./restart-stale-pids.js").cleanStaleGatewayProcessesSync;
let findGatewayPidsOnPortSync: typeof import("./restart-stale-pids.js").findGatewayPidsOnPortSync;

function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string {
  return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n";
}

type MockLsofResult = {
  error: Error | null;
  status: number | null;
  stdout: string;
  stderr: string;
};

function createLsofResult(overrides: Partial<MockLsofResult> = {}): MockLsofResult {
  return {
    error: null,
    status: 0,
    stdout: "",
    stderr: "",
    ...overrides,
  };
}

function createOpenClawBusyResult(pid: number, overrides: Partial<MockLsofResult> = {}) {
  return createLsofResult({
    stdout: lsofOutput([{ pid, cmd: "openclaw-gateway" }]),
    ...overrides,
  });
}

function createErrnoResult(code: string, message: string) {
  const error = new Error(message) as NodeJS.ErrnoException;
  error.code = code;
  return createLsofResult({ error, status: null });
}

function installInitialBusyPoll(
  stalePid: number,
  resolvePoll: (call: number) => MockLsofResult,
): () => number {
  let call = 0;
  mockSpawnSync.mockImplementation(() => {
    call += 1;
    if (call === 1) {
      return createOpenClawBusyResult(stalePid);
    }
    return resolvePoll(call);
  });
  return () => call;
}

describe.skipIf(isWindows)("restart-stale-pids", () => {
  beforeEach(() => {
    vi.resetModules();
  });

  beforeEach(async () => {
    ({ __testing, cleanStaleGatewayProcessesSync, findGatewayPidsOnPortSync } =
      await import("./restart-stale-pids.js"));
    mockSpawnSync.mockReset();
    mockResolveGatewayPort.mockReset();
    mockRestartWarn.mockReset();
    mockResolveGatewayPort.mockReturnValue(18789);
    __testing.setSleepSyncOverride(() => {});
  });

  afterEach(() => {
    __testing.setSleepSyncOverride(null);
    __testing.setDateNowOverride(null);
    vi.restoreAllMocks();
  });

  // -------------------------------------------------------------------------
  // findGatewayPidsOnPortSync
  // -------------------------------------------------------------------------
  describe("findGatewayPidsOnPortSync", () => {
    it("returns [] when lsof exits with non-zero status", () => {
      mockSpawnSync.mockReturnValue({ error: null, status: 1, stdout: "", stderr: "" });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
    });

    it("logs warning when initial lsof scan exits with status > 1", () => {
      mockSpawnSync.mockReturnValue({ error: null, status: 2, stdout: "", stderr: "lsof error" });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
      expect(mockRestartWarn).toHaveBeenCalledWith(
        expect.stringContaining("lsof exited with status 2"),
      );
    });

    it("returns [] when lsof returns an error object (e.g. ENOENT)", () => {
      mockSpawnSync.mockReturnValue({
        error: new Error("ENOENT"),
        status: null,
        stdout: "",
        stderr: "",
      });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
      expect(mockRestartWarn).toHaveBeenCalledWith(
        expect.stringContaining("lsof failed during initial stale-pid scan"),
      );
    });

    it("parses openclaw-gateway pids and excludes the current process", () => {
      const stalePid = process.pid + 1;
      mockSpawnSync.mockReturnValue({
        error: null,
        status: 0,
        stdout: lsofOutput([
          { pid: stalePid, cmd: "openclaw-gateway" },
          { pid: process.pid, cmd: "openclaw-gateway" },
        ]),
        stderr: "",
      });
      const pids = findGatewayPidsOnPortSync(18789);
      expect(pids).toContain(stalePid);
      expect(pids).not.toContain(process.pid);
    });

    it("excludes pids whose command does not include 'openclaw'", () => {
      const otherPid = process.pid + 2;
      mockSpawnSync.mockReturnValue({
        error: null,
        status: 0,
        stdout: lsofOutput([{ pid: otherPid, cmd: "nginx" }]),
        stderr: "",
      });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
    });

    it("forwards the spawnTimeoutMs argument to spawnSync", () => {
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
      findGatewayPidsOnPortSync(18789, 400);
      expect(mockSpawnSync).toHaveBeenCalledWith(
        "lsof",
        expect.any(Array),
        expect.objectContaining({ timeout: 400 }),
      );
    });

    it("deduplicates pids from dual-stack listeners (IPv4+IPv6 emit same pid twice)", () => {
      // Dual-stack listeners cause lsof to emit the same PID twice in -Fpc output
      // (once for the IPv4 socket, once for IPv6). Without dedup, terminateStaleProcessesSync
      // sends SIGTERM twice and returns killed=[pid, pid], corrupting the count.
      const stalePid = process.pid + 600;
      const stdout = `p${stalePid}\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`;
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
      const result = findGatewayPidsOnPortSync(18789);
      expect(result).toEqual([stalePid]); // deduped — not [pid, pid]
    });

    it("returns [] and skips lsof on win32", () => {
      // The entire describe block is skipped on Windows (isWindows guard at top),
      // so this test only runs on Linux/macOS. It mocks platform to win32 for the
      // single assertion without needing to restore — the suite-level skipIf means
      // this will never run on an actual Windows runner where the mock could leak.
      const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
      Object.defineProperty(process, "platform", { value: "win32", configurable: true });
      try {
        expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
        expect(mockSpawnSync).not.toHaveBeenCalled();
      } finally {
        if (origDescriptor) {
          Object.defineProperty(process, "platform", origDescriptor);
        }
      }
    });
  });

  // -------------------------------------------------------------------------
  // parsePidsFromLsofOutput — pure unit tests (no I/O, driven via spawnSync mock)
  // -------------------------------------------------------------------------
  describe("parsePidsFromLsofOutput (via findGatewayPidsOnPortSync stdout path)", () => {
    it("returns [] for empty lsof stdout (status 0, nothing listening)", () => {
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
    });

    it("parses multiple openclaw pids from a single lsof output block", () => {
      const pid1 = process.pid + 10;
      const pid2 = process.pid + 11;
      mockSpawnSync.mockReturnValue({
        error: null,
        status: 0,
        stdout: lsofOutput([
          { pid: pid1, cmd: "openclaw-gateway" },
          { pid: pid2, cmd: "openclaw-gateway" },
        ]),
        stderr: "",
      });
      const result = findGatewayPidsOnPortSync(18789);
      expect(result).toContain(pid1);
      expect(result).toContain(pid2);
    });

    it("returns [] when status 0 but only non-openclaw pids present", () => {
      // Port may be bound by an unrelated process. findGatewayPidsOnPortSync
      // only tracks openclaw processes — non-openclaw listeners are ignored.
      const otherPid = process.pid + 50;
      mockSpawnSync.mockReturnValue({
        error: null,
        status: 0,
        stdout: lsofOutput([{ pid: otherPid, cmd: "caddy" }]),
        stderr: "",
      });
      expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
    });
  });

  // -------------------------------------------------------------------------
  // pollPortOnce (via cleanStaleGatewayProcessesSync) — Codex P1 regression
  // -------------------------------------------------------------------------
  describe("pollPortOnce — no second lsof spawn (Codex P1 regression)", () => {
    it("treats lsof exit status 1 as port-free (no listeners)", () => {
      // lsof exits with status 1 when no matching processes are found — this is
      // the canonical "port is free" signal, not an error.
      const stalePid = process.pid + 500;
      installInitialBusyPoll(stalePid, () => createLsofResult({ status: 1 }));
      vi.spyOn(process, "kill").mockReturnValue(true);
      // Should complete cleanly (port reported free on status 1)
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
    });

    it("treats lsof exit status >1 as inconclusive, not port-free — Codex P2 regression", () => {
      // Codex P2: non-zero lsof exits other than status 1 (e.g. permission denied,
      // bad flag, runtime error) must not be mapped to free:true. They are
      // inconclusive and should keep the polling loop running until budget expires.
      const stalePid = process.pid + 501;
      const events: string[] = [];
      events.push("initial-find");
      installInitialBusyPoll(stalePid, (call) => {
        if (call === 2) {
          // Permission/runtime error — status 2, should NOT be treated as free
          events.push("error-poll");
          return createLsofResult({ status: 2, stderr: "lsof: permission denied" });
        }
        // Eventually port is free
        events.push("free-poll");
        return createLsofResult({ status: 1 });
      });
      vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();

      // Must have continued polling after the status-2 error, not exited early
      expect(events).toContain("free-poll");
    });

    it("does not make a second lsof call when the first returns status 0", () => {
      // The bug: pollPortOnce previously called findGatewayPidsOnPortSync as a
      // second probe after getting status===0 from the first lsof. That second
      // call collapses any error/timeout back into [], which maps to free:true —
      // silently misclassifying an inconclusive result as "port is free".
      //
      // The fix: pollPortOnce now parses res.stdout directly from the first
      // spawnSync call. Exactly ONE lsof invocation per poll cycle.
      const stalePid = process.pid + 400;
      const getCallCount = installInitialBusyPoll(stalePid, (call) => {
        if (call === 2) {
          // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again)
          return createOpenClawBusyResult(stalePid);
        }
        // Port free on third call
        return createLsofResult();
      });

      vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();

      // If pollPortOnce made a second lsof call internally, spawnCount would
      // be at least 4 (initial + 2 polls each doubled). With the fix, each poll
      // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3.
      expect(getCallCount()).toBe(3);
    });

    it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => {
      // On Linux containers with restricted /proc (AppArmor, seccomp, user namespaces),
      // lsof can exit 1 AND still emit output for processes it could read.
      // status 1 + non-empty openclaw stdout must not be treated as port-free.
      const stalePid = process.pid + 601;
      const getCallCount = installInitialBusyPoll(stalePid, (call) => {
        if (call === 2) {
          // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results
          return createOpenClawBusyResult(stalePid, {
            status: 1,
            stderr: "lsof: WARNING: can't stat() fuse",
          });
        }
        // Third poll: port is genuinely free
        return createLsofResult({ status: 1 });
      });
      vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();
      // Poll 2 returned busy (not free), so we must have polled at least 3 times
      expect(getCallCount()).toBeGreaterThanOrEqual(3);
    });

    it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => {
      // If resolveLsofCommandSync throws (e.g. lsof resolution fails at runtime),
      // pollPortOnce must catch it and return the transient-inconclusive result
      // rather than propagating the exception.
      const stalePid = process.pid + 402;
      const mockedResolveLsof = vi.mocked(resolveLsofCommandSync);

      mockedResolveLsof.mockImplementationOnce(() => {
        // First call: initial findGatewayPidsOnPortSync — succeed normally
        return "lsof";
      });

      mockSpawnSync.mockImplementationOnce(() => {
        // Initial scan: finds stale pid
        return {
          error: null,
          status: 0,
          stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
          stderr: "",
        };
      });

      // Second call: poll — resolveLsofCommandSync throws
      mockedResolveLsof.mockImplementationOnce(() => {
        throw new Error("lsof binary resolution failed");
      });

      // Third call: poll — port is free
      mockedResolveLsof.mockImplementation(() => "lsof");
      mockSpawnSync.mockImplementation(() => ({ error: null, status: 1, stdout: "", stderr: "" }));

      vi.spyOn(process, "kill").mockReturnValue(true);
      // Must not throw — the catch path returns transient inconclusive, loop continues
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
    });
  });

  // -------------------------------------------------------------------------
  // cleanStaleGatewayProcessesSync
  // -------------------------------------------------------------------------
  describe("cleanStaleGatewayProcessesSync", () => {
    it("returns [] and does not call process.kill when port has no listeners", () => {
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout: "", stderr: "" });
      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
      expect(killSpy).not.toHaveBeenCalled();
    });

    it("sends SIGTERM to stale pids and returns them", () => {
      const stalePid = process.pid + 100;
      installInitialBusyPoll(stalePid, () => createLsofResult());

      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
      const result = cleanStaleGatewayProcessesSync();

      expect(result).toContain(stalePid);
      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM");
    });

    it("escalates to SIGKILL when process survives the SIGTERM window", () => {
      const stalePid = process.pid + 101;
      let call = 0;
      mockSpawnSync.mockImplementation(() => {
        call++;
        if (call <= 5) {
          return {
            error: null,
            status: 0,
            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
            stderr: "",
          };
        }
        return { error: null, status: 0, stdout: "", stderr: "" };
      });

      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();

      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGTERM");
      expect(killSpy).toHaveBeenCalledWith(stalePid, "SIGKILL");
    });

    it("polls until port is confirmed free before returning — regression for #33103", () => {
      // Core regression: cleanStaleGatewayProcessesSync must not return while
      // the port is still bound. Previously it returned after a fixed 500ms
      // sleep regardless of port state, causing systemd's new process to hit
      // EADDRINUSE and enter an unbounded restart loop.
      const stalePid = process.pid + 200;
      const events: string[] = [];
      let call = 0;

      mockSpawnSync.mockImplementation(() => {
        call++;
        if (call === 1) {
          events.push("initial-find");
          return {
            error: null,
            status: 0,
            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
            stderr: "",
          };
        }
        if (call <= 4) {
          events.push(`busy-poll-${call}`);
          return {
            error: null,
            status: 0,
            stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
            stderr: "",
          };
        }
        events.push("port-free");
        return { error: null, status: 0, stdout: "", stderr: "" };
      });

      vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();

      expect(events).toContain("port-free");
      expect(events.filter((e) => e.startsWith("busy-poll")).length).toBeGreaterThan(0);
    });

    it("bails immediately when lsof is permanently unavailable (ENOENT) — Greptile edge case", () => {
      // Regression for the edge case identified in PR review: lsof returning an
      // error must not be treated as "port free". ENOENT means lsof is not
      // installed — a permanent condition. The polling loop should bail
      // immediately on ENOENT rather than spinning the full 2-second budget.
      const stalePid = process.pid + 300;
      const events: string[] = [];
      events.push("initial-find");
      installInitialBusyPoll(stalePid, (call) => {
        // Permanent ENOENT — lsof is not installed
        events.push(`enoent-poll-${call}`);
        return createErrnoResult("ENOENT", "lsof not found");
      });

      vi.spyOn(process, "kill").mockReturnValue(true);
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();

      // Must bail after first ENOENT poll — no point retrying a missing binary
      const enoentPolls = events.filter((e) => e.startsWith("enoent-poll"));
      expect(enoentPolls.length).toBe(1);
    });

    it("bails immediately when lsof is permanently unavailable (EPERM) — SELinux/AppArmor", () => {
      // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks
      // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless.
      const stalePid = process.pid + 305;
      const getCallCount = installInitialBusyPoll(stalePid, () =>
        createErrnoResult("EPERM", "lsof eperm"),
      );
      vi.spyOn(process, "kill").mockReturnValue(true);
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
      // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES
      expect(getCallCount()).toBe(2); // 1 initial find + 1 EPERM poll
    });

    it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => {
      // EACCES and EPERM are also permanent conditions — lsof exists but the
      // process has no permission to run it. No point retrying.
      const stalePid = process.pid + 302;
      const getCallCount = installInitialBusyPoll(stalePid, () =>
        createErrnoResult("EACCES", "lsof permission denied"),
      );
      vi.spyOn(process, "kill").mockReturnValue(true);
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
      // Should have bailed after exactly 1 poll call (the EACCES one)
      expect(getCallCount()).toBe(2); // 1 initial find + 1 EACCES poll
    });

    it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => {
      // Sub-agent audit HIGH finding: the original test relied on real wall-clock
      // time (Date.now() + 2000ms deadline), burning 2 full seconds of CI time
      // every run. Fix: expose dateNowOverride in __testing so the deadline can
      // be synthesised instantly, keeping the test under 10ms.
      const stalePid = process.pid + 303;
      let fakeNow = 0;
      __testing.setDateNowOverride(() => fakeNow);

      installInitialBusyPoll(stalePid, () => {
        // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline.
        fakeNow += 2001;
        return createOpenClawBusyResult(stalePid);
      });

      vi.spyOn(process, "kill").mockReturnValue(true);
      // Must return without throwing (proceeds with warning after budget expires)
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
    });

    it("still polls for port-free when all stale pids were already dead at SIGTERM time", () => {
      // Sub-agent audit MEDIUM finding: if all pids from the initial scan are
      // already dead before SIGTERM runs (race), terminateStaleProcessesSync
      // returns killed=[] — but cleanStaleGatewayProcessesSync MUST still call
      // waitForPortFreeSync. The process may have exited on its own while
      // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would
      // silently recreate the EADDRINUSE race we are fixing.
      const stalePid = process.pid + 304;
      const events: string[] = [];

      events.push("initial-find");
      installInitialBusyPoll(stalePid, () => {
        // Port is already free on first poll — pid was dead before SIGTERM
        events.push("poll-free");
        return createLsofResult({ status: 1 });
      });

      // All SIGTERMs throw ESRCH — pid already gone
      vi.spyOn(process, "kill").mockImplementation(() => {
        throw Object.assign(new Error("ESRCH"), { code: "ESRCH" });
      });

      cleanStaleGatewayProcessesSync();

      // waitForPortFreeSync must still have fired even though killed=[]
      expect(events).toContain("poll-free");
    });

    it("continues polling on transient lsof errors (not ENOENT) — Codex P1 fix", () => {
      // A transient lsof error (spawnSync timeout, status 2, etc.) must NOT abort
      // the polling loop. The loop should keep retrying until the budget expires
      // or a definitive result is returned. Bailing on the first transient error
      // would recreate the EADDRINUSE race this PR is designed to prevent.
      const stalePid = process.pid + 301;
      const events: string[] = [];
      events.push("initial-find");
      installInitialBusyPoll(stalePid, (call) => {
        if (call === 2) {
          // Transient: spawnSync timeout (no ENOENT code)
          events.push("transient-error");
          return createLsofResult({ error: new Error("timeout"), status: null });
        }
        // Port free on the next poll
        events.push("port-free");
        return createLsofResult({ status: 1 });
      });

      vi.spyOn(process, "kill").mockReturnValue(true);
      cleanStaleGatewayProcessesSync();

      // Must have kept polling after the transient error and reached port-free
      expect(events).toContain("transient-error");
      expect(events).toContain("port-free");
    });

    it("returns gracefully when resolveGatewayPort throws", () => {
      mockResolveGatewayPort.mockImplementationOnce(() => {
        throw new Error("config read error");
      });
      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
    });

    it("returns gracefully when lsof is unavailable from the start", () => {
      mockSpawnSync.mockReturnValue({
        error: new Error("ENOENT"),
        status: null,
        stdout: "",
        stderr: "",
      });
      const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
      expect(cleanStaleGatewayProcessesSync()).toEqual([]);
      expect(killSpy).not.toHaveBeenCalled();
    });
  });

  // -------------------------------------------------------------------------
  // parsePidsFromLsofOutput — branch-coverage for mid-loop && short-circuits
  // -------------------------------------------------------------------------
  describe("parsePidsFromLsofOutput — branch coverage (lines 67-69)", () => {
    it("skips a mid-loop entry when the command does not include 'openclaw'", () => {
      // Exercises the false branch of currentCmd.toLowerCase().includes("openclaw")
      // inside the mid-loop flush: a non-openclaw cmd between two entries must not
      // be pushed, but the following openclaw entry still must be.
      const stalePid = process.pid + 700;
      // Mixed output: non-openclaw entry first, then openclaw entry
      const stdout = `p${process.pid + 699}\ncnginx\np${stalePid}\ncopenclaw-gateway\n`;
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
      const result = findGatewayPidsOnPortSync(18789);
      expect(result).toContain(stalePid);
      expect(result).not.toContain(process.pid + 699);
    });

    it("skips a mid-loop entry when currentCmd is missing (two consecutive p-lines)", () => {
      // Exercises currentCmd falsy branch mid-loop: two 'p' lines in a row
      // (no 'c' line between them) — the first PID must be skipped, the second handled.
      const stalePid = process.pid + 701;
      // Two consecutive p-lines: first has no c-line before the next p-line
      const stdout = `p${process.pid + 702}\np${stalePid}\ncopenclaw-gateway\n`;
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
      const result = findGatewayPidsOnPortSync(18789);
      expect(result).toContain(stalePid);
    });

    it("ignores a p-line with an invalid (non-positive) PID — ternary false branch", () => {
      // Exercises the `Number.isFinite(parsed) && parsed > 0 ? parsed : undefined`
      // false branch: a malformed 'p' line (e.g. 'p0' or 'pNaN') must not corrupt
      // currentPid and must not end up in the returned pids array.
      const stalePid = process.pid + 703;
      // p0 is invalid (not > 0); the following valid openclaw entry must still be found.
      const stdout = `p0\ncopenclaw-gateway\np${stalePid}\ncopenclaw-gateway\n`;
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
      const result = findGatewayPidsOnPortSync(18789);
      expect(result).toContain(stalePid);
      expect(result).not.toContain(0);
    });

    it("silently skips lines that start with neither 'p' nor 'c' — else-if false branch", () => {
      // lsof -Fpc only emits 'p' and 'c' lines, but defensive handling of
      // unexpected output (e.g. 'f' for file descriptor in other lsof formats)
      // must not throw or corrupt the pid list. Unknown lines are just skipped.
      const stalePid = process.pid + 704;
      // Intersperse an 'f' line (file descriptor marker) — not a 'p' or 'c' line
      const stdout = `p${stalePid}\nf8\ncopenclaw-gateway\n`;
      mockSpawnSync.mockReturnValue({ error: null, status: 0, stdout, stderr: "" });
      const result = findGatewayPidsOnPortSync(18789);
      // The 'f' line must not corrupt parsing; stalePid must still be found
      // (the 'c' line after 'f' correctly sets currentCmd)
      expect(result).toContain(stalePid);
    });
  });

  // -------------------------------------------------------------------------
  // pollPortOnce branch — status 1 + non-empty stdout with zero openclaw pids
  // -------------------------------------------------------------------------
  describe("pollPortOnce — status 1 + non-empty non-openclaw stdout (line 145)", () => {
    it("treats status 1 + non-openclaw stdout as port-free (not an openclaw process)", () => {
      // status 1 + non-empty stdout where no openclaw pids are present:
      // the port may be held by an unrelated process. From our perspective
      // (we only kill openclaw pids) it is effectively free.
      const stalePid = process.pid + 800;
      const getCallCount = installInitialBusyPoll(stalePid, () => {
        // status 1 + non-openclaw output — should be treated as free:true for our purposes
        return createLsofResult({
          status: 1,
          stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]),
        });
      });
      vi.spyOn(process, "kill").mockReturnValue(true);
      // Should complete cleanly — no openclaw pids in status-1 output → free
      expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
      // Completed in exactly 2 calls (initial find + 1 free poll)
      expect(getCallCount()).toBe(2);
    });
  });

  // -------------------------------------------------------------------------
  // sleepSync — direct unit tests via __testing.callSleepSyncRaw
  // -------------------------------------------------------------------------
  describe("sleepSync — Atomics.wait paths", () => {
    it("returns immediately when called with 0ms (timeoutMs <= 0 early return)", () => {
      // sleepSync(0) must short-circuit before touching Atomics.wait.
      // Verify it does not throw and returns synchronously.
      __testing.setSleepSyncOverride(null); // bypass override so real path runs
      expect(() => __testing.callSleepSyncRaw(0)).not.toThrow();
    });

    it("returns immediately when called with a negative value (Math.max(0,...) clamp)", () => {
      __testing.setSleepSyncOverride(null);
      expect(() => __testing.callSleepSyncRaw(-1)).not.toThrow();
    });

    it("executes the Atomics.wait path successfully when called with a positive timeout", () => {
      // Verify the real Atomics.wait code path runs without error.
      // Use 1ms to keep the test fast; Atomics.wait resolves immediately
      // because the timeout expires in 1ms.
      __testing.setSleepSyncOverride(null);
      expect(() => __testing.callSleepSyncRaw(1)).not.toThrow();
    });

    it("falls back to busy-wait when Atomics.wait throws (Worker / sandboxed env)", () => {
      // Atomics.wait throws in Worker threads and some sandboxed runtimes.
      // The catch branch must handle this without propagating the exception.
      const origWait = Atomics.wait;
      Atomics.wait = () => {
        throw new Error("not on main thread");
      };
      __testing.setSleepSyncOverride(null);
      try {
        // 1ms is enough to exercise the busy-wait loop without slowing CI.
        expect(() => __testing.callSleepSyncRaw(1)).not.toThrow();
      } finally {
        Atomics.wait = origWait;
        __testing.setSleepSyncOverride(() => {});
      }
    });
  });
});