fix(file-lock): bound post-reclaim free slot to prevent endless spin

The previous fix decremented attempt unconditionally when on the last
retry slot, which introduced an endless loop: if fs.rm silently fails
(EACCES/EPERM swallowed by .catch), isStaleLock continues to return
true on every iteration and the attempt -= 1 guard keeps looping the
last slot forever, hanging all callers (withFileLock, usage logging).

Fix: introduce reclaimSlotAvailable (one-shot boolean, reset to false
on first use).  The free post-reclaim attempt is given exactly once;
subsequent stale detections on the last slot are not granted extra
iterations and the loop naturally exhausts its retry budget and throws
file lock timeout as expected.

Adds regression test: mocks fs.rm as a no-op so the stale lock
persists unconditionally.  Without the one-shot bound the loop would
spin forever (vitest default timeout would terminate it); with the fix
it throws file lock timeout promptly.
This commit is contained in:
jiarung 2026-03-17 07:15:47 +00:00
parent 14303dac74
commit 8c2a7b4181
5 changed files with 58 additions and 4 deletions

View File

@ -1,7 +1,7 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { getProcessStartTime } from "../shared/pid-alive.js";
import { withFileLock } from "./file-lock.js";
@ -211,6 +211,40 @@ describe("withFileLock", () => {
expect(ran).toBe(true);
});
it("times out (does not spin forever) when a stale lock cannot be removed", async () => {
// Safety bound: if fs.rm silently fails (EACCES/EPERM — swallowed by
// .catch), subsequent iterations keep seeing isStale=true. The one-shot
// reclaimSlotAvailable guard must prevent the attempt -= 1 trick from
// repeating indefinitely; after the free slot is consumed, the loop must
// exhaust its retry budget and throw timeout rather than hanging forever.
//
// We simulate the condition by mocking fs.rm as a no-op so the .lock
// file is never removed — every subsequent open(O_EXCL) still sees EEXIST.
// Without the reclaimSlotAvailable bound, each stale detection on the last
// slot would decrement attempt and loop back, spinning forever.
const lockPath = `${targetFile}.lock`;
await fs.mkdir(path.dirname(targetFile), { recursive: true });
await fs.writeFile(lockPath, JSON.stringify({ pid: 0, createdAt: new Date(0).toISOString() }));
await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000);
// Spy on fs.rm: resolve successfully but do nothing, so the lock file
// persists and isStaleLock() keeps returning true on every iteration.
const rmSpy = vi.spyOn(fs, "rm").mockResolvedValue(undefined);
try {
await expect(
withFileLock(
targetFile,
{ ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 2 } },
async () => {},
),
).rejects.toThrow("file lock timeout");
} finally {
rmSpy.mockRestore();
await fs.rm(lockPath, { force: true }).catch(() => {});
}
});
it("two concurrent waiters on a stale lock never overlap inside fn()", async () => {
// Plant a stale lock (dead PID, old timestamp) so both waiters will
// simultaneously enter the stale-reclaim branch. The inode guard must

View File

@ -155,6 +155,11 @@ export async function acquireFileLock(
}
const attempts = Math.max(1, options.retries.retries + 1);
// One-shot budget for the post-reclaim free slot (see stale-reclaim comment
// below). Consumed the first time we step attempt back; subsequent stale
// detections on the last slot are allowed to exhaust the loop and time out,
// preventing an endless spin when fs.rm silently fails (e.g. EACCES/EPERM).
let reclaimSlotAvailable = true;
for (let attempt = 0; attempt < attempts; attempt += 1) {
try {
const handle = await fs.open(lockPath, "wx");
@ -228,7 +233,15 @@ export async function acquireFileLock(
// upcoming += 1 nets to zero, guaranteeing at least one post-reclaim
// open(O_EXCL) attempt. No extra sleep budget is consumed — we only
// charge a retry for backoff sleeps, not reclaim-only work.
if (attempt >= attempts - 1) {
//
// Safety bound: reclaimSlotAvailable is consumed after the first use.
// If fs.rm silently fails (EACCES/EPERM swallowed by .catch above),
// subsequent iterations will still detect isStale=true; without the
// bound, decrementing attempt on every last-slot iteration would spin
// the loop forever. After the one-shot budget is gone, the loop is
// allowed to exhaust normally and throw timeout.
if (reclaimSlotAvailable && attempt >= attempts - 1) {
reclaimSlotAvailable = false;
attempt -= 1;
}
continue;

View File

@ -11,7 +11,7 @@ export function safeRealpathSync(targetPath: string, cache?: Map<string, string>
return cached;
}
try {
const resolved = fs.realpathSync(targetPath);
const resolved = fs.realpathSync.native(targetPath);
cache?.set(targetPath, resolved);
return resolved;
} catch {

View File

@ -11,7 +11,7 @@
"dependencies": {
"@lit-labs/signals": "^0.2.0",
"@lit/context": "^1.1.6",
"@noble/ed25519": "3.0.0",
"@noble/ed25519": "^3.0.0",
"dompurify": "^3.3.3",
"lit": "^3.3.2",
"marked": "^17.0.4",

View File

@ -91,6 +91,12 @@ function createChatHeaderState(
sessionKey: "main",
connected: true,
sessionsHideCron: true,
sessionsLoading: false,
sessionsError: null,
sessionsFilterActive: "0",
sessionsFilterLimit: "0",
sessionsIncludeGlobal: true,
sessionsIncludeUnknown: true,
sessionsResult: {
ts: 0,
path: "",
@ -111,6 +117,7 @@ function createChatHeaderState(
chatModelOverrides: {},
chatModelCatalog: catalog,
chatModelsLoading: false,
chatSending: false,
client: { request } as unknown as GatewayBrowserClient,
settings: {
gatewayUrl: "",