fix(file-lock): guard stale-lock reclaim with inode identity check
TOCTOU in the stale-lock branch: isStaleLock(lockPath) returning true
is evaluated under several awaits before unlink is called. If two
waiters (same process or different processes) both observe the same
stale file, waiter A can unlink, create a fresh lock, and start fn(),
then waiter B's delayed unlink removes A's fresh file. B then wins
open(O_EXCL) and both A and B execute fn() concurrently, breaking the
read-modify-write guarantee for token-usage.json.
Fix: snapshot the lock file's inode immediately after the EEXIST, then
re-stat right before the unlink. If the inode changed between the two
stats, a concurrent waiter already reclaimed the stale file and wrote a
fresh lock; leave the new file alone and continue to the next
open(O_EXCL) attempt. The three-outcome table:
staleIno == -1 (file gone by the time we stat)
→ skip unlink, continue: another waiter already handled it
staleIno == currentIno (same stale file still there)
→ safe to unlink; we and the other waiter(s) racing here all call
rm(force:true) — the first succeeds, the rest get silent ENOENT
staleIno != currentIno (inode changed — fresh lock in place)
→ do NOT unlink; continue and let isStaleLock reject the live lock
A note on the in-loop HELD_LOCKS re-check that was considered: joining
the existing holder inside the retry loop would allow two independent
concurrent callers to run fn() simultaneously, which breaks mutual
exclusion. HELD_LOCKS reentrant join is intentionally restricted to the
entry point of acquireFileLock (recursive/reentrant callers only).
Tests added:
- two concurrent waiters on a stale lock never overlap inside fn()
(maxInside assertion, not just result set)
- existing stale-reclaim tests continue to pass
This commit is contained in:
parent
c5c92e6be1
commit
8d636b8a61
@ -14,6 +14,18 @@ const LOCK_OPTIONS = {
|
||||
stale: 5_000,
|
||||
};
|
||||
|
||||
// More retries for tests where one waiter must survive while the other holds
|
||||
// the lock for a non-trivial duration.
|
||||
const RETRY_LOCK_OPTIONS = {
|
||||
retries: {
|
||||
retries: 10,
|
||||
factor: 1,
|
||||
minTimeout: 10,
|
||||
maxTimeout: 30,
|
||||
},
|
||||
stale: 5_000,
|
||||
};
|
||||
|
||||
describe("withFileLock", () => {
|
||||
let tmpDir: string;
|
||||
let targetFile: string;
|
||||
@ -88,4 +100,38 @@ describe("withFileLock", () => {
|
||||
).resolves.toBeUndefined();
|
||||
expect(ran).toBe(true);
|
||||
});
|
||||
|
||||
it("two concurrent waiters on a stale lock never overlap inside fn()", async () => {
|
||||
// Plant a stale lock (dead PID, old timestamp) so both waiters will
|
||||
// simultaneously enter the stale-reclaim branch. The inode guard must
|
||||
// prevent the slower waiter's unlink from deleting the faster waiter's
|
||||
// freshly-acquired lock, which would allow both fn() calls to run
|
||||
// concurrently and corrupt each other's read-modify-write sequences.
|
||||
const lockPath = `${targetFile}.lock`;
|
||||
await fs.mkdir(path.dirname(targetFile), { recursive: true });
|
||||
await fs.writeFile(lockPath, JSON.stringify({ pid: 0, createdAt: new Date(0).toISOString() }));
|
||||
|
||||
let inside = 0; // number of concurrent fn() executions
|
||||
let maxInside = 0;
|
||||
const results: number[] = [];
|
||||
|
||||
const run = async (id: number) => {
|
||||
// Use RETRY_LOCK_OPTIONS so the losing waiter has enough budget to
|
||||
// outlast the winning waiter's 20 ms hold without timing out.
|
||||
await withFileLock(targetFile, RETRY_LOCK_OPTIONS, async () => {
|
||||
inside += 1;
|
||||
maxInside = Math.max(maxInside, inside);
|
||||
await new Promise((r) => setTimeout(r, 20)); // hold the lock briefly
|
||||
results.push(id);
|
||||
inside -= 1;
|
||||
});
|
||||
};
|
||||
|
||||
// Launch both concurrently so they race on the stale lock.
|
||||
await Promise.all([run(1), run(2)]);
|
||||
|
||||
// Both callbacks must have run exactly once and never overlapped.
|
||||
expect(results.toSorted((a, b) => a - b)).toEqual([1, 2]);
|
||||
expect(maxInside).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
@ -134,10 +134,41 @@ export async function acquireFileLock(
|
||||
if (code !== "EEXIST") {
|
||||
throw err;
|
||||
}
|
||||
if (await isStaleLock(lockPath, options.stale)) {
|
||||
await fs.rm(lockPath, { force: true }).catch(() => undefined);
|
||||
|
||||
// Snapshot the inode of the existing lock file *before* checking
|
||||
// staleness. We compare it again just before unlinking; if the inode
|
||||
// has changed in the interim, another waiter already reclaimed the
|
||||
// stale file and created a fresh lock — deleting it would silently
|
||||
// break that holder's mutual exclusion guarantee.
|
||||
const staleIno = await fs
|
||||
.stat(lockPath)
|
||||
.then((s) => s.ino)
|
||||
.catch(() => -1);
|
||||
|
||||
// staleIno === -1 means the file vanished between open(EEXIST) and
|
||||
// stat — another process already removed it. Skip straight to the
|
||||
// next open(O_EXCL) attempt.
|
||||
const isStale = staleIno === -1 || (await isStaleLock(lockPath, options.stale));
|
||||
|
||||
if (isStale) {
|
||||
if (staleIno !== -1) {
|
||||
// Re-verify the path still maps to the same inode we deemed stale.
|
||||
// If it changed, a concurrent waiter beat us to the reclaim and has
|
||||
// already written its own fresh lock; leave that file alone.
|
||||
const currentIno = await fs
|
||||
.stat(lockPath)
|
||||
.then((s) => s.ino)
|
||||
.catch(() => -1);
|
||||
if (currentIno === staleIno) {
|
||||
await fs.rm(lockPath, { force: true }).catch(() => undefined);
|
||||
}
|
||||
}
|
||||
// Retry open(O_EXCL) regardless: either we removed the stale lock or
|
||||
// a concurrent waiter already handled it; either way, the path is now
|
||||
// either free or holds a fresh lock that isStaleLock will reject.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (attempt >= attempts - 1) {
|
||||
break;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user