From 14303dac74a3803eccc3e76403cf4c247ce61fd0 Mon Sep 17 00:00:00 2001 From: jiarung Date: Tue, 17 Mar 2026 06:57:09 +0000 Subject: [PATCH] fix(file-lock): guarantee post-reclaim open attempt on last retry slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When acquireFileLock detects a stale lock on the final loop iteration (attempt === attempts - 1), the subsequent `continue` increments attempt to `attempts`, causing the loop to exit and throw 'file lock timeout' without ever calling open(O_EXCL) on the now-clear path. This is the crash-recovery bug reported in the P2 badge: an empty/partial .lock file left by a crash between open("wx") and writeFile becomes reclaimable on the last slot, but the call still throws timeout and the usage record is silently dropped. Fix: when on the last slot, step attempt back by one before continue so the upcoming += 1 nets to zero — guaranteeing at least one post-reclaim open(O_EXCL) attempt. No extra sleep budget is consumed; retries are only charged for backoff-sleep iterations, not reclaim-only work. Adds a regression test: retries=0 (attempts=1), stale empty lock file aged past staleMs — withFileLock must succeed, not throw timeout. --- src/plugin-sdk/file-lock.test.ts | 24 ++++++++++++++++++++++++ src/plugin-sdk/file-lock.ts | 16 ++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index 012b574d7c1..f9e2211513f 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -187,6 +187,30 @@ describe("withFileLock", () => { }, ); + it("reclaims a stale empty lock file even when detected on the very last retry (retries=0)", async () => { + // Regression: when retries=0 (attempts=1), a stale lock detected on + // attempt 0 caused `continue` to increment attempt to 1, exiting the loop + // and throwing timeout without ever calling open(O_EXCL) on the now-clear + // path. This reproduces the token-usage.json.lock crash-recovery bug where + // an empty .lock file left by a crash is reclaimable only on the last slot. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, ""); // empty — crash between open("wx") and writeFile + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); // age past stale threshold + + let ran = false; + await expect( + withFileLock( + targetFile, + { ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 0 } }, + async () => { + ran = true; + }, + ), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { // Plant a stale lock (dead PID, old timestamp) so both waiters will // simultaneously enter the stale-reclaim branch. The inode guard must diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index ee981355fec..5be1e56b1a4 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -215,6 +215,22 @@ export async function acquireFileLock( // Retry open(O_EXCL) regardless: either we removed the stale lock or // a concurrent waiter already handled it; either way, the path is now // either free or holds a fresh lock that isStaleLock will reject. + // + // Guard: the for-loop's `attempt += 1` runs after every `continue`, + // consuming a retry slot. If we are already on the last slot + // (attempt === attempts - 1), that increment exits the loop and we + // throw timeout without ever re-trying open(O_EXCL) on the now-clear + // path. This is the crash-recovery bug: an empty/partial .lock file + // (crash between open("wx") and writeFile) that becomes reclaimable + // on the last iteration causes the record to be silently dropped. + // + // Fix: when on the last slot, step attempt back by one so the + // upcoming += 1 nets to zero, guaranteeing at least one post-reclaim + // open(O_EXCL) attempt. No extra sleep budget is consumed — we only + // charge a retry for backoff sleeps, not reclaim-only work. + if (attempt >= attempts - 1) { + attempt -= 1; + } continue; }