diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index 012b574d7c1..f9e2211513f 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -187,6 +187,30 @@ describe("withFileLock", () => { }, ); + it("reclaims a stale empty lock file even when detected on the very last retry (retries=0)", async () => { + // Regression: when retries=0 (attempts=1), a stale lock detected on + // attempt 0 caused `continue` to increment attempt to 1, exiting the loop + // and throwing timeout without ever calling open(O_EXCL) on the now-clear + // path. This reproduces the token-usage.json.lock crash-recovery bug where + // an empty .lock file left by a crash is reclaimable only on the last slot. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, ""); // empty — crash between open("wx") and writeFile + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); // age past stale threshold + + let ran = false; + await expect( + withFileLock( + targetFile, + { ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 0 } }, + async () => { + ran = true; + }, + ), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { // Plant a stale lock (dead PID, old timestamp) so both waiters will // simultaneously enter the stale-reclaim branch. The inode guard must diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index ee981355fec..5be1e56b1a4 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -215,6 +215,22 @@ export async function acquireFileLock( // Retry open(O_EXCL) regardless: either we removed the stale lock or // a concurrent waiter already handled it; either way, the path is now // either free or holds a fresh lock that isStaleLock will reject. + // + // Guard: the for-loop's `attempt += 1` runs after every `continue`, + // consuming a retry slot. If we are already on the last slot + // (attempt === attempts - 1), that increment exits the loop and we + // throw timeout without ever re-trying open(O_EXCL) on the now-clear + // path. This is the crash-recovery bug: an empty/partial .lock file + // (crash between open("wx") and writeFile) that becomes reclaimable + // on the last iteration causes the record to be silently dropped. + // + // Fix: when on the last slot, step attempt back by one so the + // upcoming += 1 nets to zero, guaranteeing at least one post-reclaim + // open(O_EXCL) attempt. No extra sleep budget is consumed — we only + // charge a retry for backoff sleeps, not reclaim-only work. + if (attempt >= attempts - 1) { + attempt -= 1; + } continue; }