From 13b0c1d0109ff6d5bf021c5f0037114106ebc3f1 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 23:41:25 +0000 Subject: [PATCH] fix(usage-log): reacquire lock via O_EXCL after timeout instead of running unlocked After the retry loop timed out, withFileLock unconditionally deleted the lock file and called fn() without reacquiring the lock. If multiple waiters timed out concurrently they would all enter the critical section together, defeating the serialisation guarantee and allowing concurrent read-modify-write cycles to overwrite each other's records. Fix: after unlinking the stale lock, attempt one final O_EXCL open so that exactly one concurrent waiter wins the lock and the rest receive ERR_LOCK_TIMEOUT. The unlocked fast-path is removed entirely. --- src/agents/usage-log.ts | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 2370ee9b24e..40ae3daef4e 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -92,10 +92,32 @@ async function withFileLock(lockPath: string, fn: () => Promise): Promise< } } - // Timeout: remove a potentially stale lock and make one final attempt. + // Timed out waiting for the lock. Remove a potentially stale lock file + // (left behind by a crashed process) and make one final attempt to acquire + // it through the normal O_EXCL path. This ensures the write is always + // serialised: if the stale file is gone another waiter that also timed out + // concurrently will race on O_EXCL and only one of them will win. await fs.unlink(lockPath).catch(() => {}); - const records = await fn(); - return records; + + // Re-enter the acquisition loop for a single attempt (deadline already + // passed so the while condition is false; open directly instead). + let fh: fs.FileHandle | undefined; + try { + fh = await fs.open(lockPath, "wx"); + await fh.close(); + fh = undefined; + try { + return await fn(); + } finally { + await fs.unlink(lockPath).catch(() => {}); + } + } catch (err) { + await fh?.close().catch(() => {}); + throw Object.assign( + new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), + { code: "ERR_LOCK_TIMEOUT", cause: err }, + ); + } } async function appendRecord(file: string, entry: TokenUsageRecord): Promise {