From d2b7b46604ee10a8257195c19829b55314fcdd3c Mon Sep 17 00:00:00 2001 From: jiarung Date: Mon, 16 Mar 2026 03:07:47 +0000 Subject: [PATCH] fix(usage-log): increase lock stale window to 30 s to prevent active-lock steals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit appendRecord rewrites the full token-usage.json on every write, so lock hold time grows with file size and disk speed. The previous stale: 5_000 was too tight: on large histories or slow disks a write can legitimately take longer than 5 s, allowing a concurrent waiter to treat the still- active lock as stale, reclaim it, and run an overlapping read-modify-write cycle that silently drops the earlier writer's entry. The risk is amplified by the attempt path where recordTokenUsage is fired without awaiting, so multiple concurrent runs can legitimately overlap. Fix: • Raise stale to 30_000 ms — gives ample headroom for large files on slow disks while still reclaiming crashed-process locks within 30 s. • Match the retry budget: 150 retries × 200 ms ≈ 30 s with jitter, so waiters exhaust retries only when the holder exceeds the stale window (i.e., is genuinely stuck or has crashed). --- src/agents/usage-log.ts | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index dc640f7410e..9ebb3ff580b 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -68,14 +68,25 @@ async function readJsonArray(file: string): Promise { // • uses exponential backoff with jitter capped at stale ms // --------------------------------------------------------------------------- const APPEND_LOCK_OPTIONS: FileLockOptions = { - // ~100 retries × 50 ms ≈ 5 s total — matches the previous LOCK_TIMEOUT_MS. + // appendRecord rewrites the full token-usage.json on every call, so hold + // time scales with file size and disk speed. 5 s was too tight: a slow + // write on a large history could exceed the stale window and allow a + // concurrent waiter to steal an active lock, causing overlapping + // read-modify-write cycles that silently drop entries. + // + // 30 s gives plenty of headroom for large files on slow disks while still + // reclaiming locks left by crashed processes within a reasonable window. + // The retry budget (~150 × 200 ms = 30 s) matches the stale window so + // waiters exhaust retries only if the holder holds longer than stale, + // i.e., is genuinely stuck or crashed. retries: { - retries: 100, + retries: 150, factor: 1, - minTimeout: 50, - maxTimeout: 50, + minTimeout: 200, + maxTimeout: 200, + randomize: true, }, - stale: 5_000, + stale: 30_000, }; async function appendRecord(file: string, entry: TokenUsageRecord): Promise {