fix(usage-log): recover stale lock left by abnormal process exit

A process killed or crashed after creating token-usage.json.lock but
before the finally-unlink runs leaves a permanent stale lock. All
subsequent recordTokenUsage calls for that workspace time out and drop
their entries.

Fix:
- Write the holder's PID into the lock file on acquisition (O_EXCL + writeFile).
- On each EEXIST retry, call isLockStale() which reads the PID and sends
  signal 0 (kill(pid, 0)) to check liveness without delivering a signal.
  ESRCH means the process is gone → lock is stale; any other result
  (alive, EPERM, unreadable file) is treated as live so we never break a
  legitimately held lock.
- If stale, unlink and continue to the next O_EXCL attempt; multiple
  concurrent waiters racing on the steal are safe because only one O_EXCL
  open succeeds.
- Recovery is immediate (no need to wait for LOCK_TIMEOUT_MS).

Add a test that spawns a subprocess, waits for it to exit, writes its
dead PID into the lock file, and asserts recordTokenUsage succeeds and
cleans up the lock.
This commit is contained in:
jiarung 2026-03-14 16:48:08 +00:00
parent 00e05fb4e5
commit 1a5489bf32
2 changed files with 82 additions and 9 deletions

View File

@ -1,3 +1,4 @@
import { spawn } from "child_process";
import fs from "fs/promises";
import os from "os";
import path from "path";
@ -177,6 +178,39 @@ describe("recordTokenUsage", () => {
expect(records).toHaveLength(N);
});
it("reclaims stale lock left by a crashed process", async () => {
// Spawn a subprocess that exits immediately, then use its (now-dead) PID
// to simulate a lock file left behind after an abnormal exit.
const deadPid = await new Promise<number>((resolve, reject) => {
const child = spawn(process.execPath, ["-e", "process.exit(0)"]);
const pid = child.pid!;
child.on("exit", () => resolve(pid));
child.on("error", reject);
});
const memoryDir = path.join(tmpDir, "memory");
await fs.mkdir(memoryDir, { recursive: true });
const lockPath = path.join(memoryDir, "token-usage.json.lock");
await fs.writeFile(lockPath, String(deadPid));
// recordTokenUsage must detect the stale lock, reclaim it, and succeed.
await recordTokenUsage({
workspaceDir: tmpDir,
label: "llm_output",
usage: { input: 100, output: 50, total: 150 },
});
const records = JSON.parse(await fs.readFile(usageFile, "utf-8"));
expect(records).toHaveLength(1);
expect(records[0].tokensUsed).toBe(150);
// Lock file must be cleaned up by the winner.
const lockExists = await fs
.access(lockPath)
.then(() => true)
.catch(() => false);
expect(lockExists).toBe(false);
});
it("serialises concurrent writes — no record is lost", async () => {
const N = 20;
await Promise.all(

View File

@ -60,21 +60,53 @@ async function readJsonArray(file: string): Promise<TokenUsageRecord[]> {
// writes. We guard against that with an advisory lock file (.lock) using
// O_EXCL (create-exclusive), which is atomic on POSIX filesystems.
//
// Lock acquisition retries with a fixed interval up to LOCK_TIMEOUT_MS.
// On timeout ERR_LOCK_TIMEOUT is thrown; the lock file is intentionally left
// untouched so that an active holder's mutual exclusion is never broken.
// The lock file stores the holder's PID so that waiters can detect a stale
// lock left by a crashed process. On each EEXIST the waiter reads the PID
// and calls kill(pid, 0): if the process no longer exists (ESRCH) the lock
// is stale and is reclaimed immediately via a fresh O_EXCL open, preserving
// mutual exclusion even when multiple waiters race for the steal. If the
// holder is alive the waiter backs off for LOCK_RETRY_MS and retries.
// After LOCK_TIMEOUT_MS without acquiring the lock ERR_LOCK_TIMEOUT is
// thrown; the lock file is left untouched to avoid breaking a live holder.
// ---------------------------------------------------------------------------
const LOCK_RETRY_MS = 50;
const LOCK_TIMEOUT_MS = 5_000;
/**
* Returns true only when the lock at `lockPath` was written by a process
* that no longer exists (ESRCH). Any other outcome (process alive, EPERM,
* unreadable file, non-numeric content) is treated as "not stale" so we
* never break a legitimately held lock.
*/
async function isLockStale(lockPath: string): Promise<boolean> {
try {
const raw = await fs.readFile(lockPath, "utf-8");
const pid = parseInt(raw.trim(), 10);
if (isNaN(pid) || pid <= 0) {
return false;
}
try {
process.kill(pid, 0); // signal 0 checks existence without delivering a signal
return false; // process is alive
} catch (e) {
return (e as NodeJS.ErrnoException).code === "ESRCH";
}
} catch {
return false; // can't read lock — treat as live
}
}
async function withFileLock<T>(lockPath: string, fn: () => Promise<T>): Promise<T> {
const deadline = Date.now() + LOCK_TIMEOUT_MS;
const myPid = String(process.pid);
while (Date.now() < deadline) {
let fh: fs.FileHandle | undefined;
try {
// wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists
// wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists.
// Write our PID so that a waiting process can verify we are still alive.
fh = await fs.open(lockPath, "wx");
await fh.writeFile(myPid);
await fh.close();
fh = undefined;
try {
@ -87,15 +119,22 @@ async function withFileLock<T>(lockPath: string, fn: () => Promise<T>): Promise<
if ((err as NodeJS.ErrnoException).code !== "EEXIST") {
throw err;
}
// Another process holds the lock — wait and retry.
// The lock file exists. Check immediately whether the holder crashed;
// if so, unlink the stale lock and loop back to race on O_EXCL.
// Multiple concurrent waiters may all detect the stale lock and attempt
// the unlink — that is fine because only one O_EXCL open will succeed.
if (await isLockStale(lockPath)) {
await fs.unlink(lockPath).catch(() => {});
continue;
}
// Holder is alive — back off and retry.
await new Promise<void>((r) => setTimeout(r, LOCK_RETRY_MS));
}
}
// Timed out without acquiring the lock. We deliberately do NOT delete the
// lock file here: the holder may still be active (slow disk, large file),
// and removing a live lock would break mutual exclusion and allow concurrent
// read-modify-write cycles to overwrite each other's entries.
// Timed out without acquiring the lock. The lock file is intentionally left
// untouched: the holder may still be active (slow disk, large file), and
// removing a live lock would break mutual exclusion.
throw Object.assign(new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), {
code: "ERR_LOCK_TIMEOUT",
});