From 8cbc05ae1ff3754540bfb5a776198e7e661e143f Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 06:35:38 +0000 Subject: [PATCH 01/29] feat(usage-log): record inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens The recordTokenUsage function previously only persisted the aggregate tokensUsed total, discarding the input/output breakdown that was already available via getUsageTotals(). This meant token-usage.json had no per-record IO split, making it impossible to analyse input vs output token costs in dashboards. Changes: - Add inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens optional fields to TokenUsageRecord type in usage-log.ts (new file) - Write these fields (when non-zero) into each usage entry - Fields are omitted (not null) when unavailable, keeping existing records valid - Wire up recordTokenUsage() call in attempt.ts after llm_output hook This is a purely additive change; existing consumers that only read tokensUsed are unaffected. --- src/agents/pi-embedded-runner/run/attempt.ts | 14 ++++ src/agents/usage-log.ts | 83 ++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 src/agents/usage-log.ts diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 3457fdf0161..3aedfcb84a9 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -135,6 +135,7 @@ import { import { pruneProcessedHistoryImages } from "./history-image-prune.js"; import { detectAndLoadPromptImages } from "./images.js"; import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js"; +import { recordTokenUsage } from "../../usage-log.js"; type PromptBuildHookRunner = { hasHooks: (hookName: "before_prompt_build" | "before_agent_start") => boolean; @@ -2758,6 +2759,19 @@ export async function runEmbeddedAttempt( }); } + recordTokenUsage({ + workspaceDir: params.workspaceDir, + runId: params.runId, + sessionId: params.sessionId, + sessionKey: params.sessionKey, + provider: params.provider, + model: params.modelId, + label: "llm_output", + usage: getUsageTotals(), + }).catch((err) => { + log.warn(`token usage log failed: ${String(err)}`); + }); + return { aborted, timedOut, diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts new file mode 100644 index 00000000000..b3f63595793 --- /dev/null +++ b/src/agents/usage-log.ts @@ -0,0 +1,83 @@ +import fs from "fs/promises"; +import path from "path"; + +export type TokenUsageRecord = { + id: string; + taskId?: string; + label: string; + tokensUsed: number; + tokenLimit?: number; + inputTokens?: number; + outputTokens?: number; + cacheReadTokens?: number; + cacheWriteTokens?: number; + model?: string; + provider?: string; + runId?: string; + sessionId?: string; + sessionKey?: string; + createdAt: string; +}; + +function makeId() { + return `usage_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`; +} + +async function readJsonArray(file: string): Promise { + try { + const raw = await fs.readFile(file, "utf-8"); + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? (parsed as TokenUsageRecord[]) : []; + } catch { + return []; + } +} + +export async function recordTokenUsage(params: { + workspaceDir: string; + runId?: string; + sessionId?: string; + sessionKey?: string; + provider?: string; + model?: string; + label: string; + usage?: { + input?: number; + output?: number; + cacheRead?: number; + cacheWrite?: number; + total?: number; + }; +}) { + const usage = params.usage; + if (!usage) return; + const total = + usage.total ?? + (usage.input ?? 0) + (usage.output ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0); + if (!total || total <= 0) return; + + const memoryDir = path.join(params.workspaceDir, "memory"); + const file = path.join(memoryDir, "token-usage.json"); + await fs.mkdir(memoryDir, { recursive: true }); + + const entry: TokenUsageRecord = { + id: makeId(), + taskId: params.runId, + label: params.label, + tokensUsed: Math.trunc(total), + ...(usage.input != null && usage.input > 0 && { inputTokens: Math.trunc(usage.input) }), + ...(usage.output != null && usage.output > 0 && { outputTokens: Math.trunc(usage.output) }), + ...(usage.cacheRead != null && usage.cacheRead > 0 && { cacheReadTokens: Math.trunc(usage.cacheRead) }), + ...(usage.cacheWrite != null && usage.cacheWrite > 0 && { cacheWriteTokens: Math.trunc(usage.cacheWrite) }), + model: params.model, + provider: params.provider, + runId: params.runId, + sessionId: params.sessionId, + sessionKey: params.sessionKey, + createdAt: new Date().toISOString(), + }; + + const records = await readJsonArray(file); + records.push(entry); + await fs.writeFile(file, JSON.stringify(records, null, 2)); +} From feefa8568f33eb1a0af832a1035572a2e0296946 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 08:43:18 +0000 Subject: [PATCH 02/29] test(usage-log): add unit tests for recordTokenUsage IO fields --- src/agents/usage-log.test.ts | 116 +++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/agents/usage-log.test.ts diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts new file mode 100644 index 00000000000..897bb8ee4f4 --- /dev/null +++ b/src/agents/usage-log.test.ts @@ -0,0 +1,116 @@ +import { describe, expect, it, beforeEach, afterEach } from "vitest"; +import fs from "fs/promises"; +import path from "path"; +import os from "os"; +import { recordTokenUsage } from "./usage-log.js"; + +describe("recordTokenUsage", () => { + let tmpDir: string; + let usageFile: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "usage-log-test-")); + usageFile = path.join(tmpDir, "memory", "token-usage.json"); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + it("writes inputTokens and outputTokens when provided", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + model: "claude-sonnet-4-6", + provider: "anthropic", + usage: { input: 1000, output: 500, total: 1500 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(1); + expect(records[0].tokensUsed).toBe(1500); + expect(records[0].inputTokens).toBe(1000); + expect(records[0].outputTokens).toBe(500); + expect(records[0].cacheReadTokens).toBeUndefined(); + expect(records[0].cacheWriteTokens).toBeUndefined(); + }); + + it("writes cacheReadTokens and cacheWriteTokens when provided", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 800, output: 200, cacheRead: 300, cacheWrite: 100, total: 1400 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records[0].inputTokens).toBe(800); + expect(records[0].outputTokens).toBe(200); + expect(records[0].cacheReadTokens).toBe(300); + expect(records[0].cacheWriteTokens).toBe(100); + }); + + it("omits IO fields when usage only has total (legacy records)", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { total: 28402 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records[0].tokensUsed).toBe(28402); + expect(records[0].inputTokens).toBeUndefined(); + expect(records[0].outputTokens).toBeUndefined(); + }); + + it("skips writing when usage is undefined", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: undefined, + }); + + const exists = await fs.access(usageFile).then(() => true).catch(() => false); + expect(exists).toBe(false); + }); + + it("skips writing when total is zero", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 0, output: 0 }, + }); + + const exists = await fs.access(usageFile).then(() => true).catch(() => false); + expect(exists).toBe(false); + }); + + it("appends multiple records to the same file", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 100, output: 50, total: 150 }, + }); + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 200, output: 80, total: 280 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(2); + expect(records[0].inputTokens).toBe(100); + expect(records[1].inputTokens).toBe(200); + }); + + it("truncates fractional tokens", async () => { + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 100.9, output: 50.1, total: 151 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records[0].inputTokens).toBe(100); + expect(records[0].outputTokens).toBe(50); + }); +}); From 4ced2e0ef0b9e770175d69f1d1f7a51ee439bd9a Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 08:59:53 +0000 Subject: [PATCH 03/29] fix(git-hooks): resolve node binary for nvm environments When git hooks run, the shell profile is not sourced so nvm-managed Node installations are not in PATH. This caused 'node: command not found' errors on every commit for users relying on nvm. Add a PATH-extension fallback in both pre-commit and run-node-tool.sh that walks ~/.nvm/versions/node/*/bin/node and prepends the first found binary to PATH, mirroring how nvm itself resolves the runtime. --- git-hooks/pre-commit | 11 +++++++++++ scripts/pre-commit/run-node-tool.sh | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/git-hooks/pre-commit b/git-hooks/pre-commit index 948f2087ada..9650fe936eb 100755 --- a/git-hooks/pre-commit +++ b/git-hooks/pre-commit @@ -2,6 +2,17 @@ set -euo pipefail +# Resolve node when not in PATH (e.g. nvm environments where shell profile +# hasn't been sourced by the git hook). Picks the highest nvm version found. +if ! command -v node >/dev/null 2>&1; then + for _nvm_node in "$HOME/.nvm/versions/node"/*/bin/node; do + if [[ -x "$_nvm_node" ]]; then + export PATH="$(dirname "$_nvm_node"):$PATH" + break + fi + done +fi + ROOT_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" RUN_NODE_TOOL="$ROOT_DIR/scripts/pre-commit/run-node-tool.sh" FILTER_FILES="$ROOT_DIR/scripts/pre-commit/filter-staged-files.mjs" diff --git a/scripts/pre-commit/run-node-tool.sh b/scripts/pre-commit/run-node-tool.sh index 34163075517..69a26f45e0e 100755 --- a/scripts/pre-commit/run-node-tool.sh +++ b/scripts/pre-commit/run-node-tool.sh @@ -3,6 +3,16 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +# Resolve node when not in PATH (nvm environments). +if ! command -v node >/dev/null 2>&1; then + for _nvm_node in "$HOME/.nvm/versions/node"/*/bin/node; do + if [[ -x "$_nvm_node" ]]; then + export PATH="$(dirname "$_nvm_node"):$PATH" + break + fi + done +fi + if [[ $# -lt 1 ]]; then echo "usage: run-node-tool.sh [args...]" >&2 exit 2 From b5cf5aa59fb95a800094932162c48b65f876bc4e Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 09:00:56 +0000 Subject: [PATCH 04/29] fix(usage-log): add curly braces to satisfy oxlint curly rule --- src/agents/usage-log.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index b3f63595793..469ee6915e8 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -50,11 +50,15 @@ export async function recordTokenUsage(params: { }; }) { const usage = params.usage; - if (!usage) return; + if (!usage) { + return; + } const total = usage.total ?? (usage.input ?? 0) + (usage.output ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0); - if (!total || total <= 0) return; + if (!total || total <= 0) { + return; + } const memoryDir = path.join(params.workspaceDir, "memory"); const file = path.join(memoryDir, "token-usage.json"); @@ -65,10 +69,12 @@ export async function recordTokenUsage(params: { taskId: params.runId, label: params.label, tokensUsed: Math.trunc(total), - ...(usage.input != null && usage.input > 0 && { inputTokens: Math.trunc(usage.input) }), - ...(usage.output != null && usage.output > 0 && { outputTokens: Math.trunc(usage.output) }), - ...(usage.cacheRead != null && usage.cacheRead > 0 && { cacheReadTokens: Math.trunc(usage.cacheRead) }), - ...(usage.cacheWrite != null && usage.cacheWrite > 0 && { cacheWriteTokens: Math.trunc(usage.cacheWrite) }), + ...(usage.input != null && usage.input > 0 && { inputTokens: Math.trunc(usage.input) }), + ...(usage.output != null && usage.output > 0 && { outputTokens: Math.trunc(usage.output) }), + ...(usage.cacheRead != null && + usage.cacheRead > 0 && { cacheReadTokens: Math.trunc(usage.cacheRead) }), + ...(usage.cacheWrite != null && + usage.cacheWrite > 0 && { cacheWriteTokens: Math.trunc(usage.cacheWrite) }), model: params.model, provider: params.provider, runId: params.runId, From 83a566ce99af52fe3a8db1073f2167b7f598658b Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 09:24:36 +0000 Subject: [PATCH 05/29] style(usage-log): apply oxfmt formatting to test file --- src/agents/usage-log.test.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index 897bb8ee4f4..19420b5e3fd 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -1,7 +1,7 @@ -import { describe, expect, it, beforeEach, afterEach } from "vitest"; import fs from "fs/promises"; -import path from "path"; import os from "os"; +import path from "path"; +import { describe, expect, it, beforeEach, afterEach } from "vitest"; import { recordTokenUsage } from "./usage-log.js"; describe("recordTokenUsage", () => { @@ -69,7 +69,10 @@ describe("recordTokenUsage", () => { usage: undefined, }); - const exists = await fs.access(usageFile).then(() => true).catch(() => false); + const exists = await fs + .access(usageFile) + .then(() => true) + .catch(() => false); expect(exists).toBe(false); }); @@ -80,7 +83,10 @@ describe("recordTokenUsage", () => { usage: { input: 0, output: 0 }, }); - const exists = await fs.access(usageFile).then(() => true).catch(() => false); + const exists = await fs + .access(usageFile) + .then(() => true) + .catch(() => false); expect(exists).toBe(false); }); From d03e7ae8ed39c6126f0a9d5f59cebc44b2e97796 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 09:37:25 +0000 Subject: [PATCH 06/29] fix(usage-log): serialise concurrent writes with per-file promise queue Fire-and-forget callers (attempt.ts) can trigger two concurrent recordTokenUsage() calls for the same workspaceDir. The previous read-modify-write pattern had no locking, so the last writer silently overwrote the first, losing that run's entry. Fix: keep a Map> write queue so each write awaits the previous one. The queue slot is replaced with a no-throw wrapper so a failed write does not stall future writes. Added a concurrent-write test (20 parallel calls) that asserts no record is lost. --- src/agents/usage-log.test.ts | 21 +++++++++++++++++++++ src/agents/usage-log.ts | 21 ++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index 19420b5e3fd..ed2efd755df 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -119,4 +119,25 @@ describe("recordTokenUsage", () => { expect(records[0].inputTokens).toBe(100); expect(records[0].outputTokens).toBe(50); }); + + it("serialises concurrent writes — no record is lost", async () => { + const N = 20; + await Promise.all( + Array.from({ length: N }, (_, i) => + recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: i + 1, output: 1, total: i + 2 }, + }), + ), + ); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(N); + // Every distinct tokensUsed value must appear exactly once + const totals = records + .map((r: { tokensUsed: number }) => r.tokensUsed) + .toSorted((a: number, b: number) => a - b); + expect(totals).toEqual(Array.from({ length: N }, (_, i) => i + 2)); + }); }); diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 469ee6915e8..a6e180b958d 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -33,6 +33,17 @@ async function readJsonArray(file: string): Promise { } } +async function appendRecord(file: string, entry: TokenUsageRecord): Promise { + const records = await readJsonArray(file); + records.push(entry); + await fs.writeFile(file, JSON.stringify(records, null, 2)); +} + +// Per-file write queue: serialises concurrent recordTokenUsage() calls so that +// a fire-and-forget caller cannot cause two concurrent writers to read the same +// snapshot and overwrite each other's entry. +const writeQueues = new Map>(); + export async function recordTokenUsage(params: { workspaceDir: string; runId?: string; @@ -83,7 +94,11 @@ export async function recordTokenUsage(params: { createdAt: new Date().toISOString(), }; - const records = await readJsonArray(file); - records.push(entry); - await fs.writeFile(file, JSON.stringify(records, null, 2)); + const queued = writeQueues.get(file) ?? Promise.resolve(); + const next = queued.then(() => appendRecord(file, entry)); + writeQueues.set( + file, + next.catch(() => {}), + ); + await next; } From cece47f4905d7f000102a2928f3711ddcb132874 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 09:40:24 +0000 Subject: [PATCH 07/29] fix(usage-log): remove redundant taskId field from TokenUsageRecord taskId was set to params.runId, the same value already stored in the runId field, giving downstream consumers two identical fields with different names. Remove taskId from the type and the entry constructor to avoid confusion. --- src/agents/usage-log.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index a6e180b958d..802e3899f2c 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -3,7 +3,6 @@ import path from "path"; export type TokenUsageRecord = { id: string; - taskId?: string; label: string; tokensUsed: number; tokenLimit?: number; @@ -77,7 +76,6 @@ export async function recordTokenUsage(params: { const entry: TokenUsageRecord = { id: makeId(), - taskId: params.runId, label: params.label, tokensUsed: Math.trunc(total), ...(usage.input != null && usage.input > 0 && { inputTokens: Math.trunc(usage.input) }), From 98822509a8deffd599ce9b1ad25e3951f75b223b Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 14:17:36 +0000 Subject: [PATCH 08/29] fix(git-hooks): pick newest nvm Node with version-aware sort The previous loop used Bash glob expansion (lexicographic order) and stopped at the first match, so environments with multiple Node installs could select an older runtime (e.g. v18 before v22). Extract the nvm resolution into a shared scripts/pre-commit/resolve-node.sh that pipes `ls` output through `sort -V | tail -1` to select the semantically newest version. Both pre-commit and run-node-tool.sh now source the shared script, eliminating the duplicated logic. --- git-hooks/pre-commit | 17 ++++++----------- scripts/pre-commit/resolve-node.sh | 18 ++++++++++++++++++ scripts/pre-commit/run-node-tool.sh | 13 ++++--------- 3 files changed, 28 insertions(+), 20 deletions(-) create mode 100644 scripts/pre-commit/resolve-node.sh diff --git a/git-hooks/pre-commit b/git-hooks/pre-commit index 9650fe936eb..696f9b7b20e 100755 --- a/git-hooks/pre-commit +++ b/git-hooks/pre-commit @@ -2,18 +2,13 @@ set -euo pipefail -# Resolve node when not in PATH (e.g. nvm environments where shell profile -# hasn't been sourced by the git hook). Picks the highest nvm version found. -if ! command -v node >/dev/null 2>&1; then - for _nvm_node in "$HOME/.nvm/versions/node"/*/bin/node; do - if [[ -x "$_nvm_node" ]]; then - export PATH="$(dirname "$_nvm_node"):$PATH" - break - fi - done -fi - ROOT_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" + +# Resolve node when not in PATH (e.g. nvm environments where the shell +# profile hasn't been sourced by the git hook). Picks the newest installed +# nvm version using version-aware sort so that v22 is preferred over v18. +# shellcheck source=scripts/pre-commit/resolve-node.sh +source "$ROOT_DIR/scripts/pre-commit/resolve-node.sh" RUN_NODE_TOOL="$ROOT_DIR/scripts/pre-commit/run-node-tool.sh" FILTER_FILES="$ROOT_DIR/scripts/pre-commit/filter-staged-files.mjs" diff --git a/scripts/pre-commit/resolve-node.sh b/scripts/pre-commit/resolve-node.sh new file mode 100644 index 00000000000..a3f484c7029 --- /dev/null +++ b/scripts/pre-commit/resolve-node.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Resolve the newest nvm-managed Node when it is not already in PATH. +# Source this file; do not execute it directly. +# +# Uses `sort -V` (version-aware sort) so that semantic version order is +# respected — e.g. v22.x is chosen over v18.x even though "v1" sorts +# before "v2" lexicographically. +if ! command -v node >/dev/null 2>&1; then + _nvm_node=$( + ls -d "$HOME/.nvm/versions/node"/*/bin/node 2>/dev/null \ + | sort -V \ + | tail -1 + ) + if [[ -x "$_nvm_node" ]]; then + export PATH="$(dirname "$_nvm_node"):$PATH" + fi + unset _nvm_node +fi diff --git a/scripts/pre-commit/run-node-tool.sh b/scripts/pre-commit/run-node-tool.sh index 69a26f45e0e..4fda9e4ba0b 100755 --- a/scripts/pre-commit/run-node-tool.sh +++ b/scripts/pre-commit/run-node-tool.sh @@ -3,15 +3,10 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -# Resolve node when not in PATH (nvm environments). -if ! command -v node >/dev/null 2>&1; then - for _nvm_node in "$HOME/.nvm/versions/node"/*/bin/node; do - if [[ -x "$_nvm_node" ]]; then - export PATH="$(dirname "$_nvm_node"):$PATH" - break - fi - done -fi +# Resolve node when not in PATH (e.g. nvm environments). Picks the newest +# installed nvm version using version-aware sort. +# shellcheck source=resolve-node.sh +source "$ROOT_DIR/scripts/pre-commit/resolve-node.sh" if [[ $# -lt 1 ]]; then echo "usage: run-node-tool.sh [args...]" >&2 From 020001d9b28583301ed1d4d3f8b9a8c1b6d6f19f Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 14:25:13 +0000 Subject: [PATCH 09/29] fix(usage-log): propagate non-ENOENT read errors to prevent silent data loss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit readJsonArray previously caught all errors and returned [], so a malformed token-usage.json (e.g. from an interrupted writeFile) caused the next recordTokenUsage call to overwrite the file with only the new entry, permanently erasing all prior records. Fix: only suppress ENOENT (file not yet created). Any other error (SyntaxError, EACCES, …) is re-thrown so appendRecord aborts and the existing file is left intact. The write-queue slot still absorbs the rejection via .catch() so future writes are not stalled; callers that need to observe the failure (e.g. attempt.ts) can attach their own .catch() handler. --- src/agents/usage-log.test.ts | 20 ++++++++++++++++++++ src/agents/usage-log.ts | 11 +++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index ed2efd755df..b61b3154703 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -120,6 +120,26 @@ describe("recordTokenUsage", () => { expect(records[0].outputTokens).toBe(50); }); + it("does not overwrite a malformed token-usage.json — preserves corrupted file", async () => { + // Simulate an interrupted write that left partial JSON + await fs.mkdir(path.join(tmpDir, "memory"), { recursive: true }); + await fs.writeFile(usageFile, '{"broken":true', "utf-8"); + + // recordTokenUsage must reject (caller is responsible for handling, e.g. + // attempt.ts uses .catch()) and must NOT overwrite the existing file. + await expect( + recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 100, output: 50, total: 150 }, + }), + ).rejects.toThrow(SyntaxError); + + // File must still contain the original corrupted content, not a new array. + const content = await fs.readFile(usageFile, "utf-8"); + expect(content).toBe('{"broken":true'); + }); + it("serialises concurrent writes — no record is lost", async () => { const N = 20; await Promise.all( diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 802e3899f2c..2d071978510 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -27,8 +27,15 @@ async function readJsonArray(file: string): Promise { const raw = await fs.readFile(file, "utf-8"); const parsed = JSON.parse(raw); return Array.isArray(parsed) ? (parsed as TokenUsageRecord[]) : []; - } catch { - return []; + } catch (err) { + // File does not exist yet — start with an empty array. + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + return []; + } + // Any other error (malformed JSON, permission denied, partial write, …) + // must propagate so appendRecord aborts and the existing file is not + // silently overwritten with only the new entry. + throw err; } } From 386dbb010e42168d85138de1837b3d73aa90edc7 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 14:57:25 +0000 Subject: [PATCH 10/29] fix(git-hooks,usage-log): fix two CI failures pre-commit: guard the resolve-node.sh source with a file-existence check so the hook works in test environments that stub only the files they care about (the integration test creates run-node-tool.sh but not resolve-node.sh; node is provided via a fake binary in PATH so the nvm fallback is never needed in that context). usage-log: replace Math.random() in makeId() with crypto.randomBytes() to satisfy the temp-path-guard security lint rule that rejects weak randomness in source files. --- git-hooks/pre-commit | 8 ++++++-- src/agents/usage-log.ts | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/git-hooks/pre-commit b/git-hooks/pre-commit index 696f9b7b20e..cf1ccaa9c46 100755 --- a/git-hooks/pre-commit +++ b/git-hooks/pre-commit @@ -7,8 +7,12 @@ ROOT_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" # Resolve node when not in PATH (e.g. nvm environments where the shell # profile hasn't been sourced by the git hook). Picks the newest installed # nvm version using version-aware sort so that v22 is preferred over v18. -# shellcheck source=scripts/pre-commit/resolve-node.sh -source "$ROOT_DIR/scripts/pre-commit/resolve-node.sh" +_resolve_node_sh="$ROOT_DIR/scripts/pre-commit/resolve-node.sh" +if [[ -f "$_resolve_node_sh" ]]; then + # shellcheck source=scripts/pre-commit/resolve-node.sh + source "$_resolve_node_sh" +fi +unset _resolve_node_sh RUN_NODE_TOOL="$ROOT_DIR/scripts/pre-commit/run-node-tool.sh" FILTER_FILES="$ROOT_DIR/scripts/pre-commit/filter-staged-files.mjs" diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 2d071978510..992c3e945ea 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -1,3 +1,4 @@ +import { randomBytes } from "crypto"; import fs from "fs/promises"; import path from "path"; @@ -19,7 +20,7 @@ export type TokenUsageRecord = { }; function makeId() { - return `usage_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`; + return `usage_${Date.now().toString(36)}_${randomBytes(4).toString("hex")}`; } async function readJsonArray(file: string): Promise { From f4c4ab416d6d2ada5374f6ab7b9bcdf46396890d Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 14:59:24 +0000 Subject: [PATCH 11/29] style(attempt): fix import order to pass oxfmt check --- src/agents/pi-embedded-runner/run/attempt.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 3aedfcb84a9..8e72cf22b7c 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -93,6 +93,7 @@ import { sanitizeToolCallIdsForCloudCodeAssist } from "../../tool-call-id.js"; import { resolveEffectiveToolFsWorkspaceOnly } from "../../tool-fs-policy.js"; import { normalizeToolName } from "../../tool-policy.js"; import { resolveTranscriptPolicy } from "../../transcript-policy.js"; +import { recordTokenUsage } from "../../usage-log.js"; import { DEFAULT_BOOTSTRAP_FILENAME } from "../../workspace.js"; import { isRunnerAbortError } from "../abort.js"; import { appendCacheTtlTimestamp, isCacheTtlEligibleProvider } from "../cache-ttl.js"; @@ -135,7 +136,6 @@ import { import { pruneProcessedHistoryImages } from "./history-image-prune.js"; import { detectAndLoadPromptImages } from "./images.js"; import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js"; -import { recordTokenUsage } from "../../usage-log.js"; type PromptBuildHookRunner = { hasHooks: (hookName: "before_prompt_build" | "before_agent_start") => boolean; From f267ff78882b0a14e617af759c314f2705fb4989 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 16:04:22 +0000 Subject: [PATCH 12/29] fix(usage-log): add cross-process file lock to prevent write races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The in-memory writeQueues Map serialises writes within one Node process but two concurrent OpenClaw processes sharing the same workspaceDir (e.g. parallel CLI runs) can still race: both read the same snapshot before either writes, and the later writer silently overwrites the earlier entry. Add withFileLock() — an O_EXCL advisory lock on .lock — to coordinate across processes. The per-file in-memory queue is kept to reduce lock contention within the same process. On lock-acquire failure the helper retries every 50 ms up to a 5 s timeout; on timeout it removes a potentially stale lock file and makes one final attempt to prevent permanent blocking after a crash. --- src/agents/usage-log.test.ts | 19 +++++++++++ src/agents/usage-log.ts | 63 ++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 7 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index b61b3154703..e61c48af23d 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -140,6 +140,25 @@ describe("recordTokenUsage", () => { expect(content).toBe('{"broken":true'); }); + it("cross-process lock: concurrent writers via file lock do not lose records", async () => { + // Simulate two processes bypassing the in-memory queue by calling + // recordTokenUsage from independent promise chains simultaneously. + // If the file lock is working they must still land all records. + const N = 10; + const writes = Array.from({ length: N }, (_, i) => { + // Each call is deliberately NOT chained — they race on the file lock. + return recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: i + 1, output: 1, total: i + 2 }, + }); + }); + await Promise.all(writes); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(N); + }); + it("serialises concurrent writes — no record is lost", async () => { const N = 20; await Promise.all( diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 992c3e945ea..22966a1cee8 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -40,15 +40,64 @@ async function readJsonArray(file: string): Promise { } } -async function appendRecord(file: string, entry: TokenUsageRecord): Promise { - const records = await readJsonArray(file); - records.push(entry); - await fs.writeFile(file, JSON.stringify(records, null, 2)); +// --------------------------------------------------------------------------- +// Cross-process file lock +// +// The in-memory writeQueues Map serialises writes within a single Node +// process, but two concurrent OpenClaw processes targeting the same +// workspaceDir can still race: both read the same snapshot before either +// writes. We guard against that with an advisory lock file (.lock) using +// O_EXCL (create-exclusive), which is atomic on POSIX filesystems. +// +// Lock acquisition retries with a fixed interval up to LOCK_TIMEOUT_MS. +// If the holding process crashes the stale lock is removed after the +// timeout so subsequent callers are not permanently blocked. +// --------------------------------------------------------------------------- +const LOCK_RETRY_MS = 50; +const LOCK_TIMEOUT_MS = 5_000; + +async function withFileLock(lockPath: string, fn: () => Promise): Promise { + const deadline = Date.now() + LOCK_TIMEOUT_MS; + + while (Date.now() < deadline) { + let fh: fs.FileHandle | undefined; + try { + // wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists + fh = await fs.open(lockPath, "wx"); + await fh.close(); + fh = undefined; + try { + return await fn(); + } finally { + await fs.unlink(lockPath).catch(() => {}); + } + } catch (err) { + await fh?.close().catch(() => {}); + if ((err as NodeJS.ErrnoException).code !== "EEXIST") { + throw err; + } + // Another process holds the lock — wait and retry. + await new Promise((r) => setTimeout(r, LOCK_RETRY_MS)); + } + } + + // Timeout: remove a potentially stale lock and make one final attempt. + await fs.unlink(lockPath).catch(() => {}); + const records = await fn(); + return records; } -// Per-file write queue: serialises concurrent recordTokenUsage() calls so that -// a fire-and-forget caller cannot cause two concurrent writers to read the same -// snapshot and overwrite each other's entry. +async function appendRecord(file: string, entry: TokenUsageRecord): Promise { + const lockPath = `${file}.lock`; + await withFileLock(lockPath, async () => { + const records = await readJsonArray(file); + records.push(entry); + await fs.writeFile(file, JSON.stringify(records, null, 2)); + }); +} + +// Per-file write queue: serialises concurrent recordTokenUsage() calls within +// the same process so they do not all contend on the cross-process file lock. const writeQueues = new Map>(); export async function recordTokenUsage(params: { From a7a7923d09d6621cedba8cde3da3214cfacb5e3a Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 23:35:12 +0000 Subject: [PATCH 13/29] fix(usage-log): reject non-array token logs instead of resetting history MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit readJsonArray treated any valid JSON that is not an array as [], causing appendRecord to overwrite the file with only the new entry — silently deleting all prior data. This is the same data-loss mode the malformed-JSON fix was trying to prevent. Fix: throw ERR_UNEXPECTED_TOKEN_LOG_SHAPE when parsed JSON is not an array so appendRecord aborts and the existing file is preserved. --- src/agents/usage-log.test.ts | 18 ++++++++++++++++++ src/agents/usage-log.ts | 13 ++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index e61c48af23d..37ea1a92e54 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -120,6 +120,24 @@ describe("recordTokenUsage", () => { expect(records[0].outputTokens).toBe(50); }); + it("does not overwrite a valid-but-non-array token-usage.json — rejects unexpected shape", async () => { + // Simulate a manual edit or migration that left a valid JSON object + await fs.mkdir(path.join(tmpDir, "memory"), { recursive: true }); + await fs.writeFile(usageFile, '{"legacy": true, "records": []}', "utf-8"); + + await expect( + recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 100, output: 50, total: 150 }, + }), + ).rejects.toThrow("not an array"); + + // File must be unchanged — the legacy data is preserved. + const content = await fs.readFile(usageFile, "utf-8"); + expect(content).toBe('{"legacy": true, "records": []}'); + }); + it("does not overwrite a malformed token-usage.json — preserves corrupted file", async () => { // Simulate an interrupted write that left partial JSON await fs.mkdir(path.join(tmpDir, "memory"), { recursive: true }); diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 22966a1cee8..2370ee9b24e 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -27,7 +27,18 @@ async function readJsonArray(file: string): Promise { try { const raw = await fs.readFile(file, "utf-8"); const parsed = JSON.parse(raw); - return Array.isArray(parsed) ? (parsed as TokenUsageRecord[]) : []; + if (!Array.isArray(parsed)) { + // Valid JSON but unexpected shape (object, number, string, …). + // Returning [] here would cause appendRecord to overwrite the file + // with only the new entry, silently deleting prior data. + throw Object.assign( + new Error( + `token-usage.json contains valid JSON but is not an array (got ${typeof parsed})`, + ), + { code: "ERR_UNEXPECTED_TOKEN_LOG_SHAPE" }, + ); + } + return parsed as TokenUsageRecord[]; } catch (err) { // File does not exist yet — start with an empty array. if ((err as NodeJS.ErrnoException).code === "ENOENT") { From 13b0c1d0109ff6d5bf021c5f0037114106ebc3f1 Mon Sep 17 00:00:00 2001 From: jiarung Date: Fri, 13 Mar 2026 23:41:25 +0000 Subject: [PATCH 14/29] fix(usage-log): reacquire lock via O_EXCL after timeout instead of running unlocked After the retry loop timed out, withFileLock unconditionally deleted the lock file and called fn() without reacquiring the lock. If multiple waiters timed out concurrently they would all enter the critical section together, defeating the serialisation guarantee and allowing concurrent read-modify-write cycles to overwrite each other's records. Fix: after unlinking the stale lock, attempt one final O_EXCL open so that exactly one concurrent waiter wins the lock and the rest receive ERR_LOCK_TIMEOUT. The unlocked fast-path is removed entirely. --- src/agents/usage-log.ts | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 2370ee9b24e..40ae3daef4e 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -92,10 +92,32 @@ async function withFileLock(lockPath: string, fn: () => Promise): Promise< } } - // Timeout: remove a potentially stale lock and make one final attempt. + // Timed out waiting for the lock. Remove a potentially stale lock file + // (left behind by a crashed process) and make one final attempt to acquire + // it through the normal O_EXCL path. This ensures the write is always + // serialised: if the stale file is gone another waiter that also timed out + // concurrently will race on O_EXCL and only one of them will win. await fs.unlink(lockPath).catch(() => {}); - const records = await fn(); - return records; + + // Re-enter the acquisition loop for a single attempt (deadline already + // passed so the while condition is false; open directly instead). + let fh: fs.FileHandle | undefined; + try { + fh = await fs.open(lockPath, "wx"); + await fh.close(); + fh = undefined; + try { + return await fn(); + } finally { + await fs.unlink(lockPath).catch(() => {}); + } + } catch (err) { + await fh?.close().catch(() => {}); + throw Object.assign( + new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), + { code: "ERR_LOCK_TIMEOUT", cause: err }, + ); + } } async function appendRecord(file: string, entry: TokenUsageRecord): Promise { From 00e05fb4e547889b930441a2dae048726fc27608 Mon Sep 17 00:00:00 2001 From: jiarung Date: Sat, 14 Mar 2026 16:35:39 +0000 Subject: [PATCH 15/29] =?UTF-8?q?fix(usage-log):=20do=20not=20delete=20loc?= =?UTF-8?q?k=20on=20timeout=20=E2=80=94=20holder=20may=20still=20be=20acti?= =?UTF-8?q?ve?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unconditionally unlinking the lock file after LOCK_TIMEOUT_MS is unsafe: the holder may legitimately still be running (slow disk, large usage file), so removing its lock breaks mutual exclusion and allows concurrent read-modify-write cycles to overwrite each other's entries. Remove the stale-lock-removal path entirely and throw ERR_LOCK_TIMEOUT instead. Callers already swallow the error via .catch() in the write queue, so the only effect is that the write is skipped rather than risking data loss through a race. --- src/agents/usage-log.ts | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 40ae3daef4e..1738e0db4c1 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -61,8 +61,8 @@ async function readJsonArray(file: string): Promise { // O_EXCL (create-exclusive), which is atomic on POSIX filesystems. // // Lock acquisition retries with a fixed interval up to LOCK_TIMEOUT_MS. -// If the holding process crashes the stale lock is removed after the -// timeout so subsequent callers are not permanently blocked. +// On timeout ERR_LOCK_TIMEOUT is thrown; the lock file is intentionally left +// untouched so that an active holder's mutual exclusion is never broken. // --------------------------------------------------------------------------- const LOCK_RETRY_MS = 50; const LOCK_TIMEOUT_MS = 5_000; @@ -92,32 +92,13 @@ async function withFileLock(lockPath: string, fn: () => Promise): Promise< } } - // Timed out waiting for the lock. Remove a potentially stale lock file - // (left behind by a crashed process) and make one final attempt to acquire - // it through the normal O_EXCL path. This ensures the write is always - // serialised: if the stale file is gone another waiter that also timed out - // concurrently will race on O_EXCL and only one of them will win. - await fs.unlink(lockPath).catch(() => {}); - - // Re-enter the acquisition loop for a single attempt (deadline already - // passed so the while condition is false; open directly instead). - let fh: fs.FileHandle | undefined; - try { - fh = await fs.open(lockPath, "wx"); - await fh.close(); - fh = undefined; - try { - return await fn(); - } finally { - await fs.unlink(lockPath).catch(() => {}); - } - } catch (err) { - await fh?.close().catch(() => {}); - throw Object.assign( - new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), - { code: "ERR_LOCK_TIMEOUT", cause: err }, - ); - } + // Timed out without acquiring the lock. We deliberately do NOT delete the + // lock file here: the holder may still be active (slow disk, large file), + // and removing a live lock would break mutual exclusion and allow concurrent + // read-modify-write cycles to overwrite each other's entries. + throw Object.assign(new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), { + code: "ERR_LOCK_TIMEOUT", + }); } async function appendRecord(file: string, entry: TokenUsageRecord): Promise { From 1a5489bf32cf3c83aa788a53af57e90b23719a31 Mon Sep 17 00:00:00 2001 From: jiarung Date: Sat, 14 Mar 2026 16:48:08 +0000 Subject: [PATCH 16/29] fix(usage-log): recover stale lock left by abnormal process exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A process killed or crashed after creating token-usage.json.lock but before the finally-unlink runs leaves a permanent stale lock. All subsequent recordTokenUsage calls for that workspace time out and drop their entries. Fix: - Write the holder's PID into the lock file on acquisition (O_EXCL + writeFile). - On each EEXIST retry, call isLockStale() which reads the PID and sends signal 0 (kill(pid, 0)) to check liveness without delivering a signal. ESRCH means the process is gone → lock is stale; any other result (alive, EPERM, unreadable file) is treated as live so we never break a legitimately held lock. - If stale, unlink and continue to the next O_EXCL attempt; multiple concurrent waiters racing on the steal are safe because only one O_EXCL open succeeds. - Recovery is immediate (no need to wait for LOCK_TIMEOUT_MS). Add a test that spawns a subprocess, waits for it to exit, writes its dead PID into the lock file, and asserts recordTokenUsage succeeds and cleans up the lock. --- src/agents/usage-log.test.ts | 34 +++++++++++++++++++++ src/agents/usage-log.ts | 57 ++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index 37ea1a92e54..d308bf7e34e 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -1,3 +1,4 @@ +import { spawn } from "child_process"; import fs from "fs/promises"; import os from "os"; import path from "path"; @@ -177,6 +178,39 @@ describe("recordTokenUsage", () => { expect(records).toHaveLength(N); }); + it("reclaims stale lock left by a crashed process", async () => { + // Spawn a subprocess that exits immediately, then use its (now-dead) PID + // to simulate a lock file left behind after an abnormal exit. + const deadPid = await new Promise((resolve, reject) => { + const child = spawn(process.execPath, ["-e", "process.exit(0)"]); + const pid = child.pid!; + child.on("exit", () => resolve(pid)); + child.on("error", reject); + }); + + const memoryDir = path.join(tmpDir, "memory"); + await fs.mkdir(memoryDir, { recursive: true }); + const lockPath = path.join(memoryDir, "token-usage.json.lock"); + await fs.writeFile(lockPath, String(deadPid)); + + // recordTokenUsage must detect the stale lock, reclaim it, and succeed. + await recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 100, output: 50, total: 150 }, + }); + + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(1); + expect(records[0].tokensUsed).toBe(150); + // Lock file must be cleaned up by the winner. + const lockExists = await fs + .access(lockPath) + .then(() => true) + .catch(() => false); + expect(lockExists).toBe(false); + }); + it("serialises concurrent writes — no record is lost", async () => { const N = 20; await Promise.all( diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 1738e0db4c1..c8a1cdb254c 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -60,21 +60,53 @@ async function readJsonArray(file: string): Promise { // writes. We guard against that with an advisory lock file (.lock) using // O_EXCL (create-exclusive), which is atomic on POSIX filesystems. // -// Lock acquisition retries with a fixed interval up to LOCK_TIMEOUT_MS. -// On timeout ERR_LOCK_TIMEOUT is thrown; the lock file is intentionally left -// untouched so that an active holder's mutual exclusion is never broken. +// The lock file stores the holder's PID so that waiters can detect a stale +// lock left by a crashed process. On each EEXIST the waiter reads the PID +// and calls kill(pid, 0): if the process no longer exists (ESRCH) the lock +// is stale and is reclaimed immediately via a fresh O_EXCL open, preserving +// mutual exclusion even when multiple waiters race for the steal. If the +// holder is alive the waiter backs off for LOCK_RETRY_MS and retries. +// After LOCK_TIMEOUT_MS without acquiring the lock ERR_LOCK_TIMEOUT is +// thrown; the lock file is left untouched to avoid breaking a live holder. // --------------------------------------------------------------------------- const LOCK_RETRY_MS = 50; const LOCK_TIMEOUT_MS = 5_000; +/** + * Returns true only when the lock at `lockPath` was written by a process + * that no longer exists (ESRCH). Any other outcome (process alive, EPERM, + * unreadable file, non-numeric content) is treated as "not stale" so we + * never break a legitimately held lock. + */ +async function isLockStale(lockPath: string): Promise { + try { + const raw = await fs.readFile(lockPath, "utf-8"); + const pid = parseInt(raw.trim(), 10); + if (isNaN(pid) || pid <= 0) { + return false; + } + try { + process.kill(pid, 0); // signal 0 checks existence without delivering a signal + return false; // process is alive + } catch (e) { + return (e as NodeJS.ErrnoException).code === "ESRCH"; + } + } catch { + return false; // can't read lock — treat as live + } +} + async function withFileLock(lockPath: string, fn: () => Promise): Promise { const deadline = Date.now() + LOCK_TIMEOUT_MS; + const myPid = String(process.pid); while (Date.now() < deadline) { let fh: fs.FileHandle | undefined; try { - // wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists + // wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists. + // Write our PID so that a waiting process can verify we are still alive. fh = await fs.open(lockPath, "wx"); + await fh.writeFile(myPid); await fh.close(); fh = undefined; try { @@ -87,15 +119,22 @@ async function withFileLock(lockPath: string, fn: () => Promise): Promise< if ((err as NodeJS.ErrnoException).code !== "EEXIST") { throw err; } - // Another process holds the lock — wait and retry. + // The lock file exists. Check immediately whether the holder crashed; + // if so, unlink the stale lock and loop back to race on O_EXCL. + // Multiple concurrent waiters may all detect the stale lock and attempt + // the unlink — that is fine because only one O_EXCL open will succeed. + if (await isLockStale(lockPath)) { + await fs.unlink(lockPath).catch(() => {}); + continue; + } + // Holder is alive — back off and retry. await new Promise((r) => setTimeout(r, LOCK_RETRY_MS)); } } - // Timed out without acquiring the lock. We deliberately do NOT delete the - // lock file here: the holder may still be active (slow disk, large file), - // and removing a live lock would break mutual exclusion and allow concurrent - // read-modify-write cycles to overwrite each other's entries. + // Timed out without acquiring the lock. The lock file is intentionally left + // untouched: the holder may still be active (slow disk, large file), and + // removing a live lock would break mutual exclusion. throw Object.assign(new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), { code: "ERR_LOCK_TIMEOUT", }); From 8c162d0ba4937a8ee99eeea58906788e47d6abed Mon Sep 17 00:00:00 2001 From: jiarung Date: Sat, 14 Mar 2026 18:41:39 +0000 Subject: [PATCH 17/29] fix(usage-log): write via temp file and atomic rename to prevent corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit appendRecord previously called fs.writeFile(token-usage.json, …) directly. A process crash or SIGKILL during that write can leave the file truncated; readJsonArray then throws (SyntaxError), and since attempt.ts swallows the error with .catch(), that one interrupted write silently disables all future token logging for the workspace until the file is manually repaired. Fix: write the new content to a uniquely-named sibling temp file first, then call fs.rename() to atomically replace the real file. rename(2) is atomic on POSIX when src and dst share the same directory/filesystem, so readers always see either the old complete file or the new complete file — never a partial write. The temp file is unlinked on error to avoid leaving orphans. --- src/agents/usage-log.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index c8a1cdb254c..43bfaabeab1 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -145,7 +145,18 @@ async function appendRecord(file: string, entry: TokenUsageRecord): Promise { const records = await readJsonArray(file); records.push(entry); - await fs.writeFile(file, JSON.stringify(records, null, 2)); + // Write to a sibling temp file then atomically rename into place so that + // a crash or kill during the write never leaves token-usage.json truncated. + // rename(2) is atomic on POSIX when src and dst are on the same filesystem, + // which is guaranteed here because both paths share the same directory. + const tmp = `${file}.tmp.${randomBytes(4).toString("hex")}`; + try { + await fs.writeFile(tmp, JSON.stringify(records, null, 2)); + await fs.rename(tmp, file); + } catch (err) { + await fs.unlink(tmp).catch(() => {}); + throw err; + } }); } From d3971e77fdcf80202ded5a548fa88458eaff701a Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 05:14:07 +0000 Subject: [PATCH 18/29] fix(git-hooks): replace GNU-only sort -V with portable zero-pad sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sort -V is a GNU extension; BSD sort on macOS does not support it. When node is absent from PATH and the nvm fallback runs, set -euo pipefail causes the unsupported flag to abort the hook before lint/format can run, blocking commits on macOS. Replace the sort -V | tail -1 pipeline with a Bash for-loop that zero-pads each semver component to five digits and emits a tab-delimited key+path line. Plain sort + tail -1 + cut then selects the highest semantic version — no GNU-only flags required. Smoke-tested with v18 vs v22 paths; v22 is correctly selected on both GNU and BSD sort. --- scripts/pre-commit/resolve-node.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/pre-commit/resolve-node.sh b/scripts/pre-commit/resolve-node.sh index a3f484c7029..f51226889fc 100644 --- a/scripts/pre-commit/resolve-node.sh +++ b/scripts/pre-commit/resolve-node.sh @@ -2,17 +2,21 @@ # Resolve the newest nvm-managed Node when it is not already in PATH. # Source this file; do not execute it directly. # -# Uses `sort -V` (version-aware sort) so that semantic version order is -# respected — e.g. v22.x is chosen over v18.x even though "v1" sorts -# before "v2" lexicographically. +# Cross-platform: avoids GNU-only `sort -V` (not supported by BSD sort on +# macOS). Instead, each semver component is zero-padded to five digits so +# that a plain lexicographic sort correctly selects the highest semantic +# version (e.g. v22.x wins over v18.x). if ! command -v node >/dev/null 2>&1; then _nvm_node=$( - ls -d "$HOME/.nvm/versions/node"/*/bin/node 2>/dev/null \ - | sort -V \ - | tail -1 + for _p in "$HOME/.nvm/versions/node"/*/bin/node; do + [[ -x "$_p" ]] || continue + _ver="${_p%/bin/node}"; _ver="${_ver##*/v}" + IFS=. read -r _ma _mi _pa <<< "$_ver" + printf '%05d%05d%05d\t%s\n' "${_ma:-0}" "${_mi:-0}" "${_pa:-0}" "$_p" + done | sort | tail -1 | cut -f2- ) if [[ -x "$_nvm_node" ]]; then export PATH="$(dirname "$_nvm_node"):$PATH" fi - unset _nvm_node + unset _nvm_node _p _ver _ma _mi _pa fi From c5c92e6be1027e10f278cecadd61f07e377eebfd Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 05:42:29 +0000 Subject: [PATCH 19/29] fix(file-lock): reclaim lock files with invalid or empty content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lock file is created (empty) by open("wx") before pid/createdAt are written by the subsequent writeFile. A process that crashes in this narrow window leaves an empty .lock file whose content readLockPayload() cannot parse (returns null). Previously isStaleLock skipped both the pid-alive and the age checks when payload was null, falling through to the mtime stat. If the mtime was still within staleMs the function returned false, making the empty lock appear live indefinitely — every future writer would time out and silently drop its usage record until the file was manually deleted. Fix: treat null payload (empty, truncated, or non-JSON content) as stale immediately. Such a file could only have been left by a process that never completed the write, so it is safe to reclaim without waiting for the mtime timeout. The mtime stat fallback is also removed: its only useful case was exactly this null-payload scenario (it was redundant when payload is valid, since the pid-alive and createdAt-age checks already cover the live-lock and aged-out-lock cases). Tests added: - empty lock file → reclaimed, callback runs - truncated/invalid JSON lock file → reclaimed - pid field not a number → reclaimed --- src/plugin-sdk/file-lock.test.ts | 91 ++++++++++++++++++++++++++++++++ src/plugin-sdk/file-lock.ts | 23 ++++---- 2 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 src/plugin-sdk/file-lock.test.ts diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts new file mode 100644 index 00000000000..e5d015ad9cf --- /dev/null +++ b/src/plugin-sdk/file-lock.test.ts @@ -0,0 +1,91 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { withFileLock } from "./file-lock.js"; + +const LOCK_OPTIONS = { + retries: { + retries: 2, + factor: 1, + minTimeout: 20, + maxTimeout: 50, + }, + stale: 5_000, +}; + +describe("withFileLock", () => { + let tmpDir: string; + let targetFile: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "file-lock-test-")); + targetFile = path.join(tmpDir, "data.json"); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + it("acquires and releases the lock, allowing a second caller to proceed", async () => { + const order: string[] = []; + await withFileLock(targetFile, LOCK_OPTIONS, async () => { + order.push("first-start"); + await new Promise((r) => setTimeout(r, 10)); + order.push("first-end"); + }); + await withFileLock(targetFile, LOCK_OPTIONS, async () => { + order.push("second"); + }); + expect(order).toEqual(["first-start", "first-end", "second"]); + }); + + it("reclaims an empty lock file left by a crash between open and writeFile", async () => { + // Simulate a crash in the open("wx")-to-writeFile window: the .lock file + // exists but has empty (unparseable) content. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, ""); // empty — no pid/createdAt written + + // withFileLock must not time out; it should reclaim the empty lock and + // run the callback without error. + let ran = false; + await expect( + withFileLock(targetFile, LOCK_OPTIONS, async () => { + ran = true; + }), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + + it("reclaims a lock file containing partial/invalid JSON", async () => { + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, '{"pid":'); // truncated JSON + + let ran = false; + await expect( + withFileLock(targetFile, LOCK_OPTIONS, async () => { + ran = true; + }), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + + it("reclaims a lock file whose pid field is not a number", async () => { + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile( + lockPath, + JSON.stringify({ pid: "not-a-number", createdAt: new Date().toISOString() }), + ); + + let ran = false; + await expect( + withFileLock(targetFile, LOCK_OPTIONS, async () => { + ran = true; + }), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); +}); diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index 98277381868..ee80bcd1d81 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -64,21 +64,22 @@ async function resolveNormalizedFilePath(filePath: string): Promise { async function isStaleLock(lockPath: string, staleMs: number): Promise { const payload = await readLockPayload(lockPath); - if (payload?.pid && !isPidAlive(payload.pid)) { + // A lock file with missing or unparseable content was left by a process + // that crashed between open("wx") (which creates the file) and the + // subsequent writeFile (which fills in the pid/createdAt). Treat it as + // stale immediately so it can be reclaimed rather than blocking every + // future writer until the mtime-based timeout expires. + if (payload === null) { return true; } - if (payload?.createdAt) { - const createdAt = Date.parse(payload.createdAt); - if (!Number.isFinite(createdAt) || Date.now() - createdAt > staleMs) { - return true; - } - } - try { - const stat = await fs.stat(lockPath); - return Date.now() - stat.mtimeMs > staleMs; - } catch { + if (!isPidAlive(payload.pid)) { return true; } + const createdAt = Date.parse(payload.createdAt); + if (!Number.isFinite(createdAt) || Date.now() - createdAt > staleMs) { + return true; + } + return false; } export type FileLockHandle = { From 8d636b8a61e7c97e9cdd27af8b4fb53e319dc1f5 Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 05:56:09 +0000 Subject: [PATCH 20/29] fix(file-lock): guard stale-lock reclaim with inode identity check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TOCTOU in the stale-lock branch: isStaleLock(lockPath) returning true is evaluated under several awaits before unlink is called. If two waiters (same process or different processes) both observe the same stale file, waiter A can unlink, create a fresh lock, and start fn(), then waiter B's delayed unlink removes A's fresh file. B then wins open(O_EXCL) and both A and B execute fn() concurrently, breaking the read-modify-write guarantee for token-usage.json. Fix: snapshot the lock file's inode immediately after the EEXIST, then re-stat right before the unlink. If the inode changed between the two stats, a concurrent waiter already reclaimed the stale file and wrote a fresh lock; leave the new file alone and continue to the next open(O_EXCL) attempt. The three-outcome table: staleIno == -1 (file gone by the time we stat) → skip unlink, continue: another waiter already handled it staleIno == currentIno (same stale file still there) → safe to unlink; we and the other waiter(s) racing here all call rm(force:true) — the first succeeds, the rest get silent ENOENT staleIno != currentIno (inode changed — fresh lock in place) → do NOT unlink; continue and let isStaleLock reject the live lock A note on the in-loop HELD_LOCKS re-check that was considered: joining the existing holder inside the retry loop would allow two independent concurrent callers to run fn() simultaneously, which breaks mutual exclusion. HELD_LOCKS reentrant join is intentionally restricted to the entry point of acquireFileLock (recursive/reentrant callers only). Tests added: - two concurrent waiters on a stale lock never overlap inside fn() (maxInside assertion, not just result set) - existing stale-reclaim tests continue to pass --- src/plugin-sdk/file-lock.test.ts | 46 ++++++++++++++++++++++++++++++++ src/plugin-sdk/file-lock.ts | 35 ++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index e5d015ad9cf..a49005192ea 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -14,6 +14,18 @@ const LOCK_OPTIONS = { stale: 5_000, }; +// More retries for tests where one waiter must survive while the other holds +// the lock for a non-trivial duration. +const RETRY_LOCK_OPTIONS = { + retries: { + retries: 10, + factor: 1, + minTimeout: 10, + maxTimeout: 30, + }, + stale: 5_000, +}; + describe("withFileLock", () => { let tmpDir: string; let targetFile: string; @@ -88,4 +100,38 @@ describe("withFileLock", () => { ).resolves.toBeUndefined(); expect(ran).toBe(true); }); + + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { + // Plant a stale lock (dead PID, old timestamp) so both waiters will + // simultaneously enter the stale-reclaim branch. The inode guard must + // prevent the slower waiter's unlink from deleting the faster waiter's + // freshly-acquired lock, which would allow both fn() calls to run + // concurrently and corrupt each other's read-modify-write sequences. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, JSON.stringify({ pid: 0, createdAt: new Date(0).toISOString() })); + + let inside = 0; // number of concurrent fn() executions + let maxInside = 0; + const results: number[] = []; + + const run = async (id: number) => { + // Use RETRY_LOCK_OPTIONS so the losing waiter has enough budget to + // outlast the winning waiter's 20 ms hold without timing out. + await withFileLock(targetFile, RETRY_LOCK_OPTIONS, async () => { + inside += 1; + maxInside = Math.max(maxInside, inside); + await new Promise((r) => setTimeout(r, 20)); // hold the lock briefly + results.push(id); + inside -= 1; + }); + }; + + // Launch both concurrently so they race on the stale lock. + await Promise.all([run(1), run(2)]); + + // Both callbacks must have run exactly once and never overlapped. + expect(results.toSorted((a, b) => a - b)).toEqual([1, 2]); + expect(maxInside).toBe(1); + }); }); diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index ee80bcd1d81..061f3335920 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -134,10 +134,41 @@ export async function acquireFileLock( if (code !== "EEXIST") { throw err; } - if (await isStaleLock(lockPath, options.stale)) { - await fs.rm(lockPath, { force: true }).catch(() => undefined); + + // Snapshot the inode of the existing lock file *before* checking + // staleness. We compare it again just before unlinking; if the inode + // has changed in the interim, another waiter already reclaimed the + // stale file and created a fresh lock — deleting it would silently + // break that holder's mutual exclusion guarantee. + const staleIno = await fs + .stat(lockPath) + .then((s) => s.ino) + .catch(() => -1); + + // staleIno === -1 means the file vanished between open(EEXIST) and + // stat — another process already removed it. Skip straight to the + // next open(O_EXCL) attempt. + const isStale = staleIno === -1 || (await isStaleLock(lockPath, options.stale)); + + if (isStale) { + if (staleIno !== -1) { + // Re-verify the path still maps to the same inode we deemed stale. + // If it changed, a concurrent waiter beat us to the reclaim and has + // already written its own fresh lock; leave that file alone. + const currentIno = await fs + .stat(lockPath) + .then((s) => s.ino) + .catch(() => -1); + if (currentIno === staleIno) { + await fs.rm(lockPath, { force: true }).catch(() => undefined); + } + } + // Retry open(O_EXCL) regardless: either we removed the stale lock or + // a concurrent waiter already handled it; either way, the path is now + // either free or holds a fresh lock that isStaleLock will reject. continue; } + if (attempt >= attempts - 1) { break; } From 3e1eda63d9eb64d33deb99dc501aa8d2ba7e2029 Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 07:36:31 +0000 Subject: [PATCH 21/29] refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match. --- src/agents/usage-log.test.ts | 6 +- src/agents/usage-log.ts | 105 +++++++---------------------------- 2 files changed, 26 insertions(+), 85 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index d308bf7e34e..e250bd63a3b 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -191,7 +191,11 @@ describe("recordTokenUsage", () => { const memoryDir = path.join(tmpDir, "memory"); await fs.mkdir(memoryDir, { recursive: true }); const lockPath = path.join(memoryDir, "token-usage.json.lock"); - await fs.writeFile(lockPath, String(deadPid)); + // withFileLock (plugin-sdk) stores {pid, createdAt} — match that format. + await fs.writeFile( + lockPath, + JSON.stringify({ pid: deadPid, createdAt: new Date().toISOString() }), + ); // recordTokenUsage must detect the stale lock, reclaim it, and succeed. await recordTokenUsage({ diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 43bfaabeab1..4f04ddd0d01 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -1,6 +1,7 @@ import { randomBytes } from "crypto"; import fs from "fs/promises"; import path from "path"; +import { type FileLockOptions, withFileLock } from "../infra/file-lock.js"; export type TokenUsageRecord = { id: string; @@ -55,94 +56,30 @@ async function readJsonArray(file: string): Promise { // Cross-process file lock // // The in-memory writeQueues Map serialises writes within a single Node -// process, but two concurrent OpenClaw processes targeting the same -// workspaceDir can still race: both read the same snapshot before either -// writes. We guard against that with an advisory lock file (.lock) using -// O_EXCL (create-exclusive), which is atomic on POSIX filesystems. +// process. Two concurrent OpenClaw processes targeting the same +// workspaceDir can still race, so we use an advisory O_EXCL lock provided +// by the shared withFileLock helper in plugin-sdk/file-lock.ts. // -// The lock file stores the holder's PID so that waiters can detect a stale -// lock left by a crashed process. On each EEXIST the waiter reads the PID -// and calls kill(pid, 0): if the process no longer exists (ESRCH) the lock -// is stale and is reclaimed immediately via a fresh O_EXCL open, preserving -// mutual exclusion even when multiple waiters race for the steal. If the -// holder is alive the waiter backs off for LOCK_RETRY_MS and retries. -// After LOCK_TIMEOUT_MS without acquiring the lock ERR_LOCK_TIMEOUT is -// thrown; the lock file is left untouched to avoid breaking a live holder. +// That implementation: +// • stores {pid, createdAt} so waiters can detect a crashed holder +// • treats empty/unparseable lock content as stale (crash during open→write) +// • re-verifies the lock inode before removing it so a slow waiter's +// unlink cannot delete a fresh lock from another process +// • uses exponential backoff with jitter capped at stale ms // --------------------------------------------------------------------------- -const LOCK_RETRY_MS = 50; -const LOCK_TIMEOUT_MS = 5_000; - -/** - * Returns true only when the lock at `lockPath` was written by a process - * that no longer exists (ESRCH). Any other outcome (process alive, EPERM, - * unreadable file, non-numeric content) is treated as "not stale" so we - * never break a legitimately held lock. - */ -async function isLockStale(lockPath: string): Promise { - try { - const raw = await fs.readFile(lockPath, "utf-8"); - const pid = parseInt(raw.trim(), 10); - if (isNaN(pid) || pid <= 0) { - return false; - } - try { - process.kill(pid, 0); // signal 0 checks existence without delivering a signal - return false; // process is alive - } catch (e) { - return (e as NodeJS.ErrnoException).code === "ESRCH"; - } - } catch { - return false; // can't read lock — treat as live - } -} - -async function withFileLock(lockPath: string, fn: () => Promise): Promise { - const deadline = Date.now() + LOCK_TIMEOUT_MS; - const myPid = String(process.pid); - - while (Date.now() < deadline) { - let fh: fs.FileHandle | undefined; - try { - // wx = O_WRONLY | O_CREAT | O_EXCL — fails if the file already exists. - // Write our PID so that a waiting process can verify we are still alive. - fh = await fs.open(lockPath, "wx"); - await fh.writeFile(myPid); - await fh.close(); - fh = undefined; - try { - return await fn(); - } finally { - await fs.unlink(lockPath).catch(() => {}); - } - } catch (err) { - await fh?.close().catch(() => {}); - if ((err as NodeJS.ErrnoException).code !== "EEXIST") { - throw err; - } - // The lock file exists. Check immediately whether the holder crashed; - // if so, unlink the stale lock and loop back to race on O_EXCL. - // Multiple concurrent waiters may all detect the stale lock and attempt - // the unlink — that is fine because only one O_EXCL open will succeed. - if (await isLockStale(lockPath)) { - await fs.unlink(lockPath).catch(() => {}); - continue; - } - // Holder is alive — back off and retry. - await new Promise((r) => setTimeout(r, LOCK_RETRY_MS)); - } - } - - // Timed out without acquiring the lock. The lock file is intentionally left - // untouched: the holder may still be active (slow disk, large file), and - // removing a live lock would break mutual exclusion. - throw Object.assign(new Error(`Could not acquire lock ${lockPath} within ${LOCK_TIMEOUT_MS}ms`), { - code: "ERR_LOCK_TIMEOUT", - }); -} +const APPEND_LOCK_OPTIONS: FileLockOptions = { + // ~100 retries × 50 ms ≈ 5 s total — matches the previous LOCK_TIMEOUT_MS. + retries: { + retries: 100, + factor: 1, + minTimeout: 50, + maxTimeout: 50, + }, + stale: 5_000, +}; async function appendRecord(file: string, entry: TokenUsageRecord): Promise { - const lockPath = `${file}.lock`; - await withFileLock(lockPath, async () => { + await withFileLock(file, APPEND_LOCK_OPTIONS, async () => { const records = await readJsonArray(file); records.push(entry); // Write to a sibling temp file then atomically rename into place so that From 9944231ff450675c77e3923d07ee384079d16d3d Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 07:46:31 +0000 Subject: [PATCH 22/29] fix(file-lock,git-hooks): PID reuse detection, null-payload race, prerelease sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent fixes bundled here because they came from the same review pass. ── 1. Record lock owner identity beyond PID (file-lock) ────────────── Stale-lock detection used only isPidAlive(), but PIDs are reusable. On systems with small PID namespaces (containers, rapid restarts) a crashed writer's PID can be reassigned to an unrelated live process, causing isStaleLock to return false and the lock to appear held indefinitely. Fix: record the process start time (field 22 from /proc/{pid}/stat) alongside pid and createdAt. On Linux, if the current holder's startTime differs from the stored value the PID was recycled and the lock is reclaimed immediately. On other platforms startTime is omitted and the existing createdAt age-check (a reused PID inherits the old timestamp, exceeding staleMs) remains as the fallback. ── 2. Restore mtime fallback for null/unparseable payloads (file-lock) ─ The previous fix treated null payload as immediately stale. But the lock file is created (empty) by open('wx') before writeFile fills in the JSON. A live writer still in that window has an empty file; marking it stale immediately allows a second process to steal the lock and both to enter fn() concurrently. Fix: when payload is null, fall back to the file's mtime. A file younger than staleMs may belong to a live writer and is left alone; a file older than staleMs was definitely orphaned and is reclaimed. A new test asserts that a freshly-created empty lock (recent mtime) is NOT treated as stale. ── 3. Strip prerelease suffix before printf '%05d' (resolve-node.sh) ── When an nvm install has a prerelease directory name (e.g. v22.0.0-rc.1/bin/node), splitting on '.' leaves _pa as '0-rc.1'. printf '%05d' then fails because '0-rc.1' is not an integer, and set -euo pipefail aborts the hook before lint/format can run — the opposite of what the nvm fallback is meant to achieve. Fix: strip the longest non-digit suffix from each component before printf: '0-rc.1' → '0', '14' → '14' (no-op for normal releases). Uses POSIX parameter expansion so it works on both GNU bash and macOS bash 3.x. --- scripts/pre-commit/resolve-node.sh | 7 +++ src/plugin-sdk/file-lock.test.ts | 94 ++++++++++++++++++++++++++++-- src/plugin-sdk/file-lock.ts | 82 ++++++++++++++++++++------ 3 files changed, 162 insertions(+), 21 deletions(-) diff --git a/scripts/pre-commit/resolve-node.sh b/scripts/pre-commit/resolve-node.sh index f51226889fc..60a77ed759b 100644 --- a/scripts/pre-commit/resolve-node.sh +++ b/scripts/pre-commit/resolve-node.sh @@ -12,6 +12,13 @@ if ! command -v node >/dev/null 2>&1; then [[ -x "$_p" ]] || continue _ver="${_p%/bin/node}"; _ver="${_ver##*/v}" IFS=. read -r _ma _mi _pa <<< "$_ver" + # Strip any non-numeric suffix so prerelease tags (e.g. "0-rc.1", "0-nightly") + # do not make printf '%05d' fail under set -euo pipefail. + # ${var%%pattern} removes the longest suffix matching pattern, so + # "0-rc.1" → "0" and "14" → "14" (no-op when already numeric-only). + _ma="${_ma%%[^0-9]*}" + _mi="${_mi%%[^0-9]*}" + _pa="${_pa%%[^0-9]*}" printf '%05d%05d%05d\t%s\n' "${_ma:-0}" "${_mi:-0}" "${_pa:-0}" "$_p" done | sort | tail -1 | cut -f2- ) diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index a49005192ea..012b574d7c1 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -2,8 +2,15 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { getProcessStartTime } from "../shared/pid-alive.js"; import { withFileLock } from "./file-lock.js"; +/** Set a file's mtime to `msAgo` milliseconds in the past. */ +async function ageFile(filePath: string, msAgo: number): Promise { + const t = (Date.now() - msAgo) / 1000; // fs.utimes takes seconds + await fs.utimes(filePath, t, t); +} + const LOCK_OPTIONS = { retries: { retries: 2, @@ -58,9 +65,11 @@ describe("withFileLock", () => { const lockPath = `${targetFile}.lock`; await fs.mkdir(path.dirname(targetFile), { recursive: true }); await fs.writeFile(lockPath, ""); // empty — no pid/createdAt written + // An empty file with a young mtime is conservatively treated as a live + // writer still in the open→writeFile window. Age it past staleMs so the + // mtime-based fallback marks it as stale. + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); - // withFileLock must not time out; it should reclaim the empty lock and - // run the callback without error. let ran = false; await expect( withFileLock(targetFile, LOCK_OPTIONS, async () => { @@ -70,10 +79,28 @@ describe("withFileLock", () => { expect(ran).toBe(true); }); - it("reclaims a lock file containing partial/invalid JSON", async () => { + it("does not reclaim an empty lock file whose mtime is within staleMs (live writer window)", async () => { + // A freshly-created empty lock belongs to a live writer still in the + // open→writeFile window. isStaleLock must NOT treat it as stale. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, ""); // young mtime: effectively "just opened" + + // withFileLock should time out because the empty lock looks live. + await expect( + withFileLock( + targetFile, + { ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 1 } }, + async () => {}, + ), + ).rejects.toThrow("file lock timeout"); + }); + + it("reclaims a lock file containing partial/invalid JSON aged past staleMs", async () => { const lockPath = `${targetFile}.lock`; await fs.mkdir(path.dirname(targetFile), { recursive: true }); await fs.writeFile(lockPath, '{"pid":'); // truncated JSON + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); let ran = false; await expect( @@ -84,13 +111,38 @@ describe("withFileLock", () => { expect(ran).toBe(true); }); - it("reclaims a lock file whose pid field is not a number", async () => { + it("reclaims a lock file whose pid field is not a number, aged past staleMs", async () => { const lockPath = `${targetFile}.lock`; await fs.mkdir(path.dirname(targetFile), { recursive: true }); await fs.writeFile( lockPath, JSON.stringify({ pid: "not-a-number", createdAt: new Date().toISOString() }), ); + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); + + let ran = false; + await expect( + withFileLock(targetFile, LOCK_OPTIONS, async () => { + ran = true; + }), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + + it("reclaims a lock whose live PID has an old createdAt (PID-reuse via age check)", async () => { + // Simulate PID reuse: the lock contains the test process's own (live) PID + // but a createdAt older than staleMs, as if the original holder crashed and + // the OS later assigned the same PID to an unrelated process. The age + // check must reclaim the lock even though the PID is alive. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(Date.now() - (LOCK_OPTIONS.stale + 5_000)).toISOString(), + }), + ); let ran = false; await expect( @@ -101,6 +153,40 @@ describe("withFileLock", () => { expect(ran).toBe(true); }); + it.skipIf(process.platform !== "linux")( + "reclaims a lock whose PID was recycled by a different process (startTime mismatch)", + async () => { + // On Linux, /proc/{pid}/stat provides a per-process start time that + // survives PID reuse. Write a lock with the current process's PID but a + // startTime that cannot match — isStaleLock should detect the mismatch + // and reclaim immediately, without waiting for createdAt to age out. + const actualStartTime = getProcessStartTime(process.pid); + if (actualStartTime === null) { + // getProcessStartTime returned null unexpectedly on Linux — skip. + return; + } + + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), // recent — age check alone would not fire + startTime: actualStartTime + 999_999, // deliberately wrong + }), + ); + + let ran = false; + await expect( + withFileLock(targetFile, LOCK_OPTIONS, async () => { + ran = true; + }), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }, + ); + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { // Plant a stale lock (dead PID, old timestamp) so both waiters will // simultaneously enter the stale-reclaim branch. The inode guard must diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index 061f3335920..f09a594d2f6 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -1,6 +1,6 @@ import fs from "node:fs/promises"; import path from "node:path"; -import { isPidAlive } from "../shared/pid-alive.js"; +import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js"; import { resolveProcessScopedMap } from "../shared/process-scoped-map.js"; export type FileLockOptions = { @@ -17,6 +17,11 @@ export type FileLockOptions = { type LockFilePayload = { pid: number; createdAt: string; + // /proc/{pid}/stat field 22 (clock ticks since boot), recorded at lock + // creation. Present only on Linux; omitted (undefined) on other platforms. + // Used to detect PID recycling: if the same PID is later alive but has a + // different startTime, it belongs to a different process. + startTime?: number; }; type HeldLock = { @@ -44,7 +49,11 @@ async function readLockPayload(lockPath: string): Promise { async function isStaleLock(lockPath: string, staleMs: number): Promise { const payload = await readLockPayload(lockPath); - // A lock file with missing or unparseable content was left by a process - // that crashed between open("wx") (which creates the file) and the - // subsequent writeFile (which fills in the pid/createdAt). Treat it as - // stale immediately so it can be reclaimed rather than blocking every - // future writer until the mtime-based timeout expires. - if (payload === null) { - return true; + + if (payload !== null) { + // PID liveness alone is not enough: the OS can recycle a PID after the + // original holder exits. Three complementary checks handle this: + // + // 1. isPidAlive: fast path — if the PID is gone, the lock is stale. + // + // 2. startTime (Linux only): /proc/{pid}/stat field 22 records how long + // after boot the process started. If the current holder's startTime + // differs from the stored value, the PID was recycled by an unrelated + // process and the lock can be reclaimed immediately. + // + // 3. createdAt age: a reused PID inherits the old creation timestamp, so + // once it exceeds staleMs the lock is reclaimed on any platform. + if (!isPidAlive(payload.pid)) { + return true; + } + if (payload.startTime !== undefined) { + const currentStartTime = getProcessStartTime(payload.pid); + if (currentStartTime !== null && currentStartTime !== payload.startTime) { + return true; // PID was recycled by a different process + } + } + const createdAt = Date.parse(payload.createdAt); + if (!Number.isFinite(createdAt) || Date.now() - createdAt > staleMs) { + return true; + } + return false; } - if (!isPidAlive(payload.pid)) { - return true; + + // payload is null: the lock file exists but its content is empty or + // unparseable. The most likely cause is a crash in the narrow window + // between open("wx") (file created, empty) and writeFile (payload written). + // A live writer still in that window is indistinguishable from a crashed + // one by content alone, so we fall back to the file's mtime: a young file + // (mtime < staleMs ago) may belong to a live writer; an old file was + // definitely orphaned. Treating null as immediately stale would steal the + // lock from a live writer and break mutual exclusion. + try { + const stat = await fs.stat(lockPath); + return Date.now() - stat.mtimeMs > staleMs; + } catch { + return true; // file vanished: another waiter already handled it } - const createdAt = Date.parse(payload.createdAt); - if (!Number.isFinite(createdAt) || Date.now() - createdAt > staleMs) { - return true; - } - return false; } export type FileLockHandle = { @@ -120,8 +157,19 @@ export async function acquireFileLock( for (let attempt = 0; attempt < attempts; attempt += 1) { try { const handle = await fs.open(lockPath, "wx"); + const startTime = getProcessStartTime(process.pid); await handle.writeFile( - JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2), + JSON.stringify( + { + pid: process.pid, + createdAt: new Date().toISOString(), + // Omit startTime on non-Linux where it is null so the field is + // absent from the JSON rather than present as null. + ...(startTime !== null && { startTime }), + }, + null, + 2, + ), "utf8", ); HELD_LOCKS.set(normalizedFile, { count: 1, handle, lockPath }); From 9f05b36834e31501837e4831d016cca5efd2394e Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 08:07:32 +0000 Subject: [PATCH 23/29] fix(usage-log): canonicalize queue key to prevent concurrent writes via path aliases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit writeQueues was keyed by the raw workspaceDir-derived path before any realpath resolution. Two callers using different spellings of the same physical directory (a symlink and its target, or a relative vs absolute path) therefore produced separate queue entries and both entered appendRecord concurrently. Inside appendRecord, withFileLock calls resolveNormalizedFilePath which uses fs.realpath on the directory; both spellings resolve to the same normalised path. If one chain is already in fn() — its entry set in HELD_LOCKS — the second chain's acquireFileLock sees HELD_LOCKS hit for the same normalised path and re-entrantly joins it. Both callbacks then execute the read-modify-write cycle concurrently, and whichever writes last overwrites the first, silently dropping one entry per collision. Fix: call fs.realpath(memoryDir) immediately after fs.mkdir and use the canonical path as both the writeQueues key and the appendRecord file argument. A single canonical key means all in-process writers for the same physical file are serialised through one queue regardless of how the workspace path was spelled by the caller. Test: symlink tmpDir to a second name and interleave concurrent recordTokenUsage calls across both spellings. Asserts all N records survive — regression guard for the path-alias queue split. --- src/agents/usage-log.test.ts | 30 ++++++++++++++++++++++++++++++ src/agents/usage-log.ts | 10 +++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index e250bd63a3b..0922a328989 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -215,6 +215,36 @@ describe("recordTokenUsage", () => { expect(lockExists).toBe(false); }); + it("different path spellings for the same workspace share one queue — no record is lost", async () => { + // Symlink tmpDir → another name so the same physical directory has two + // spellings. Without queue-key canonicalisation both spellings create + // independent writeQueues entries; when one chain holds the file lock + // (HELD_LOCKS set) the other re-entrantly joins it and both execute the + // read-modify-write cycle concurrently, silently dropping entries. + const symlinkDir = `${tmpDir}-symlink`; + await fs.symlink(tmpDir, symlinkDir); + try { + // Mix canonical and symlink paths across concurrent writes. + const N = 6; + await Promise.all( + Array.from({ length: N }, (_, i) => + recordTokenUsage({ + workspaceDir: i % 2 === 0 ? tmpDir : symlinkDir, + label: "llm_output", + usage: { input: i + 1, output: 1, total: i + 2 }, + }), + ), + ); + + // All N records must survive — none may be lost to a concurrent + // read-modify-write collision. + const records = JSON.parse(await fs.readFile(usageFile, "utf-8")); + expect(records).toHaveLength(N); + } finally { + await fs.unlink(symlinkDir).catch(() => {}); + } + }); + it("serialises concurrent writes — no record is lost", async () => { const N = 20; await Promise.all( diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 4f04ddd0d01..dc640f7410e 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -129,8 +129,16 @@ export async function recordTokenUsage(params: { } const memoryDir = path.join(params.workspaceDir, "memory"); - const file = path.join(memoryDir, "token-usage.json"); await fs.mkdir(memoryDir, { recursive: true }); + // Canonicalize before keying writeQueues so that different path spellings + // for the same physical directory (e.g. a symlink vs its target) share a + // single in-process queue. Without this, two spellings produce separate + // queue entries and both call appendRecord concurrently; when + // withFileLock's HELD_LOCKS map then resolves both to the same normalised + // path the second caller re-entrantly joins the first — allowing concurrent + // read-modify-write cycles that silently drop entries. + const realMemoryDir = await fs.realpath(memoryDir).catch(() => memoryDir); + const file = path.join(realMemoryDir, "token-usage.json"); const entry: TokenUsageRecord = { id: makeId(), From b182845b8035eab57b32c93edca5338acd98620d Mon Sep 17 00:00:00 2001 From: jiarung Date: Sun, 15 Mar 2026 14:06:19 +0000 Subject: [PATCH 24/29] fix(test): use out-of-grace-window timestamp in append skip test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 843e3c1ef restored a recency grace window (60 s) for append messages: messages newer than connectedAtMs - 60 s are still forwarded to onMessage so genuinely recent offline arrivals trigger auto-reply. The test 'handles append messages by marking them read but skipping auto-reply' used nowSeconds() as the message timestamp, which falls inside the grace window and therefore reaches onMessage — contradicting the expect(onMessage).not.toHaveBeenCalled() assertion. Fix: use nowSeconds(-120_000) (2 minutes before now) so the message is clearly outside the grace window and the append-recency filter correctly skips it. --- ...-inbox.allows-messages-from-senders-allowfrom-list.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extensions/whatsapp/src/monitor-inbox.allows-messages-from-senders-allowfrom-list.test.ts b/extensions/whatsapp/src/monitor-inbox.allows-messages-from-senders-allowfrom-list.test.ts index 545a010ed50..75bb94c966f 100644 --- a/extensions/whatsapp/src/monitor-inbox.allows-messages-from-senders-allowfrom-list.test.ts +++ b/extensions/whatsapp/src/monitor-inbox.allows-messages-from-senders-allowfrom-list.test.ts @@ -265,7 +265,9 @@ describe("web monitor inbox", () => { remoteJid: "999@s.whatsapp.net", }, message: { conversation: "old message" }, - messageTimestamp: nowSeconds(), + // Use a timestamp well outside the recency grace window (> 60 s before + // connection) so the append-recency filter correctly skips auto-reply. + messageTimestamp: nowSeconds(-120_000), pushName: "History Sender", }, ], From 01a13e6ab40087c676f88d8cd9d8d4845a263345 Mon Sep 17 00:00:00 2001 From: jiarung Date: Mon, 16 Mar 2026 01:59:17 +0000 Subject: [PATCH 25/29] fix(attempt): pass resolved workspace path into recordTokenUsage runEmbeddedAttempt calls process.chdir(effectiveWorkspace) early in the run, then later invokes recordTokenUsage with the raw params.workspaceDir string. If workspaceDir is a relative path (e.g. ./ws) recordTokenUsage resolves it from the already-changed cwd, producing a nested path (`./ws/ws/memory/token-usage.json`) or an outright failure. Fix: pass effectiveWorkspace (the fully-resolved, sandbox-aware absolute path that was used for every other workspace operation in the run) into recordTokenUsage so usage logs always land in the correct directory. --- src/agents/pi-embedded-runner/run/attempt.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index f03f87db833..4990cd7cd9e 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -2822,7 +2822,7 @@ export async function runEmbeddedAttempt( } recordTokenUsage({ - workspaceDir: params.workspaceDir, + workspaceDir: effectiveWorkspace, runId: params.runId, sessionId: params.sessionId, sessionKey: params.sessionKey, From d2b7b46604ee10a8257195c19829b55314fcdd3c Mon Sep 17 00:00:00 2001 From: jiarung Date: Mon, 16 Mar 2026 03:07:47 +0000 Subject: [PATCH 26/29] fix(usage-log): increase lock stale window to 30 s to prevent active-lock steals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit appendRecord rewrites the full token-usage.json on every write, so lock hold time grows with file size and disk speed. The previous stale: 5_000 was too tight: on large histories or slow disks a write can legitimately take longer than 5 s, allowing a concurrent waiter to treat the still- active lock as stale, reclaim it, and run an overlapping read-modify-write cycle that silently drops the earlier writer's entry. The risk is amplified by the attempt path where recordTokenUsage is fired without awaiting, so multiple concurrent runs can legitimately overlap. Fix: • Raise stale to 30_000 ms — gives ample headroom for large files on slow disks while still reclaiming crashed-process locks within 30 s. • Match the retry budget: 150 retries × 200 ms ≈ 30 s with jitter, so waiters exhaust retries only when the holder exceeds the stale window (i.e., is genuinely stuck or has crashed). --- src/agents/usage-log.ts | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index dc640f7410e..9ebb3ff580b 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -68,14 +68,25 @@ async function readJsonArray(file: string): Promise { // • uses exponential backoff with jitter capped at stale ms // --------------------------------------------------------------------------- const APPEND_LOCK_OPTIONS: FileLockOptions = { - // ~100 retries × 50 ms ≈ 5 s total — matches the previous LOCK_TIMEOUT_MS. + // appendRecord rewrites the full token-usage.json on every call, so hold + // time scales with file size and disk speed. 5 s was too tight: a slow + // write on a large history could exceed the stale window and allow a + // concurrent waiter to steal an active lock, causing overlapping + // read-modify-write cycles that silently drop entries. + // + // 30 s gives plenty of headroom for large files on slow disks while still + // reclaiming locks left by crashed processes within a reasonable window. + // The retry budget (~150 × 200 ms = 30 s) matches the stale window so + // waiters exhaust retries only if the holder holds longer than stale, + // i.e., is genuinely stuck or crashed. retries: { - retries: 100, + retries: 150, factor: 1, - minTimeout: 50, - maxTimeout: 50, + minTimeout: 200, + maxTimeout: 200, + randomize: true, }, - stale: 5_000, + stale: 30_000, }; async function appendRecord(file: string, entry: TokenUsageRecord): Promise { From 14303dac74a3803eccc3e76403cf4c247ce61fd0 Mon Sep 17 00:00:00 2001 From: jiarung Date: Tue, 17 Mar 2026 06:57:09 +0000 Subject: [PATCH 27/29] fix(file-lock): guarantee post-reclaim open attempt on last retry slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When acquireFileLock detects a stale lock on the final loop iteration (attempt === attempts - 1), the subsequent `continue` increments attempt to `attempts`, causing the loop to exit and throw 'file lock timeout' without ever calling open(O_EXCL) on the now-clear path. This is the crash-recovery bug reported in the P2 badge: an empty/partial .lock file left by a crash between open("wx") and writeFile becomes reclaimable on the last slot, but the call still throws timeout and the usage record is silently dropped. Fix: when on the last slot, step attempt back by one before continue so the upcoming += 1 nets to zero — guaranteeing at least one post-reclaim open(O_EXCL) attempt. No extra sleep budget is consumed; retries are only charged for backoff-sleep iterations, not reclaim-only work. Adds a regression test: retries=0 (attempts=1), stale empty lock file aged past staleMs — withFileLock must succeed, not throw timeout. --- src/plugin-sdk/file-lock.test.ts | 24 ++++++++++++++++++++++++ src/plugin-sdk/file-lock.ts | 16 ++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index 012b574d7c1..f9e2211513f 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -187,6 +187,30 @@ describe("withFileLock", () => { }, ); + it("reclaims a stale empty lock file even when detected on the very last retry (retries=0)", async () => { + // Regression: when retries=0 (attempts=1), a stale lock detected on + // attempt 0 caused `continue` to increment attempt to 1, exiting the loop + // and throwing timeout without ever calling open(O_EXCL) on the now-clear + // path. This reproduces the token-usage.json.lock crash-recovery bug where + // an empty .lock file left by a crash is reclaimable only on the last slot. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, ""); // empty — crash between open("wx") and writeFile + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); // age past stale threshold + + let ran = false; + await expect( + withFileLock( + targetFile, + { ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 0 } }, + async () => { + ran = true; + }, + ), + ).resolves.toBeUndefined(); + expect(ran).toBe(true); + }); + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { // Plant a stale lock (dead PID, old timestamp) so both waiters will // simultaneously enter the stale-reclaim branch. The inode guard must diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index ee981355fec..5be1e56b1a4 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -215,6 +215,22 @@ export async function acquireFileLock( // Retry open(O_EXCL) regardless: either we removed the stale lock or // a concurrent waiter already handled it; either way, the path is now // either free or holds a fresh lock that isStaleLock will reject. + // + // Guard: the for-loop's `attempt += 1` runs after every `continue`, + // consuming a retry slot. If we are already on the last slot + // (attempt === attempts - 1), that increment exits the loop and we + // throw timeout without ever re-trying open(O_EXCL) on the now-clear + // path. This is the crash-recovery bug: an empty/partial .lock file + // (crash between open("wx") and writeFile) that becomes reclaimable + // on the last iteration causes the record to be silently dropped. + // + // Fix: when on the last slot, step attempt back by one so the + // upcoming += 1 nets to zero, guaranteeing at least one post-reclaim + // open(O_EXCL) attempt. No extra sleep budget is consumed — we only + // charge a retry for backoff sleeps, not reclaim-only work. + if (attempt >= attempts - 1) { + attempt -= 1; + } continue; } From 8c2a7b41814008f3ad5c5f2a7dcfb8a3c6249b16 Mon Sep 17 00:00:00 2001 From: jiarung Date: Tue, 17 Mar 2026 07:15:47 +0000 Subject: [PATCH 28/29] fix(file-lock): bound post-reclaim free slot to prevent endless spin The previous fix decremented attempt unconditionally when on the last retry slot, which introduced an endless loop: if fs.rm silently fails (EACCES/EPERM swallowed by .catch), isStaleLock continues to return true on every iteration and the attempt -= 1 guard keeps looping the last slot forever, hanging all callers (withFileLock, usage logging). Fix: introduce reclaimSlotAvailable (one-shot boolean, reset to false on first use). The free post-reclaim attempt is given exactly once; subsequent stale detections on the last slot are not granted extra iterations and the loop naturally exhausts its retry budget and throws file lock timeout as expected. Adds regression test: mocks fs.rm as a no-op so the stale lock persists unconditionally. Without the one-shot bound the loop would spin forever (vitest default timeout would terminate it); with the fix it throws file lock timeout promptly. --- src/plugin-sdk/file-lock.test.ts | 36 +++++++++++++++++++++++++++++++- src/plugin-sdk/file-lock.ts | 15 ++++++++++++- src/plugins/path-safety.ts | 2 +- ui/package.json | 2 +- ui/src/ui/views/chat.test.ts | 7 +++++++ 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/plugin-sdk/file-lock.test.ts b/src/plugin-sdk/file-lock.test.ts index f9e2211513f..474149d0323 100644 --- a/src/plugin-sdk/file-lock.test.ts +++ b/src/plugin-sdk/file-lock.test.ts @@ -1,7 +1,7 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; -import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { getProcessStartTime } from "../shared/pid-alive.js"; import { withFileLock } from "./file-lock.js"; @@ -211,6 +211,40 @@ describe("withFileLock", () => { expect(ran).toBe(true); }); + it("times out (does not spin forever) when a stale lock cannot be removed", async () => { + // Safety bound: if fs.rm silently fails (EACCES/EPERM — swallowed by + // .catch), subsequent iterations keep seeing isStale=true. The one-shot + // reclaimSlotAvailable guard must prevent the attempt -= 1 trick from + // repeating indefinitely; after the free slot is consumed, the loop must + // exhaust its retry budget and throw timeout rather than hanging forever. + // + // We simulate the condition by mocking fs.rm as a no-op so the .lock + // file is never removed — every subsequent open(O_EXCL) still sees EEXIST. + // Without the reclaimSlotAvailable bound, each stale detection on the last + // slot would decrement attempt and loop back, spinning forever. + const lockPath = `${targetFile}.lock`; + await fs.mkdir(path.dirname(targetFile), { recursive: true }); + await fs.writeFile(lockPath, JSON.stringify({ pid: 0, createdAt: new Date(0).toISOString() })); + await ageFile(lockPath, LOCK_OPTIONS.stale + 1_000); + + // Spy on fs.rm: resolve successfully but do nothing, so the lock file + // persists and isStaleLock() keeps returning true on every iteration. + const rmSpy = vi.spyOn(fs, "rm").mockResolvedValue(undefined); + + try { + await expect( + withFileLock( + targetFile, + { ...LOCK_OPTIONS, retries: { ...LOCK_OPTIONS.retries, retries: 2 } }, + async () => {}, + ), + ).rejects.toThrow("file lock timeout"); + } finally { + rmSpy.mockRestore(); + await fs.rm(lockPath, { force: true }).catch(() => {}); + } + }); + it("two concurrent waiters on a stale lock never overlap inside fn()", async () => { // Plant a stale lock (dead PID, old timestamp) so both waiters will // simultaneously enter the stale-reclaim branch. The inode guard must diff --git a/src/plugin-sdk/file-lock.ts b/src/plugin-sdk/file-lock.ts index 5be1e56b1a4..e0360602757 100644 --- a/src/plugin-sdk/file-lock.ts +++ b/src/plugin-sdk/file-lock.ts @@ -155,6 +155,11 @@ export async function acquireFileLock( } const attempts = Math.max(1, options.retries.retries + 1); + // One-shot budget for the post-reclaim free slot (see stale-reclaim comment + // below). Consumed the first time we step attempt back; subsequent stale + // detections on the last slot are allowed to exhaust the loop and time out, + // preventing an endless spin when fs.rm silently fails (e.g. EACCES/EPERM). + let reclaimSlotAvailable = true; for (let attempt = 0; attempt < attempts; attempt += 1) { try { const handle = await fs.open(lockPath, "wx"); @@ -228,7 +233,15 @@ export async function acquireFileLock( // upcoming += 1 nets to zero, guaranteeing at least one post-reclaim // open(O_EXCL) attempt. No extra sleep budget is consumed — we only // charge a retry for backoff sleeps, not reclaim-only work. - if (attempt >= attempts - 1) { + // + // Safety bound: reclaimSlotAvailable is consumed after the first use. + // If fs.rm silently fails (EACCES/EPERM swallowed by .catch above), + // subsequent iterations will still detect isStale=true; without the + // bound, decrementing attempt on every last-slot iteration would spin + // the loop forever. After the one-shot budget is gone, the loop is + // allowed to exhaust normally and throw timeout. + if (reclaimSlotAvailable && attempt >= attempts - 1) { + reclaimSlotAvailable = false; attempt -= 1; } continue; diff --git a/src/plugins/path-safety.ts b/src/plugins/path-safety.ts index 7935312cbe4..691931faf2e 100644 --- a/src/plugins/path-safety.ts +++ b/src/plugins/path-safety.ts @@ -11,7 +11,7 @@ export function safeRealpathSync(targetPath: string, cache?: Map return cached; } try { - const resolved = fs.realpathSync(targetPath); + const resolved = fs.realpathSync.native(targetPath); cache?.set(targetPath, resolved); return resolved; } catch { diff --git a/ui/package.json b/ui/package.json index c326f70cf3a..4a561185218 100644 --- a/ui/package.json +++ b/ui/package.json @@ -11,7 +11,7 @@ "dependencies": { "@lit-labs/signals": "^0.2.0", "@lit/context": "^1.1.6", - "@noble/ed25519": "3.0.0", + "@noble/ed25519": "^3.0.0", "dompurify": "^3.3.3", "lit": "^3.3.2", "marked": "^17.0.4", diff --git a/ui/src/ui/views/chat.test.ts b/ui/src/ui/views/chat.test.ts index 5e02b2649e2..93164647a6f 100644 --- a/ui/src/ui/views/chat.test.ts +++ b/ui/src/ui/views/chat.test.ts @@ -91,6 +91,12 @@ function createChatHeaderState( sessionKey: "main", connected: true, sessionsHideCron: true, + sessionsLoading: false, + sessionsError: null, + sessionsFilterActive: "0", + sessionsFilterLimit: "0", + sessionsIncludeGlobal: true, + sessionsIncludeUnknown: true, sessionsResult: { ts: 0, path: "", @@ -111,6 +117,7 @@ function createChatHeaderState( chatModelOverrides: {}, chatModelCatalog: catalog, chatModelsLoading: false, + chatSending: false, client: { request } as unknown as GatewayBrowserClient, settings: { gatewayUrl: "", From 313acd4463b388026e9acc8277be961a2765eefe Mon Sep 17 00:00:00 2001 From: jiarung Date: Tue, 17 Mar 2026 08:56:37 +0000 Subject: [PATCH 29/29] fix(usage-log): remove writeQueues entries after each write settles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each unique token-usage.json path was inserted into writeQueues and never removed, even after the stored promise settled successfully or with an error. In long-lived processes that touch many workspaces (e.g. ephemeral per-run directories) this causes unbounded Map growth and retains completed promises indefinitely — a memory leak introduced by the write-queue serialisation logic. Fix: capture the stored (rejection-suppressed) promise before setting it in the Map. After it settles, delete the Map entry iff it still holds the same promise — a concurrent call that queued a new write for the same path will have replaced the entry and owns its own cleanup. Adds two regression tests via _testOnly_getWriteQueueSize(): - N distinct paths all cleaned up after writes settle (Map → empty) - rejected write also cleans up its entry (no leak on error path) --- src/agents/usage-log.test.ts | 48 +++++++++++++++++++++++++++++++++++- src/agents/usage-log.ts | 23 ++++++++++++++--- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/agents/usage-log.test.ts b/src/agents/usage-log.test.ts index 0922a328989..822e67fab49 100644 --- a/src/agents/usage-log.test.ts +++ b/src/agents/usage-log.test.ts @@ -3,7 +3,7 @@ import fs from "fs/promises"; import os from "os"; import path from "path"; import { describe, expect, it, beforeEach, afterEach } from "vitest"; -import { recordTokenUsage } from "./usage-log.js"; +import { recordTokenUsage, _testOnly_getWriteQueueSize } from "./usage-log.js"; describe("recordTokenUsage", () => { let tmpDir: string; @@ -245,6 +245,52 @@ describe("recordTokenUsage", () => { } }); + it("writeQueues entries are removed after each write settles — no unbounded Map growth", async () => { + // Regression: writeQueues entries were never deleted after a write settled, + // causing unbounded Map growth in long-lived processes that touch many + // ephemeral workspace directories (one unique path → one permanent entry). + // + // Verify the fix: after all writes to N distinct paths complete, the Map + // must be empty (no retained entries for settled paths). + const N = 10; + const dirs = await Promise.all( + Array.from({ length: N }, () => fs.mkdtemp(path.join(os.tmpdir(), "usage-queue-leak-"))), + ); + try { + await Promise.all( + dirs.map((dir) => + recordTokenUsage({ + workspaceDir: dir, + label: "llm_output", + usage: { input: 1, output: 1, total: 2 }, + }), + ), + ); + // All writes are awaited; their stored promises must have settled and + // been removed from the Map. + expect(_testOnly_getWriteQueueSize()).toBe(0); + } finally { + await Promise.all(dirs.map((d) => fs.rm(d, { recursive: true, force: true }))); + } + }); + + it("writeQueues entry is removed even when the write rejects", async () => { + // Plant a non-array token-usage.json to trigger a rejection in appendRecord. + // The Map entry must still be cleaned up — a failing write must not leak. + await fs.mkdir(path.join(tmpDir, "memory"), { recursive: true }); + await fs.writeFile(path.join(tmpDir, "memory", "token-usage.json"), '"not-an-array"', "utf-8"); + + await expect( + recordTokenUsage({ + workspaceDir: tmpDir, + label: "llm_output", + usage: { input: 1, output: 1, total: 2 }, + }), + ).rejects.toThrow(); + + expect(_testOnly_getWriteQueueSize()).toBe(0); + }); + it("serialises concurrent writes — no record is lost", async () => { const N = 20; await Promise.all( diff --git a/src/agents/usage-log.ts b/src/agents/usage-log.ts index 9ebb3ff580b..d154133e24a 100644 --- a/src/agents/usage-log.ts +++ b/src/agents/usage-log.ts @@ -112,6 +112,11 @@ async function appendRecord(file: string, entry: TokenUsageRecord): Promise>(); +/** Exposed for testing only — returns the current number of live queue entries. */ +export function _testOnly_getWriteQueueSize(): number { + return writeQueues.size; +} + export async function recordTokenUsage(params: { workspaceDir: string; runId?: string; @@ -171,9 +176,19 @@ export async function recordTokenUsage(params: { const queued = writeQueues.get(file) ?? Promise.resolve(); const next = queued.then(() => appendRecord(file, entry)); - writeQueues.set( - file, - next.catch(() => {}), - ); + // Suppress rejection so the Map value never holds a rejected promise, which + // would cause unhandled-rejection noise for any future .then() chained on it. + const stored = next.catch(() => {}); + writeQueues.set(file, stored); + // Remove the entry once it settles. Check identity first: if another call + // already enqueued a new write for the same path, writeQueues now holds that + // newer promise and must not be deleted — the newer entry owns its own cleanup. + // This prevents unbounded Map growth in long-lived processes that write to + // many ephemeral workspace directories. + void stored.then(() => { + if (writeQueues.get(file) === stored) { + writeQueues.delete(file); + } + }); await next; }