openclaw/src/agents/usage-log.ts

180 lines
6.5 KiB
TypeScript
Raw Normal View History

import { randomBytes } from "crypto";
import fs from "fs/promises";
import path from "path";
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
import { type FileLockOptions, withFileLock } from "../infra/file-lock.js";
export type TokenUsageRecord = {
id: string;
label: string;
tokensUsed: number;
tokenLimit?: number;
inputTokens?: number;
outputTokens?: number;
cacheReadTokens?: number;
cacheWriteTokens?: number;
model?: string;
provider?: string;
runId?: string;
sessionId?: string;
sessionKey?: string;
createdAt: string;
};
function makeId() {
return `usage_${Date.now().toString(36)}_${randomBytes(4).toString("hex")}`;
}
async function readJsonArray(file: string): Promise<TokenUsageRecord[]> {
try {
const raw = await fs.readFile(file, "utf-8");
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed)) {
// Valid JSON but unexpected shape (object, number, string, …).
// Returning [] here would cause appendRecord to overwrite the file
// with only the new entry, silently deleting prior data.
throw Object.assign(
new Error(
`token-usage.json contains valid JSON but is not an array (got ${typeof parsed})`,
),
{ code: "ERR_UNEXPECTED_TOKEN_LOG_SHAPE" },
);
}
return parsed as TokenUsageRecord[];
} catch (err) {
// File does not exist yet — start with an empty array.
if ((err as NodeJS.ErrnoException).code === "ENOENT") {
return [];
}
// Any other error (malformed JSON, permission denied, partial write, …)
// must propagate so appendRecord aborts and the existing file is not
// silently overwritten with only the new entry.
throw err;
}
}
// ---------------------------------------------------------------------------
// Cross-process file lock
//
// The in-memory writeQueues Map serialises writes within a single Node
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
// process. Two concurrent OpenClaw processes targeting the same
// workspaceDir can still race, so we use an advisory O_EXCL lock provided
// by the shared withFileLock helper in plugin-sdk/file-lock.ts.
//
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
// That implementation:
// • stores {pid, createdAt} so waiters can detect a crashed holder
// • treats empty/unparseable lock content as stale (crash during open→write)
// • re-verifies the lock inode before removing it so a slow waiter's
// unlink cannot delete a fresh lock from another process
// • uses exponential backoff with jitter capped at stale ms
// ---------------------------------------------------------------------------
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
const APPEND_LOCK_OPTIONS: FileLockOptions = {
// appendRecord rewrites the full token-usage.json on every call, so hold
// time scales with file size and disk speed. 5 s was too tight: a slow
// write on a large history could exceed the stale window and allow a
// concurrent waiter to steal an active lock, causing overlapping
// read-modify-write cycles that silently drop entries.
//
// 30 s gives plenty of headroom for large files on slow disks while still
// reclaiming locks left by crashed processes within a reasonable window.
// The retry budget (~150 × 200 ms = 30 s) matches the stale window so
// waiters exhaust retries only if the holder holds longer than stale,
// i.e., is genuinely stuck or crashed.
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
retries: {
retries: 150,
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
factor: 1,
minTimeout: 200,
maxTimeout: 200,
randomize: true,
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
},
stale: 30_000,
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
};
async function appendRecord(file: string, entry: TokenUsageRecord): Promise<void> {
refactor(usage-log): delegate cross-process lock to plugin-sdk/file-lock appendRecord wrote token-usage.json in place with a direct fs.writeFile call; a crash or SIGKILL during that write left truncated JSON. Because readJsonArray now throws on any non-ENOENT error (to prevent silent data loss) and recordTokenUsage callers swallow the error via .catch(), one corrupted write permanently disabled all future token logging until the file was manually repaired. The in-place-write bug was fixed in 8c162d0ba via a temp-file + atomic rename approach, but usage-log.ts still carried its own private withFileLock / isLockStale implementation. That inline lock had two known bugs that were fixed in plugin-sdk/file-lock.ts but never applied here: 1. isLockStale treated empty / unparseable lock content as 'not stale' — a process that crashes between open('wx') and writeFile(pid) leaves an empty .lock that appeared live forever, blocking all future writers until it was manually removed. 2. No inode identity check before unlink: two waiters observing the same stale lock could both call unlink; the slower one would delete the faster one's freshly-acquired lock, letting both enter fn() concurrently and race on the read-modify-write sequence. Fix: import withFileLock from infra/file-lock.ts (which re-exports the canonical plugin-sdk implementation) and remove the ~70-line inline lock. APPEND_LOCK_OPTIONS reproduces the previous timeout/retry budget (~100 × 50 ms ≈ 5 s) while gaining all fixes from plugin-sdk/file-lock. The lock payload format changed from a plain PID string to the JSON {pid, createdAt} envelope expected by the shared implementation; the stale-lock integration test is updated to match.
2026-03-15 07:36:31 +00:00
await withFileLock(file, APPEND_LOCK_OPTIONS, async () => {
const records = await readJsonArray(file);
records.push(entry);
// Write to a sibling temp file then atomically rename into place so that
// a crash or kill during the write never leaves token-usage.json truncated.
// rename(2) is atomic on POSIX when src and dst are on the same filesystem,
// which is guaranteed here because both paths share the same directory.
const tmp = `${file}.tmp.${randomBytes(4).toString("hex")}`;
try {
await fs.writeFile(tmp, JSON.stringify(records, null, 2));
await fs.rename(tmp, file);
} catch (err) {
await fs.unlink(tmp).catch(() => {});
throw err;
}
});
}
// Per-file write queue: serialises concurrent recordTokenUsage() calls within
// the same process so they do not all contend on the cross-process file lock.
const writeQueues = new Map<string, Promise<void>>();
export async function recordTokenUsage(params: {
workspaceDir: string;
runId?: string;
sessionId?: string;
sessionKey?: string;
provider?: string;
model?: string;
label: string;
usage?: {
input?: number;
output?: number;
cacheRead?: number;
cacheWrite?: number;
total?: number;
};
}) {
const usage = params.usage;
if (!usage) {
return;
}
const total =
usage.total ??
(usage.input ?? 0) + (usage.output ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0);
if (!total || total <= 0) {
return;
}
const memoryDir = path.join(params.workspaceDir, "memory");
await fs.mkdir(memoryDir, { recursive: true });
// Canonicalize before keying writeQueues so that different path spellings
// for the same physical directory (e.g. a symlink vs its target) share a
// single in-process queue. Without this, two spellings produce separate
// queue entries and both call appendRecord concurrently; when
// withFileLock's HELD_LOCKS map then resolves both to the same normalised
// path the second caller re-entrantly joins the first — allowing concurrent
// read-modify-write cycles that silently drop entries.
const realMemoryDir = await fs.realpath(memoryDir).catch(() => memoryDir);
const file = path.join(realMemoryDir, "token-usage.json");
const entry: TokenUsageRecord = {
id: makeId(),
label: params.label,
tokensUsed: Math.trunc(total),
...(usage.input != null && usage.input > 0 && { inputTokens: Math.trunc(usage.input) }),
...(usage.output != null && usage.output > 0 && { outputTokens: Math.trunc(usage.output) }),
...(usage.cacheRead != null &&
usage.cacheRead > 0 && { cacheReadTokens: Math.trunc(usage.cacheRead) }),
...(usage.cacheWrite != null &&
usage.cacheWrite > 0 && { cacheWriteTokens: Math.trunc(usage.cacheWrite) }),
model: params.model,
provider: params.provider,
runId: params.runId,
sessionId: params.sessionId,
sessionKey: params.sessionKey,
createdAt: new Date().toISOString(),
};
const queued = writeQueues.get(file) ?? Promise.resolve();
const next = queued.then(() => appendRecord(file, entry));
writeQueues.set(
file,
next.catch(() => {}),
);
await next;
}