openclaw/test/evals/run-evals.ts

82 lines
2.5 KiB
TypeScript

import { chatAgentEvalDataset } from "./dataset.js";
import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
import type { CaseEvaluation } from "./types.js";
function evaluateDataset(): CaseEvaluation[] {
return chatAgentEvalDataset.map((testCase) => {
const results = [
gradeOutput(testCase),
gradeTrajectory(testCase),
gradeTraceIntegrity(testCase),
];
const passed = results.every((result) => result.passed);
return { testCase, results, passed };
});
}
function printHumanSummary(evaluations: CaseEvaluation[]) {
let totalGraders = 0;
let passedGraders = 0;
for (const evaluation of evaluations) {
for (const result of evaluation.results) {
totalGraders += 1;
if (result.passed) {
passedGraders += 1;
}
}
}
// Keep output friendly for CI logs.
console.log("LLM eval summary");
console.log(`- cases: ${evaluations.length}`);
console.log(`- graders passed: ${passedGraders}/${totalGraders}`);
for (const evaluation of evaluations) {
const status = evaluation.passed ? "PASS" : "FAIL";
const critical = evaluation.testCase.critical ? "critical" : "non-critical";
console.log(`\n[${status}] ${evaluation.testCase.id} (${critical})`);
console.log(` ${evaluation.testCase.description}`);
for (const result of evaluation.results) {
const marker = result.passed ? "ok" : "x";
console.log(` - ${marker} ${result.grader}`);
for (const detail of result.details) {
console.log(` · ${detail}`);
}
}
}
}
function printJsonSummary(evaluations: CaseEvaluation[]) {
const payload = {
generatedAt: new Date().toISOString(),
totalCases: evaluations.length,
evaluations: evaluations.map((evaluation) => ({
id: evaluation.testCase.id,
description: evaluation.testCase.description,
critical: Boolean(evaluation.testCase.critical),
passed: evaluation.passed,
results: evaluation.results,
})),
};
console.log(JSON.stringify(payload, null, 2));
}
const evaluations = evaluateDataset();
const hasCriticalFailure = evaluations.some(
(evaluation) => Boolean(evaluation.testCase.critical) && !evaluation.passed,
);
const enforce = process.env.EVALS_ENFORCE === "1";
const emitJson = process.argv.includes("--json");
if (emitJson) {
printJsonSummary(evaluations);
} else {
printHumanSummary(evaluations);
}
if (enforce && hasCriticalFailure) {
console.error("\nCritical eval checks failed with EVALS_ENFORCE=1.");
process.exit(1);
}