feat(evals): add eval runner, types, and vitest config

This commit is contained in:
kumarabhirup 2026-03-02 18:37:07 -08:00
parent ba01abdcd9
commit a9520572be
No known key found for this signature in database
GPG Key ID: DB7CA2289CAB0167
3 changed files with 136 additions and 0 deletions

81
test/evals/run-evals.ts Normal file
View File

@ -0,0 +1,81 @@
import { chatAgentEvalDataset } from "./dataset.js";
import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
import type { CaseEvaluation } from "./types.js";
function evaluateDataset(): CaseEvaluation[] {
return chatAgentEvalDataset.map((testCase) => {
const results = [
gradeOutput(testCase),
gradeTrajectory(testCase),
gradeTraceIntegrity(testCase),
];
const passed = results.every((result) => result.passed);
return { testCase, results, passed };
});
}
function printHumanSummary(evaluations: CaseEvaluation[]) {
let totalGraders = 0;
let passedGraders = 0;
for (const evaluation of evaluations) {
for (const result of evaluation.results) {
totalGraders += 1;
if (result.passed) {
passedGraders += 1;
}
}
}
// Keep output friendly for CI logs.
console.log("LLM eval summary");
console.log(`- cases: ${evaluations.length}`);
console.log(`- graders passed: ${passedGraders}/${totalGraders}`);
for (const evaluation of evaluations) {
const status = evaluation.passed ? "PASS" : "FAIL";
const critical = evaluation.testCase.critical ? "critical" : "non-critical";
console.log(`\n[${status}] ${evaluation.testCase.id} (${critical})`);
console.log(` ${evaluation.testCase.description}`);
for (const result of evaluation.results) {
const marker = result.passed ? "ok" : "x";
console.log(` - ${marker} ${result.grader}`);
for (const detail of result.details) {
console.log(` · ${detail}`);
}
}
}
}
function printJsonSummary(evaluations: CaseEvaluation[]) {
const payload = {
generatedAt: new Date().toISOString(),
totalCases: evaluations.length,
evaluations: evaluations.map((evaluation) => ({
id: evaluation.testCase.id,
description: evaluation.testCase.description,
critical: Boolean(evaluation.testCase.critical),
passed: evaluation.passed,
results: evaluation.results,
})),
};
console.log(JSON.stringify(payload, null, 2));
}
const evaluations = evaluateDataset();
const hasCriticalFailure = evaluations.some(
(evaluation) => Boolean(evaluation.testCase.critical) && !evaluation.passed,
);
const enforce = process.env.EVALS_ENFORCE === "1";
const emitJson = process.argv.includes("--json");
if (emitJson) {
printJsonSummary(evaluations);
} else {
printHumanSummary(evaluations);
}
if (enforce && hasCriticalFailure) {
console.error("\nCritical eval checks failed with EVALS_ENFORCE=1.");
process.exit(1);
}

46
test/evals/types.ts Normal file
View File

@ -0,0 +1,46 @@
export type EvalEvent = {
type: string;
globalSeq?: number;
toolCallId?: string;
toolName?: string;
delta?: string;
text?: string;
output?: Record<string, unknown>;
errorText?: string;
sessionKey?: string;
};
export type TrajectoryMatchMode = "strict" | "subset" | "superset" | "unordered";
export type EvalCase = {
id: string;
description: string;
critical?: boolean;
events: EvalEvent[];
expectations: {
output?: {
mustContain?: string[];
mustNotContain?: string[];
};
trajectory?: {
tools: string[];
mode: TrajectoryMatchMode;
};
trace?: {
requiredTypes?: string[];
requireMonotonicGlobalSeq?: boolean;
};
};
};
export type GradeResult = {
grader: "output" | "trajectory" | "trace";
passed: boolean;
details: string[];
};
export type CaseEvaluation = {
testCase: EvalCase;
results: GradeResult[];
passed: boolean;
};

View File

@ -0,0 +1,9 @@
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
environment: "node",
include: ["**/*.test.ts"],
testTimeout: 30_000,
},
});