feat(evals): add eval runner, types, and vitest config
This commit is contained in:
parent
ba01abdcd9
commit
a9520572be
81
test/evals/run-evals.ts
Normal file
81
test/evals/run-evals.ts
Normal file
@ -0,0 +1,81 @@
|
||||
import { chatAgentEvalDataset } from "./dataset.js";
|
||||
import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
|
||||
import type { CaseEvaluation } from "./types.js";
|
||||
|
||||
function evaluateDataset(): CaseEvaluation[] {
|
||||
return chatAgentEvalDataset.map((testCase) => {
|
||||
const results = [
|
||||
gradeOutput(testCase),
|
||||
gradeTrajectory(testCase),
|
||||
gradeTraceIntegrity(testCase),
|
||||
];
|
||||
const passed = results.every((result) => result.passed);
|
||||
return { testCase, results, passed };
|
||||
});
|
||||
}
|
||||
|
||||
function printHumanSummary(evaluations: CaseEvaluation[]) {
|
||||
let totalGraders = 0;
|
||||
let passedGraders = 0;
|
||||
|
||||
for (const evaluation of evaluations) {
|
||||
for (const result of evaluation.results) {
|
||||
totalGraders += 1;
|
||||
if (result.passed) {
|
||||
passedGraders += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keep output friendly for CI logs.
|
||||
console.log("LLM eval summary");
|
||||
console.log(`- cases: ${evaluations.length}`);
|
||||
console.log(`- graders passed: ${passedGraders}/${totalGraders}`);
|
||||
|
||||
for (const evaluation of evaluations) {
|
||||
const status = evaluation.passed ? "PASS" : "FAIL";
|
||||
const critical = evaluation.testCase.critical ? "critical" : "non-critical";
|
||||
console.log(`\n[${status}] ${evaluation.testCase.id} (${critical})`);
|
||||
console.log(` ${evaluation.testCase.description}`);
|
||||
for (const result of evaluation.results) {
|
||||
const marker = result.passed ? "ok" : "x";
|
||||
console.log(` - ${marker} ${result.grader}`);
|
||||
for (const detail of result.details) {
|
||||
console.log(` · ${detail}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function printJsonSummary(evaluations: CaseEvaluation[]) {
|
||||
const payload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
totalCases: evaluations.length,
|
||||
evaluations: evaluations.map((evaluation) => ({
|
||||
id: evaluation.testCase.id,
|
||||
description: evaluation.testCase.description,
|
||||
critical: Boolean(evaluation.testCase.critical),
|
||||
passed: evaluation.passed,
|
||||
results: evaluation.results,
|
||||
})),
|
||||
};
|
||||
console.log(JSON.stringify(payload, null, 2));
|
||||
}
|
||||
|
||||
const evaluations = evaluateDataset();
|
||||
const hasCriticalFailure = evaluations.some(
|
||||
(evaluation) => Boolean(evaluation.testCase.critical) && !evaluation.passed,
|
||||
);
|
||||
const enforce = process.env.EVALS_ENFORCE === "1";
|
||||
const emitJson = process.argv.includes("--json");
|
||||
|
||||
if (emitJson) {
|
||||
printJsonSummary(evaluations);
|
||||
} else {
|
||||
printHumanSummary(evaluations);
|
||||
}
|
||||
|
||||
if (enforce && hasCriticalFailure) {
|
||||
console.error("\nCritical eval checks failed with EVALS_ENFORCE=1.");
|
||||
process.exit(1);
|
||||
}
|
||||
46
test/evals/types.ts
Normal file
46
test/evals/types.ts
Normal file
@ -0,0 +1,46 @@
|
||||
export type EvalEvent = {
|
||||
type: string;
|
||||
globalSeq?: number;
|
||||
toolCallId?: string;
|
||||
toolName?: string;
|
||||
delta?: string;
|
||||
text?: string;
|
||||
output?: Record<string, unknown>;
|
||||
errorText?: string;
|
||||
sessionKey?: string;
|
||||
};
|
||||
|
||||
export type TrajectoryMatchMode = "strict" | "subset" | "superset" | "unordered";
|
||||
|
||||
export type EvalCase = {
|
||||
id: string;
|
||||
description: string;
|
||||
critical?: boolean;
|
||||
events: EvalEvent[];
|
||||
expectations: {
|
||||
output?: {
|
||||
mustContain?: string[];
|
||||
mustNotContain?: string[];
|
||||
};
|
||||
trajectory?: {
|
||||
tools: string[];
|
||||
mode: TrajectoryMatchMode;
|
||||
};
|
||||
trace?: {
|
||||
requiredTypes?: string[];
|
||||
requireMonotonicGlobalSeq?: boolean;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
export type GradeResult = {
|
||||
grader: "output" | "trajectory" | "trace";
|
||||
passed: boolean;
|
||||
details: string[];
|
||||
};
|
||||
|
||||
export type CaseEvaluation = {
|
||||
testCase: EvalCase;
|
||||
results: GradeResult[];
|
||||
passed: boolean;
|
||||
};
|
||||
9
test/evals/vitest.config.ts
Normal file
9
test/evals/vitest.config.ts
Normal file
@ -0,0 +1,9 @@
|
||||
import { defineConfig } from "vitest/config";
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
environment: "node",
|
||||
include: ["**/*.test.ts"],
|
||||
testTimeout: 30_000,
|
||||
},
|
||||
});
|
||||
Loading…
x
Reference in New Issue
Block a user