From a9520572be8407187585a72cc372a9e07df8c41e Mon Sep 17 00:00:00 2001 From: kumarabhirup Date: Mon, 2 Mar 2026 18:37:07 -0800 Subject: [PATCH] feat(evals): add eval runner, types, and vitest config --- test/evals/run-evals.ts | 81 +++++++++++++++++++++++++++++++++++++ test/evals/types.ts | 46 +++++++++++++++++++++ test/evals/vitest.config.ts | 9 +++++ 3 files changed, 136 insertions(+) create mode 100644 test/evals/run-evals.ts create mode 100644 test/evals/types.ts create mode 100644 test/evals/vitest.config.ts diff --git a/test/evals/run-evals.ts b/test/evals/run-evals.ts new file mode 100644 index 00000000000..fd94d90b595 --- /dev/null +++ b/test/evals/run-evals.ts @@ -0,0 +1,81 @@ +import { chatAgentEvalDataset } from "./dataset.js"; +import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js"; +import type { CaseEvaluation } from "./types.js"; + +function evaluateDataset(): CaseEvaluation[] { + return chatAgentEvalDataset.map((testCase) => { + const results = [ + gradeOutput(testCase), + gradeTrajectory(testCase), + gradeTraceIntegrity(testCase), + ]; + const passed = results.every((result) => result.passed); + return { testCase, results, passed }; + }); +} + +function printHumanSummary(evaluations: CaseEvaluation[]) { + let totalGraders = 0; + let passedGraders = 0; + + for (const evaluation of evaluations) { + for (const result of evaluation.results) { + totalGraders += 1; + if (result.passed) { + passedGraders += 1; + } + } + } + + // Keep output friendly for CI logs. + console.log("LLM eval summary"); + console.log(`- cases: ${evaluations.length}`); + console.log(`- graders passed: ${passedGraders}/${totalGraders}`); + + for (const evaluation of evaluations) { + const status = evaluation.passed ? "PASS" : "FAIL"; + const critical = evaluation.testCase.critical ? "critical" : "non-critical"; + console.log(`\n[${status}] ${evaluation.testCase.id} (${critical})`); + console.log(` ${evaluation.testCase.description}`); + for (const result of evaluation.results) { + const marker = result.passed ? "ok" : "x"; + console.log(` - ${marker} ${result.grader}`); + for (const detail of result.details) { + console.log(` ยท ${detail}`); + } + } + } +} + +function printJsonSummary(evaluations: CaseEvaluation[]) { + const payload = { + generatedAt: new Date().toISOString(), + totalCases: evaluations.length, + evaluations: evaluations.map((evaluation) => ({ + id: evaluation.testCase.id, + description: evaluation.testCase.description, + critical: Boolean(evaluation.testCase.critical), + passed: evaluation.passed, + results: evaluation.results, + })), + }; + console.log(JSON.stringify(payload, null, 2)); +} + +const evaluations = evaluateDataset(); +const hasCriticalFailure = evaluations.some( + (evaluation) => Boolean(evaluation.testCase.critical) && !evaluation.passed, +); +const enforce = process.env.EVALS_ENFORCE === "1"; +const emitJson = process.argv.includes("--json"); + +if (emitJson) { + printJsonSummary(evaluations); +} else { + printHumanSummary(evaluations); +} + +if (enforce && hasCriticalFailure) { + console.error("\nCritical eval checks failed with EVALS_ENFORCE=1."); + process.exit(1); +} diff --git a/test/evals/types.ts b/test/evals/types.ts new file mode 100644 index 00000000000..99764d3d010 --- /dev/null +++ b/test/evals/types.ts @@ -0,0 +1,46 @@ +export type EvalEvent = { + type: string; + globalSeq?: number; + toolCallId?: string; + toolName?: string; + delta?: string; + text?: string; + output?: Record; + errorText?: string; + sessionKey?: string; +}; + +export type TrajectoryMatchMode = "strict" | "subset" | "superset" | "unordered"; + +export type EvalCase = { + id: string; + description: string; + critical?: boolean; + events: EvalEvent[]; + expectations: { + output?: { + mustContain?: string[]; + mustNotContain?: string[]; + }; + trajectory?: { + tools: string[]; + mode: TrajectoryMatchMode; + }; + trace?: { + requiredTypes?: string[]; + requireMonotonicGlobalSeq?: boolean; + }; + }; +}; + +export type GradeResult = { + grader: "output" | "trajectory" | "trace"; + passed: boolean; + details: string[]; +}; + +export type CaseEvaluation = { + testCase: EvalCase; + results: GradeResult[]; + passed: boolean; +}; diff --git a/test/evals/vitest.config.ts b/test/evals/vitest.config.ts new file mode 100644 index 00000000000..aaf1bf4fa89 --- /dev/null +++ b/test/evals/vitest.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + environment: "node", + include: ["**/*.test.ts"], + testTimeout: 30_000, + }, +});