From 1deeb5878e28f2ab0d6aabf7d674cf24952181ea Mon Sep 17 00:00:00 2001 From: kumarabhirup Date: Mon, 2 Mar 2026 18:36:40 -0800 Subject: [PATCH] feat(evals): add eval framework with dataset, graders, and regression report --- test/evals/README.md | 26 +++ test/evals/REGRESSION_VERIFICATION_REPORT.md | 72 +++++++ test/evals/dataset.ts | 140 +++++++++++++ test/evals/graders.test.ts | 43 ++++ test/evals/graders.ts | 203 +++++++++++++++++++ 5 files changed, 484 insertions(+) create mode 100644 test/evals/README.md create mode 100644 test/evals/REGRESSION_VERIFICATION_REPORT.md create mode 100644 test/evals/dataset.ts create mode 100644 test/evals/graders.test.ts create mode 100644 test/evals/graders.ts diff --git a/test/evals/README.md b/test/evals/README.md new file mode 100644 index 00000000000..75d71784ca8 --- /dev/null +++ b/test/evals/README.md @@ -0,0 +1,26 @@ +# Chat-Agent Evals + +This folder contains a dataset-driven eval harness for web chat-agent behavior. + +## What is graded + +- `output`: final assistant text quality checks (`mustContain`, `mustNotContain`) +- `trajectory`: tool-call ordering checks (`strict`, `subset`, `superset`, `unordered`) +- `trace`: event-stream integrity checks (required event types, monotonic `globalSeq`, + and tool-call lifecycle completeness) + +## Run + +```bash +pnpm test:evals +``` + +This mode is informational and prints a full summary. + +## Enforce critical checks + +```bash +pnpm test:evals:enforce +``` + +When `EVALS_ENFORCE=1`, the runner exits non-zero if any **critical** eval case fails. diff --git a/test/evals/REGRESSION_VERIFICATION_REPORT.md b/test/evals/REGRESSION_VERIFICATION_REPORT.md new file mode 100644 index 00000000000..7d00a44c1fe --- /dev/null +++ b/test/evals/REGRESSION_VERIFICATION_REPORT.md @@ -0,0 +1,72 @@ +# Regression Verification Report + +Date: 2026-03-02 + +## Stability Loops + +### Web interaction regressions + +Command: + +```bash +for i in {1..20}; do + pnpm vitest run app/workspace/object-view-active-view.test.ts app/components/workspace/object-filter-bar.test.tsx --silent +done +``` + +Result: **20/20 passes** (no flaky failures observed). + +### CLI edge-case regressions + +Command: + +```bash +for i in {1..20}; do + ../../node_modules/.bin/vitest run --config vitest.config.ts argv.test.ts run-main.test.ts windows-argv.test.ts --silent +done +``` + +Result: **20/20 passes** (no flaky failures observed). + +## Mutation Probes (Manual) + +The following intentional mutations were applied, tested, and reverted: + +1. `apps/web/app/workspace/object-view-active-view.ts` + - Mutation: `shouldApply` reduced to `nameMismatch` only. + - Expected break: initial-load active view bug returns. + - Caught by: `app/workspace/object-view-active-view.test.ts` (`applies active view on initial load ...`). + - Outcome: **failed as expected**, then reverted. + +2. `apps/web/app/components/chat-panel.tsx` + - Mutation: tool error state changed from `"error"` to `"output-available"`. + - Expected break: tool terminal state classification regression. + - Caught by: `app/components/chat-panel.stream-parser.test.ts` (`accumulates tool input/output ...`). + - Outcome: **failed as expected**, then reverted. + +3. `src/cli/run-main.ts` + - Mutation: disable-delegation branch changed to incorrectly return `true`. + - Expected break: delegation safety bypass. + - Caught by: `src/cli/run-main.test.ts` (`disables delegation when explicit env disable flag is set`). + - Outcome: **failed as expected**, then reverted. + +4. `test/evals/graders.ts` + - Mutation: duplicate `globalSeq` detection removed. + - Expected break: trace-integrity duplicate-sequence protection. + - Caught by: `test/evals/graders.test.ts` (`fails trace grader for duplicate globalSeq ...`). + - Outcome: **failed as expected**, then reverted. + +## Final Verification Runs + +- Web interaction suite: pass (`34` tests) +- CLI edge-case suite: pass (`42` tests) +- Eval grader unit suite: pass (`3` tests) +- Eval harness: + - `pnpm test:evals`: pass (`9/9` graders across `3` cases) + - `pnpm test:evals:enforce`: pass (`critical` cases all passing) + +## Residual Risks + +- Some large end-to-end UI flows (`workspace/page.tsx` URL orchestration and multi-panel state transitions) still rely on unit/component-level coverage rather than browser E2E. +- The eval dataset currently includes representative high-risk cases, but not production trace replay at scale yet. +- Additional CRM mutation permutations (large view/filter combinations) can be added to the eval dataset over time. diff --git a/test/evals/dataset.ts b/test/evals/dataset.ts new file mode 100644 index 00000000000..0eb80d86d85 --- /dev/null +++ b/test/evals/dataset.ts @@ -0,0 +1,140 @@ +import type { EvalCase } from "./types.js"; + +export const chatAgentEvalDataset: EvalCase[] = [ + { + id: "tool-call-success", + description: "single-tool answer includes expected retrieval summary", + critical: true, + events: [ + { type: "user-message", id: "u-1", text: "Find docs for workspace lock", globalSeq: 1 }, + { type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 2 }, + { + type: "tool-input-available", + toolCallId: "tool-1", + globalSeq: 3, + output: { query: "workspace lock" }, + }, + { + type: "tool-output-available", + toolCallId: "tool-1", + globalSeq: 4, + output: { hits: 3 }, + }, + { type: "text-start", globalSeq: 5 }, + { + type: "text-delta", + delta: "Found 3 matching documents about workspace lock.", + globalSeq: 6, + }, + { type: "text-end", globalSeq: 7 }, + ], + expectations: { + output: { + mustContain: ["Found 3 matching documents"], + mustNotContain: ["NO_REPLY"], + }, + trajectory: { + tools: ["searchDocs"], + mode: "strict", + }, + trace: { + requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"], + requireMonotonicGlobalSeq: true, + }, + }, + }, + { + id: "subagent-roundtrip", + description: + "subagent spawn followed by completion announcement is preserved in final response", + critical: true, + events: [ + { type: "user-message", id: "u-2", text: "Research customer timeline", globalSeq: 1 }, + { type: "tool-input-start", toolCallId: "tool-2", toolName: "spawnSubagent", globalSeq: 2 }, + { + type: "tool-output-available", + toolCallId: "tool-2", + globalSeq: 3, + output: { childSessionKey: "sub:abc", task: "timeline research" }, + }, + { + type: "tool-input-start", + toolCallId: "tool-3", + toolName: "collectSubagentResult", + globalSeq: 4, + }, + { + type: "tool-output-available", + toolCallId: "tool-3", + globalSeq: 5, + output: { summary: "Key milestones collected." }, + }, + { type: "text-start", globalSeq: 6 }, + { + type: "text-delta", + delta: "Subagent finished and reported back with key milestones.", + globalSeq: 7, + }, + { type: "text-end", globalSeq: 8 }, + ], + expectations: { + output: { + mustContain: ["Subagent finished and reported back"], + }, + trajectory: { + tools: ["spawnSubagent", "collectSubagentResult"], + mode: "strict", + }, + trace: { + requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"], + requireMonotonicGlobalSeq: true, + }, + }, + }, + { + id: "crm-view-mutation", + description: + "CRM mutation path applies object view metadata and confirms result in assistant output", + critical: false, + events: [ + { type: "user-message", id: "u-3", text: "Set Important as active leads view", globalSeq: 1 }, + { type: "tool-input-start", toolCallId: "tool-4", toolName: "saveObjectViews", globalSeq: 2 }, + { + type: "tool-output-available", + toolCallId: "tool-4", + globalSeq: 3, + output: { object: "leads", activeView: "Important" }, + }, + { + type: "tool-input-start", + toolCallId: "tool-5", + toolName: "fetchObjectEntries", + globalSeq: 4, + }, + { + type: "tool-output-available", + toolCallId: "tool-5", + globalSeq: 5, + output: { filteredRows: 12 }, + }, + { + type: "text-delta", + delta: "Active view 'Important' is now applied to leads.", + globalSeq: 6, + }, + ], + expectations: { + output: { + mustContain: ["Active view 'Important' is now applied"], + }, + trajectory: { + tools: ["saveObjectViews", "fetchObjectEntries"], + mode: "strict", + }, + trace: { + requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"], + requireMonotonicGlobalSeq: true, + }, + }, + }, +]; diff --git a/test/evals/graders.test.ts b/test/evals/graders.test.ts new file mode 100644 index 00000000000..dcab12da2a6 --- /dev/null +++ b/test/evals/graders.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "vitest"; +import { chatAgentEvalDataset } from "./dataset.js"; +import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js"; + +describe("eval graders", () => { + it("passes output/trajectory/trace graders for baseline success case", () => { + const baseline = chatAgentEvalDataset[0]; + expect(gradeOutput(baseline).passed).toBe(true); + expect(gradeTrajectory(baseline).passed).toBe(true); + expect(gradeTraceIntegrity(baseline).passed).toBe(true); + }); + + it("fails trajectory grader when tool order does not match strict expectation", () => { + const mismatched = { + ...chatAgentEvalDataset[0], + events: [ + { + type: "tool-input-start", + toolCallId: "tool-other", + toolName: "saveFile", + globalSeq: 1, + }, + ], + }; + const result = gradeTrajectory(mismatched); + expect(result.passed).toBe(false); + expect(result.details.join(" ")).toContain("trajectory mismatch"); + }); + + it("fails trace grader for duplicate globalSeq and missing tool terminal events", () => { + const brokenTrace = { + ...chatAgentEvalDataset[0], + events: [ + { type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 4 }, + { type: "text-delta", delta: "partial", globalSeq: 4 }, + ], + }; + const result = gradeTraceIntegrity(brokenTrace); + expect(result.passed).toBe(false); + expect(result.details.some((detail) => detail.includes("duplicate globalSeq"))).toBe(true); + expect(result.details.some((detail) => detail.includes("without terminal output"))).toBe(true); + }); +}); diff --git a/test/evals/graders.ts b/test/evals/graders.ts new file mode 100644 index 00000000000..cfb40b397d2 --- /dev/null +++ b/test/evals/graders.ts @@ -0,0 +1,203 @@ +import type { EvalCase, EvalEvent, GradeResult, TrajectoryMatchMode } from "./types.js"; + +function collectAssistantText(events: EvalEvent[]): string { + const chunks: string[] = []; + for (const event of events) { + if (event.type === "text-delta" && typeof event.delta === "string") { + chunks.push(event.delta); + continue; + } + if (event.type === "text" && typeof event.text === "string") { + chunks.push(event.text); + continue; + } + } + return chunks.join("").trim(); +} + +function collectToolTrajectory(events: EvalEvent[]): string[] { + const tools: string[] = []; + for (const event of events) { + if (event.type === "tool-input-start" && event.toolName) { + tools.push(event.toolName); + } + } + return tools; +} + +function arraysEqual(a: string[], b: string[]): boolean { + if (a.length !== b.length) { + return false; + } + for (let i = 0; i < a.length; i += 1) { + if (a[i] !== b[i]) { + return false; + } + } + return true; +} + +function includesSubsequence(haystack: string[], needle: string[]): boolean { + if (needle.length === 0) { + return true; + } + let j = 0; + for (let i = 0; i < haystack.length && j < needle.length; i += 1) { + if (haystack[i] === needle[j]) { + j += 1; + } + } + return j === needle.length; +} + +function matchTrajectory(expected: string[], actual: string[], mode: TrajectoryMatchMode): boolean { + switch (mode) { + case "strict": + return arraysEqual(expected, actual); + case "subset": + return includesSubsequence(expected, actual); + case "superset": + return includesSubsequence(actual, expected); + case "unordered": { + const expectedSorted = [...expected].toSorted(); + const actualSorted = [...actual].toSorted(); + return arraysEqual(expectedSorted, actualSorted); + } + default: + return false; + } +} + +export function gradeOutput(testCase: EvalCase): GradeResult { + const details: string[] = []; + const expectation = testCase.expectations.output; + if (!expectation) { + return { grader: "output", passed: true, details: ["no output expectation configured"] }; + } + + const text = collectAssistantText(testCase.events); + if (!text) { + details.push("assistant text was empty"); + } + + for (const required of expectation.mustContain ?? []) { + if (!text.includes(required)) { + details.push(`missing required output fragment: "${required}"`); + } + } + + for (const forbidden of expectation.mustNotContain ?? []) { + if (text.includes(forbidden)) { + details.push(`found forbidden output fragment: "${forbidden}"`); + } + } + + return { + grader: "output", + passed: details.length === 0, + details: details.length > 0 ? details : ["output matched expectations"], + }; +} + +export function gradeTrajectory(testCase: EvalCase): GradeResult { + const details: string[] = []; + const expectation = testCase.expectations.trajectory; + if (!expectation) { + return { + grader: "trajectory", + passed: true, + details: ["no trajectory expectation configured"], + }; + } + + const actual = collectToolTrajectory(testCase.events); + const matched = matchTrajectory(expectation.tools, actual, expectation.mode); + if (!matched) { + details.push( + `trajectory mismatch for mode=${expectation.mode}: expected=${JSON.stringify(expectation.tools)} actual=${JSON.stringify(actual)}`, + ); + } + + return { + grader: "trajectory", + passed: matched, + details: matched ? ["trajectory matched expectation"] : details, + }; +} + +export function gradeTraceIntegrity(testCase: EvalCase): GradeResult { + const details: string[] = []; + const expectation = testCase.expectations.trace; + + if (!expectation) { + return { grader: "trace", passed: true, details: ["no trace expectation configured"] }; + } + + const requiredTypes = expectation.requiredTypes ?? []; + const eventTypes = new Set(testCase.events.map((event) => event.type)); + for (const requiredType of requiredTypes) { + if (!eventTypes.has(requiredType)) { + details.push(`missing required event type: ${requiredType}`); + } + } + + if (expectation.requireMonotonicGlobalSeq) { + let lastSeq = -1; + const seen = new Set(); + for (const event of testCase.events) { + if (typeof event.globalSeq !== "number") { + continue; + } + if (seen.has(event.globalSeq)) { + details.push(`duplicate globalSeq detected: ${event.globalSeq}`); + } + seen.add(event.globalSeq); + if (event.globalSeq <= lastSeq) { + details.push(`non-monotonic globalSeq transition: ${lastSeq} -> ${event.globalSeq}`); + } + lastSeq = event.globalSeq; + } + } + + const toolState = new Map(); + for (const event of testCase.events) { + if (!event.toolCallId) { + continue; + } + + if (event.type === "tool-input-start") { + const current = toolState.get(event.toolCallId) ?? { + started: false, + terminalCount: 0, + }; + current.started = true; + toolState.set(event.toolCallId, current); + continue; + } + + if (event.type === "tool-output-available" || event.type === "tool-output-error") { + const current = toolState.get(event.toolCallId); + if (!current?.started) { + details.push(`terminal tool event without start for ${event.toolCallId}`); + continue; + } + current.terminalCount += 1; + toolState.set(event.toolCallId, current); + } + } + + for (const [toolCallId, state] of toolState.entries()) { + if (state.started && state.terminalCount === 0) { + details.push(`tool call without terminal output: ${toolCallId}`); + } + if (state.terminalCount > 1) { + details.push(`tool call produced multiple terminal outputs: ${toolCallId}`); + } + } + + return { + grader: "trace", + passed: details.length === 0, + details: details.length > 0 ? details : ["trace integrity checks passed"], + }; +}