feat(evals): add eval framework with dataset, graders, and regression report

This commit is contained in:
kumarabhirup 2026-03-02 18:36:40 -08:00
parent ebde96c11b
commit 1deeb5878e
No known key found for this signature in database
GPG Key ID: DB7CA2289CAB0167
5 changed files with 484 additions and 0 deletions

26
test/evals/README.md Normal file
View File

@ -0,0 +1,26 @@
# Chat-Agent Evals
This folder contains a dataset-driven eval harness for web chat-agent behavior.
## What is graded
- `output`: final assistant text quality checks (`mustContain`, `mustNotContain`)
- `trajectory`: tool-call ordering checks (`strict`, `subset`, `superset`, `unordered`)
- `trace`: event-stream integrity checks (required event types, monotonic `globalSeq`,
and tool-call lifecycle completeness)
## Run
```bash
pnpm test:evals
```
This mode is informational and prints a full summary.
## Enforce critical checks
```bash
pnpm test:evals:enforce
```
When `EVALS_ENFORCE=1`, the runner exits non-zero if any **critical** eval case fails.

View File

@ -0,0 +1,72 @@
# Regression Verification Report
Date: 2026-03-02
## Stability Loops
### Web interaction regressions
Command:
```bash
for i in {1..20}; do
pnpm vitest run app/workspace/object-view-active-view.test.ts app/components/workspace/object-filter-bar.test.tsx --silent
done
```
Result: **20/20 passes** (no flaky failures observed).
### CLI edge-case regressions
Command:
```bash
for i in {1..20}; do
../../node_modules/.bin/vitest run --config vitest.config.ts argv.test.ts run-main.test.ts windows-argv.test.ts --silent
done
```
Result: **20/20 passes** (no flaky failures observed).
## Mutation Probes (Manual)
The following intentional mutations were applied, tested, and reverted:
1. `apps/web/app/workspace/object-view-active-view.ts`
- Mutation: `shouldApply` reduced to `nameMismatch` only.
- Expected break: initial-load active view bug returns.
- Caught by: `app/workspace/object-view-active-view.test.ts` (`applies active view on initial load ...`).
- Outcome: **failed as expected**, then reverted.
2. `apps/web/app/components/chat-panel.tsx`
- Mutation: tool error state changed from `"error"` to `"output-available"`.
- Expected break: tool terminal state classification regression.
- Caught by: `app/components/chat-panel.stream-parser.test.ts` (`accumulates tool input/output ...`).
- Outcome: **failed as expected**, then reverted.
3. `src/cli/run-main.ts`
- Mutation: disable-delegation branch changed to incorrectly return `true`.
- Expected break: delegation safety bypass.
- Caught by: `src/cli/run-main.test.ts` (`disables delegation when explicit env disable flag is set`).
- Outcome: **failed as expected**, then reverted.
4. `test/evals/graders.ts`
- Mutation: duplicate `globalSeq` detection removed.
- Expected break: trace-integrity duplicate-sequence protection.
- Caught by: `test/evals/graders.test.ts` (`fails trace grader for duplicate globalSeq ...`).
- Outcome: **failed as expected**, then reverted.
## Final Verification Runs
- Web interaction suite: pass (`34` tests)
- CLI edge-case suite: pass (`42` tests)
- Eval grader unit suite: pass (`3` tests)
- Eval harness:
- `pnpm test:evals`: pass (`9/9` graders across `3` cases)
- `pnpm test:evals:enforce`: pass (`critical` cases all passing)
## Residual Risks
- Some large end-to-end UI flows (`workspace/page.tsx` URL orchestration and multi-panel state transitions) still rely on unit/component-level coverage rather than browser E2E.
- The eval dataset currently includes representative high-risk cases, but not production trace replay at scale yet.
- Additional CRM mutation permutations (large view/filter combinations) can be added to the eval dataset over time.

140
test/evals/dataset.ts Normal file
View File

@ -0,0 +1,140 @@
import type { EvalCase } from "./types.js";
export const chatAgentEvalDataset: EvalCase[] = [
{
id: "tool-call-success",
description: "single-tool answer includes expected retrieval summary",
critical: true,
events: [
{ type: "user-message", id: "u-1", text: "Find docs for workspace lock", globalSeq: 1 },
{ type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 2 },
{
type: "tool-input-available",
toolCallId: "tool-1",
globalSeq: 3,
output: { query: "workspace lock" },
},
{
type: "tool-output-available",
toolCallId: "tool-1",
globalSeq: 4,
output: { hits: 3 },
},
{ type: "text-start", globalSeq: 5 },
{
type: "text-delta",
delta: "Found 3 matching documents about workspace lock.",
globalSeq: 6,
},
{ type: "text-end", globalSeq: 7 },
],
expectations: {
output: {
mustContain: ["Found 3 matching documents"],
mustNotContain: ["NO_REPLY"],
},
trajectory: {
tools: ["searchDocs"],
mode: "strict",
},
trace: {
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
requireMonotonicGlobalSeq: true,
},
},
},
{
id: "subagent-roundtrip",
description:
"subagent spawn followed by completion announcement is preserved in final response",
critical: true,
events: [
{ type: "user-message", id: "u-2", text: "Research customer timeline", globalSeq: 1 },
{ type: "tool-input-start", toolCallId: "tool-2", toolName: "spawnSubagent", globalSeq: 2 },
{
type: "tool-output-available",
toolCallId: "tool-2",
globalSeq: 3,
output: { childSessionKey: "sub:abc", task: "timeline research" },
},
{
type: "tool-input-start",
toolCallId: "tool-3",
toolName: "collectSubagentResult",
globalSeq: 4,
},
{
type: "tool-output-available",
toolCallId: "tool-3",
globalSeq: 5,
output: { summary: "Key milestones collected." },
},
{ type: "text-start", globalSeq: 6 },
{
type: "text-delta",
delta: "Subagent finished and reported back with key milestones.",
globalSeq: 7,
},
{ type: "text-end", globalSeq: 8 },
],
expectations: {
output: {
mustContain: ["Subagent finished and reported back"],
},
trajectory: {
tools: ["spawnSubagent", "collectSubagentResult"],
mode: "strict",
},
trace: {
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
requireMonotonicGlobalSeq: true,
},
},
},
{
id: "crm-view-mutation",
description:
"CRM mutation path applies object view metadata and confirms result in assistant output",
critical: false,
events: [
{ type: "user-message", id: "u-3", text: "Set Important as active leads view", globalSeq: 1 },
{ type: "tool-input-start", toolCallId: "tool-4", toolName: "saveObjectViews", globalSeq: 2 },
{
type: "tool-output-available",
toolCallId: "tool-4",
globalSeq: 3,
output: { object: "leads", activeView: "Important" },
},
{
type: "tool-input-start",
toolCallId: "tool-5",
toolName: "fetchObjectEntries",
globalSeq: 4,
},
{
type: "tool-output-available",
toolCallId: "tool-5",
globalSeq: 5,
output: { filteredRows: 12 },
},
{
type: "text-delta",
delta: "Active view 'Important' is now applied to leads.",
globalSeq: 6,
},
],
expectations: {
output: {
mustContain: ["Active view 'Important' is now applied"],
},
trajectory: {
tools: ["saveObjectViews", "fetchObjectEntries"],
mode: "strict",
},
trace: {
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
requireMonotonicGlobalSeq: true,
},
},
},
];

View File

@ -0,0 +1,43 @@
import { describe, expect, it } from "vitest";
import { chatAgentEvalDataset } from "./dataset.js";
import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
describe("eval graders", () => {
it("passes output/trajectory/trace graders for baseline success case", () => {
const baseline = chatAgentEvalDataset[0];
expect(gradeOutput(baseline).passed).toBe(true);
expect(gradeTrajectory(baseline).passed).toBe(true);
expect(gradeTraceIntegrity(baseline).passed).toBe(true);
});
it("fails trajectory grader when tool order does not match strict expectation", () => {
const mismatched = {
...chatAgentEvalDataset[0],
events: [
{
type: "tool-input-start",
toolCallId: "tool-other",
toolName: "saveFile",
globalSeq: 1,
},
],
};
const result = gradeTrajectory(mismatched);
expect(result.passed).toBe(false);
expect(result.details.join(" ")).toContain("trajectory mismatch");
});
it("fails trace grader for duplicate globalSeq and missing tool terminal events", () => {
const brokenTrace = {
...chatAgentEvalDataset[0],
events: [
{ type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 4 },
{ type: "text-delta", delta: "partial", globalSeq: 4 },
],
};
const result = gradeTraceIntegrity(brokenTrace);
expect(result.passed).toBe(false);
expect(result.details.some((detail) => detail.includes("duplicate globalSeq"))).toBe(true);
expect(result.details.some((detail) => detail.includes("without terminal output"))).toBe(true);
});
});

203
test/evals/graders.ts Normal file
View File

@ -0,0 +1,203 @@
import type { EvalCase, EvalEvent, GradeResult, TrajectoryMatchMode } from "./types.js";
function collectAssistantText(events: EvalEvent[]): string {
const chunks: string[] = [];
for (const event of events) {
if (event.type === "text-delta" && typeof event.delta === "string") {
chunks.push(event.delta);
continue;
}
if (event.type === "text" && typeof event.text === "string") {
chunks.push(event.text);
continue;
}
}
return chunks.join("").trim();
}
function collectToolTrajectory(events: EvalEvent[]): string[] {
const tools: string[] = [];
for (const event of events) {
if (event.type === "tool-input-start" && event.toolName) {
tools.push(event.toolName);
}
}
return tools;
}
function arraysEqual(a: string[], b: string[]): boolean {
if (a.length !== b.length) {
return false;
}
for (let i = 0; i < a.length; i += 1) {
if (a[i] !== b[i]) {
return false;
}
}
return true;
}
function includesSubsequence(haystack: string[], needle: string[]): boolean {
if (needle.length === 0) {
return true;
}
let j = 0;
for (let i = 0; i < haystack.length && j < needle.length; i += 1) {
if (haystack[i] === needle[j]) {
j += 1;
}
}
return j === needle.length;
}
function matchTrajectory(expected: string[], actual: string[], mode: TrajectoryMatchMode): boolean {
switch (mode) {
case "strict":
return arraysEqual(expected, actual);
case "subset":
return includesSubsequence(expected, actual);
case "superset":
return includesSubsequence(actual, expected);
case "unordered": {
const expectedSorted = [...expected].toSorted();
const actualSorted = [...actual].toSorted();
return arraysEqual(expectedSorted, actualSorted);
}
default:
return false;
}
}
export function gradeOutput(testCase: EvalCase): GradeResult {
const details: string[] = [];
const expectation = testCase.expectations.output;
if (!expectation) {
return { grader: "output", passed: true, details: ["no output expectation configured"] };
}
const text = collectAssistantText(testCase.events);
if (!text) {
details.push("assistant text was empty");
}
for (const required of expectation.mustContain ?? []) {
if (!text.includes(required)) {
details.push(`missing required output fragment: "${required}"`);
}
}
for (const forbidden of expectation.mustNotContain ?? []) {
if (text.includes(forbidden)) {
details.push(`found forbidden output fragment: "${forbidden}"`);
}
}
return {
grader: "output",
passed: details.length === 0,
details: details.length > 0 ? details : ["output matched expectations"],
};
}
export function gradeTrajectory(testCase: EvalCase): GradeResult {
const details: string[] = [];
const expectation = testCase.expectations.trajectory;
if (!expectation) {
return {
grader: "trajectory",
passed: true,
details: ["no trajectory expectation configured"],
};
}
const actual = collectToolTrajectory(testCase.events);
const matched = matchTrajectory(expectation.tools, actual, expectation.mode);
if (!matched) {
details.push(
`trajectory mismatch for mode=${expectation.mode}: expected=${JSON.stringify(expectation.tools)} actual=${JSON.stringify(actual)}`,
);
}
return {
grader: "trajectory",
passed: matched,
details: matched ? ["trajectory matched expectation"] : details,
};
}
export function gradeTraceIntegrity(testCase: EvalCase): GradeResult {
const details: string[] = [];
const expectation = testCase.expectations.trace;
if (!expectation) {
return { grader: "trace", passed: true, details: ["no trace expectation configured"] };
}
const requiredTypes = expectation.requiredTypes ?? [];
const eventTypes = new Set(testCase.events.map((event) => event.type));
for (const requiredType of requiredTypes) {
if (!eventTypes.has(requiredType)) {
details.push(`missing required event type: ${requiredType}`);
}
}
if (expectation.requireMonotonicGlobalSeq) {
let lastSeq = -1;
const seen = new Set<number>();
for (const event of testCase.events) {
if (typeof event.globalSeq !== "number") {
continue;
}
if (seen.has(event.globalSeq)) {
details.push(`duplicate globalSeq detected: ${event.globalSeq}`);
}
seen.add(event.globalSeq);
if (event.globalSeq <= lastSeq) {
details.push(`non-monotonic globalSeq transition: ${lastSeq} -> ${event.globalSeq}`);
}
lastSeq = event.globalSeq;
}
}
const toolState = new Map<string, { started: boolean; terminalCount: number }>();
for (const event of testCase.events) {
if (!event.toolCallId) {
continue;
}
if (event.type === "tool-input-start") {
const current = toolState.get(event.toolCallId) ?? {
started: false,
terminalCount: 0,
};
current.started = true;
toolState.set(event.toolCallId, current);
continue;
}
if (event.type === "tool-output-available" || event.type === "tool-output-error") {
const current = toolState.get(event.toolCallId);
if (!current?.started) {
details.push(`terminal tool event without start for ${event.toolCallId}`);
continue;
}
current.terminalCount += 1;
toolState.set(event.toolCallId, current);
}
}
for (const [toolCallId, state] of toolState.entries()) {
if (state.started && state.terminalCount === 0) {
details.push(`tool call without terminal output: ${toolCallId}`);
}
if (state.terminalCount > 1) {
details.push(`tool call produced multiple terminal outputs: ${toolCallId}`);
}
}
return {
grader: "trace",
passed: details.length === 0,
details: details.length > 0 ? details : ["trace integrity checks passed"],
};
}