feat(evals): add eval framework with dataset, graders, and regression report
This commit is contained in:
parent
ebde96c11b
commit
1deeb5878e
26
test/evals/README.md
Normal file
26
test/evals/README.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Chat-Agent Evals
|
||||
|
||||
This folder contains a dataset-driven eval harness for web chat-agent behavior.
|
||||
|
||||
## What is graded
|
||||
|
||||
- `output`: final assistant text quality checks (`mustContain`, `mustNotContain`)
|
||||
- `trajectory`: tool-call ordering checks (`strict`, `subset`, `superset`, `unordered`)
|
||||
- `trace`: event-stream integrity checks (required event types, monotonic `globalSeq`,
|
||||
and tool-call lifecycle completeness)
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
pnpm test:evals
|
||||
```
|
||||
|
||||
This mode is informational and prints a full summary.
|
||||
|
||||
## Enforce critical checks
|
||||
|
||||
```bash
|
||||
pnpm test:evals:enforce
|
||||
```
|
||||
|
||||
When `EVALS_ENFORCE=1`, the runner exits non-zero if any **critical** eval case fails.
|
||||
72
test/evals/REGRESSION_VERIFICATION_REPORT.md
Normal file
72
test/evals/REGRESSION_VERIFICATION_REPORT.md
Normal file
@ -0,0 +1,72 @@
|
||||
# Regression Verification Report
|
||||
|
||||
Date: 2026-03-02
|
||||
|
||||
## Stability Loops
|
||||
|
||||
### Web interaction regressions
|
||||
|
||||
Command:
|
||||
|
||||
```bash
|
||||
for i in {1..20}; do
|
||||
pnpm vitest run app/workspace/object-view-active-view.test.ts app/components/workspace/object-filter-bar.test.tsx --silent
|
||||
done
|
||||
```
|
||||
|
||||
Result: **20/20 passes** (no flaky failures observed).
|
||||
|
||||
### CLI edge-case regressions
|
||||
|
||||
Command:
|
||||
|
||||
```bash
|
||||
for i in {1..20}; do
|
||||
../../node_modules/.bin/vitest run --config vitest.config.ts argv.test.ts run-main.test.ts windows-argv.test.ts --silent
|
||||
done
|
||||
```
|
||||
|
||||
Result: **20/20 passes** (no flaky failures observed).
|
||||
|
||||
## Mutation Probes (Manual)
|
||||
|
||||
The following intentional mutations were applied, tested, and reverted:
|
||||
|
||||
1. `apps/web/app/workspace/object-view-active-view.ts`
|
||||
- Mutation: `shouldApply` reduced to `nameMismatch` only.
|
||||
- Expected break: initial-load active view bug returns.
|
||||
- Caught by: `app/workspace/object-view-active-view.test.ts` (`applies active view on initial load ...`).
|
||||
- Outcome: **failed as expected**, then reverted.
|
||||
|
||||
2. `apps/web/app/components/chat-panel.tsx`
|
||||
- Mutation: tool error state changed from `"error"` to `"output-available"`.
|
||||
- Expected break: tool terminal state classification regression.
|
||||
- Caught by: `app/components/chat-panel.stream-parser.test.ts` (`accumulates tool input/output ...`).
|
||||
- Outcome: **failed as expected**, then reverted.
|
||||
|
||||
3. `src/cli/run-main.ts`
|
||||
- Mutation: disable-delegation branch changed to incorrectly return `true`.
|
||||
- Expected break: delegation safety bypass.
|
||||
- Caught by: `src/cli/run-main.test.ts` (`disables delegation when explicit env disable flag is set`).
|
||||
- Outcome: **failed as expected**, then reverted.
|
||||
|
||||
4. `test/evals/graders.ts`
|
||||
- Mutation: duplicate `globalSeq` detection removed.
|
||||
- Expected break: trace-integrity duplicate-sequence protection.
|
||||
- Caught by: `test/evals/graders.test.ts` (`fails trace grader for duplicate globalSeq ...`).
|
||||
- Outcome: **failed as expected**, then reverted.
|
||||
|
||||
## Final Verification Runs
|
||||
|
||||
- Web interaction suite: pass (`34` tests)
|
||||
- CLI edge-case suite: pass (`42` tests)
|
||||
- Eval grader unit suite: pass (`3` tests)
|
||||
- Eval harness:
|
||||
- `pnpm test:evals`: pass (`9/9` graders across `3` cases)
|
||||
- `pnpm test:evals:enforce`: pass (`critical` cases all passing)
|
||||
|
||||
## Residual Risks
|
||||
|
||||
- Some large end-to-end UI flows (`workspace/page.tsx` URL orchestration and multi-panel state transitions) still rely on unit/component-level coverage rather than browser E2E.
|
||||
- The eval dataset currently includes representative high-risk cases, but not production trace replay at scale yet.
|
||||
- Additional CRM mutation permutations (large view/filter combinations) can be added to the eval dataset over time.
|
||||
140
test/evals/dataset.ts
Normal file
140
test/evals/dataset.ts
Normal file
@ -0,0 +1,140 @@
|
||||
import type { EvalCase } from "./types.js";
|
||||
|
||||
export const chatAgentEvalDataset: EvalCase[] = [
|
||||
{
|
||||
id: "tool-call-success",
|
||||
description: "single-tool answer includes expected retrieval summary",
|
||||
critical: true,
|
||||
events: [
|
||||
{ type: "user-message", id: "u-1", text: "Find docs for workspace lock", globalSeq: 1 },
|
||||
{ type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 2 },
|
||||
{
|
||||
type: "tool-input-available",
|
||||
toolCallId: "tool-1",
|
||||
globalSeq: 3,
|
||||
output: { query: "workspace lock" },
|
||||
},
|
||||
{
|
||||
type: "tool-output-available",
|
||||
toolCallId: "tool-1",
|
||||
globalSeq: 4,
|
||||
output: { hits: 3 },
|
||||
},
|
||||
{ type: "text-start", globalSeq: 5 },
|
||||
{
|
||||
type: "text-delta",
|
||||
delta: "Found 3 matching documents about workspace lock.",
|
||||
globalSeq: 6,
|
||||
},
|
||||
{ type: "text-end", globalSeq: 7 },
|
||||
],
|
||||
expectations: {
|
||||
output: {
|
||||
mustContain: ["Found 3 matching documents"],
|
||||
mustNotContain: ["NO_REPLY"],
|
||||
},
|
||||
trajectory: {
|
||||
tools: ["searchDocs"],
|
||||
mode: "strict",
|
||||
},
|
||||
trace: {
|
||||
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
|
||||
requireMonotonicGlobalSeq: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "subagent-roundtrip",
|
||||
description:
|
||||
"subagent spawn followed by completion announcement is preserved in final response",
|
||||
critical: true,
|
||||
events: [
|
||||
{ type: "user-message", id: "u-2", text: "Research customer timeline", globalSeq: 1 },
|
||||
{ type: "tool-input-start", toolCallId: "tool-2", toolName: "spawnSubagent", globalSeq: 2 },
|
||||
{
|
||||
type: "tool-output-available",
|
||||
toolCallId: "tool-2",
|
||||
globalSeq: 3,
|
||||
output: { childSessionKey: "sub:abc", task: "timeline research" },
|
||||
},
|
||||
{
|
||||
type: "tool-input-start",
|
||||
toolCallId: "tool-3",
|
||||
toolName: "collectSubagentResult",
|
||||
globalSeq: 4,
|
||||
},
|
||||
{
|
||||
type: "tool-output-available",
|
||||
toolCallId: "tool-3",
|
||||
globalSeq: 5,
|
||||
output: { summary: "Key milestones collected." },
|
||||
},
|
||||
{ type: "text-start", globalSeq: 6 },
|
||||
{
|
||||
type: "text-delta",
|
||||
delta: "Subagent finished and reported back with key milestones.",
|
||||
globalSeq: 7,
|
||||
},
|
||||
{ type: "text-end", globalSeq: 8 },
|
||||
],
|
||||
expectations: {
|
||||
output: {
|
||||
mustContain: ["Subagent finished and reported back"],
|
||||
},
|
||||
trajectory: {
|
||||
tools: ["spawnSubagent", "collectSubagentResult"],
|
||||
mode: "strict",
|
||||
},
|
||||
trace: {
|
||||
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
|
||||
requireMonotonicGlobalSeq: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "crm-view-mutation",
|
||||
description:
|
||||
"CRM mutation path applies object view metadata and confirms result in assistant output",
|
||||
critical: false,
|
||||
events: [
|
||||
{ type: "user-message", id: "u-3", text: "Set Important as active leads view", globalSeq: 1 },
|
||||
{ type: "tool-input-start", toolCallId: "tool-4", toolName: "saveObjectViews", globalSeq: 2 },
|
||||
{
|
||||
type: "tool-output-available",
|
||||
toolCallId: "tool-4",
|
||||
globalSeq: 3,
|
||||
output: { object: "leads", activeView: "Important" },
|
||||
},
|
||||
{
|
||||
type: "tool-input-start",
|
||||
toolCallId: "tool-5",
|
||||
toolName: "fetchObjectEntries",
|
||||
globalSeq: 4,
|
||||
},
|
||||
{
|
||||
type: "tool-output-available",
|
||||
toolCallId: "tool-5",
|
||||
globalSeq: 5,
|
||||
output: { filteredRows: 12 },
|
||||
},
|
||||
{
|
||||
type: "text-delta",
|
||||
delta: "Active view 'Important' is now applied to leads.",
|
||||
globalSeq: 6,
|
||||
},
|
||||
],
|
||||
expectations: {
|
||||
output: {
|
||||
mustContain: ["Active view 'Important' is now applied"],
|
||||
},
|
||||
trajectory: {
|
||||
tools: ["saveObjectViews", "fetchObjectEntries"],
|
||||
mode: "strict",
|
||||
},
|
||||
trace: {
|
||||
requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
|
||||
requireMonotonicGlobalSeq: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
43
test/evals/graders.test.ts
Normal file
43
test/evals/graders.test.ts
Normal file
@ -0,0 +1,43 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { chatAgentEvalDataset } from "./dataset.js";
|
||||
import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
|
||||
|
||||
describe("eval graders", () => {
|
||||
it("passes output/trajectory/trace graders for baseline success case", () => {
|
||||
const baseline = chatAgentEvalDataset[0];
|
||||
expect(gradeOutput(baseline).passed).toBe(true);
|
||||
expect(gradeTrajectory(baseline).passed).toBe(true);
|
||||
expect(gradeTraceIntegrity(baseline).passed).toBe(true);
|
||||
});
|
||||
|
||||
it("fails trajectory grader when tool order does not match strict expectation", () => {
|
||||
const mismatched = {
|
||||
...chatAgentEvalDataset[0],
|
||||
events: [
|
||||
{
|
||||
type: "tool-input-start",
|
||||
toolCallId: "tool-other",
|
||||
toolName: "saveFile",
|
||||
globalSeq: 1,
|
||||
},
|
||||
],
|
||||
};
|
||||
const result = gradeTrajectory(mismatched);
|
||||
expect(result.passed).toBe(false);
|
||||
expect(result.details.join(" ")).toContain("trajectory mismatch");
|
||||
});
|
||||
|
||||
it("fails trace grader for duplicate globalSeq and missing tool terminal events", () => {
|
||||
const brokenTrace = {
|
||||
...chatAgentEvalDataset[0],
|
||||
events: [
|
||||
{ type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 4 },
|
||||
{ type: "text-delta", delta: "partial", globalSeq: 4 },
|
||||
],
|
||||
};
|
||||
const result = gradeTraceIntegrity(brokenTrace);
|
||||
expect(result.passed).toBe(false);
|
||||
expect(result.details.some((detail) => detail.includes("duplicate globalSeq"))).toBe(true);
|
||||
expect(result.details.some((detail) => detail.includes("without terminal output"))).toBe(true);
|
||||
});
|
||||
});
|
||||
203
test/evals/graders.ts
Normal file
203
test/evals/graders.ts
Normal file
@ -0,0 +1,203 @@
|
||||
import type { EvalCase, EvalEvent, GradeResult, TrajectoryMatchMode } from "./types.js";
|
||||
|
||||
function collectAssistantText(events: EvalEvent[]): string {
|
||||
const chunks: string[] = [];
|
||||
for (const event of events) {
|
||||
if (event.type === "text-delta" && typeof event.delta === "string") {
|
||||
chunks.push(event.delta);
|
||||
continue;
|
||||
}
|
||||
if (event.type === "text" && typeof event.text === "string") {
|
||||
chunks.push(event.text);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return chunks.join("").trim();
|
||||
}
|
||||
|
||||
function collectToolTrajectory(events: EvalEvent[]): string[] {
|
||||
const tools: string[] = [];
|
||||
for (const event of events) {
|
||||
if (event.type === "tool-input-start" && event.toolName) {
|
||||
tools.push(event.toolName);
|
||||
}
|
||||
}
|
||||
return tools;
|
||||
}
|
||||
|
||||
function arraysEqual(a: string[], b: string[]): boolean {
|
||||
if (a.length !== b.length) {
|
||||
return false;
|
||||
}
|
||||
for (let i = 0; i < a.length; i += 1) {
|
||||
if (a[i] !== b[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function includesSubsequence(haystack: string[], needle: string[]): boolean {
|
||||
if (needle.length === 0) {
|
||||
return true;
|
||||
}
|
||||
let j = 0;
|
||||
for (let i = 0; i < haystack.length && j < needle.length; i += 1) {
|
||||
if (haystack[i] === needle[j]) {
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
return j === needle.length;
|
||||
}
|
||||
|
||||
function matchTrajectory(expected: string[], actual: string[], mode: TrajectoryMatchMode): boolean {
|
||||
switch (mode) {
|
||||
case "strict":
|
||||
return arraysEqual(expected, actual);
|
||||
case "subset":
|
||||
return includesSubsequence(expected, actual);
|
||||
case "superset":
|
||||
return includesSubsequence(actual, expected);
|
||||
case "unordered": {
|
||||
const expectedSorted = [...expected].toSorted();
|
||||
const actualSorted = [...actual].toSorted();
|
||||
return arraysEqual(expectedSorted, actualSorted);
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export function gradeOutput(testCase: EvalCase): GradeResult {
|
||||
const details: string[] = [];
|
||||
const expectation = testCase.expectations.output;
|
||||
if (!expectation) {
|
||||
return { grader: "output", passed: true, details: ["no output expectation configured"] };
|
||||
}
|
||||
|
||||
const text = collectAssistantText(testCase.events);
|
||||
if (!text) {
|
||||
details.push("assistant text was empty");
|
||||
}
|
||||
|
||||
for (const required of expectation.mustContain ?? []) {
|
||||
if (!text.includes(required)) {
|
||||
details.push(`missing required output fragment: "${required}"`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const forbidden of expectation.mustNotContain ?? []) {
|
||||
if (text.includes(forbidden)) {
|
||||
details.push(`found forbidden output fragment: "${forbidden}"`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
grader: "output",
|
||||
passed: details.length === 0,
|
||||
details: details.length > 0 ? details : ["output matched expectations"],
|
||||
};
|
||||
}
|
||||
|
||||
export function gradeTrajectory(testCase: EvalCase): GradeResult {
|
||||
const details: string[] = [];
|
||||
const expectation = testCase.expectations.trajectory;
|
||||
if (!expectation) {
|
||||
return {
|
||||
grader: "trajectory",
|
||||
passed: true,
|
||||
details: ["no trajectory expectation configured"],
|
||||
};
|
||||
}
|
||||
|
||||
const actual = collectToolTrajectory(testCase.events);
|
||||
const matched = matchTrajectory(expectation.tools, actual, expectation.mode);
|
||||
if (!matched) {
|
||||
details.push(
|
||||
`trajectory mismatch for mode=${expectation.mode}: expected=${JSON.stringify(expectation.tools)} actual=${JSON.stringify(actual)}`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
grader: "trajectory",
|
||||
passed: matched,
|
||||
details: matched ? ["trajectory matched expectation"] : details,
|
||||
};
|
||||
}
|
||||
|
||||
export function gradeTraceIntegrity(testCase: EvalCase): GradeResult {
|
||||
const details: string[] = [];
|
||||
const expectation = testCase.expectations.trace;
|
||||
|
||||
if (!expectation) {
|
||||
return { grader: "trace", passed: true, details: ["no trace expectation configured"] };
|
||||
}
|
||||
|
||||
const requiredTypes = expectation.requiredTypes ?? [];
|
||||
const eventTypes = new Set(testCase.events.map((event) => event.type));
|
||||
for (const requiredType of requiredTypes) {
|
||||
if (!eventTypes.has(requiredType)) {
|
||||
details.push(`missing required event type: ${requiredType}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (expectation.requireMonotonicGlobalSeq) {
|
||||
let lastSeq = -1;
|
||||
const seen = new Set<number>();
|
||||
for (const event of testCase.events) {
|
||||
if (typeof event.globalSeq !== "number") {
|
||||
continue;
|
||||
}
|
||||
if (seen.has(event.globalSeq)) {
|
||||
details.push(`duplicate globalSeq detected: ${event.globalSeq}`);
|
||||
}
|
||||
seen.add(event.globalSeq);
|
||||
if (event.globalSeq <= lastSeq) {
|
||||
details.push(`non-monotonic globalSeq transition: ${lastSeq} -> ${event.globalSeq}`);
|
||||
}
|
||||
lastSeq = event.globalSeq;
|
||||
}
|
||||
}
|
||||
|
||||
const toolState = new Map<string, { started: boolean; terminalCount: number }>();
|
||||
for (const event of testCase.events) {
|
||||
if (!event.toolCallId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.type === "tool-input-start") {
|
||||
const current = toolState.get(event.toolCallId) ?? {
|
||||
started: false,
|
||||
terminalCount: 0,
|
||||
};
|
||||
current.started = true;
|
||||
toolState.set(event.toolCallId, current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.type === "tool-output-available" || event.type === "tool-output-error") {
|
||||
const current = toolState.get(event.toolCallId);
|
||||
if (!current?.started) {
|
||||
details.push(`terminal tool event without start for ${event.toolCallId}`);
|
||||
continue;
|
||||
}
|
||||
current.terminalCount += 1;
|
||||
toolState.set(event.toolCallId, current);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [toolCallId, state] of toolState.entries()) {
|
||||
if (state.started && state.terminalCount === 0) {
|
||||
details.push(`tool call without terminal output: ${toolCallId}`);
|
||||
}
|
||||
if (state.terminalCount > 1) {
|
||||
details.push(`tool call produced multiple terminal outputs: ${toolCallId}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
grader: "trace",
|
||||
passed: details.length === 0,
|
||||
details: details.length > 0 ? details : ["trace integrity checks passed"],
|
||||
};
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user