From 1deeb5878e28f2ab0d6aabf7d674cf24952181ea Mon Sep 17 00:00:00 2001
From: kumarabhirup <hey@kumareth.com>
Date: Mon, 2 Mar 2026 18:36:40 -0800
Subject: [PATCH] feat(evals): add eval framework with dataset, graders, and
 regression report

---
 test/evals/README.md                         |  26 +++
 test/evals/REGRESSION_VERIFICATION_REPORT.md |  72 +++++++
 test/evals/dataset.ts                        | 140 +++++++++++++
 test/evals/graders.test.ts                   |  43 ++++
 test/evals/graders.ts                        | 203 +++++++++++++++++++
 5 files changed, 484 insertions(+)
 create mode 100644 test/evals/README.md
 create mode 100644 test/evals/REGRESSION_VERIFICATION_REPORT.md
 create mode 100644 test/evals/dataset.ts
 create mode 100644 test/evals/graders.test.ts
 create mode 100644 test/evals/graders.ts

diff --git a/test/evals/README.md b/test/evals/README.md
new file mode 100644
index 00000000000..75d71784ca8
--- /dev/null
+++ b/test/evals/README.md
@@ -0,0 +1,26 @@
+# Chat-Agent Evals
+
+This folder contains a dataset-driven eval harness for web chat-agent behavior.
+
+## What is graded
+
+- `output`: final assistant text quality checks (`mustContain`, `mustNotContain`)
+- `trajectory`: tool-call ordering checks (`strict`, `subset`, `superset`, `unordered`)
+- `trace`: event-stream integrity checks (required event types, monotonic `globalSeq`,
+  and tool-call lifecycle completeness)
+
+## Run
+
+```bash
+pnpm test:evals
+```
+
+This mode is informational and prints a full summary.
+
+## Enforce critical checks
+
+```bash
+pnpm test:evals:enforce
+```
+
+When `EVALS_ENFORCE=1`, the runner exits non-zero if any **critical** eval case fails.
diff --git a/test/evals/REGRESSION_VERIFICATION_REPORT.md b/test/evals/REGRESSION_VERIFICATION_REPORT.md
new file mode 100644
index 00000000000..7d00a44c1fe
--- /dev/null
+++ b/test/evals/REGRESSION_VERIFICATION_REPORT.md
@@ -0,0 +1,72 @@
+# Regression Verification Report
+
+Date: 2026-03-02
+
+## Stability Loops
+
+### Web interaction regressions
+
+Command:
+
+```bash
+for i in {1..20}; do
+  pnpm vitest run app/workspace/object-view-active-view.test.ts app/components/workspace/object-filter-bar.test.tsx --silent
+done
+```
+
+Result: **20/20 passes** (no flaky failures observed).
+
+### CLI edge-case regressions
+
+Command:
+
+```bash
+for i in {1..20}; do
+  ../../node_modules/.bin/vitest run --config vitest.config.ts argv.test.ts run-main.test.ts windows-argv.test.ts --silent
+done
+```
+
+Result: **20/20 passes** (no flaky failures observed).
+
+## Mutation Probes (Manual)
+
+The following intentional mutations were applied, tested, and reverted:
+
+1. `apps/web/app/workspace/object-view-active-view.ts`
+   - Mutation: `shouldApply` reduced to `nameMismatch` only.
+   - Expected break: initial-load active view bug returns.
+   - Caught by: `app/workspace/object-view-active-view.test.ts` (`applies active view on initial load ...`).
+   - Outcome: **failed as expected**, then reverted.
+
+2. `apps/web/app/components/chat-panel.tsx`
+   - Mutation: tool error state changed from `"error"` to `"output-available"`.
+   - Expected break: tool terminal state classification regression.
+   - Caught by: `app/components/chat-panel.stream-parser.test.ts` (`accumulates tool input/output ...`).
+   - Outcome: **failed as expected**, then reverted.
+
+3. `src/cli/run-main.ts`
+   - Mutation: disable-delegation branch changed to incorrectly return `true`.
+   - Expected break: delegation safety bypass.
+   - Caught by: `src/cli/run-main.test.ts` (`disables delegation when explicit env disable flag is set`).
+   - Outcome: **failed as expected**, then reverted.
+
+4. `test/evals/graders.ts`
+   - Mutation: duplicate `globalSeq` detection removed.
+   - Expected break: trace-integrity duplicate-sequence protection.
+   - Caught by: `test/evals/graders.test.ts` (`fails trace grader for duplicate globalSeq ...`).
+   - Outcome: **failed as expected**, then reverted.
+
+## Final Verification Runs
+
+- Web interaction suite: pass (`34` tests)
+- CLI edge-case suite: pass (`42` tests)
+- Eval grader unit suite: pass (`3` tests)
+- Eval harness:
+  - `pnpm test:evals`: pass (`9/9` graders across `3` cases)
+  - `pnpm test:evals:enforce`: pass (`critical` cases all passing)
+
+## Residual Risks
+
+- Some large end-to-end UI flows (`workspace/page.tsx` URL orchestration and multi-panel state transitions) still rely on unit/component-level coverage rather than browser E2E.
+- The eval dataset currently includes representative high-risk cases, but not production trace replay at scale yet.
+- Additional CRM mutation permutations (large view/filter combinations) can be added to the eval dataset over time.
diff --git a/test/evals/dataset.ts b/test/evals/dataset.ts
new file mode 100644
index 00000000000..0eb80d86d85
--- /dev/null
+++ b/test/evals/dataset.ts
@@ -0,0 +1,140 @@
+import type { EvalCase } from "./types.js";
+
+export const chatAgentEvalDataset: EvalCase[] = [
+  {
+    id: "tool-call-success",
+    description: "single-tool answer includes expected retrieval summary",
+    critical: true,
+    events: [
+      { type: "user-message", id: "u-1", text: "Find docs for workspace lock", globalSeq: 1 },
+      { type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 2 },
+      {
+        type: "tool-input-available",
+        toolCallId: "tool-1",
+        globalSeq: 3,
+        output: { query: "workspace lock" },
+      },
+      {
+        type: "tool-output-available",
+        toolCallId: "tool-1",
+        globalSeq: 4,
+        output: { hits: 3 },
+      },
+      { type: "text-start", globalSeq: 5 },
+      {
+        type: "text-delta",
+        delta: "Found 3 matching documents about workspace lock.",
+        globalSeq: 6,
+      },
+      { type: "text-end", globalSeq: 7 },
+    ],
+    expectations: {
+      output: {
+        mustContain: ["Found 3 matching documents"],
+        mustNotContain: ["NO_REPLY"],
+      },
+      trajectory: {
+        tools: ["searchDocs"],
+        mode: "strict",
+      },
+      trace: {
+        requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
+        requireMonotonicGlobalSeq: true,
+      },
+    },
+  },
+  {
+    id: "subagent-roundtrip",
+    description:
+      "subagent spawn followed by completion announcement is preserved in final response",
+    critical: true,
+    events: [
+      { type: "user-message", id: "u-2", text: "Research customer timeline", globalSeq: 1 },
+      { type: "tool-input-start", toolCallId: "tool-2", toolName: "spawnSubagent", globalSeq: 2 },
+      {
+        type: "tool-output-available",
+        toolCallId: "tool-2",
+        globalSeq: 3,
+        output: { childSessionKey: "sub:abc", task: "timeline research" },
+      },
+      {
+        type: "tool-input-start",
+        toolCallId: "tool-3",
+        toolName: "collectSubagentResult",
+        globalSeq: 4,
+      },
+      {
+        type: "tool-output-available",
+        toolCallId: "tool-3",
+        globalSeq: 5,
+        output: { summary: "Key milestones collected." },
+      },
+      { type: "text-start", globalSeq: 6 },
+      {
+        type: "text-delta",
+        delta: "Subagent finished and reported back with key milestones.",
+        globalSeq: 7,
+      },
+      { type: "text-end", globalSeq: 8 },
+    ],
+    expectations: {
+      output: {
+        mustContain: ["Subagent finished and reported back"],
+      },
+      trajectory: {
+        tools: ["spawnSubagent", "collectSubagentResult"],
+        mode: "strict",
+      },
+      trace: {
+        requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
+        requireMonotonicGlobalSeq: true,
+      },
+    },
+  },
+  {
+    id: "crm-view-mutation",
+    description:
+      "CRM mutation path applies object view metadata and confirms result in assistant output",
+    critical: false,
+    events: [
+      { type: "user-message", id: "u-3", text: "Set Important as active leads view", globalSeq: 1 },
+      { type: "tool-input-start", toolCallId: "tool-4", toolName: "saveObjectViews", globalSeq: 2 },
+      {
+        type: "tool-output-available",
+        toolCallId: "tool-4",
+        globalSeq: 3,
+        output: { object: "leads", activeView: "Important" },
+      },
+      {
+        type: "tool-input-start",
+        toolCallId: "tool-5",
+        toolName: "fetchObjectEntries",
+        globalSeq: 4,
+      },
+      {
+        type: "tool-output-available",
+        toolCallId: "tool-5",
+        globalSeq: 5,
+        output: { filteredRows: 12 },
+      },
+      {
+        type: "text-delta",
+        delta: "Active view 'Important' is now applied to leads.",
+        globalSeq: 6,
+      },
+    ],
+    expectations: {
+      output: {
+        mustContain: ["Active view 'Important' is now applied"],
+      },
+      trajectory: {
+        tools: ["saveObjectViews", "fetchObjectEntries"],
+        mode: "strict",
+      },
+      trace: {
+        requiredTypes: ["tool-input-start", "tool-output-available", "text-delta"],
+        requireMonotonicGlobalSeq: true,
+      },
+    },
+  },
+];
diff --git a/test/evals/graders.test.ts b/test/evals/graders.test.ts
new file mode 100644
index 00000000000..dcab12da2a6
--- /dev/null
+++ b/test/evals/graders.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from "vitest";
+import { chatAgentEvalDataset } from "./dataset.js";
+import { gradeOutput, gradeTraceIntegrity, gradeTrajectory } from "./graders.js";
+
+describe("eval graders", () => {
+  it("passes output/trajectory/trace graders for baseline success case", () => {
+    const baseline = chatAgentEvalDataset[0];
+    expect(gradeOutput(baseline).passed).toBe(true);
+    expect(gradeTrajectory(baseline).passed).toBe(true);
+    expect(gradeTraceIntegrity(baseline).passed).toBe(true);
+  });
+
+  it("fails trajectory grader when tool order does not match strict expectation", () => {
+    const mismatched = {
+      ...chatAgentEvalDataset[0],
+      events: [
+        {
+          type: "tool-input-start",
+          toolCallId: "tool-other",
+          toolName: "saveFile",
+          globalSeq: 1,
+        },
+      ],
+    };
+    const result = gradeTrajectory(mismatched);
+    expect(result.passed).toBe(false);
+    expect(result.details.join(" ")).toContain("trajectory mismatch");
+  });
+
+  it("fails trace grader for duplicate globalSeq and missing tool terminal events", () => {
+    const brokenTrace = {
+      ...chatAgentEvalDataset[0],
+      events: [
+        { type: "tool-input-start", toolCallId: "tool-1", toolName: "searchDocs", globalSeq: 4 },
+        { type: "text-delta", delta: "partial", globalSeq: 4 },
+      ],
+    };
+    const result = gradeTraceIntegrity(brokenTrace);
+    expect(result.passed).toBe(false);
+    expect(result.details.some((detail) => detail.includes("duplicate globalSeq"))).toBe(true);
+    expect(result.details.some((detail) => detail.includes("without terminal output"))).toBe(true);
+  });
+});
diff --git a/test/evals/graders.ts b/test/evals/graders.ts
new file mode 100644
index 00000000000..cfb40b397d2
--- /dev/null
+++ b/test/evals/graders.ts
@@ -0,0 +1,203 @@
+import type { EvalCase, EvalEvent, GradeResult, TrajectoryMatchMode } from "./types.js";
+
+function collectAssistantText(events: EvalEvent[]): string {
+  const chunks: string[] = [];
+  for (const event of events) {
+    if (event.type === "text-delta" && typeof event.delta === "string") {
+      chunks.push(event.delta);
+      continue;
+    }
+    if (event.type === "text" && typeof event.text === "string") {
+      chunks.push(event.text);
+      continue;
+    }
+  }
+  return chunks.join("").trim();
+}
+
+function collectToolTrajectory(events: EvalEvent[]): string[] {
+  const tools: string[] = [];
+  for (const event of events) {
+    if (event.type === "tool-input-start" && event.toolName) {
+      tools.push(event.toolName);
+    }
+  }
+  return tools;
+}
+
+function arraysEqual(a: string[], b: string[]): boolean {
+  if (a.length !== b.length) {
+    return false;
+  }
+  for (let i = 0; i < a.length; i += 1) {
+    if (a[i] !== b[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+function includesSubsequence(haystack: string[], needle: string[]): boolean {
+  if (needle.length === 0) {
+    return true;
+  }
+  let j = 0;
+  for (let i = 0; i < haystack.length && j < needle.length; i += 1) {
+    if (haystack[i] === needle[j]) {
+      j += 1;
+    }
+  }
+  return j === needle.length;
+}
+
+function matchTrajectory(expected: string[], actual: string[], mode: TrajectoryMatchMode): boolean {
+  switch (mode) {
+    case "strict":
+      return arraysEqual(expected, actual);
+    case "subset":
+      return includesSubsequence(expected, actual);
+    case "superset":
+      return includesSubsequence(actual, expected);
+    case "unordered": {
+      const expectedSorted = [...expected].toSorted();
+      const actualSorted = [...actual].toSorted();
+      return arraysEqual(expectedSorted, actualSorted);
+    }
+    default:
+      return false;
+  }
+}
+
+export function gradeOutput(testCase: EvalCase): GradeResult {
+  const details: string[] = [];
+  const expectation = testCase.expectations.output;
+  if (!expectation) {
+    return { grader: "output", passed: true, details: ["no output expectation configured"] };
+  }
+
+  const text = collectAssistantText(testCase.events);
+  if (!text) {
+    details.push("assistant text was empty");
+  }
+
+  for (const required of expectation.mustContain ?? []) {
+    if (!text.includes(required)) {
+      details.push(`missing required output fragment: "${required}"`);
+    }
+  }
+
+  for (const forbidden of expectation.mustNotContain ?? []) {
+    if (text.includes(forbidden)) {
+      details.push(`found forbidden output fragment: "${forbidden}"`);
+    }
+  }
+
+  return {
+    grader: "output",
+    passed: details.length === 0,
+    details: details.length > 0 ? details : ["output matched expectations"],
+  };
+}
+
+export function gradeTrajectory(testCase: EvalCase): GradeResult {
+  const details: string[] = [];
+  const expectation = testCase.expectations.trajectory;
+  if (!expectation) {
+    return {
+      grader: "trajectory",
+      passed: true,
+      details: ["no trajectory expectation configured"],
+    };
+  }
+
+  const actual = collectToolTrajectory(testCase.events);
+  const matched = matchTrajectory(expectation.tools, actual, expectation.mode);
+  if (!matched) {
+    details.push(
+      `trajectory mismatch for mode=${expectation.mode}: expected=${JSON.stringify(expectation.tools)} actual=${JSON.stringify(actual)}`,
+    );
+  }
+
+  return {
+    grader: "trajectory",
+    passed: matched,
+    details: matched ? ["trajectory matched expectation"] : details,
+  };
+}
+
+export function gradeTraceIntegrity(testCase: EvalCase): GradeResult {
+  const details: string[] = [];
+  const expectation = testCase.expectations.trace;
+
+  if (!expectation) {
+    return { grader: "trace", passed: true, details: ["no trace expectation configured"] };
+  }
+
+  const requiredTypes = expectation.requiredTypes ?? [];
+  const eventTypes = new Set(testCase.events.map((event) => event.type));
+  for (const requiredType of requiredTypes) {
+    if (!eventTypes.has(requiredType)) {
+      details.push(`missing required event type: ${requiredType}`);
+    }
+  }
+
+  if (expectation.requireMonotonicGlobalSeq) {
+    let lastSeq = -1;
+    const seen = new Set<number>();
+    for (const event of testCase.events) {
+      if (typeof event.globalSeq !== "number") {
+        continue;
+      }
+      if (seen.has(event.globalSeq)) {
+        details.push(`duplicate globalSeq detected: ${event.globalSeq}`);
+      }
+      seen.add(event.globalSeq);
+      if (event.globalSeq <= lastSeq) {
+        details.push(`non-monotonic globalSeq transition: ${lastSeq} -> ${event.globalSeq}`);
+      }
+      lastSeq = event.globalSeq;
+    }
+  }
+
+  const toolState = new Map<string, { started: boolean; terminalCount: number }>();
+  for (const event of testCase.events) {
+    if (!event.toolCallId) {
+      continue;
+    }
+
+    if (event.type === "tool-input-start") {
+      const current = toolState.get(event.toolCallId) ?? {
+        started: false,
+        terminalCount: 0,
+      };
+      current.started = true;
+      toolState.set(event.toolCallId, current);
+      continue;
+    }
+
+    if (event.type === "tool-output-available" || event.type === "tool-output-error") {
+      const current = toolState.get(event.toolCallId);
+      if (!current?.started) {
+        details.push(`terminal tool event without start for ${event.toolCallId}`);
+        continue;
+      }
+      current.terminalCount += 1;
+      toolState.set(event.toolCallId, current);
+    }
+  }
+
+  for (const [toolCallId, state] of toolState.entries()) {
+    if (state.started && state.terminalCount === 0) {
+      details.push(`tool call without terminal output: ${toolCallId}`);
+    }
+    if (state.terminalCount > 1) {
+      details.push(`tool call produced multiple terminal outputs: ${toolCallId}`);
+    }
+  }
+
+  return {
+    grader: "trace",
+    passed: details.length === 0,
+    details: details.length > 0 ? details : ["trace integrity checks passed"],
+  };
+}