204 lines
5.8 KiB
TypeScript
204 lines
5.8 KiB
TypeScript
import type { EvalCase, EvalEvent, GradeResult, TrajectoryMatchMode } from "./types.js";
|
|
|
|
function collectAssistantText(events: EvalEvent[]): string {
|
|
const chunks: string[] = [];
|
|
for (const event of events) {
|
|
if (event.type === "text-delta" && typeof event.delta === "string") {
|
|
chunks.push(event.delta);
|
|
continue;
|
|
}
|
|
if (event.type === "text" && typeof event.text === "string") {
|
|
chunks.push(event.text);
|
|
continue;
|
|
}
|
|
}
|
|
return chunks.join("").trim();
|
|
}
|
|
|
|
function collectToolTrajectory(events: EvalEvent[]): string[] {
|
|
const tools: string[] = [];
|
|
for (const event of events) {
|
|
if (event.type === "tool-input-start" && event.toolName) {
|
|
tools.push(event.toolName);
|
|
}
|
|
}
|
|
return tools;
|
|
}
|
|
|
|
function arraysEqual(a: string[], b: string[]): boolean {
|
|
if (a.length !== b.length) {
|
|
return false;
|
|
}
|
|
for (let i = 0; i < a.length; i += 1) {
|
|
if (a[i] !== b[i]) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function includesSubsequence(haystack: string[], needle: string[]): boolean {
|
|
if (needle.length === 0) {
|
|
return true;
|
|
}
|
|
let j = 0;
|
|
for (let i = 0; i < haystack.length && j < needle.length; i += 1) {
|
|
if (haystack[i] === needle[j]) {
|
|
j += 1;
|
|
}
|
|
}
|
|
return j === needle.length;
|
|
}
|
|
|
|
function matchTrajectory(expected: string[], actual: string[], mode: TrajectoryMatchMode): boolean {
|
|
switch (mode) {
|
|
case "strict":
|
|
return arraysEqual(expected, actual);
|
|
case "subset":
|
|
return includesSubsequence(expected, actual);
|
|
case "superset":
|
|
return includesSubsequence(actual, expected);
|
|
case "unordered": {
|
|
const expectedSorted = [...expected].toSorted();
|
|
const actualSorted = [...actual].toSorted();
|
|
return arraysEqual(expectedSorted, actualSorted);
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
export function gradeOutput(testCase: EvalCase): GradeResult {
|
|
const details: string[] = [];
|
|
const expectation = testCase.expectations.output;
|
|
if (!expectation) {
|
|
return { grader: "output", passed: true, details: ["no output expectation configured"] };
|
|
}
|
|
|
|
const text = collectAssistantText(testCase.events);
|
|
if (!text) {
|
|
details.push("assistant text was empty");
|
|
}
|
|
|
|
for (const required of expectation.mustContain ?? []) {
|
|
if (!text.includes(required)) {
|
|
details.push(`missing required output fragment: "${required}"`);
|
|
}
|
|
}
|
|
|
|
for (const forbidden of expectation.mustNotContain ?? []) {
|
|
if (text.includes(forbidden)) {
|
|
details.push(`found forbidden output fragment: "${forbidden}"`);
|
|
}
|
|
}
|
|
|
|
return {
|
|
grader: "output",
|
|
passed: details.length === 0,
|
|
details: details.length > 0 ? details : ["output matched expectations"],
|
|
};
|
|
}
|
|
|
|
export function gradeTrajectory(testCase: EvalCase): GradeResult {
|
|
const details: string[] = [];
|
|
const expectation = testCase.expectations.trajectory;
|
|
if (!expectation) {
|
|
return {
|
|
grader: "trajectory",
|
|
passed: true,
|
|
details: ["no trajectory expectation configured"],
|
|
};
|
|
}
|
|
|
|
const actual = collectToolTrajectory(testCase.events);
|
|
const matched = matchTrajectory(expectation.tools, actual, expectation.mode);
|
|
if (!matched) {
|
|
details.push(
|
|
`trajectory mismatch for mode=${expectation.mode}: expected=${JSON.stringify(expectation.tools)} actual=${JSON.stringify(actual)}`,
|
|
);
|
|
}
|
|
|
|
return {
|
|
grader: "trajectory",
|
|
passed: matched,
|
|
details: matched ? ["trajectory matched expectation"] : details,
|
|
};
|
|
}
|
|
|
|
export function gradeTraceIntegrity(testCase: EvalCase): GradeResult {
|
|
const details: string[] = [];
|
|
const expectation = testCase.expectations.trace;
|
|
|
|
if (!expectation) {
|
|
return { grader: "trace", passed: true, details: ["no trace expectation configured"] };
|
|
}
|
|
|
|
const requiredTypes = expectation.requiredTypes ?? [];
|
|
const eventTypes = new Set(testCase.events.map((event) => event.type));
|
|
for (const requiredType of requiredTypes) {
|
|
if (!eventTypes.has(requiredType)) {
|
|
details.push(`missing required event type: ${requiredType}`);
|
|
}
|
|
}
|
|
|
|
if (expectation.requireMonotonicGlobalSeq) {
|
|
let lastSeq = -1;
|
|
const seen = new Set<number>();
|
|
for (const event of testCase.events) {
|
|
if (typeof event.globalSeq !== "number") {
|
|
continue;
|
|
}
|
|
if (seen.has(event.globalSeq)) {
|
|
details.push(`duplicate globalSeq detected: ${event.globalSeq}`);
|
|
}
|
|
seen.add(event.globalSeq);
|
|
if (event.globalSeq <= lastSeq) {
|
|
details.push(`non-monotonic globalSeq transition: ${lastSeq} -> ${event.globalSeq}`);
|
|
}
|
|
lastSeq = event.globalSeq;
|
|
}
|
|
}
|
|
|
|
const toolState = new Map<string, { started: boolean; terminalCount: number }>();
|
|
for (const event of testCase.events) {
|
|
if (!event.toolCallId) {
|
|
continue;
|
|
}
|
|
|
|
if (event.type === "tool-input-start") {
|
|
const current = toolState.get(event.toolCallId) ?? {
|
|
started: false,
|
|
terminalCount: 0,
|
|
};
|
|
current.started = true;
|
|
toolState.set(event.toolCallId, current);
|
|
continue;
|
|
}
|
|
|
|
if (event.type === "tool-output-available" || event.type === "tool-output-error") {
|
|
const current = toolState.get(event.toolCallId);
|
|
if (!current?.started) {
|
|
details.push(`terminal tool event without start for ${event.toolCallId}`);
|
|
continue;
|
|
}
|
|
current.terminalCount += 1;
|
|
toolState.set(event.toolCallId, current);
|
|
}
|
|
}
|
|
|
|
for (const [toolCallId, state] of toolState.entries()) {
|
|
if (state.started && state.terminalCount === 0) {
|
|
details.push(`tool call without terminal output: ${toolCallId}`);
|
|
}
|
|
if (state.terminalCount > 1) {
|
|
details.push(`tool call produced multiple terminal outputs: ${toolCallId}`);
|
|
}
|
|
}
|
|
|
|
return {
|
|
grader: "trace",
|
|
passed: details.length === 0,
|
|
details: details.length > 0 ? details : ["trace integrity checks passed"],
|
|
};
|
|
}
|