openclaw/apps/web/app/api/cron/runs/search-transcript/route.ts

import { readFileSync, readdirSync, existsSync, statSync } from "node:fs";
import { join } from "node:path";
import { resolveOpenClawStateDir } from "@/lib/workspace";

export const dynamic = "force-dynamic";

const AGENTS_DIR = join(resolveOpenClawStateDir(), "agents");

type MessagePart =
  | { type: "text"; text: string }
  | { type: "thinking"; thinking: string }
  | { type: "tool-call"; toolName: string; toolCallId: string; args?: unknown; output?: string };

type ParsedMessage = {
  id: string;
  role: "user" | "assistant" | "system";
  parts: MessagePart[];
  timestamp: string;
};

/**
 * Search for the actual agent transcript for a cron run.
 *
 * For main-target cron runs, the agent response lives in the main session
 * transcript files. This endpoint searches session files for the cron payload
 * text near the run timestamp and returns the matching conversation
 * (user message + assistant response).
 */

/** Try to find a cron-specific session from sessions.json. */
function findCronSessionId(jobId: string): string | null {
  if (!existsSync(AGENTS_DIR)) {return null;}
  try {
    const agentDirs = readdirSync(AGENTS_DIR, { withFileTypes: true });
    for (const agentDir of agentDirs) {
      if (!agentDir.isDirectory()) {continue;}
      const sessionsJsonPath = join(AGENTS_DIR, agentDir.name, "sessions", "sessions.json");
      if (!existsSync(sessionsJsonPath)) {continue;}
      try {
        const store = JSON.parse(readFileSync(sessionsJsonPath, "utf-8"));
        // Look for cron session key matching this job
        for (const [key, entry] of Object.entries(store)) {
          if (key.includes(`:cron:${jobId}`) && !key.includes(":run:")) {
            const sessionId = (entry as { sessionId?: string })?.sessionId;
            if (typeof sessionId === "string" && sessionId.trim()) {
              // Verify the session file actually exists
              const sessionFile = join(AGENTS_DIR, agentDir.name, "sessions", `${sessionId}.jsonl`);
              if (existsSync(sessionFile)) {
                return sessionId;
              }
            }
          }
        }
      } catch {
        // skip malformed sessions.json
      }
    }
  } catch {
    // ignore
  }
  return null;
}

/** Find session files that might contain the cron run's transcript. */
function findCandidateSessionFiles(runAtMs: number): string[] {
  const candidates: Array<{ path: string; mtimeMs: number }> = [];
  if (!existsSync(AGENTS_DIR)) {return [];}

  try {
    const agentDirs = readdirSync(AGENTS_DIR, { withFileTypes: true });
    for (const agentDir of agentDirs) {
      if (!agentDir.isDirectory()) {continue;}
      const sessionsDir = join(AGENTS_DIR, agentDir.name, "sessions");
      if (!existsSync(sessionsDir)) {continue;}
      try {
        const files = readdirSync(sessionsDir);
        for (const file of files) {
          if (!file.endsWith(".jsonl")) {continue;}
          const filePath = join(sessionsDir, file);
          try {
            const stat = statSync(filePath);
            // Only consider files modified within ±2 hours of the run
            const windowMs = 2 * 60 * 60 * 1000;
            if (Math.abs(stat.mtimeMs - runAtMs) < windowMs) {
              candidates.push({ path: filePath, mtimeMs: stat.mtimeMs });
            }
          } catch {
            // skip
          }
        }
      } catch {
        // skip
      }
    }
  } catch {
    // ignore
  }

  // Sort by closest modification time to runAtMs
  candidates.sort((a, b) => Math.abs(a.mtimeMs - runAtMs) - Math.abs(b.mtimeMs - runAtMs));

  // Limit to 10 most likely candidates
  return candidates.slice(0, 10).map((c) => c.path);
}

/** Parse message entries from a JSONL transcript, optionally filtered by time range. */
function parseMessagesInRange(
  content: string,
  opts?: { afterMs?: number; beforeMs?: number },
): ParsedMessage[] {
  const lines = content.trim().split("\n").filter((l) => l.trim());
  const messages: ParsedMessage[] = [];
  const pendingToolCalls = new Map<string, { toolName: string; args?: unknown }>();

  for (const line of lines) {
    try {
      const entry = JSON.parse(line);
      if (entry.type !== "message" || !entry.message) {continue;}

      // Filter by timestamp if provided
      if (opts?.afterMs || opts?.beforeMs) {
        const ts = entry.timestamp ? new Date(entry.timestamp).getTime() : (entry.ts ?? 0);
        if (opts.afterMs && ts < opts.afterMs) {continue;}
        if (opts.beforeMs && ts > opts.beforeMs) {continue;}
      }

      const msg = entry.message;
      const role = msg.role as "user" | "assistant" | "system";
      const parts: MessagePart[] = [];

      if (Array.isArray(msg.content)) {
        for (const part of msg.content) {
          if (part.type === "text" && typeof part.text === "string" && part.text.trim()) {
            parts.push({ type: "text", text: part.text });
          } else if (part.type === "thinking" && typeof part.thinking === "string" && part.thinking.trim()) {
            parts.push({ type: "thinking", thinking: part.thinking });
          } else if (part.type === "tool_use" || part.type === "tool-call") {
            const toolName = part.name ?? part.toolName ?? "unknown";
            const toolCallId = part.id ?? part.toolCallId ?? `tool-${Date.now()}`;
            pendingToolCalls.set(toolCallId, { toolName, args: part.input ?? part.args });
            parts.push({ type: "tool-call", toolName, toolCallId, args: part.input ?? part.args });
          } else if (part.type === "tool_result" || part.type === "tool-result") {
            const toolCallId = part.tool_use_id ?? part.toolCallId ?? "";
            const pending = pendingToolCalls.get(toolCallId);
            const outputText = typeof part.content === "string"
              ? part.content
              : Array.isArray(part.content)
                ? part.content.filter((c: { type: string }) => c.type === "text").map((c: { text: string }) => c.text).join("\n")
                : typeof part.output === "string"
                  ? part.output
                  : JSON.stringify(part.output ?? part.content ?? "");

            if (pending) {
              const existingMsg = messages[messages.length - 1];
              if (existingMsg) {
                const tc = existingMsg.parts.find(
                  (p) => p.type === "tool-call" && (p as { toolCallId: string }).toolCallId === toolCallId,
                );
                if (tc && tc.type === "tool-call") {
                  (tc as { output?: string }).output = outputText.slice(0, 5000);
                  continue;
                }
              }
              parts.push({ type: "tool-call", toolName: pending.toolName, toolCallId, args: pending.args, output: outputText.slice(0, 5000) });
            } else {
              parts.push({ type: "tool-call", toolName: "tool", toolCallId, output: outputText.slice(0, 5000) });
            }
          }
        }
      } else if (typeof msg.content === "string" && msg.content.trim()) {
        parts.push({ type: "text", text: msg.content });
      }

      if (parts.length > 0) {
        messages.push({
          id: entry.id ?? `msg-${messages.length}`,
          role,
          parts,
          timestamp: entry.timestamp ?? new Date(entry.ts ?? Date.now()).toISOString(),
        });
      }
    } catch {
      // skip malformed lines
    }
  }

  return messages;
}

/** Extract text content from message parts. */
function getMessageText(msg: ParsedMessage): string {
  return msg.parts
    .filter((p): p is { type: "text"; text: string } => p.type === "text")
    .map((p) => p.text)
    .join("\n");
}

/**
 * Search session files for the cron run's conversation.
 * Matches by finding a user message containing the summary text near runAtMs,
 * then returns that message + all following messages until the next user message.
 */
function searchForRunTranscript(
  sessionFiles: string[],
  summary: string,
  runAtMs: number,
): { messages: ParsedMessage[]; sessionFile: string } | null {
  // Use a distinctive portion of the summary for matching (first 80 chars)
  const searchText = summary.slice(0, 80);
  // Search window: from 5s before run to 10 minutes after (heartbeat delay)
  const afterMs = runAtMs - 5_000;
  const beforeMs = runAtMs + 10 * 60_000;

  for (const filePath of sessionFiles) {
    try {
      const content = readFileSync(filePath, "utf-8");
      if (!content.includes(searchText.slice(0, 40))) {
        // Quick pre-check: skip files that don't contain the text at all
        continue;
      }

      const allMessages = parseMessagesInRange(content);

      // Find user messages containing the summary text within the time window
      for (let i = 0; i < allMessages.length; i++) {
        const msg = allMessages[i];
        if (msg.role !== "user") {continue;}

        const msgTs = new Date(msg.timestamp).getTime();
        if (msgTs < afterMs || msgTs > beforeMs) {continue;}

        const text = getMessageText(msg);
        if (!text.includes(searchText.slice(0, 40))) {continue;}

        // Found the user message! Collect it + all following messages
        // until the next user message (the full agent turn).
        const conversation: ParsedMessage[] = [msg];
        for (let j = i + 1; j < allMessages.length; j++) {
          const next = allMessages[j];
          if (next.role === "user") {break;}
          conversation.push(next);
        }

        return { messages: conversation, sessionFile: filePath };
      }
    } catch {
      // skip unreadable files
    }
  }

  return null;
}

/**
 * GET /api/cron/runs/search-transcript?jobId=X&runAtMs=Y&summary=Z
 *
 * Search for the actual agent transcript for a cron run that doesn't have
 * a direct sessionId. Tries:
 * 1. Sessions.json lookup for a cron-specific session
 * 2. Time-based search of session files near the run timestamp
 */
export async function GET(request: Request) {
  const url = new URL(request.url);
  const jobId = url.searchParams.get("jobId");
  const runAtMsStr = url.searchParams.get("runAtMs");
  const summary = url.searchParams.get("summary");

  if (!jobId || !runAtMsStr) {
    return Response.json({ error: "jobId and runAtMs are required" }, { status: 400 });
  }

  const runAtMs = Number(runAtMsStr);
  if (!Number.isFinite(runAtMs)) {
    return Response.json({ error: "Invalid runAtMs" }, { status: 400 });
  }

  // Strategy 1: Look for a cron-specific session in sessions.json
  const cronSessionId = findCronSessionId(jobId);
  if (cronSessionId) {
    try {
      const agentDirs = readdirSync(AGENTS_DIR, { withFileTypes: true });
      for (const agentDir of agentDirs) {
        if (!agentDir.isDirectory()) {continue;}
        const sessionFile = join(AGENTS_DIR, agentDir.name, "sessions", `${cronSessionId}.jsonl`);
        if (!existsSync(sessionFile)) {continue;}

        const content = readFileSync(sessionFile, "utf-8");
        const messages = parseMessagesInRange(content);
        if (messages.length > 0) {
          return Response.json({
            sessionId: cronSessionId,
            messages,
            source: "cron-session",
          });
        }
      }
    } catch {
      // fall through to search
    }
  }

  // Strategy 2: Search session files near the run timestamp
  if (summary) {
    const candidates = findCandidateSessionFiles(runAtMs);
    const result = searchForRunTranscript(candidates, summary, runAtMs);
    if (result) {
      return Response.json({
        messages: result.messages,
        source: "main-session-search",
      });
    }
  }

  return Response.json({ error: "Transcript not found" }, { status: 404 });
}