lakehouse/scripts/mode_compare.ts

#!/usr/bin/env bun
/**
 * Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
 * (written per-call by /v1/mode/execute) and surfaces the cross-mode
 * comparison matrix that lets us see what each enrichment dimension
 * is actually doing.
 *
 * Per file, per mode, computes:
 *   - response_chars
 *   - finding_count (rows in markdown tables — heuristic, regex)
 *   - pathway_citations (mentions of "Pathway memory" or "📚")
 *   - latency_ms
 *   - matrix_chunks_kept / dropped
 *
 * Then surfaces:
 *   - per file, what each mode produced (rows next to each other)
 *   - per mode, average response_chars + latency
 *   - which modes ALWAYS underperform vs codereview_lakehouse
 *   - which signals (bug fingerprints, matrix) correlate with output size
 *
 * Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
 */

import { readFileSync, existsSync } from "node:fs";

interface Row {
  ts: string;
  mode: string;
  model: string;
  task_class: string;
  file_path: string;
  enriched_prompt_chars: number;
  response_chars: number;
  latency_ms: number;
  sources: {
    focus_file_bytes?: number;
    bug_fingerprints_count?: number;
    matrix_chunks_kept?: number;
    matrix_chunks_dropped?: number;
    relevance_filter_used?: boolean;
    flags?: any;
  };
  response: string;
}

function parseArgs(): { jsonl: string; since: string | null } {
  const args = Bun.argv.slice(2);
  const out: Record<string, string> = {};
  for (let i = 0; i < args.length; i++) {
    const a = args[i];
    if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
  }
  return {
    jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
    since: out.since || null,
  };
}

function loadRows(path: string, since: string | null): Row[] {
  if (!existsSync(path)) {
    console.error(`[compare] no log file at ${path}`);
    process.exit(1);
  }
  const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
  const rows: Row[] = [];
  for (const line of lines) {
    try {
      const r: Row = JSON.parse(line);
      if (since && r.ts < since) continue;
      rows.push(r);
    } catch {
      // skip malformed
    }
  }
  return rows;
}

function countFindings(md: string): number {
  // Markdown table rows that look like findings: `| <num> | ...` or `| **N** | ...`
  // Heuristic — adversarial framing produces ranked tables.
  const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
  return matches ? matches.length : 0;
}

function countPathwayCitations(md: string): number {
  // How many times the model referenced the pathway memory preamble.
  const re = /pathway\s*memory|📚/gi;
  return (md.match(re) ?? []).length;
}

function pad(s: string | number, n: number, right = false): string {
  const str = String(s);
  if (str.length >= n) return str.slice(0, n);
  return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}

function main() {
  const { jsonl, since } = parseArgs();
  const rows = loadRows(jsonl, since);
  if (rows.length === 0) {
    console.error("[compare] no rows after filter");
    process.exit(1);
  }

  // Group by file → mode
  const byFile: Record<string, Record<string, Row>> = {};
  const allModes = new Set<string>();
  for (const r of rows) {
    byFile[r.file_path] ??= {};
    byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
    allModes.add(r.mode);
  }
  const modesSorted = [...allModes].sort();

  // Per-file matrix
  console.log("\n═══ PER-FILE COMPARISON ═══\n");
  for (const file of Object.keys(byFile).sort()) {
    console.log(`📄 ${file}`);
    console.log(
      `  ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}`
    );
    console.log(`  ${"─".repeat(28)} ${"─".repeat(6)} ${"─".repeat(8)} ${"─".repeat(8)} ${"─".repeat(7)} ${"─".repeat(9)} ${"─".repeat(6)}`);
    for (const mode of modesSorted) {
      const r = byFile[file][mode];
      if (!r) {
        console.log(`  ${pad(mode, 28)} ${pad("—", 6, true)}`);
        continue;
      }
      const findings = countFindings(r.response);
      const cits = countPathwayCitations(r.response);
      const mk = r.sources.matrix_chunks_kept ?? 0;
      const md = r.sources.matrix_chunks_dropped ?? 0;
      const bf = r.sources.bug_fingerprints_count ?? 0;
      console.log(
        `  ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}`
      );
    }
    console.log("");
  }

  // Per-mode averages
  console.log("═══ PER-MODE AGGREGATE ═══\n");
  console.log(`  ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`);
  console.log(`  ${"─".repeat(28)} ${"─".repeat(4)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(8)} ${"─".repeat(8)}`);
  for (const mode of modesSorted) {
    const modeRows = rows.filter(r => r.mode === mode);
    if (modeRows.length === 0) continue;
    const n = modeRows.length;
    const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n);
    const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10;
    const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10;
    const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
    console.log(
      `  ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}`
    );
  }

  // Mode-relative: how often does each mode produce MORE findings than lakehouse?
  console.log("\n═══ MODE vs codereview_lakehouse (per file) ═══\n");
  console.log(`  ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg findings", 16, true)}`);
  console.log(`  ${"─".repeat(28)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
  for (const mode of modesSorted) {
    if (mode === "codereview_lakehouse") continue;
    let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
    for (const file of Object.keys(byFile)) {
      const baseline = byFile[file]["codereview_lakehouse"];
      const challenger = byFile[file][mode];
      if (!baseline || !challenger) continue;
      const bf = countFindings(baseline.response);
      const cf = countFindings(challenger.response);
      if (cf > bf) wins++;
      else if (cf < bf) losses++;
      else ties++;
      totalDelta += cf - bf;
      n++;
    }
    if (n === 0) continue;
    const avgDelta = (totalDelta / n).toFixed(1);
    console.log(
      `  ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
    );
  }
  console.log("\n[compare] done\n");
}

main();