lakehouse/scripts/mode_compare.ts

#!/usr/bin/env bun
/**
 * Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
 * (written per-call by /v1/mode/execute) and surfaces the cross-mode
 * comparison matrix that lets us see what each enrichment dimension
 * is actually doing.
 *
 * Per file, per mode, computes:
 *   - response_chars
 *   - finding_count (rows in markdown tables — heuristic, regex)
 *   - pathway_citations (mentions of "Pathway memory" or "📚")
 *   - latency_ms
 *   - matrix_chunks_kept / dropped
 *
 * Then surfaces:
 *   - per file, what each mode produced (rows next to each other)
 *   - per mode, average response_chars + latency
 *   - which modes ALWAYS underperform vs codereview_lakehouse
 *   - which signals (bug fingerprints, matrix) correlate with output size
 *
 * Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
 */

import { readFileSync, existsSync } from "node:fs";

interface Row {
  ts: string;
  mode: string;
  model: string;
  task_class: string;
  file_path: string;
  enriched_prompt_chars: number;
  response_chars: number;
  latency_ms: number;
  sources: {
    focus_file_bytes?: number;
    bug_fingerprints_count?: number;
    matrix_chunks_kept?: number;
    matrix_chunks_dropped?: number;
    relevance_filter_used?: boolean;
    flags?: any;
  };
  response: string;
}

function parseArgs(): { jsonl: string; since: string | null } {
  const args = Bun.argv.slice(2);
  const out: Record<string, string> = {};
  for (let i = 0; i < args.length; i++) {
    const a = args[i];
    if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
  }
  return {
    jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
    since: out.since || null,
  };
}

function loadRows(path: string, since: string | null): Row[] {
  if (!existsSync(path)) {
    console.error(`[compare] no log file at ${path}`);
    process.exit(1);
  }
  const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
  const rows: Row[] = [];
  for (const line of lines) {
    try {
      const r: Row = JSON.parse(line);
      if (since && r.ts < since) continue;
      rows.push(r);
    } catch {
      // skip malformed
    }
  }
  return rows;
}

function countFindings(md: string): number {
  // Adversarial framing produces a "Ranked Findings" table early in
  // the output. The original regex `^\|\s*\*?\*?\d+\*?\*?\s*\|` matched
  // ANY numbered table row — including the patch table that follows
  // the findings table, double-counting every finding.
  //
  // Fix: only count rows under a "Ranked Findings" / "Findings" header
  // until we hit the next ## heading or a "Patch" / "Suggestion" header.
  // Falls back to the loose count if no findings header is detected
  // (some modes use different framing).
  const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
  const m = md.match(findingsSectionRe);
  if (m && m.index !== undefined) {
    const after = md.slice(m.index + m[0].length);
    // Stop at the next ## heading or Patch/Suggestion header.
    // Allow non-letter chars (emoji/space) between # and the keyword
    // so headers like `## 🛠️ Concrete Patch Suggestions` get caught.
    const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
    const stop = after.search(stopRe);
    const section = stop >= 0 ? after.slice(0, stop) : after;
    const rows = section.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
    return rows ? rows.length : 0;
  }
  // Fallback for outputs without a labeled findings section.
  const all = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
  return all ? all.length : 0;
}

function countPathwayCitations(md: string): number {
  // How many times the model referenced the pathway memory preamble.
  const re = /pathway\s*memory|📚/gi;
  return (md.match(re) ?? []).length;
}

// ─── Grounding check ───
// A finding is "grounded" if the symbols it cites actually exist in
// the focus file AND any cited line numbers fall within the file's
// real line count. Anti-pollution measure surfaced 2026-04-26 after
// codereview_playbook_only produced 8 findings citing lines 378-945
// in a 332-line file (all hallucinated from pathway-memory preamble
// since the mode doesn't pass file content to the model).

interface GroundingResult {
  total: number;
  grounded: number;
  partial: number;
  hallucinated: number;
  out_of_bounds_lines: number; // findings citing lines past EOF
  details: { row: string; verdict: string }[];
}

function extractFindings(md: string): { full: string; symbols: string[]; lines: number[] }[] {
  // Pull each finding row from the Findings section (uses the same
  // emoji-tolerant section-detection logic as countFindings).
  const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
  const m = md.match(findingsSectionRe);
  let section = md;
  if (m && m.index !== undefined) {
    const after = md.slice(m.index + m[0].length);
    const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
    const stop = after.search(stopRe);
    section = stop >= 0 ? after.slice(0, stop) : after;
  }
  const rows = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
  return rows.map(row => {
    // Symbols: backtick-quoted identifiers, also bare snake_case_words
    const sym = new Set<string>();
    for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
    for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) {
      const w = t[1];
      // Filter common words that aren't symbols
      if (!["score", "level", "match", "table", "value", "where", "found", "would", "after", "before", "calls", "needs", "patch", "should", "missing", "violation", "evidence", "checks", "either", "audit", "later", "field", "rules", "stage", "early", "always", "later", "could", "leaks", "memory"].includes(w)) {
        sym.add(w);
      }
    }
    // Line numbers from `path:NNN` or `:NNN-NNN` patterns
    const lineNums: number[] = [];
    for (const t of row.matchAll(/[:‑\-](\d{2,5})(?:[‑\-](\d{2,5}))?/g)) {
      lineNums.push(parseInt(t[1]));
      if (t[2]) lineNums.push(parseInt(t[2]));
    }
    return { full: row, symbols: [...sym], lines: lineNums };
  });
}

function checkGrounding(md: string, fileContent: string | null): GroundingResult {
  const findings = extractFindings(md);
  if (!fileContent) {
    return { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [] };
  }
  const fileLines = fileContent.split("\n").length;
  const result: GroundingResult = {
    total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [],
  };
  for (const f of findings) {
    const sym_hits = f.symbols.filter(s => fileContent.includes(s));
    const symbol_grounded = f.symbols.length === 0 ? false : sym_hits.length > 0;
    const line_oob = f.lines.length > 0 && f.lines.some(l => l > fileLines);
    if (line_oob) result.out_of_bounds_lines++;
    let verdict: string;
    if (sym_hits.length > 0 && !line_oob) {
      result.grounded++;
      verdict = `grounded (${sym_hits.length}/${f.symbols.length} syms hit)`;
    } else if (sym_hits.length > 0 && line_oob) {
      result.partial++;
      verdict = `partial (real syms but lines >${fileLines} EOF)`;
    } else if (symbol_grounded) {
      result.partial++;
      verdict = "partial";
    } else {
      result.hallucinated++;
      verdict = `hallucinated (0/${f.symbols.length} syms hit${line_oob ? `, lines>${fileLines}` : ''})`;
    }
    result.details.push({ row: f.full.slice(0, 80), verdict });
  }
  return result;
}

function readFileSafe(path: string): string | null {
  try {
    return readFileSync(path, "utf8");
  } catch {
    return null;
  }
}

function pad(s: string | number, n: number, right = false): string {
  const str = String(s);
  if (str.length >= n) return str.slice(0, n);
  return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}

// Modes intentionally lossy or designed to expose architectural axes —
// their numerical wins should NOT be read as recommendations. Tagged
// in the output so a glance at the matrix doesn't mislead.
const CONTROL_MODES = new Set([
  "codereview_null",          // baseline — no enrichment, generic framing
  "codereview_playbook_only", // lossy — pathway only, NO file content
]);

function modeLabel(mode: string): string {
  return CONTROL_MODES.has(mode) ? `${mode} ⚗`  : mode;
}

function main() {
  const { jsonl, since } = parseArgs();
  const rows = loadRows(jsonl, since);
  if (rows.length === 0) {
    console.error("[compare] no rows after filter");
    process.exit(1);
  }

  // Group by file → mode
  const byFile: Record<string, Record<string, Row>> = {};
  const allModes = new Set<string>();
  for (const r of rows) {
    byFile[r.file_path] ??= {};
    byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
    allModes.add(r.mode);
  }
  const modesSorted = [...allModes].sort();

  // Per-file matrix
  console.log("\n═══ PER-FILE COMPARISON ═══");
  console.log("(⚗ = control/lossy mode — wins should not be read as recommendations)\n");
  for (const file of Object.keys(byFile).sort()) {
    console.log(`📄 ${file}`);
    const fileContent = readFileSafe(file);
    const fileLines = fileContent ? fileContent.split("\n").length : 0;
    console.log(`   (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE — grounding skipped" : ""})`);
    console.log(
      `  ${pad("mode", 30)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}`
    );
    console.log(`  ${"─".repeat(30)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`);
    for (const mode of modesSorted) {
      const r = byFile[file][mode];
      if (!r) {
        console.log(`  ${pad(modeLabel(mode), 30)} ${pad("—", 6, true)}`);
        continue;
      }
      const findings = countFindings(r.response);
      const cits = countPathwayCitations(r.response);
      const bf = r.sources.bug_fingerprints_count ?? 0;
      const grounding = checkGrounding(r.response, fileContent);
      const groundedStr = grounding.total === 0 ? "—" : `${grounding.grounded}/${grounding.total}`;
      console.log(
        `  ${pad(modeLabel(mode), 30)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}`
      );
    }
    console.log("");
  }

  // Per-mode averages — grounded findings is now the primary metric.
  // avg_groundedness is the rate at which findings cite real symbols
  // within file bounds. Modes with low groundedness are confabulating.
  console.log("═══ PER-MODE AGGREGATE ═══\n");
  console.log(`  ${pad("mode", 30)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`);
  console.log(`  ${"─".repeat(30)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`);
  const fileCache: Record<string, string | null> = {};
  for (const mode of modesSorted) {
    const modeRows = rows.filter(r => r.mode === mode);
    if (modeRows.length === 0) continue;
    const n = modeRows.length;
    let totFind = 0, totGround = 0, totHallu = 0;
    for (const r of modeRows) {
      const fc = fileCache[r.file_path] ??= readFileSafe(r.file_path);
      const g = checkGrounding(r.response, fc);
      totFind += g.total;
      totGround += g.grounded;
      totHallu += g.hallucinated;
    }
    const avgFind = (totFind / n).toFixed(1);
    const avgGround = (totGround / n).toFixed(1);
    const grndPct = totFind > 0 ? `${Math.round(100 * totGround / totFind)}%` : "—";
    const avgHallu = (totHallu / n).toFixed(1);
    const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
    console.log(
      `  ${pad(modeLabel(mode), 30)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}`
    );
  }

  // Mode-relative: GROUNDED findings vs lakehouse. The earlier raw
  // finding-count comparison rewarded confabulation (more rows = more
  // wins). Comparing grounded findings instead corrects for modes
  // that produce convincing-but-fake output.
  console.log("\n═══ MODE vs codereview_lakehouse (grounded findings, per file) ═══\n");
  console.log(`  ${pad("mode", 30)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg grounded", 16, true)}`);
  console.log(`  ${"─".repeat(30)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
  for (const mode of modesSorted) {
    if (mode === "codereview_lakehouse") continue;
    let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
    for (const file of Object.keys(byFile)) {
      const baseline = byFile[file]["codereview_lakehouse"];
      const challenger = byFile[file][mode];
      if (!baseline || !challenger) continue;
      const fc = fileCache[file] ??= readFileSafe(file);
      const bg = checkGrounding(baseline.response, fc).grounded;
      const cg = checkGrounding(challenger.response, fc).grounded;
      if (cg > bg) wins++;
      else if (cg < bg) losses++;
      else ties++;
      totalDelta += cg - bg;
      n++;
    }
    if (n === 0) continue;
    const avgDelta = (totalDelta / n).toFixed(1);
    console.log(
      `  ${pad(modeLabel(mode), 30)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
    );
  }
  console.log("\n[compare] done — ⚗ marks lossy/control modes, exclude from recommendations\n");
}

main();