#!/usr/bin/env bun /** * Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl * (written per-call by /v1/mode/execute) and surfaces the cross-mode * comparison matrix that lets us see what each enrichment dimension * is actually doing. * * Per file, per mode, computes: * - response_chars * - finding_count (rows in markdown tables — heuristic, regex) * - pathway_citations (mentions of "Pathway memory" or "šŸ“š") * - latency_ms * - matrix_chunks_kept / dropped * * Then surfaces: * - per file, what each mode produced (rows next to each other) * - per mode, average response_chars + latency * - which modes ALWAYS underperform vs codereview_lakehouse * - which signals (bug fingerprints, matrix) correlate with output size * * Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26] */ import { readFileSync, existsSync } from "node:fs"; interface Row { ts: string; mode: string; model: string; task_class: string; file_path: string; enriched_prompt_chars: number; response_chars: number; latency_ms: number; sources: { focus_file_bytes?: number; bug_fingerprints_count?: number; matrix_chunks_kept?: number; matrix_chunks_dropped?: number; relevance_filter_used?: boolean; flags?: any; }; response: string; } function parseArgs(): { jsonl: string; since: string | null } { const args = Bun.argv.slice(2); const out: Record = {}; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? ""; } return { jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl", since: out.since || null, }; } function loadRows(path: string, since: string | null): Row[] { if (!existsSync(path)) { console.error(`[compare] no log file at ${path}`); process.exit(1); } const lines = readFileSync(path, "utf8").split("\n").filter(Boolean); const rows: Row[] = []; for (const line of lines) { try { const r: Row = JSON.parse(line); if (since && r.ts < since) continue; rows.push(r); } catch { // skip malformed } } return rows; } function countFindings(md: string): number { // Adversarial framing produces a "Ranked Findings" table early in // the output. The original regex `^\|\s*\*?\*?\d+\*?\*?\s*\|` matched // ANY numbered table row — including the patch table that follows // the findings table, double-counting every finding. // // Fix: only count rows under a "Ranked Findings" / "Findings" header // until we hit the next ## heading or a "Patch" / "Suggestion" header. // Falls back to the loose count if no findings header is detected // (some modes use different framing). const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i; const m = md.match(findingsSectionRe); if (m && m.index !== undefined) { const after = md.slice(m.index + m[0].length); // Stop at the next ## heading or Patch/Suggestion header. // Allow non-letter chars (emoji/space) between # and the keyword // so headers like `## šŸ› ļø Concrete Patch Suggestions` get caught. const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i; const stop = after.search(stopRe); const section = stop >= 0 ? after.slice(0, stop) : after; const rows = section.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm); return rows ? rows.length : 0; } // Fallback for outputs without a labeled findings section. const all = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm); return all ? all.length : 0; } function countPathwayCitations(md: string): number { // How many times the model referenced the pathway memory preamble. const re = /pathway\s*memory|šŸ“š/gi; return (md.match(re) ?? []).length; } // ─── Grounding check ─── // A finding is "grounded" if the symbols it cites actually exist in // the focus file AND any cited line numbers fall within the file's // real line count. Anti-pollution measure surfaced 2026-04-26 after // codereview_playbook_only produced 8 findings citing lines 378-945 // in a 332-line file (all hallucinated from pathway-memory preamble // since the mode doesn't pass file content to the model). interface GroundingResult { total: number; grounded: number; partial: number; hallucinated: number; out_of_bounds_lines: number; // findings citing lines past EOF details: { row: string; verdict: string }[]; } function extractFindings(md: string): { full: string; symbols: string[]; lines: number[] }[] { // Pull each finding row from the Findings section (uses the same // emoji-tolerant section-detection logic as countFindings). const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i; const m = md.match(findingsSectionRe); let section = md; if (m && m.index !== undefined) { const after = md.slice(m.index + m[0].length); const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i; const stop = after.search(stopRe); section = stop >= 0 ? after.slice(0, stop) : after; } const rows = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l)); return rows.map(row => { // Symbols: backtick-quoted identifiers, also bare snake_case_words const sym = new Set(); for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]); for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) { const w = t[1]; // Filter common words that aren't symbols if (!["score", "level", "match", "table", "value", "where", "found", "would", "after", "before", "calls", "needs", "patch", "should", "missing", "violation", "evidence", "checks", "either", "audit", "later", "field", "rules", "stage", "early", "always", "later", "could", "leaks", "memory"].includes(w)) { sym.add(w); } } // Line numbers from `path:NNN` or `:NNN-NNN` patterns const lineNums: number[] = []; for (const t of row.matchAll(/[:‑\-](\d{2,5})(?:[‑\-](\d{2,5}))?/g)) { lineNums.push(parseInt(t[1])); if (t[2]) lineNums.push(parseInt(t[2])); } return { full: row, symbols: [...sym], lines: lineNums }; }); } function checkGrounding(md: string, fileContent: string | null): GroundingResult { const findings = extractFindings(md); if (!fileContent) { return { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [] }; } const fileLines = fileContent.split("\n").length; const result: GroundingResult = { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [], }; for (const f of findings) { const sym_hits = f.symbols.filter(s => fileContent.includes(s)); const symbol_grounded = f.symbols.length === 0 ? false : sym_hits.length > 0; const line_oob = f.lines.length > 0 && f.lines.some(l => l > fileLines); if (line_oob) result.out_of_bounds_lines++; let verdict: string; if (sym_hits.length > 0 && !line_oob) { result.grounded++; verdict = `grounded (${sym_hits.length}/${f.symbols.length} syms hit)`; } else if (sym_hits.length > 0 && line_oob) { result.partial++; verdict = `partial (real syms but lines >${fileLines} EOF)`; } else if (symbol_grounded) { result.partial++; verdict = "partial"; } else { result.hallucinated++; verdict = `hallucinated (0/${f.symbols.length} syms hit${line_oob ? `, lines>${fileLines}` : ''})`; } result.details.push({ row: f.full.slice(0, 80), verdict }); } return result; } function readFileSafe(path: string): string | null { try { return readFileSync(path, "utf8"); } catch { return null; } } function pad(s: string | number, n: number, right = false): string { const str = String(s); if (str.length >= n) return str.slice(0, n); return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length); } // Modes intentionally lossy or designed to expose architectural axes — // their numerical wins should NOT be read as recommendations. Tagged // in the output so a glance at the matrix doesn't mislead. const CONTROL_MODES = new Set([ "codereview_null", // baseline — no enrichment, generic framing "codereview_playbook_only", // lossy — pathway only, NO file content ]); function modeLabel(mode: string): string { return CONTROL_MODES.has(mode) ? `${mode} āš—` : mode; } function main() { const { jsonl, since } = parseArgs(); const rows = loadRows(jsonl, since); if (rows.length === 0) { console.error("[compare] no rows after filter"); process.exit(1); } // Group by file → mode (with corpus appended when matrix-bearing modes // were swept across multiple corpora — otherwise lakehouse_arch_v1 // would clobber scrum_findings_v1 etc). matrix_corpus is now a Vec // on the wire (multi-corpus support); legacy rows have either a // string or null. Coerce to a stable key. const matrixCorpus = (r: Row): string => { const c = (r.sources as any)?.matrix_corpus; if (!c) return ""; if (typeof c === "string") return c; if (Array.isArray(c)) { if (c.length === 0) return ""; if (c.length === 1) return c[0]; // Stable join: sort then "+"-separate so order doesn't matter. return [...c].sort().join("+"); } return ""; }; const corporaInPlay = new Set(rows.map(matrixCorpus).filter(c => c)); const showCorpus = corporaInPlay.size > 1; const keyOf = (r: Row): string => { const c = matrixCorpus(r); return showCorpus && c ? `${r.mode}|${c}` : r.mode; }; const byFile: Record> = {}; const allModes = new Set(); for (const r of rows) { byFile[r.file_path] ??= {}; const k = keyOf(r); byFile[r.file_path][k] = r; // last-write-wins per (mode,corpus) per file allModes.add(k); } const modesSorted = [...allModes].sort(); // Per-file matrix console.log("\n═══ PER-FILE COMPARISON ═══"); console.log("(āš— = control/lossy mode — wins should not be read as recommendations)\n"); for (const file of Object.keys(byFile).sort()) { console.log(`šŸ“„ ${file}`); const fileContent = readFileSafe(file); const fileLines = fileContent ? fileContent.split("\n").length : 0; console.log(` (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE — grounding skipped" : ""})`); console.log( ` ${pad("mode", 56)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}` ); console.log(` ${"─".repeat(56)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`); for (const mode of modesSorted) { const r = byFile[file][mode]; if (!r) { console.log(` ${pad(modeLabel(mode), 56)} ${pad("—", 6, true)}`); continue; } const findings = countFindings(r.response); const cits = countPathwayCitations(r.response); const bf = r.sources.bug_fingerprints_count ?? 0; const grounding = checkGrounding(r.response, fileContent); const groundedStr = grounding.total === 0 ? "—" : `${grounding.grounded}/${grounding.total}`; console.log( ` ${pad(modeLabel(mode), 56)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}` ); } console.log(""); } // Per-mode averages — grounded findings is now the primary metric. // avg_groundedness is the rate at which findings cite real symbols // within file bounds. Modes with low groundedness are confabulating. console.log("═══ PER-MODE AGGREGATE ═══\n"); console.log(` ${pad("mode", 56)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`); console.log(` ${"─".repeat(56)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`); const fileCache: Record = {}; for (const mode of modesSorted) { const modeRows = rows.filter(r => keyOf(r) === mode); if (modeRows.length === 0) continue; const n = modeRows.length; let totFind = 0, totGround = 0, totHallu = 0; for (const r of modeRows) { const fc = fileCache[r.file_path] ??= readFileSafe(r.file_path); const g = checkGrounding(r.response, fc); totFind += g.total; totGround += g.grounded; totHallu += g.hallucinated; } const avgFind = (totFind / n).toFixed(1); const avgGround = (totGround / n).toFixed(1); const grndPct = totFind > 0 ? `${Math.round(100 * totGround / totFind)}%` : "—"; const avgHallu = (totHallu / n).toFixed(1); const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n); console.log( ` ${pad(modeLabel(mode), 56)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}` ); } // Mode-relative: GROUNDED findings vs lakehouse. The earlier raw // finding-count comparison rewarded confabulation (more rows = more // wins). Comparing grounded findings instead corrects for modes // that produce convincing-but-fake output. console.log("\n═══ MODE vs codereview_lakehouse (grounded findings, per file) ═══\n"); console.log(` ${pad("mode", 56)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Ī” avg grounded", 16, true)}`); console.log(` ${"─".repeat(56)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`); // Pick whichever codereview_lakehouse key shows up most often as the // baseline (handles corpus-suffixed keys when showCorpus=true). const baselineKey = modesSorted .filter(k => k.startsWith("codereview_lakehouse")) .sort((a, b) => Object.values(byFile).filter(f => f[b]).length - Object.values(byFile).filter(f => f[a]).length)[0] ?? "codereview_lakehouse"; for (const mode of modesSorted) { if (mode === baselineKey) continue; let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0; for (const file of Object.keys(byFile)) { const baseline = byFile[file][baselineKey]; const challenger = byFile[file][mode]; if (!baseline || !challenger) continue; const fc = fileCache[file] ??= readFileSafe(file); const bg = checkGrounding(baseline.response, fc).grounded; const cg = checkGrounding(challenger.response, fc).grounded; if (cg > bg) wins++; else if (cg < bg) losses++; else ties++; totalDelta += cg - bg; n++; } if (n === 0) continue; const avgDelta = (totalDelta / n).toFixed(1); console.log( ` ${pad(modeLabel(mode), 56)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}` ); } console.log("\n[compare] done — āš— marks lossy/control modes, exclude from recommendations\n"); } main();