lakehouse/scripts/mode_compare.ts
root 2dbc8dbc83
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: model-aware enrichment downgrade + 3 corpora + variance harness
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing
matrix corpora is anti-additive on strong models — composed lakehouse_arch
+ symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded
findings, p=0.031). Default flips to isolation; matrix path now auto-
downgrades when the resolved model is strong.

Mode runner:
- matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec)
- top_k=6 from each corpus, merge by score, take top 8 globally
- chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch]
- is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation
  for strong models (default-strong; weak = :free suffix or local last-resort)
- LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs
- EnrichmentSources.downgraded_from records when the gate fires

Three corpora indexed via /vectors/index (5849 chunks total):
- lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks)
- scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED
  from defaults — 24% out-of-bounds line citations from cross-file drift)
- lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks)

Experiment infra:
- scripts/build_*_corpus.ts — re-runnable when source content changes
- scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file
- scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles
  numbered + path-with-line + path-with-symbol finding tables
- scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora
- scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast,
  --corpus flag for per-call override

Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:29:17 -05:00

363 lines
15 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bun
/**
* Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
* comparison matrix that lets us see what each enrichment dimension
* is actually doing.
*
* Per file, per mode, computes:
* - response_chars
* - finding_count (rows in markdown tables — heuristic, regex)
* - pathway_citations (mentions of "Pathway memory" or "📚")
* - latency_ms
* - matrix_chunks_kept / dropped
*
* Then surfaces:
* - per file, what each mode produced (rows next to each other)
* - per mode, average response_chars + latency
* - which modes ALWAYS underperform vs codereview_lakehouse
* - which signals (bug fingerprints, matrix) correlate with output size
*
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
*/
import { readFileSync, existsSync } from "node:fs";
interface Row {
ts: string;
mode: string;
model: string;
task_class: string;
file_path: string;
enriched_prompt_chars: number;
response_chars: number;
latency_ms: number;
sources: {
focus_file_bytes?: number;
bug_fingerprints_count?: number;
matrix_chunks_kept?: number;
matrix_chunks_dropped?: number;
relevance_filter_used?: boolean;
flags?: any;
};
response: string;
}
function parseArgs(): { jsonl: string; since: string | null } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
return {
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
since: out.since || null,
};
}
function loadRows(path: string, since: string | null): Row[] {
if (!existsSync(path)) {
console.error(`[compare] no log file at ${path}`);
process.exit(1);
}
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const line of lines) {
try {
const r: Row = JSON.parse(line);
if (since && r.ts < since) continue;
rows.push(r);
} catch {
// skip malformed
}
}
return rows;
}
function countFindings(md: string): number {
// Adversarial framing produces a "Ranked Findings" table early in
// the output. The original regex `^\|\s*\*?\*?\d+\*?\*?\s*\|` matched
// ANY numbered table row — including the patch table that follows
// the findings table, double-counting every finding.
//
// Fix: only count rows under a "Ranked Findings" / "Findings" header
// until we hit the next ## heading or a "Patch" / "Suggestion" header.
// Falls back to the loose count if no findings header is detected
// (some modes use different framing).
const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(findingsSectionRe);
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
// Stop at the next ## heading or Patch/Suggestion header.
// Allow non-letter chars (emoji/space) between # and the keyword
// so headers like `## 🛠️ Concrete Patch Suggestions` get caught.
const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
const stop = after.search(stopRe);
const section = stop >= 0 ? after.slice(0, stop) : after;
const rows = section.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return rows ? rows.length : 0;
}
// Fallback for outputs without a labeled findings section.
const all = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return all ? all.length : 0;
}
function countPathwayCitations(md: string): number {
// How many times the model referenced the pathway memory preamble.
const re = /pathway\s*memory|📚/gi;
return (md.match(re) ?? []).length;
}
// ─── Grounding check ───
// A finding is "grounded" if the symbols it cites actually exist in
// the focus file AND any cited line numbers fall within the file's
// real line count. Anti-pollution measure surfaced 2026-04-26 after
// codereview_playbook_only produced 8 findings citing lines 378-945
// in a 332-line file (all hallucinated from pathway-memory preamble
// since the mode doesn't pass file content to the model).
interface GroundingResult {
total: number;
grounded: number;
partial: number;
hallucinated: number;
out_of_bounds_lines: number; // findings citing lines past EOF
details: { row: string; verdict: string }[];
}
function extractFindings(md: string): { full: string; symbols: string[]; lines: number[] }[] {
// Pull each finding row from the Findings section (uses the same
// emoji-tolerant section-detection logic as countFindings).
const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(findingsSectionRe);
let section = md;
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
const stop = after.search(stopRe);
section = stop >= 0 ? after.slice(0, stop) : after;
}
const rows = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
return rows.map(row => {
// Symbols: backtick-quoted identifiers, also bare snake_case_words
const sym = new Set<string>();
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) {
const w = t[1];
// Filter common words that aren't symbols
if (!["score", "level", "match", "table", "value", "where", "found", "would", "after", "before", "calls", "needs", "patch", "should", "missing", "violation", "evidence", "checks", "either", "audit", "later", "field", "rules", "stage", "early", "always", "later", "could", "leaks", "memory"].includes(w)) {
sym.add(w);
}
}
// Line numbers from `path:NNN` or `:NNN-NNN` patterns
const lineNums: number[] = [];
for (const t of row.matchAll(/[:\-](\d{2,5})(?:[\-](\d{2,5}))?/g)) {
lineNums.push(parseInt(t[1]));
if (t[2]) lineNums.push(parseInt(t[2]));
}
return { full: row, symbols: [...sym], lines: lineNums };
});
}
function checkGrounding(md: string, fileContent: string | null): GroundingResult {
const findings = extractFindings(md);
if (!fileContent) {
return { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [] };
}
const fileLines = fileContent.split("\n").length;
const result: GroundingResult = {
total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [],
};
for (const f of findings) {
const sym_hits = f.symbols.filter(s => fileContent.includes(s));
const symbol_grounded = f.symbols.length === 0 ? false : sym_hits.length > 0;
const line_oob = f.lines.length > 0 && f.lines.some(l => l > fileLines);
if (line_oob) result.out_of_bounds_lines++;
let verdict: string;
if (sym_hits.length > 0 && !line_oob) {
result.grounded++;
verdict = `grounded (${sym_hits.length}/${f.symbols.length} syms hit)`;
} else if (sym_hits.length > 0 && line_oob) {
result.partial++;
verdict = `partial (real syms but lines >${fileLines} EOF)`;
} else if (symbol_grounded) {
result.partial++;
verdict = "partial";
} else {
result.hallucinated++;
verdict = `hallucinated (0/${f.symbols.length} syms hit${line_oob ? `, lines>${fileLines}` : ''})`;
}
result.details.push({ row: f.full.slice(0, 80), verdict });
}
return result;
}
function readFileSafe(path: string): string | null {
try {
return readFileSync(path, "utf8");
} catch {
return null;
}
}
function pad(s: string | number, n: number, right = false): string {
const str = String(s);
if (str.length >= n) return str.slice(0, n);
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}
// Modes intentionally lossy or designed to expose architectural axes —
// their numerical wins should NOT be read as recommendations. Tagged
// in the output so a glance at the matrix doesn't mislead.
const CONTROL_MODES = new Set([
"codereview_null", // baseline — no enrichment, generic framing
"codereview_playbook_only", // lossy — pathway only, NO file content
]);
function modeLabel(mode: string): string {
return CONTROL_MODES.has(mode) ? `${mode}` : mode;
}
function main() {
const { jsonl, since } = parseArgs();
const rows = loadRows(jsonl, since);
if (rows.length === 0) {
console.error("[compare] no rows after filter");
process.exit(1);
}
// Group by file → mode (with corpus appended when matrix-bearing modes
// were swept across multiple corpora — otherwise lakehouse_arch_v1
// would clobber scrum_findings_v1 etc). matrix_corpus is now a Vec
// on the wire (multi-corpus support); legacy rows have either a
// string or null. Coerce to a stable key.
const matrixCorpus = (r: Row): string => {
const c = (r.sources as any)?.matrix_corpus;
if (!c) return "";
if (typeof c === "string") return c;
if (Array.isArray(c)) {
if (c.length === 0) return "";
if (c.length === 1) return c[0];
// Stable join: sort then "+"-separate so order doesn't matter.
return [...c].sort().join("+");
}
return "";
};
const corporaInPlay = new Set(rows.map(matrixCorpus).filter(c => c));
const showCorpus = corporaInPlay.size > 1;
const keyOf = (r: Row): string => {
const c = matrixCorpus(r);
return showCorpus && c ? `${r.mode}|${c}` : r.mode;
};
const byFile: Record<string, Record<string, Row>> = {};
const allModes = new Set<string>();
for (const r of rows) {
byFile[r.file_path] ??= {};
const k = keyOf(r);
byFile[r.file_path][k] = r; // last-write-wins per (mode,corpus) per file
allModes.add(k);
}
const modesSorted = [...allModes].sort();
// Per-file matrix
console.log("\n═══ PER-FILE COMPARISON ═══");
console.log("(⚗ = control/lossy mode — wins should not be read as recommendations)\n");
for (const file of Object.keys(byFile).sort()) {
console.log(`📄 ${file}`);
const fileContent = readFileSafe(file);
const fileLines = fileContent ? fileContent.split("\n").length : 0;
console.log(` (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE — grounding skipped" : ""})`);
console.log(
` ${pad("mode", 56)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}`
);
console.log(` ${"─".repeat(56)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`);
for (const mode of modesSorted) {
const r = byFile[file][mode];
if (!r) {
console.log(` ${pad(modeLabel(mode), 56)} ${pad("—", 6, true)}`);
continue;
}
const findings = countFindings(r.response);
const cits = countPathwayCitations(r.response);
const bf = r.sources.bug_fingerprints_count ?? 0;
const grounding = checkGrounding(r.response, fileContent);
const groundedStr = grounding.total === 0 ? "—" : `${grounding.grounded}/${grounding.total}`;
console.log(
` ${pad(modeLabel(mode), 56)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}`
);
}
console.log("");
}
// Per-mode averages — grounded findings is now the primary metric.
// avg_groundedness is the rate at which findings cite real symbols
// within file bounds. Modes with low groundedness are confabulating.
console.log("═══ PER-MODE AGGREGATE ═══\n");
console.log(` ${pad("mode", 56)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`);
console.log(` ${"─".repeat(56)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`);
const fileCache: Record<string, string | null> = {};
for (const mode of modesSorted) {
const modeRows = rows.filter(r => keyOf(r) === mode);
if (modeRows.length === 0) continue;
const n = modeRows.length;
let totFind = 0, totGround = 0, totHallu = 0;
for (const r of modeRows) {
const fc = fileCache[r.file_path] ??= readFileSafe(r.file_path);
const g = checkGrounding(r.response, fc);
totFind += g.total;
totGround += g.grounded;
totHallu += g.hallucinated;
}
const avgFind = (totFind / n).toFixed(1);
const avgGround = (totGround / n).toFixed(1);
const grndPct = totFind > 0 ? `${Math.round(100 * totGround / totFind)}%` : "—";
const avgHallu = (totHallu / n).toFixed(1);
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
console.log(
` ${pad(modeLabel(mode), 56)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}`
);
}
// Mode-relative: GROUNDED findings vs lakehouse. The earlier raw
// finding-count comparison rewarded confabulation (more rows = more
// wins). Comparing grounded findings instead corrects for modes
// that produce convincing-but-fake output.
console.log("\n═══ MODE vs codereview_lakehouse (grounded findings, per file) ═══\n");
console.log(` ${pad("mode", 56)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg grounded", 16, true)}`);
console.log(` ${"─".repeat(56)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
// Pick whichever codereview_lakehouse key shows up most often as the
// baseline (handles corpus-suffixed keys when showCorpus=true).
const baselineKey = modesSorted
.filter(k => k.startsWith("codereview_lakehouse"))
.sort((a, b) =>
Object.values(byFile).filter(f => f[b]).length -
Object.values(byFile).filter(f => f[a]).length)[0] ?? "codereview_lakehouse";
for (const mode of modesSorted) {
if (mode === baselineKey) continue;
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
for (const file of Object.keys(byFile)) {
const baseline = byFile[file][baselineKey];
const challenger = byFile[file][mode];
if (!baseline || !challenger) continue;
const fc = fileCache[file] ??= readFileSafe(file);
const bg = checkGrounding(baseline.response, fc).grounded;
const cg = checkGrounding(challenger.response, fc).grounded;
if (cg > bg) wins++;
else if (cg < bg) losses++;
else ties++;
totalDelta += cg - bg;
n++;
}
if (n === 0) continue;
const avgDelta = (totalDelta / n).toFixed(1);
console.log(
` ${pad(modeLabel(mode), 56)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
);
}
console.log("\n[compare] done — ⚗ marks lossy/control modes, exclude from recommendations\n");
}
main();