lakehouse/scripts/mode_compare.ts
root 52bb216c2d
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
mode_compare: grounding check + control flag + emoji-tolerant section detection
Three fixes after the playbook_only confabulation surfaced in
2026-04-26 experiment (8 'findings' on a 333-line file all citing
lines 378-945 — fully fabricated from pathway-memory pattern names).

(1) Aggregator regex bug — section detection failed on emoji-prefixed
markdown headers like `## 🔎 Ranked Findings`. The original regex
required word chars right after #{1,3}\s+, so the patches table
header `## 🛠️ Concrete Patch Suggestions` was never recognized as
a stop boundary, double-counting every finding. Fix: allow
non-letter chars (emoji/space) between # and the keyword.

(2) Grounding check — for each finding row in the response, extract
backtick-quoted symbols + cited line numbers; verify symbols exist
in the actual focus file and lines fall within EOF. Computes
grounded/total ratio per mode. Surfaces 'OOB' (out-of-bounds) count
explicitly so confabulation is visible at a glance. Confirms what
hand-grading found: codereview_playbook_only's 8 findings on
service.rs were 1/8 grounded with 7 OOB.

(3) Control mode tagging — codereview_null and codereview_playbook_only
are designed as falsifiers (baseline / lossy ceiling) and their
numerical wins should never be read as recommendations. Output
marks them with ⚗ glyph + warning footer.

Per-mode aggregate is now sorted by groundedness, not raw count.
Per-mode-vs-lakehouse comparison uses grounded findings, not raw —
so confabulation can no longer score a "win".

Updated SCRUM_MASTER_SPEC.md with refactor timeline pointing at
the 2026-04-25/26 commits (observer fix, relevance filter, retire
wire, mode router, enrichment runner, parameterized experiment).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 01:44:21 -05:00

332 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bun
/**
* Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
* comparison matrix that lets us see what each enrichment dimension
* is actually doing.
*
* Per file, per mode, computes:
* - response_chars
* - finding_count (rows in markdown tables — heuristic, regex)
* - pathway_citations (mentions of "Pathway memory" or "📚")
* - latency_ms
* - matrix_chunks_kept / dropped
*
* Then surfaces:
* - per file, what each mode produced (rows next to each other)
* - per mode, average response_chars + latency
* - which modes ALWAYS underperform vs codereview_lakehouse
* - which signals (bug fingerprints, matrix) correlate with output size
*
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
*/
import { readFileSync, existsSync } from "node:fs";
interface Row {
ts: string;
mode: string;
model: string;
task_class: string;
file_path: string;
enriched_prompt_chars: number;
response_chars: number;
latency_ms: number;
sources: {
focus_file_bytes?: number;
bug_fingerprints_count?: number;
matrix_chunks_kept?: number;
matrix_chunks_dropped?: number;
relevance_filter_used?: boolean;
flags?: any;
};
response: string;
}
function parseArgs(): { jsonl: string; since: string | null } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
return {
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
since: out.since || null,
};
}
function loadRows(path: string, since: string | null): Row[] {
if (!existsSync(path)) {
console.error(`[compare] no log file at ${path}`);
process.exit(1);
}
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const line of lines) {
try {
const r: Row = JSON.parse(line);
if (since && r.ts < since) continue;
rows.push(r);
} catch {
// skip malformed
}
}
return rows;
}
function countFindings(md: string): number {
// Adversarial framing produces a "Ranked Findings" table early in
// the output. The original regex `^\|\s*\*?\*?\d+\*?\*?\s*\|` matched
// ANY numbered table row — including the patch table that follows
// the findings table, double-counting every finding.
//
// Fix: only count rows under a "Ranked Findings" / "Findings" header
// until we hit the next ## heading or a "Patch" / "Suggestion" header.
// Falls back to the loose count if no findings header is detected
// (some modes use different framing).
const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(findingsSectionRe);
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
// Stop at the next ## heading or Patch/Suggestion header.
// Allow non-letter chars (emoji/space) between # and the keyword
// so headers like `## 🛠️ Concrete Patch Suggestions` get caught.
const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
const stop = after.search(stopRe);
const section = stop >= 0 ? after.slice(0, stop) : after;
const rows = section.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return rows ? rows.length : 0;
}
// Fallback for outputs without a labeled findings section.
const all = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return all ? all.length : 0;
}
function countPathwayCitations(md: string): number {
// How many times the model referenced the pathway memory preamble.
const re = /pathway\s*memory|📚/gi;
return (md.match(re) ?? []).length;
}
// ─── Grounding check ───
// A finding is "grounded" if the symbols it cites actually exist in
// the focus file AND any cited line numbers fall within the file's
// real line count. Anti-pollution measure surfaced 2026-04-26 after
// codereview_playbook_only produced 8 findings citing lines 378-945
// in a 332-line file (all hallucinated from pathway-memory preamble
// since the mode doesn't pass file content to the model).
interface GroundingResult {
total: number;
grounded: number;
partial: number;
hallucinated: number;
out_of_bounds_lines: number; // findings citing lines past EOF
details: { row: string; verdict: string }[];
}
function extractFindings(md: string): { full: string; symbols: string[]; lines: number[] }[] {
// Pull each finding row from the Findings section (uses the same
// emoji-tolerant section-detection logic as countFindings).
const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(findingsSectionRe);
let section = md;
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i;
const stop = after.search(stopRe);
section = stop >= 0 ? after.slice(0, stop) : after;
}
const rows = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
return rows.map(row => {
// Symbols: backtick-quoted identifiers, also bare snake_case_words
const sym = new Set<string>();
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) {
const w = t[1];
// Filter common words that aren't symbols
if (!["score", "level", "match", "table", "value", "where", "found", "would", "after", "before", "calls", "needs", "patch", "should", "missing", "violation", "evidence", "checks", "either", "audit", "later", "field", "rules", "stage", "early", "always", "later", "could", "leaks", "memory"].includes(w)) {
sym.add(w);
}
}
// Line numbers from `path:NNN` or `:NNN-NNN` patterns
const lineNums: number[] = [];
for (const t of row.matchAll(/[:\-](\d{2,5})(?:[\-](\d{2,5}))?/g)) {
lineNums.push(parseInt(t[1]));
if (t[2]) lineNums.push(parseInt(t[2]));
}
return { full: row, symbols: [...sym], lines: lineNums };
});
}
function checkGrounding(md: string, fileContent: string | null): GroundingResult {
const findings = extractFindings(md);
if (!fileContent) {
return { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [] };
}
const fileLines = fileContent.split("\n").length;
const result: GroundingResult = {
total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [],
};
for (const f of findings) {
const sym_hits = f.symbols.filter(s => fileContent.includes(s));
const symbol_grounded = f.symbols.length === 0 ? false : sym_hits.length > 0;
const line_oob = f.lines.length > 0 && f.lines.some(l => l > fileLines);
if (line_oob) result.out_of_bounds_lines++;
let verdict: string;
if (sym_hits.length > 0 && !line_oob) {
result.grounded++;
verdict = `grounded (${sym_hits.length}/${f.symbols.length} syms hit)`;
} else if (sym_hits.length > 0 && line_oob) {
result.partial++;
verdict = `partial (real syms but lines >${fileLines} EOF)`;
} else if (symbol_grounded) {
result.partial++;
verdict = "partial";
} else {
result.hallucinated++;
verdict = `hallucinated (0/${f.symbols.length} syms hit${line_oob ? `, lines>${fileLines}` : ''})`;
}
result.details.push({ row: f.full.slice(0, 80), verdict });
}
return result;
}
function readFileSafe(path: string): string | null {
try {
return readFileSync(path, "utf8");
} catch {
return null;
}
}
function pad(s: string | number, n: number, right = false): string {
const str = String(s);
if (str.length >= n) return str.slice(0, n);
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}
// Modes intentionally lossy or designed to expose architectural axes —
// their numerical wins should NOT be read as recommendations. Tagged
// in the output so a glance at the matrix doesn't mislead.
const CONTROL_MODES = new Set([
"codereview_null", // baseline — no enrichment, generic framing
"codereview_playbook_only", // lossy — pathway only, NO file content
]);
function modeLabel(mode: string): string {
return CONTROL_MODES.has(mode) ? `${mode}` : mode;
}
function main() {
const { jsonl, since } = parseArgs();
const rows = loadRows(jsonl, since);
if (rows.length === 0) {
console.error("[compare] no rows after filter");
process.exit(1);
}
// Group by file → mode
const byFile: Record<string, Record<string, Row>> = {};
const allModes = new Set<string>();
for (const r of rows) {
byFile[r.file_path] ??= {};
byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
allModes.add(r.mode);
}
const modesSorted = [...allModes].sort();
// Per-file matrix
console.log("\n═══ PER-FILE COMPARISON ═══");
console.log("(⚗ = control/lossy mode — wins should not be read as recommendations)\n");
for (const file of Object.keys(byFile).sort()) {
console.log(`📄 ${file}`);
const fileContent = readFileSafe(file);
const fileLines = fileContent ? fileContent.split("\n").length : 0;
console.log(` (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE — grounding skipped" : ""})`);
console.log(
` ${pad("mode", 30)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}`
);
console.log(` ${"─".repeat(30)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`);
for (const mode of modesSorted) {
const r = byFile[file][mode];
if (!r) {
console.log(` ${pad(modeLabel(mode), 30)} ${pad("—", 6, true)}`);
continue;
}
const findings = countFindings(r.response);
const cits = countPathwayCitations(r.response);
const bf = r.sources.bug_fingerprints_count ?? 0;
const grounding = checkGrounding(r.response, fileContent);
const groundedStr = grounding.total === 0 ? "—" : `${grounding.grounded}/${grounding.total}`;
console.log(
` ${pad(modeLabel(mode), 30)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}`
);
}
console.log("");
}
// Per-mode averages — grounded findings is now the primary metric.
// avg_groundedness is the rate at which findings cite real symbols
// within file bounds. Modes with low groundedness are confabulating.
console.log("═══ PER-MODE AGGREGATE ═══\n");
console.log(` ${pad("mode", 30)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`);
console.log(` ${"─".repeat(30)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`);
const fileCache: Record<string, string | null> = {};
for (const mode of modesSorted) {
const modeRows = rows.filter(r => r.mode === mode);
if (modeRows.length === 0) continue;
const n = modeRows.length;
let totFind = 0, totGround = 0, totHallu = 0;
for (const r of modeRows) {
const fc = fileCache[r.file_path] ??= readFileSafe(r.file_path);
const g = checkGrounding(r.response, fc);
totFind += g.total;
totGround += g.grounded;
totHallu += g.hallucinated;
}
const avgFind = (totFind / n).toFixed(1);
const avgGround = (totGround / n).toFixed(1);
const grndPct = totFind > 0 ? `${Math.round(100 * totGround / totFind)}%` : "—";
const avgHallu = (totHallu / n).toFixed(1);
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
console.log(
` ${pad(modeLabel(mode), 30)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}`
);
}
// Mode-relative: GROUNDED findings vs lakehouse. The earlier raw
// finding-count comparison rewarded confabulation (more rows = more
// wins). Comparing grounded findings instead corrects for modes
// that produce convincing-but-fake output.
console.log("\n═══ MODE vs codereview_lakehouse (grounded findings, per file) ═══\n");
console.log(` ${pad("mode", 30)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg grounded", 16, true)}`);
console.log(` ${"─".repeat(30)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
for (const mode of modesSorted) {
if (mode === "codereview_lakehouse") continue;
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
for (const file of Object.keys(byFile)) {
const baseline = byFile[file]["codereview_lakehouse"];
const challenger = byFile[file][mode];
if (!baseline || !challenger) continue;
const fc = fileCache[file] ??= readFileSafe(file);
const bg = checkGrounding(baseline.response, fc).grounded;
const cg = checkGrounding(challenger.response, fc).grounded;
if (cg > bg) wins++;
else if (cg < bg) losses++;
else ties++;
totalDelta += cg - bg;
n++;
}
if (n === 0) continue;
const avgDelta = (totalDelta / n).toFixed(1);
console.log(
` ${pad(modeLabel(mode), 30)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
);
}
console.log("\n[compare] done — ⚗ marks lossy/control modes, exclude from recommendations\n");
}
main();