lakehouse/scripts/mode_pass5_summarize.ts
root 2dbc8dbc83
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: model-aware enrichment downgrade + 3 corpora + variance harness
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing
matrix corpora is anti-additive on strong models — composed lakehouse_arch
+ symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded
findings, p=0.031). Default flips to isolation; matrix path now auto-
downgrades when the resolved model is strong.

Mode runner:
- matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec)
- top_k=6 from each corpus, merge by score, take top 8 globally
- chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch]
- is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation
  for strong models (default-strong; weak = :free suffix or local last-resort)
- LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs
- EnrichmentSources.downgraded_from records when the gate fires

Three corpora indexed via /vectors/index (5849 chunks total):
- lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks)
- scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED
  from defaults — 24% out-of-bounds line citations from cross-file drift)
- lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks)

Experiment infra:
- scripts/build_*_corpus.ts — re-runnable when source content changes
- scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file
- scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles
  numbered + path-with-line + path-with-symbol finding tables
- scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora
- scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast,
  --corpus flag for per-call override

Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:29:17 -05:00

170 lines
7.1 KiB
TypeScript

#!/usr/bin/env bun
/**
* Pass 5 variance summarizer. Reads data/_kb/mode_experiments.jsonl
* since a timestamp, groups by (mode|corpus), reports mean ± stddev
* of grounded finding count, plus a head-to-head wins/losses table
* vs the isolation baseline.
*
* Usage:
* bun run scripts/mode_pass5_summarize.ts # default 2h
* bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T22 # explicit
*/
import { readFileSync, existsSync } from "node:fs";
const argSince = (() => {
const i = Bun.argv.indexOf("--since");
return i >= 0 ? Bun.argv[i + 1] : new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
})();
const JSONL = "data/_kb/mode_experiments.jsonl";
if (!existsSync(JSONL)) { console.error(`no ${JSONL}`); process.exit(1); }
interface Row {
ts: string; mode: string; file_path: string; response: string;
sources: { matrix_corpus?: string | string[] | null };
latency_ms: number;
}
function corpusKey(c: any): string {
if (!c) return "";
if (typeof c === "string") return c;
if (Array.isArray(c)) return c.length === 0 ? "" : [...c].sort().join("+");
return "";
}
const condKey = (r: Row) => {
const c = corpusKey(r.sources?.matrix_corpus);
return c ? `${r.mode}|${c}` : r.mode;
};
// Reuse the same grounding logic as mode_compare — symbols cited in
// findings rows must appear in the focus file, and any line numbers
// must fall within EOF.
function extractFindings(md: string): { symbols: string[]; lines: number[] }[] {
const sec = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
const m = md.match(sec);
let section = md;
if (m && m.index !== undefined) {
const after = md.slice(m.index + m[0].length);
const stop = after.search(/\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i);
section = stop >= 0 ? after.slice(0, stop) : after;
}
// Three row shapes:
// 1) numbered: `| 1 | ... |`
// 2) path-with-line: `| service.rs:106 | ... |`
// 3) path-with-sym: `| crates/vectord/src/pathway_memory.rs:load_fn (≈L220) | ... |`
// Pick whichever shape matches the most rows (ties favor numbered).
const numbered = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
const pathRows = section.split("\n").filter(l => /^\|\s*[a-z_/\.][a-z_/\.0-9]*\.(rs|ts|py)\b/i.test(l));
const rows = numbered.length >= pathRows.length ? numbered : pathRows;
return rows.map(row => {
const sym = new Set<string>();
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) sym.add(t[1]);
const lines: number[] = [];
for (const t of row.matchAll(/[:\-](\d{2,5})/g)) lines.push(parseInt(t[1]));
return { symbols: [...sym], lines };
});
}
function grounded(md: string, file: string): { total: number; grounded: number; oob: number } {
const content = readFileSync(file, "utf8");
const eof = content.split("\n").length;
const findings = extractFindings(md);
let g = 0, oob = 0;
for (const f of findings) {
const symHit = f.symbols.length > 0 && f.symbols.some(s => content.includes(s));
const lineOob = f.lines.length > 0 && f.lines.some(l => l > eof);
if (lineOob) oob++;
if (symHit && !lineOob) g++;
}
return { total: findings.length, grounded: g, oob };
}
const lines = readFileSync(JSONL, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const l of lines) {
try {
const r: Row = JSON.parse(l);
if (r.ts < argSince) continue;
rows.push(r);
} catch {}
}
if (rows.length === 0) { console.error(`no rows since ${argSince}`); process.exit(1); }
// Group: condition → file → array of grounded counts
type CellArr = { grnd: number[]; total: number[]; oob: number[]; ms: number[] };
const byCond: Record<string, Record<string, CellArr>> = {};
for (const r of rows) {
const k = condKey(r);
byCond[k] ??= {};
byCond[k][r.file_path] ??= { grnd: [], total: [], oob: [], ms: [] };
const g = grounded(r.response, r.file_path);
byCond[k][r.file_path].grnd.push(g.grounded);
byCond[k][r.file_path].total.push(g.total);
byCond[k][r.file_path].oob.push(g.oob);
byCond[k][r.file_path].ms.push(r.latency_ms);
}
function stats(xs: number[]): { n: number; mean: number; sd: number; min: number; max: number } {
const n = xs.length;
if (n === 0) return { n: 0, mean: 0, sd: 0, min: 0, max: 0 };
const mean = xs.reduce((s, x) => s + x, 0) / n;
const variance = n === 1 ? 0 : xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1);
return { n, mean, sd: Math.sqrt(variance), min: Math.min(...xs), max: Math.max(...xs) };
}
const conditions = Object.keys(byCond).sort();
const files = [...new Set(rows.map(r => r.file_path))].sort();
console.log(`\n═══ Pass 5 variance — since ${argSince} ═══\n`);
console.log(` ${rows.length} rows · ${conditions.length} conditions · ${files.length} files\n`);
for (const file of files) {
console.log(`📄 ${file}`);
console.log(` ${"condition".padEnd(56)} n ${"grounded mean ± sd".padStart(20)} ${"range".padStart(8)} ${"oob".padStart(4)} ${"avg ms".padStart(7)}`);
console.log(` ${"─".repeat(56)} ─── ${"─".repeat(20)} ${"─".repeat(8)} ${"─".repeat(4)} ${"─".repeat(7)}`);
for (const c of conditions) {
const cell = byCond[c]?.[file];
if (!cell || cell.grnd.length === 0) continue;
const s = stats(cell.grnd);
const oobSum = cell.oob.reduce((a, b) => a + b, 0);
const msMean = cell.ms.reduce((a, b) => a + b, 0) / cell.ms.length;
const meanSd = `${s.mean.toFixed(1)} ± ${s.sd.toFixed(1)}`;
const range = `[${s.min}-${s.max}]`;
console.log(` ${c.padEnd(56)} ${String(s.n).padStart(3)} ${meanSd.padStart(20)} ${range.padStart(8)} ${String(oobSum).padStart(4)} ${Math.round(msMean / 1000).toString().padStart(5)}s`);
}
console.log("");
}
// Head-to-head: for each condition vs isolation baseline, count rep-by-rep
// wins across the same file. Requires equal rep counts.
console.log(`═══ Head-to-head: each condition vs isolation, rep-by-rep ═══\n`);
const isoKey = conditions.find(c => c.startsWith("codereview_isolation"));
if (!isoKey) {
console.log(" no isolation rows in window");
} else {
console.log(` baseline: ${isoKey}\n`);
console.log(` ${"challenger".padEnd(56)} wins losses ties Δ mean grnd`);
console.log(` ${"─".repeat(56)} ${"─".repeat(4)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(12)}`);
for (const c of conditions) {
if (c === isoKey) continue;
let wins = 0, losses = 0, ties = 0, deltaSum = 0, n = 0;
for (const file of files) {
const isoArr = byCond[isoKey]?.[file]?.grnd ?? [];
const cArr = byCond[c]?.[file]?.grnd ?? [];
const k = Math.min(isoArr.length, cArr.length);
for (let i = 0; i < k; i++) {
if (cArr[i] > isoArr[i]) wins++;
else if (cArr[i] < isoArr[i]) losses++;
else ties++;
deltaSum += cArr[i] - isoArr[i];
n++;
}
}
const dMean = n > 0 ? (deltaSum / n).toFixed(2) : "—";
console.log(` ${c.padEnd(56)} ${String(wins).padStart(4)} ${String(losses).padStart(6)} ${String(ties).padStart(4)} ${dMean.padStart(12)}`);
}
}