Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
170 lines
7.1 KiB
TypeScript
170 lines
7.1 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Pass 5 variance summarizer. Reads data/_kb/mode_experiments.jsonl
|
|
* since a timestamp, groups by (mode|corpus), reports mean ± stddev
|
|
* of grounded finding count, plus a head-to-head wins/losses table
|
|
* vs the isolation baseline.
|
|
*
|
|
* Usage:
|
|
* bun run scripts/mode_pass5_summarize.ts # default 2h
|
|
* bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T22 # explicit
|
|
*/
|
|
|
|
import { readFileSync, existsSync } from "node:fs";
|
|
|
|
const argSince = (() => {
|
|
const i = Bun.argv.indexOf("--since");
|
|
return i >= 0 ? Bun.argv[i + 1] : new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
|
|
})();
|
|
|
|
const JSONL = "data/_kb/mode_experiments.jsonl";
|
|
if (!existsSync(JSONL)) { console.error(`no ${JSONL}`); process.exit(1); }
|
|
|
|
interface Row {
|
|
ts: string; mode: string; file_path: string; response: string;
|
|
sources: { matrix_corpus?: string | string[] | null };
|
|
latency_ms: number;
|
|
}
|
|
|
|
function corpusKey(c: any): string {
|
|
if (!c) return "";
|
|
if (typeof c === "string") return c;
|
|
if (Array.isArray(c)) return c.length === 0 ? "" : [...c].sort().join("+");
|
|
return "";
|
|
}
|
|
const condKey = (r: Row) => {
|
|
const c = corpusKey(r.sources?.matrix_corpus);
|
|
return c ? `${r.mode}|${c}` : r.mode;
|
|
};
|
|
|
|
// Reuse the same grounding logic as mode_compare — symbols cited in
|
|
// findings rows must appear in the focus file, and any line numbers
|
|
// must fall within EOF.
|
|
function extractFindings(md: string): { symbols: string[]; lines: number[] }[] {
|
|
const sec = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i;
|
|
const m = md.match(sec);
|
|
let section = md;
|
|
if (m && m.index !== undefined) {
|
|
const after = md.slice(m.index + m[0].length);
|
|
const stop = after.search(/\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i);
|
|
section = stop >= 0 ? after.slice(0, stop) : after;
|
|
}
|
|
// Three row shapes:
|
|
// 1) numbered: `| 1 | ... |`
|
|
// 2) path-with-line: `| service.rs:106 | ... |`
|
|
// 3) path-with-sym: `| crates/vectord/src/pathway_memory.rs:load_fn (≈L220) | ... |`
|
|
// Pick whichever shape matches the most rows (ties favor numbered).
|
|
const numbered = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l));
|
|
const pathRows = section.split("\n").filter(l => /^\|\s*[a-z_/\.][a-z_/\.0-9]*\.(rs|ts|py)\b/i.test(l));
|
|
const rows = numbered.length >= pathRows.length ? numbered : pathRows;
|
|
return rows.map(row => {
|
|
const sym = new Set<string>();
|
|
for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]);
|
|
for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) sym.add(t[1]);
|
|
const lines: number[] = [];
|
|
for (const t of row.matchAll(/[:\-](\d{2,5})/g)) lines.push(parseInt(t[1]));
|
|
return { symbols: [...sym], lines };
|
|
});
|
|
}
|
|
|
|
function grounded(md: string, file: string): { total: number; grounded: number; oob: number } {
|
|
const content = readFileSync(file, "utf8");
|
|
const eof = content.split("\n").length;
|
|
const findings = extractFindings(md);
|
|
let g = 0, oob = 0;
|
|
for (const f of findings) {
|
|
const symHit = f.symbols.length > 0 && f.symbols.some(s => content.includes(s));
|
|
const lineOob = f.lines.length > 0 && f.lines.some(l => l > eof);
|
|
if (lineOob) oob++;
|
|
if (symHit && !lineOob) g++;
|
|
}
|
|
return { total: findings.length, grounded: g, oob };
|
|
}
|
|
|
|
const lines = readFileSync(JSONL, "utf8").split("\n").filter(Boolean);
|
|
const rows: Row[] = [];
|
|
for (const l of lines) {
|
|
try {
|
|
const r: Row = JSON.parse(l);
|
|
if (r.ts < argSince) continue;
|
|
rows.push(r);
|
|
} catch {}
|
|
}
|
|
|
|
if (rows.length === 0) { console.error(`no rows since ${argSince}`); process.exit(1); }
|
|
|
|
// Group: condition → file → array of grounded counts
|
|
type CellArr = { grnd: number[]; total: number[]; oob: number[]; ms: number[] };
|
|
const byCond: Record<string, Record<string, CellArr>> = {};
|
|
for (const r of rows) {
|
|
const k = condKey(r);
|
|
byCond[k] ??= {};
|
|
byCond[k][r.file_path] ??= { grnd: [], total: [], oob: [], ms: [] };
|
|
const g = grounded(r.response, r.file_path);
|
|
byCond[k][r.file_path].grnd.push(g.grounded);
|
|
byCond[k][r.file_path].total.push(g.total);
|
|
byCond[k][r.file_path].oob.push(g.oob);
|
|
byCond[k][r.file_path].ms.push(r.latency_ms);
|
|
}
|
|
|
|
function stats(xs: number[]): { n: number; mean: number; sd: number; min: number; max: number } {
|
|
const n = xs.length;
|
|
if (n === 0) return { n: 0, mean: 0, sd: 0, min: 0, max: 0 };
|
|
const mean = xs.reduce((s, x) => s + x, 0) / n;
|
|
const variance = n === 1 ? 0 : xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1);
|
|
return { n, mean, sd: Math.sqrt(variance), min: Math.min(...xs), max: Math.max(...xs) };
|
|
}
|
|
|
|
const conditions = Object.keys(byCond).sort();
|
|
const files = [...new Set(rows.map(r => r.file_path))].sort();
|
|
|
|
console.log(`\n═══ Pass 5 variance — since ${argSince} ═══\n`);
|
|
console.log(` ${rows.length} rows · ${conditions.length} conditions · ${files.length} files\n`);
|
|
|
|
for (const file of files) {
|
|
console.log(`📄 ${file}`);
|
|
console.log(` ${"condition".padEnd(56)} n ${"grounded mean ± sd".padStart(20)} ${"range".padStart(8)} ${"oob".padStart(4)} ${"avg ms".padStart(7)}`);
|
|
console.log(` ${"─".repeat(56)} ─── ${"─".repeat(20)} ${"─".repeat(8)} ${"─".repeat(4)} ${"─".repeat(7)}`);
|
|
for (const c of conditions) {
|
|
const cell = byCond[c]?.[file];
|
|
if (!cell || cell.grnd.length === 0) continue;
|
|
const s = stats(cell.grnd);
|
|
const oobSum = cell.oob.reduce((a, b) => a + b, 0);
|
|
const msMean = cell.ms.reduce((a, b) => a + b, 0) / cell.ms.length;
|
|
const meanSd = `${s.mean.toFixed(1)} ± ${s.sd.toFixed(1)}`;
|
|
const range = `[${s.min}-${s.max}]`;
|
|
console.log(` ${c.padEnd(56)} ${String(s.n).padStart(3)} ${meanSd.padStart(20)} ${range.padStart(8)} ${String(oobSum).padStart(4)} ${Math.round(msMean / 1000).toString().padStart(5)}s`);
|
|
}
|
|
console.log("");
|
|
}
|
|
|
|
// Head-to-head: for each condition vs isolation baseline, count rep-by-rep
|
|
// wins across the same file. Requires equal rep counts.
|
|
console.log(`═══ Head-to-head: each condition vs isolation, rep-by-rep ═══\n`);
|
|
const isoKey = conditions.find(c => c.startsWith("codereview_isolation"));
|
|
if (!isoKey) {
|
|
console.log(" no isolation rows in window");
|
|
} else {
|
|
console.log(` baseline: ${isoKey}\n`);
|
|
console.log(` ${"challenger".padEnd(56)} wins losses ties Δ mean grnd`);
|
|
console.log(` ${"─".repeat(56)} ${"─".repeat(4)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(12)}`);
|
|
for (const c of conditions) {
|
|
if (c === isoKey) continue;
|
|
let wins = 0, losses = 0, ties = 0, deltaSum = 0, n = 0;
|
|
for (const file of files) {
|
|
const isoArr = byCond[isoKey]?.[file]?.grnd ?? [];
|
|
const cArr = byCond[c]?.[file]?.grnd ?? [];
|
|
const k = Math.min(isoArr.length, cArr.length);
|
|
for (let i = 0; i < k; i++) {
|
|
if (cArr[i] > isoArr[i]) wins++;
|
|
else if (cArr[i] < isoArr[i]) losses++;
|
|
else ties++;
|
|
deltaSum += cArr[i] - isoArr[i];
|
|
n++;
|
|
}
|
|
}
|
|
const dMean = n > 0 ? (deltaSum / n).toFixed(2) : "—";
|
|
console.log(` ${c.padEnd(56)} ${String(wins).padStart(4)} ${String(losses).padStart(6)} ${String(ties).padStart(4)} ${dMean.padStart(12)}`);
|
|
}
|
|
}
|