#!/usr/bin/env bun /** * Pass 3: variance test. * * Runs codereview_lakehouse on the SAME file N times at each of M * temperatures. Measures run-to-run stability of grounded finding * count, response size, and latency. Anything <100% groundedness * is a leak; track which symbols got hallucinated. * * Output appends to data/_kb/mode_experiments.jsonl. The aggregator * can group by ts and identify variance buckets. * * Usage: bun run scripts/mode_pass3_variance.ts */ const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free"; const FILES = (process.env.LH_FILES ?? [ "crates/queryd/src/delta.rs", "crates/vectord/src/pathway_memory.rs", "crates/gateway/src/v1/mode.rs", ].join(",")).split(","); const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number); const REPS = Number(process.env.LH_REPS ?? 5); interface Result { file: string; temp: number; rep: number; ok: boolean; response_chars?: number; latency_ms?: number; error?: string; } async function runOne(file: string, temp: number, rep: number): Promise { try { const r = await fetch(`${GATEWAY}/v1/mode/execute`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ task_class: "scrum_review", file_path: file, force_mode: "codereview_lakehouse", force_model: MODEL, force_temperature: temp, }), signal: AbortSignal.timeout(180_000), }); if (!r.ok) { const body = await r.text().catch(() => ""); return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` }; } const j: any = await r.json(); return { file, temp, rep, ok: true, response_chars: (j.response ?? "").length, latency_ms: j.latency_ms, }; } catch (e: any) { return { file, temp, rep, ok: false, error: e.message }; } } async function main() { const total = FILES.length * TEMPS.length * REPS; console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`); console.log(`[pass3] model=${MODEL}\n`); let i = 0; const results: Result[] = []; for (const file of FILES) { for (const temp of TEMPS) { for (let rep = 1; rep <= REPS; rep++) { i++; process.stdout.write(` [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `); const r = await runOne(file, temp, rep); results.push(r); if (r.ok) { console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`); } else { console.log(`✗ ${r.error}`); } } } } console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`); // Per-file × temp variance summary (response_chars stddev as a quick // proxy for output instability). console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`); console.log(` ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`); for (const file of FILES) { const cells = TEMPS.map(t => { const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0); if (xs.length === 0) return " — "; const mean = xs.reduce((s, x) => s + x, 0) / xs.length; const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length); return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20); }).join(" "); console.log(` ${file.slice(0, 40).padEnd(40)} ${cells}`); } console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`); } main().catch(e => { console.error(e); process.exit(1); });