lakehouse/scripts/mode_pass3_variance.ts

#!/usr/bin/env bun
/**
 * Pass 3: variance test.
 *
 * Runs codereview_lakehouse on the SAME file N times at each of M
 * temperatures. Measures run-to-run stability of grounded finding
 * count, response size, and latency. Anything <100% groundedness
 * is a leak; track which symbols got hallucinated.
 *
 * Output appends to data/_kb/mode_experiments.jsonl. The aggregator
 * can group by ts and identify variance buckets.
 *
 * Usage: bun run scripts/mode_pass3_variance.ts
 */

const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";

const FILES = (process.env.LH_FILES ?? [
  "crates/queryd/src/delta.rs",
  "crates/vectord/src/pathway_memory.rs",
  "crates/gateway/src/v1/mode.rs",
].join(",")).split(",");

const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number);
const REPS = Number(process.env.LH_REPS ?? 5);

interface Result {
  file: string;
  temp: number;
  rep: number;
  ok: boolean;
  response_chars?: number;
  latency_ms?: number;
  error?: string;
}

async function runOne(file: string, temp: number, rep: number): Promise<Result> {
  try {
    const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
      method: "POST",
      headers: { "content-type": "application/json" },
      body: JSON.stringify({
        task_class: "scrum_review",
        file_path: file,
        force_mode: "codereview_lakehouse",
        force_model: MODEL,
        force_temperature: temp,
      }),
      signal: AbortSignal.timeout(180_000),
    });
    if (!r.ok) {
      const body = await r.text().catch(() => "");
      return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
    }
    const j: any = await r.json();
    return {
      file, temp, rep, ok: true,
      response_chars: (j.response ?? "").length,
      latency_ms: j.latency_ms,
    };
  } catch (e: any) {
    return { file, temp, rep, ok: false, error: e.message };
  }
}

async function main() {
  const total = FILES.length * TEMPS.length * REPS;
  console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`);
  console.log(`[pass3] model=${MODEL}\n`);
  let i = 0;
  const results: Result[] = [];
  for (const file of FILES) {
    for (const temp of TEMPS) {
      for (let rep = 1; rep <= REPS; rep++) {
        i++;
        process.stdout.write(`  [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `);
        const r = await runOne(file, temp, rep);
        results.push(r);
        if (r.ok) {
          console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
        } else {
          console.log(`✗ ${r.error}`);
        }
      }
    }
  }

  console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);

  // Per-file × temp variance summary (response_chars stddev as a quick
  // proxy for output instability).
  console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`);
  console.log(`  ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`);
  for (const file of FILES) {
    const cells = TEMPS.map(t => {
      const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0);
      if (xs.length === 0) return "          —          ";
      const mean = xs.reduce((s, x) => s + x, 0) / xs.length;
      const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length);
      return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20);
    }).join(" ");
    console.log(`  ${file.slice(0, 40).padEnd(40)} ${cells}`);
  }

  console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`);
}

main().catch(e => { console.error(e); process.exit(1); });