Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Setup for the corpus-tightening experiment sweep (J 2026-04-26 — "now
is the only cheap window before the corpus gets large and refactoring
costs go up").
Override params on /v1/mode/execute (additive — old callers unaffected):
force_matrix_corpus — Pass 2: try alternate corpora per call
force_relevance_threshold — Pass 2: sweep filter strictness
force_temperature — Pass 3: variance test
New native mode `staffing_inference_lakehouse` (Pass 4):
- Same composer architecture as codereview_lakehouse
- Staffing framing: coordinator producing fillable|contingent|
unfillable verdict + ranked candidate list with playbook citations
- matrix_corpus = workers_500k_v8
- Validates that modes-as-prompt-molders generalizes beyond code
- Framing explicitly says "do NOT fabricate workers" — the staffing
analog of the lakehouse mode's symbol-grounding requirement
Three sweep harnesses:
scripts/mode_pass2_corpus_sweep.ts — 4 corpora × 4 thresholds × 5 files
scripts/mode_pass3_variance.ts — 3 files × 3 temps × 5 reps
scripts/mode_pass4_staffing.ts — 5 fill requests through staffing mode
Each appends per-call rows to data/_kb/mode_experiments.jsonl which
mode_compare.ts already aggregates with grounding column.
Pass 1 (10 files × 5 modes broad sweep) currently running via the
existing scripts/mode_experiment.ts — gateway restart deferred until
it completes so the new override knobs aren't enabled mid-experiment.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
110 lines
3.9 KiB
TypeScript
110 lines
3.9 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* Pass 3: variance test.
|
||
*
|
||
* Runs codereview_lakehouse on the SAME file N times at each of M
|
||
* temperatures. Measures run-to-run stability of grounded finding
|
||
* count, response size, and latency. Anything <100% groundedness
|
||
* is a leak; track which symbols got hallucinated.
|
||
*
|
||
* Output appends to data/_kb/mode_experiments.jsonl. The aggregator
|
||
* can group by ts and identify variance buckets.
|
||
*
|
||
* Usage: bun run scripts/mode_pass3_variance.ts
|
||
*/
|
||
|
||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
|
||
|
||
const FILES = (process.env.LH_FILES ?? [
|
||
"crates/queryd/src/delta.rs",
|
||
"crates/vectord/src/pathway_memory.rs",
|
||
"crates/gateway/src/v1/mode.rs",
|
||
].join(",")).split(",");
|
||
|
||
const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number);
|
||
const REPS = Number(process.env.LH_REPS ?? 5);
|
||
|
||
interface Result {
|
||
file: string;
|
||
temp: number;
|
||
rep: number;
|
||
ok: boolean;
|
||
response_chars?: number;
|
||
latency_ms?: number;
|
||
error?: string;
|
||
}
|
||
|
||
async function runOne(file: string, temp: number, rep: number): Promise<Result> {
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
task_class: "scrum_review",
|
||
file_path: file,
|
||
force_mode: "codereview_lakehouse",
|
||
force_model: MODEL,
|
||
force_temperature: temp,
|
||
}),
|
||
signal: AbortSignal.timeout(180_000),
|
||
});
|
||
if (!r.ok) {
|
||
const body = await r.text().catch(() => "");
|
||
return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
|
||
}
|
||
const j: any = await r.json();
|
||
return {
|
||
file, temp, rep, ok: true,
|
||
response_chars: (j.response ?? "").length,
|
||
latency_ms: j.latency_ms,
|
||
};
|
||
} catch (e: any) {
|
||
return { file, temp, rep, ok: false, error: e.message };
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
const total = FILES.length * TEMPS.length * REPS;
|
||
console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`);
|
||
console.log(`[pass3] model=${MODEL}\n`);
|
||
let i = 0;
|
||
const results: Result[] = [];
|
||
for (const file of FILES) {
|
||
for (const temp of TEMPS) {
|
||
for (let rep = 1; rep <= REPS; rep++) {
|
||
i++;
|
||
process.stdout.write(` [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `);
|
||
const r = await runOne(file, temp, rep);
|
||
results.push(r);
|
||
if (r.ok) {
|
||
console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||
} else {
|
||
console.log(`✗ ${r.error}`);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||
|
||
// Per-file × temp variance summary (response_chars stddev as a quick
|
||
// proxy for output instability).
|
||
console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`);
|
||
console.log(` ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`);
|
||
for (const file of FILES) {
|
||
const cells = TEMPS.map(t => {
|
||
const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0);
|
||
if (xs.length === 0) return " — ";
|
||
const mean = xs.reduce((s, x) => s + x, 0) / xs.length;
|
||
const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length);
|
||
return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20);
|
||
}).join(" ");
|
||
console.log(` ${file.slice(0, 40).padEnd(40)} ${cells}`);
|
||
}
|
||
|
||
console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|