Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
97 lines
3.5 KiB
TypeScript
97 lines
3.5 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* Pass 5: variance test for the 2026-04-26 paid-model bake-off.
|
||
*
|
||
* The pass-4 single-rep sweep showed isolation beating every matrix
|
||
* condition by 1.0-1.4 grounded findings/file on grok-4.1-fast. This
|
||
* harness runs N reps × M conditions on the file where the effect was
|
||
* sharpest (pathway_memory.rs, 1355 lines) so we can decide whether
|
||
* the deltas are real signal or run-to-run noise.
|
||
*
|
||
* Conditions:
|
||
* 1. codereview_isolation — no matrix
|
||
* 2. codereview_lakehouse + corpus=lakehouse_arch_v1 — A only
|
||
* 3. codereview_lakehouse + corpus=lakehouse_symbols_v1 — C only
|
||
* 4. codereview_lakehouse (modes.toml default) — A+C composed
|
||
*
|
||
* Output appends per-call to data/_kb/mode_experiments.jsonl. Aggregate
|
||
* with `bun run scripts/mode_compare.ts --since <ts>` and read the
|
||
* grounded column with multiple rows per (mode|corpus) key.
|
||
*
|
||
* Usage:
|
||
* bun run scripts/mode_pass5_variance_paid.ts
|
||
* LH_REPS=3 LH_FILE=crates/queryd/src/delta.rs bun run scripts/mode_pass5_variance_paid.ts
|
||
*/
|
||
|
||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||
const MODEL = process.env.LH_MODEL ?? "x-ai/grok-4.1-fast";
|
||
const FILE = process.env.LH_FILE ?? "crates/vectord/src/pathway_memory.rs";
|
||
const REPS = Number(process.env.LH_REPS ?? 5);
|
||
|
||
interface Condition {
|
||
label: string;
|
||
mode: string;
|
||
corpus?: string | string[];
|
||
}
|
||
|
||
const CONDITIONS: Condition[] = [
|
||
{ label: "isolation ", mode: "codereview_isolation" },
|
||
{ label: "arch_only ", mode: "codereview_lakehouse", corpus: "lakehouse_arch_v1" },
|
||
{ label: "symbols_only ", mode: "codereview_lakehouse", corpus: "lakehouse_symbols_v1" },
|
||
{ label: "composed (A+C) ", mode: "codereview_lakehouse" /* uses modes.toml default */ },
|
||
];
|
||
|
||
async function runOne(c: Condition, rep: number): Promise<{ ok: boolean; latency_ms?: number; resp_chars?: number; error?: string }> {
|
||
const body: any = {
|
||
task_class: "scrum_review",
|
||
file_path: FILE,
|
||
force_mode: c.mode,
|
||
force_model: MODEL,
|
||
};
|
||
if (c.corpus !== undefined) body.force_matrix_corpus = c.corpus;
|
||
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify(body),
|
||
signal: AbortSignal.timeout(240_000),
|
||
});
|
||
if (!r.ok) {
|
||
const txt = await r.text().catch(() => "");
|
||
return { ok: false, error: `HTTP ${r.status}: ${txt.slice(0, 160)}` };
|
||
}
|
||
const j: any = await r.json();
|
||
return { ok: true, latency_ms: j.latency_ms, resp_chars: (j.response ?? "").length };
|
||
} catch (e: any) {
|
||
return { ok: false, error: e.message };
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
const total = CONDITIONS.length * REPS;
|
||
console.log(`[pass5] file=${FILE}`);
|
||
console.log(`[pass5] model=${MODEL} · ${CONDITIONS.length} conditions × ${REPS} reps = ${total} runs`);
|
||
console.log("");
|
||
|
||
let i = 0;
|
||
const startTs = new Date().toISOString();
|
||
for (let rep = 1; rep <= REPS; rep++) {
|
||
for (const c of CONDITIONS) {
|
||
i++;
|
||
process.stdout.write(` [${i}/${total}] rep=${rep} ${c.label}... `);
|
||
const r = await runOne(c, rep);
|
||
if (r.ok) {
|
||
console.log(`✓ ${r.resp_chars} chars · ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||
} else {
|
||
console.log(`✗ ${r.error}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`\n[pass5] complete · started ${startTs}`);
|
||
console.log(`[pass5] aggregate: bun run scripts/mode_compare.ts --since ${startTs}`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|