lakehouse/scripts/mode_experiment.ts
root 2dbc8dbc83
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: model-aware enrichment downgrade + 3 corpora + variance harness
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing
matrix corpora is anti-additive on strong models — composed lakehouse_arch
+ symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded
findings, p=0.031). Default flips to isolation; matrix path now auto-
downgrades when the resolved model is strong.

Mode runner:
- matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec)
- top_k=6 from each corpus, merge by score, take top 8 globally
- chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch]
- is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation
  for strong models (default-strong; weak = :free suffix or local last-resort)
- LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs
- EnrichmentSources.downgraded_from records when the gate fires

Three corpora indexed via /vectors/index (5849 chunks total):
- lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks)
- scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED
  from defaults — 24% out-of-bounds line citations from cross-file drift)
- lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks)

Experiment infra:
- scripts/build_*_corpus.ts — re-runnable when source content changes
- scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file
- scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles
  numbered + path-with-line + path-with-symbol finding tables
- scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora
- scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast,
  --corpus flag for per-call override

Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:29:17 -05:00

136 lines
5.0 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bun
/**
* Mode experiment harness — sweeps a set of files through every native
* mode, calling /v1/mode/execute serially. Results land in the
* mode_experiments.jsonl that the gateway already writes (the runner
* appends per-call). This script just orchestrates the calls.
*
* Usage:
* bun run scripts/mode_experiment.ts \
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
* --model openai/gpt-oss-120b:free
*
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
* one model. Cloud-quota-resilient — uses OpenRouter free model unless
* --model overrides.
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
const ALL_MODES = [
"codereview_lakehouse",
"codereview_null",
"codereview_isolation",
"codereview_matrix_only",
"codereview_playbook_only",
];
const DEFAULT_FILES = [
"crates/queryd/src/delta.rs",
"crates/queryd/src/service.rs",
];
function parseArgs(): { files: string[]; modes: string[]; model: string; corpus: string[] } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
// Default to the paid OpenRouter primary (matches scrum_master_pipeline
// ladder rung 1). Pass `--model openai/gpt-oss-120b:free` if you want
// the old free-tier baseline. See SCRUM_MASTER_SPEC.md for the ladder.
const model = out.model ?? "x-ai/grok-4.1-fast";
const corpus = (out.corpus ?? "").split(",").map(s => s.trim()).filter(Boolean);
return { files, modes, model, corpus };
}
interface RunResult {
file: string;
mode: string;
ok: boolean;
latency_ms?: number;
response_chars?: number;
enriched_chars?: number;
bug_fingerprints?: number;
matrix_kept?: number;
matrix_dropped?: number;
error?: string;
}
async function runOne(file: string, mode: string, model: string, corpus: string[]): Promise<RunResult> {
const t0 = Date.now();
try {
const body: any = {
task_class: TASK_CLASS,
file_path: file,
force_mode: mode,
force_model: model,
};
if (corpus.length === 1) body.force_matrix_corpus = corpus[0];
else if (corpus.length > 1) body.force_matrix_corpus = corpus;
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(body),
signal: AbortSignal.timeout(240_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
}
const j: any = await r.json();
return {
file, mode, ok: true,
latency_ms: j.latency_ms,
response_chars: (j.response ?? "").length,
enriched_chars: j.enriched_prompt_chars,
bug_fingerprints: j.sources?.bug_fingerprints_count,
matrix_kept: j.sources?.matrix_chunks_kept,
matrix_dropped: j.sources?.matrix_chunks_dropped,
};
} catch (e: any) {
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
}
}
async function main() {
const { files, modes, model, corpus } = parseArgs();
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
if (corpus.length > 0) console.log(`[experiment] corpus override: ${corpus.join(" + ")}`);
console.log("");
const results: RunResult[] = [];
let i = 0;
for (const file of files) {
for (const mode of modes) {
i++;
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
const r = await runOne(file, mode, model, corpus);
results.push(r);
if (r.ok) {
console.log(
`${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
);
} else {
console.log(`${r.error}`);
}
}
}
console.log("");
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
}
main().catch(e => { console.error(e); process.exit(1); });