Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec<String> (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
136 lines
5.0 KiB
TypeScript
136 lines
5.0 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* Mode experiment harness — sweeps a set of files through every native
|
||
* mode, calling /v1/mode/execute serially. Results land in the
|
||
* mode_experiments.jsonl that the gateway already writes (the runner
|
||
* appends per-call). This script just orchestrates the calls.
|
||
*
|
||
* Usage:
|
||
* bun run scripts/mode_experiment.ts \
|
||
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
|
||
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
|
||
* --model openai/gpt-oss-120b:free
|
||
*
|
||
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
|
||
* one model. Cloud-quota-resilient — uses OpenRouter free model unless
|
||
* --model overrides.
|
||
*/
|
||
|
||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
|
||
|
||
const ALL_MODES = [
|
||
"codereview_lakehouse",
|
||
"codereview_null",
|
||
"codereview_isolation",
|
||
"codereview_matrix_only",
|
||
"codereview_playbook_only",
|
||
];
|
||
|
||
const DEFAULT_FILES = [
|
||
"crates/queryd/src/delta.rs",
|
||
"crates/queryd/src/service.rs",
|
||
];
|
||
|
||
function parseArgs(): { files: string[]; modes: string[]; model: string; corpus: string[] } {
|
||
const args = Bun.argv.slice(2);
|
||
const out: Record<string, string> = {};
|
||
for (let i = 0; i < args.length; i++) {
|
||
const a = args[i];
|
||
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
|
||
}
|
||
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||
// Default to the paid OpenRouter primary (matches scrum_master_pipeline
|
||
// ladder rung 1). Pass `--model openai/gpt-oss-120b:free` if you want
|
||
// the old free-tier baseline. See SCRUM_MASTER_SPEC.md for the ladder.
|
||
const model = out.model ?? "x-ai/grok-4.1-fast";
|
||
const corpus = (out.corpus ?? "").split(",").map(s => s.trim()).filter(Boolean);
|
||
return { files, modes, model, corpus };
|
||
}
|
||
|
||
interface RunResult {
|
||
file: string;
|
||
mode: string;
|
||
ok: boolean;
|
||
latency_ms?: number;
|
||
response_chars?: number;
|
||
enriched_chars?: number;
|
||
bug_fingerprints?: number;
|
||
matrix_kept?: number;
|
||
matrix_dropped?: number;
|
||
error?: string;
|
||
}
|
||
|
||
async function runOne(file: string, mode: string, model: string, corpus: string[]): Promise<RunResult> {
|
||
const t0 = Date.now();
|
||
try {
|
||
const body: any = {
|
||
task_class: TASK_CLASS,
|
||
file_path: file,
|
||
force_mode: mode,
|
||
force_model: model,
|
||
};
|
||
if (corpus.length === 1) body.force_matrix_corpus = corpus[0];
|
||
else if (corpus.length > 1) body.force_matrix_corpus = corpus;
|
||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify(body),
|
||
signal: AbortSignal.timeout(240_000),
|
||
});
|
||
if (!r.ok) {
|
||
const body = await r.text().catch(() => "");
|
||
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
|
||
}
|
||
const j: any = await r.json();
|
||
return {
|
||
file, mode, ok: true,
|
||
latency_ms: j.latency_ms,
|
||
response_chars: (j.response ?? "").length,
|
||
enriched_chars: j.enriched_prompt_chars,
|
||
bug_fingerprints: j.sources?.bug_fingerprints_count,
|
||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||
};
|
||
} catch (e: any) {
|
||
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
const { files, modes, model, corpus } = parseArgs();
|
||
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
|
||
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
|
||
if (corpus.length > 0) console.log(`[experiment] corpus override: ${corpus.join(" + ")}`);
|
||
console.log("");
|
||
|
||
const results: RunResult[] = [];
|
||
let i = 0;
|
||
for (const file of files) {
|
||
for (const mode of modes) {
|
||
i++;
|
||
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
|
||
const r = await runOne(file, mode, model, corpus);
|
||
results.push(r);
|
||
if (r.ok) {
|
||
console.log(
|
||
`✓ ${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
|
||
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
|
||
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
|
||
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
|
||
);
|
||
} else {
|
||
console.log(`✗ ${r.error}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log("");
|
||
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
|
||
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|