Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."
Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:
codereview_lakehouse — all enrichment on (ceiling)
codereview_null — raw file + generic prompt (baseline)
codereview_isolation — file + pathway only (no matrix)
codereview_matrix_only — file + matrix only (no pathway)
codereview_playbook_only — pathway only, NO file content (lossy ceiling)
Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.
scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.
scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.
scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.
First experiment results (2 files × 5 modes via gpt-oss-120b:free):
- codereview_null produces 12.6KB response with ZERO findings
(proves adversarial framing is load-bearing)
- codereview_playbook_only produces MORE findings than lakehouse
on average (12 vs 9) at 73% the latency — pathway memory is
the dominant signal driver
- codereview_matrix_only underperforms isolation by ~0.5 findings
while costing the same latency — matrix corpus likely
underperforming for scrum_review task class
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
4.4 KiB
TypeScript
128 lines
4.4 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* Mode experiment harness — sweeps a set of files through every native
|
||
* mode, calling /v1/mode/execute serially. Results land in the
|
||
* mode_experiments.jsonl that the gateway already writes (the runner
|
||
* appends per-call). This script just orchestrates the calls.
|
||
*
|
||
* Usage:
|
||
* bun run scripts/mode_experiment.ts \
|
||
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
|
||
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
|
||
* --model openai/gpt-oss-120b:free
|
||
*
|
||
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
|
||
* one model. Cloud-quota-resilient — uses OpenRouter free model unless
|
||
* --model overrides.
|
||
*/
|
||
|
||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
|
||
|
||
const ALL_MODES = [
|
||
"codereview_lakehouse",
|
||
"codereview_null",
|
||
"codereview_isolation",
|
||
"codereview_matrix_only",
|
||
"codereview_playbook_only",
|
||
];
|
||
|
||
const DEFAULT_FILES = [
|
||
"crates/queryd/src/delta.rs",
|
||
"crates/queryd/src/service.rs",
|
||
];
|
||
|
||
function parseArgs(): { files: string[]; modes: string[]; model: string } {
|
||
const args = Bun.argv.slice(2);
|
||
const out: Record<string, string> = {};
|
||
for (let i = 0; i < args.length; i++) {
|
||
const a = args[i];
|
||
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
|
||
}
|
||
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||
const model = out.model ?? "openai/gpt-oss-120b:free";
|
||
return { files, modes, model };
|
||
}
|
||
|
||
interface RunResult {
|
||
file: string;
|
||
mode: string;
|
||
ok: boolean;
|
||
latency_ms?: number;
|
||
response_chars?: number;
|
||
enriched_chars?: number;
|
||
bug_fingerprints?: number;
|
||
matrix_kept?: number;
|
||
matrix_dropped?: number;
|
||
error?: string;
|
||
}
|
||
|
||
async function runOne(file: string, mode: string, model: string): Promise<RunResult> {
|
||
const t0 = Date.now();
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
task_class: TASK_CLASS,
|
||
file_path: file,
|
||
force_mode: mode,
|
||
force_model: model,
|
||
}),
|
||
signal: AbortSignal.timeout(180_000),
|
||
});
|
||
if (!r.ok) {
|
||
const body = await r.text().catch(() => "");
|
||
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
|
||
}
|
||
const j: any = await r.json();
|
||
return {
|
||
file, mode, ok: true,
|
||
latency_ms: j.latency_ms,
|
||
response_chars: (j.response ?? "").length,
|
||
enriched_chars: j.enriched_prompt_chars,
|
||
bug_fingerprints: j.sources?.bug_fingerprints_count,
|
||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||
};
|
||
} catch (e: any) {
|
||
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
const { files, modes, model } = parseArgs();
|
||
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
|
||
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
|
||
console.log("");
|
||
|
||
const results: RunResult[] = [];
|
||
let i = 0;
|
||
for (const file of files) {
|
||
for (const mode of modes) {
|
||
i++;
|
||
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
|
||
const r = await runOne(file, mode, model);
|
||
results.push(r);
|
||
if (r.ok) {
|
||
console.log(
|
||
`✓ ${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
|
||
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
|
||
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
|
||
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
|
||
);
|
||
} else {
|
||
console.log(`✗ ${r.error}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log("");
|
||
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
|
||
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|