lakehouse/scripts/mode_experiment.ts
root 7c47734287
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: parameterized runner + 5 enrichment-experiment modes
J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."

Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:

  codereview_lakehouse     — all enrichment on (ceiling)
  codereview_null          — raw file + generic prompt (baseline)
  codereview_isolation     — file + pathway only (no matrix)
  codereview_matrix_only   — file + matrix only (no pathway)
  codereview_playbook_only — pathway only, NO file content (lossy ceiling)

Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.

scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.

scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.

scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.

First experiment results (2 files × 5 modes via gpt-oss-120b:free):
  - codereview_null produces 12.6KB response with ZERO findings
    (proves adversarial framing is load-bearing)
  - codereview_playbook_only produces MORE findings than lakehouse
    on average (12 vs 9) at 73% the latency — pathway memory is
    the dominant signal driver
  - codereview_matrix_only underperforms isolation by ~0.5 findings
    while costing the same latency — matrix corpus likely
    underperforming for scrum_review task class

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 01:36:42 -05:00

128 lines
4.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bun
/**
* Mode experiment harness — sweeps a set of files through every native
* mode, calling /v1/mode/execute serially. Results land in the
* mode_experiments.jsonl that the gateway already writes (the runner
* appends per-call). This script just orchestrates the calls.
*
* Usage:
* bun run scripts/mode_experiment.ts \
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
* --model openai/gpt-oss-120b:free
*
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
* one model. Cloud-quota-resilient — uses OpenRouter free model unless
* --model overrides.
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
const ALL_MODES = [
"codereview_lakehouse",
"codereview_null",
"codereview_isolation",
"codereview_matrix_only",
"codereview_playbook_only",
];
const DEFAULT_FILES = [
"crates/queryd/src/delta.rs",
"crates/queryd/src/service.rs",
];
function parseArgs(): { files: string[]; modes: string[]; model: string } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
const model = out.model ?? "openai/gpt-oss-120b:free";
return { files, modes, model };
}
interface RunResult {
file: string;
mode: string;
ok: boolean;
latency_ms?: number;
response_chars?: number;
enriched_chars?: number;
bug_fingerprints?: number;
matrix_kept?: number;
matrix_dropped?: number;
error?: string;
}
async function runOne(file: string, mode: string, model: string): Promise<RunResult> {
const t0 = Date.now();
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: TASK_CLASS,
file_path: file,
force_mode: mode,
force_model: model,
}),
signal: AbortSignal.timeout(180_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
}
const j: any = await r.json();
return {
file, mode, ok: true,
latency_ms: j.latency_ms,
response_chars: (j.response ?? "").length,
enriched_chars: j.enriched_prompt_chars,
bug_fingerprints: j.sources?.bug_fingerprints_count,
matrix_kept: j.sources?.matrix_chunks_kept,
matrix_dropped: j.sources?.matrix_chunks_dropped,
};
} catch (e: any) {
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
}
}
async function main() {
const { files, modes, model } = parseArgs();
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
console.log("");
const results: RunResult[] = [];
let i = 0;
for (const file of files) {
for (const mode of modes) {
i++;
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
const r = await runOne(file, mode, model);
results.push(r);
if (r.ok) {
console.log(
`${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
);
} else {
console.log(`${r.error}`);
}
}
}
console.log("");
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
}
main().catch(e => { console.error(e); process.exit(1); });