Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Setup for the corpus-tightening experiment sweep (J 2026-04-26 — "now
is the only cheap window before the corpus gets large and refactoring
costs go up").
Override params on /v1/mode/execute (additive — old callers unaffected):
force_matrix_corpus — Pass 2: try alternate corpora per call
force_relevance_threshold — Pass 2: sweep filter strictness
force_temperature — Pass 3: variance test
New native mode `staffing_inference_lakehouse` (Pass 4):
- Same composer architecture as codereview_lakehouse
- Staffing framing: coordinator producing fillable|contingent|
unfillable verdict + ranked candidate list with playbook citations
- matrix_corpus = workers_500k_v8
- Validates that modes-as-prompt-molders generalizes beyond code
- Framing explicitly says "do NOT fabricate workers" — the staffing
analog of the lakehouse mode's symbol-grounding requirement
Three sweep harnesses:
scripts/mode_pass2_corpus_sweep.ts — 4 corpora × 4 thresholds × 5 files
scripts/mode_pass3_variance.ts — 3 files × 3 temps × 5 reps
scripts/mode_pass4_staffing.ts — 5 fill requests through staffing mode
Each appends per-call rows to data/_kb/mode_experiments.jsonl which
mode_compare.ts already aggregates with grounding column.
Pass 1 (10 files × 5 modes broad sweep) currently running via the
existing scripts/mode_experiment.ts — gateway restart deferred until
it completes so the new override knobs aren't enabled mid-experiment.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
122 lines
4.4 KiB
TypeScript
122 lines
4.4 KiB
TypeScript
#!/usr/bin/env bun
|
||
/**
|
||
* Pass 2: matrix corpus + relevance threshold sweep.
|
||
*
|
||
* For each (corpus, threshold) combination, run codereview_matrix_only
|
||
* on the same N files. Compares which corpus actually adds grounded
|
||
* findings vs codereview_isolation (matrix-off baseline).
|
||
*
|
||
* Output: data/_kb/mode_experiments.jsonl gets one row per call,
|
||
* tagged via the force_matrix_corpus + force_relevance_threshold
|
||
* fields visible in `sources`. Aggregator can then group by corpus.
|
||
*
|
||
* Usage: bun run scripts/mode_pass2_corpus_sweep.ts
|
||
*/
|
||
|
||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||
const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free";
|
||
|
||
const FILES = (process.env.LH_FILES ?? [
|
||
"crates/queryd/src/delta.rs",
|
||
"crates/queryd/src/service.rs",
|
||
"crates/vectord/src/pathway_memory.rs",
|
||
"crates/gateway/src/v1/mode.rs",
|
||
"crates/aibridge/src/client.rs",
|
||
].join(",")).split(",");
|
||
|
||
const CORPORA = (process.env.LH_CORPORA ?? [
|
||
"distilled_procedural_v20260423102847",
|
||
"distilled_factual_v20260423095819",
|
||
"distilled_config_hint_v20260423102847",
|
||
"kb_team_runs_v1",
|
||
].join(",")).split(",");
|
||
|
||
const THRESHOLDS = (process.env.LH_THRESHOLDS ?? "0.2,0.3,0.4,0.5").split(",").map(Number);
|
||
|
||
interface Result {
|
||
corpus: string;
|
||
threshold: number;
|
||
file: string;
|
||
ok: boolean;
|
||
matrix_kept?: number;
|
||
matrix_dropped?: number;
|
||
response_chars?: number;
|
||
latency_ms?: number;
|
||
error?: string;
|
||
}
|
||
|
||
async function runOne(corpus: string, threshold: number, file: string): Promise<Result> {
|
||
try {
|
||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
task_class: "scrum_review",
|
||
file_path: file,
|
||
force_mode: "codereview_matrix_only",
|
||
force_model: MODEL,
|
||
force_matrix_corpus: corpus,
|
||
force_relevance_threshold: threshold,
|
||
}),
|
||
signal: AbortSignal.timeout(180_000),
|
||
});
|
||
if (!r.ok) {
|
||
const body = await r.text().catch(() => "");
|
||
return { corpus, threshold, file, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` };
|
||
}
|
||
const j: any = await r.json();
|
||
return {
|
||
corpus, threshold, file, ok: true,
|
||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||
response_chars: (j.response ?? "").length,
|
||
latency_ms: j.latency_ms,
|
||
};
|
||
} catch (e: any) {
|
||
return { corpus, threshold, file, ok: false, error: e.message };
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
const total = CORPORA.length * THRESHOLDS.length * FILES.length;
|
||
console.log(`[pass2] corpora=${CORPORA.length} × thresholds=${THRESHOLDS.length} × files=${FILES.length} = ${total} runs`);
|
||
console.log(`[pass2] model=${MODEL}\n`);
|
||
let i = 0;
|
||
const results: Result[] = [];
|
||
for (const corpus of CORPORA) {
|
||
for (const threshold of THRESHOLDS) {
|
||
for (const file of FILES) {
|
||
i++;
|
||
process.stdout.write(` [${i}/${total}] corpus=${corpus.slice(0, 30).padEnd(30)} thr=${threshold.toFixed(1)} ${file.slice(-32).padStart(32)} ... `);
|
||
const r = await runOne(corpus, threshold, file);
|
||
results.push(r);
|
||
if (r.ok) {
|
||
const total_chunks = (r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0);
|
||
console.log(`✓ k=${r.matrix_kept}/${total_chunks} resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`);
|
||
} else {
|
||
console.log(`✗ ${r.error}`);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`\n[pass2] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||
|
||
// Per-corpus×threshold roll-up of kept-rate (the matrix usefulness proxy).
|
||
console.log(`\n[pass2] kept-rate by corpus × threshold (avg chunks kept per call):`);
|
||
console.log(` ${"corpus".padEnd(40)} ${THRESHOLDS.map(t => `thr=${t.toFixed(1)}`).join(" ").padStart(35)}`);
|
||
for (const corpus of CORPORA) {
|
||
const cells = THRESHOLDS.map(t => {
|
||
const matched = results.filter(r => r.ok && r.corpus === corpus && r.threshold === t);
|
||
if (matched.length === 0) return " — ";
|
||
const avgKept = matched.reduce((s, r) => s + (r.matrix_kept ?? 0), 0) / matched.length;
|
||
return avgKept.toFixed(1).padStart(5);
|
||
}).join(" ");
|
||
console.log(` ${corpus.slice(0, 40).padEnd(40)} ${cells}`);
|
||
}
|
||
|
||
console.log(`\n[pass2] aggregate findings/groundedness with: bun run scripts/mode_compare.ts`);
|
||
}
|
||
|
||
main().catch(e => { console.error(e); process.exit(1); });
|