From 56bf30cfd8aa7ab83576229c1e173eeab30a482e Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Apr 2026 01:55:12 -0500 Subject: [PATCH] v1/mode: override knobs + staffing native runner + pass 2/3/4 harnesses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setup for the corpus-tightening experiment sweep (J 2026-04-26 — "now is the only cheap window before the corpus gets large and refactoring costs go up"). Override params on /v1/mode/execute (additive — old callers unaffected): force_matrix_corpus — Pass 2: try alternate corpora per call force_relevance_threshold — Pass 2: sweep filter strictness force_temperature — Pass 3: variance test New native mode `staffing_inference_lakehouse` (Pass 4): - Same composer architecture as codereview_lakehouse - Staffing framing: coordinator producing fillable|contingent| unfillable verdict + ranked candidate list with playbook citations - matrix_corpus = workers_500k_v8 - Validates that modes-as-prompt-molders generalizes beyond code - Framing explicitly says "do NOT fabricate workers" — the staffing analog of the lakehouse mode's symbol-grounding requirement Three sweep harnesses: scripts/mode_pass2_corpus_sweep.ts — 4 corpora × 4 thresholds × 5 files scripts/mode_pass3_variance.ts — 3 files × 3 temps × 5 reps scripts/mode_pass4_staffing.ts — 5 fill requests through staffing mode Each appends per-call rows to data/_kb/mode_experiments.jsonl which mode_compare.ts already aggregates with grounding column. Pass 1 (10 files × 5 modes broad sweep) currently running via the existing scripts/mode_experiment.ts — gateway restart deferred until it completes so the new override knobs aren't enabled mid-experiment. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/modes.toml | 10 ++- crates/gateway/src/v1/mode.rs | 46 ++++++++++- scripts/mode_pass2_corpus_sweep.ts | 121 +++++++++++++++++++++++++++ scripts/mode_pass3_variance.ts | 109 +++++++++++++++++++++++++ scripts/mode_pass4_staffing.ts | 127 +++++++++++++++++++++++++++++ 5 files changed, 408 insertions(+), 5 deletions(-) create mode 100644 scripts/mode_pass2_corpus_sweep.ts create mode 100644 scripts/mode_pass3_variance.ts create mode 100644 scripts/mode_pass4_staffing.ts diff --git a/config/modes.toml b/config/modes.toml index e021a2f..930a636 100644 --- a/config/modes.toml +++ b/config/modes.toml @@ -31,9 +31,13 @@ matrix_corpus = "chicago_permits_v1" [[task_class]] name = "staffing_inference" -preferred_mode = "ladder" -fallback_modes = ["consensus", "pipeline"] -default_model = "gpt-oss:120b" +# Staffing-domain native enrichment runner — Pass 4 (2026-04-26). +# Same composer architecture as codereview_lakehouse but with staffing +# framing + workers corpus. Validates that the modes-as-prompt-molders +# pattern generalizes beyond code review. +preferred_mode = "staffing_inference_lakehouse" +fallback_modes = ["ladder", "consensus", "pipeline"] +default_model = "openai/gpt-oss-120b:free" matrix_corpus = "workers_500k_v8" [[task_class]] diff --git a/crates/gateway/src/v1/mode.rs b/crates/gateway/src/v1/mode.rs index ebe74e7..a601089 100644 --- a/crates/gateway/src/v1/mode.rs +++ b/crates/gateway/src/v1/mode.rs @@ -51,6 +51,7 @@ const VALID_MODES: &[&str] = &[ "codereview_isolation", // file + pathway only (no matrix) "codereview_matrix_only", // file + matrix only (no pathway) "codereview_playbook_only", // pathway only, NO file content (lossy ceiling) + "staffing_inference_lakehouse", // staffing-domain composer (Pass 4) ]; /// Whether a mode is handled natively in this gateway vs proxied to @@ -63,6 +64,7 @@ fn is_native_mode(mode: &str) -> bool { | "codereview_isolation" | "codereview_matrix_only" | "codereview_playbook_only" + | "staffing_inference_lakehouse" ) } @@ -82,6 +84,7 @@ pub struct EnrichmentFlags { pub enum ReviewerFraming { Adversarial, // forensic, ranked findings + verdict (lakehouse default) Generic, // "review this" — no codebase priors (null baseline) + Staffing, // staffing-domain coordinator framing (Pass 4) } fn flags_for_mode(mode: &str) -> EnrichmentFlags { @@ -114,6 +117,18 @@ fn flags_for_mode(mode: &str) -> EnrichmentFlags { use_relevance_filter: false, framing: ReviewerFraming::Adversarial, }, + "staffing_inference_lakehouse" => EnrichmentFlags { + // Staffing reuses the same composer architecture but with + // domain-specific framing. file_content here = the request + // payload (e.g. "fill 2 welders in Toledo OH"), bug_fingerprints + // surface prior playbook patterns from this geo+role, matrix + // pulls candidate workers + city/state demand chunks. + include_file_content: true, + include_bug_fingerprints: true, + include_matrix_chunks: true, + use_relevance_filter: true, + framing: ReviewerFraming::Staffing, + }, // Default (codereview_lakehouse): everything on. _ => EnrichmentFlags { include_file_content: true, @@ -404,6 +419,20 @@ pub struct ExecuteRequest { /// runner uses its built-in forensic-review framing. #[serde(default)] pub user_question: Option, + /// Override the matrix corpus the runner queries. Defaults to the + /// task_class's matrix_corpus from modes.toml. Use for the corpus- + /// tightening experiment (Pass 2 of the 2026-04-26 mode sweep). + #[serde(default)] + pub force_matrix_corpus: Option, + /// Override the relevance filter threshold (default 0.3). Setting + /// to 0 keeps every chunk; raising rejects more aggressively. Used + /// to find the threshold sweet spot per task class. + #[serde(default)] + pub force_relevance_threshold: Option, + /// Override the LLM temperature (default 0.1). Used by Pass 3 + /// variance testing to measure run-to-run stability. + #[serde(default)] + pub force_temperature: Option, } #[derive(Serialize, Debug, Default)] @@ -442,10 +471,20 @@ proven. Do NOT hedge."; const FRAMING_GENERIC: &str = "You are a code reviewer. Read the file below and produce a \ markdown review with findings."; +const FRAMING_STAFFING: &str = "You are a senior staffing coordinator for a light-industrial \ +labor agency. You receive a fill request (role × count × city × deadline) and have access \ +to historical playbook patterns from prior fills in this geo, plus a corpus of candidate \ +workers + demand signals. Produce a markdown plan with: (1) one-line verdict (fillable | \ +contingent | unfillable), (2) ranked candidate list with name, city, role, distance, prior \ +fill citations from the playbook, (3) risks (double-booking, eligibility gaps, geo stretch) \ +with severity + confidence percent, (4) playbook reference IDs you used. Be precise — only \ +recommend candidates whose names appear in the matrix data; do NOT fabricate workers."; + fn framing_text(f: ReviewerFraming) -> &'static str { match f { ReviewerFraming::Adversarial => FRAMING_ADVERSARIAL, ReviewerFraming::Generic => FRAMING_GENERIC, + ReviewerFraming::Staffing => FRAMING_STAFFING, } } @@ -485,6 +524,9 @@ pub async fn execute( )); } + // Caller can override the matrix corpus per-call (Pass 2 corpus + // tightening). Falls back to modes.toml default. + let matrix_corpus = req.force_matrix_corpus.clone().or(matrix_corpus); let flags = flags_for_mode(&mode); let mut sources = EnrichmentSources { matrix_corpus: matrix_corpus.clone(), @@ -621,7 +663,7 @@ pub async fn execute( let body = serde_json::json!({ "focus_file": { "path": req.file_path, "content": file_content }, "chunks": chunks_for_filter, - "threshold": 0.3, + "threshold": req.force_relevance_threshold.unwrap_or(0.3), }); match client .post("http://localhost:3800/relevance") @@ -723,7 +765,7 @@ pub async fn execute( { "role": "system", "content": framing_text(flags.framing) }, { "role": "user", "content": user_prompt }, ], - "temperature": 0.1, + "temperature": req.force_temperature.unwrap_or(0.1), "max_tokens": 4096, }); let chat_client = match reqwest::Client::builder() diff --git a/scripts/mode_pass2_corpus_sweep.ts b/scripts/mode_pass2_corpus_sweep.ts new file mode 100644 index 0000000..04596a7 --- /dev/null +++ b/scripts/mode_pass2_corpus_sweep.ts @@ -0,0 +1,121 @@ +#!/usr/bin/env bun +/** + * Pass 2: matrix corpus + relevance threshold sweep. + * + * For each (corpus, threshold) combination, run codereview_matrix_only + * on the same N files. Compares which corpus actually adds grounded + * findings vs codereview_isolation (matrix-off baseline). + * + * Output: data/_kb/mode_experiments.jsonl gets one row per call, + * tagged via the force_matrix_corpus + force_relevance_threshold + * fields visible in `sources`. Aggregator can then group by corpus. + * + * Usage: bun run scripts/mode_pass2_corpus_sweep.ts + */ + +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free"; + +const FILES = (process.env.LH_FILES ?? [ + "crates/queryd/src/delta.rs", + "crates/queryd/src/service.rs", + "crates/vectord/src/pathway_memory.rs", + "crates/gateway/src/v1/mode.rs", + "crates/aibridge/src/client.rs", +].join(",")).split(","); + +const CORPORA = (process.env.LH_CORPORA ?? [ + "distilled_procedural_v20260423102847", + "distilled_factual_v20260423095819", + "distilled_config_hint_v20260423102847", + "kb_team_runs_v1", +].join(",")).split(","); + +const THRESHOLDS = (process.env.LH_THRESHOLDS ?? "0.2,0.3,0.4,0.5").split(",").map(Number); + +interface Result { + corpus: string; + threshold: number; + file: string; + ok: boolean; + matrix_kept?: number; + matrix_dropped?: number; + response_chars?: number; + latency_ms?: number; + error?: string; +} + +async function runOne(corpus: string, threshold: number, file: string): Promise { + try { + const r = await fetch(`${GATEWAY}/v1/mode/execute`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + task_class: "scrum_review", + file_path: file, + force_mode: "codereview_matrix_only", + force_model: MODEL, + force_matrix_corpus: corpus, + force_relevance_threshold: threshold, + }), + signal: AbortSignal.timeout(180_000), + }); + if (!r.ok) { + const body = await r.text().catch(() => ""); + return { corpus, threshold, file, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` }; + } + const j: any = await r.json(); + return { + corpus, threshold, file, ok: true, + matrix_kept: j.sources?.matrix_chunks_kept, + matrix_dropped: j.sources?.matrix_chunks_dropped, + response_chars: (j.response ?? "").length, + latency_ms: j.latency_ms, + }; + } catch (e: any) { + return { corpus, threshold, file, ok: false, error: e.message }; + } +} + +async function main() { + const total = CORPORA.length * THRESHOLDS.length * FILES.length; + console.log(`[pass2] corpora=${CORPORA.length} × thresholds=${THRESHOLDS.length} × files=${FILES.length} = ${total} runs`); + console.log(`[pass2] model=${MODEL}\n`); + let i = 0; + const results: Result[] = []; + for (const corpus of CORPORA) { + for (const threshold of THRESHOLDS) { + for (const file of FILES) { + i++; + process.stdout.write(` [${i}/${total}] corpus=${corpus.slice(0, 30).padEnd(30)} thr=${threshold.toFixed(1)} ${file.slice(-32).padStart(32)} ... `); + const r = await runOne(corpus, threshold, file); + results.push(r); + if (r.ok) { + const total_chunks = (r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0); + console.log(`✓ k=${r.matrix_kept}/${total_chunks} resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`); + } else { + console.log(`✗ ${r.error}`); + } + } + } + } + + console.log(`\n[pass2] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`); + + // Per-corpus×threshold roll-up of kept-rate (the matrix usefulness proxy). + console.log(`\n[pass2] kept-rate by corpus × threshold (avg chunks kept per call):`); + console.log(` ${"corpus".padEnd(40)} ${THRESHOLDS.map(t => `thr=${t.toFixed(1)}`).join(" ").padStart(35)}`); + for (const corpus of CORPORA) { + const cells = THRESHOLDS.map(t => { + const matched = results.filter(r => r.ok && r.corpus === corpus && r.threshold === t); + if (matched.length === 0) return " — "; + const avgKept = matched.reduce((s, r) => s + (r.matrix_kept ?? 0), 0) / matched.length; + return avgKept.toFixed(1).padStart(5); + }).join(" "); + console.log(` ${corpus.slice(0, 40).padEnd(40)} ${cells}`); + } + + console.log(`\n[pass2] aggregate findings/groundedness with: bun run scripts/mode_compare.ts`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/mode_pass3_variance.ts b/scripts/mode_pass3_variance.ts new file mode 100644 index 0000000..5d2b44e --- /dev/null +++ b/scripts/mode_pass3_variance.ts @@ -0,0 +1,109 @@ +#!/usr/bin/env bun +/** + * Pass 3: variance test. + * + * Runs codereview_lakehouse on the SAME file N times at each of M + * temperatures. Measures run-to-run stability of grounded finding + * count, response size, and latency. Anything <100% groundedness + * is a leak; track which symbols got hallucinated. + * + * Output appends to data/_kb/mode_experiments.jsonl. The aggregator + * can group by ts and identify variance buckets. + * + * Usage: bun run scripts/mode_pass3_variance.ts + */ + +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free"; + +const FILES = (process.env.LH_FILES ?? [ + "crates/queryd/src/delta.rs", + "crates/vectord/src/pathway_memory.rs", + "crates/gateway/src/v1/mode.rs", +].join(",")).split(","); + +const TEMPS = (process.env.LH_TEMPS ?? "0.0,0.1,0.3").split(",").map(Number); +const REPS = Number(process.env.LH_REPS ?? 5); + +interface Result { + file: string; + temp: number; + rep: number; + ok: boolean; + response_chars?: number; + latency_ms?: number; + error?: string; +} + +async function runOne(file: string, temp: number, rep: number): Promise { + try { + const r = await fetch(`${GATEWAY}/v1/mode/execute`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + task_class: "scrum_review", + file_path: file, + force_mode: "codereview_lakehouse", + force_model: MODEL, + force_temperature: temp, + }), + signal: AbortSignal.timeout(180_000), + }); + if (!r.ok) { + const body = await r.text().catch(() => ""); + return { file, temp, rep, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 150)}` }; + } + const j: any = await r.json(); + return { + file, temp, rep, ok: true, + response_chars: (j.response ?? "").length, + latency_ms: j.latency_ms, + }; + } catch (e: any) { + return { file, temp, rep, ok: false, error: e.message }; + } +} + +async function main() { + const total = FILES.length * TEMPS.length * REPS; + console.log(`[pass3] files=${FILES.length} × temps=${TEMPS.length} × reps=${REPS} = ${total} runs`); + console.log(`[pass3] model=${MODEL}\n`); + let i = 0; + const results: Result[] = []; + for (const file of FILES) { + for (const temp of TEMPS) { + for (let rep = 1; rep <= REPS; rep++) { + i++; + process.stdout.write(` [${i}/${total}] temp=${temp.toFixed(1)} rep=${rep}/${REPS} ${file.slice(-32).padStart(32)} ... `); + const r = await runOne(file, temp, rep); + results.push(r); + if (r.ok) { + console.log(`✓ resp=${r.response_chars} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`); + } else { + console.log(`✗ ${r.error}`); + } + } + } + } + + console.log(`\n[pass3] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`); + + // Per-file × temp variance summary (response_chars stddev as a quick + // proxy for output instability). + console.log(`\n[pass3] response_chars variance (mean ± stddev) by file × temp:`); + console.log(` ${"file".padEnd(40)} ${TEMPS.map(t => `temp=${t.toFixed(1)}`.padStart(20)).join(" ")}`); + for (const file of FILES) { + const cells = TEMPS.map(t => { + const xs = results.filter(r => r.ok && r.file === file && r.temp === t).map(r => r.response_chars ?? 0); + if (xs.length === 0) return " — "; + const mean = xs.reduce((s, x) => s + x, 0) / xs.length; + const sd = Math.sqrt(xs.reduce((s, x) => s + Math.pow(x - mean, 2), 0) / xs.length); + return `${Math.round(mean).toString().padStart(7)} ± ${Math.round(sd).toString().padEnd(6)}`.padStart(20); + }).join(" "); + console.log(` ${file.slice(0, 40).padEnd(40)} ${cells}`); + } + + console.log(`\n[pass3] grounding variance via: bun run scripts/mode_compare.ts (look for grounded-N column drift)`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/mode_pass4_staffing.ts b/scripts/mode_pass4_staffing.ts new file mode 100644 index 0000000..8e68253 --- /dev/null +++ b/scripts/mode_pass4_staffing.ts @@ -0,0 +1,127 @@ +#!/usr/bin/env bun +/** + * Pass 4: staffing_inference_lakehouse cross-domain validation. + * + * Runs the staffing-domain mode against synthetic fill requests. + * Validates that the modes-as-prompt-molders architecture generalizes + * beyond code review — the composer pattern (file_content + bug + * fingerprints + relevance-filtered matrix + domain framing) should + * produce grounded staffing recommendations the same way it produces + * grounded code reviews. + * + * Each fill request is posted as `file_content` (since the runner's + * shape expects file content; for staffing it's the request payload). + * file_path is set to a synthetic path under requests/ so pathway + * memory bucketing groups requests by geo+role. + * + * Usage: bun run scripts/mode_pass4_staffing.ts + */ + +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const MODEL = process.env.LH_MODEL ?? "openai/gpt-oss-120b:free"; + +interface FillRequest { + city: string; + state: string; + role: string; + count: number; + deadline: string; + notes?: string; +} + +const REQUESTS: FillRequest[] = [ + { city: "Toledo", state: "OH", role: "Welder", count: 2, deadline: "2026-04-29", notes: "OSHA 10 required" }, + { city: "Nashville", state: "TN", role: "Forklift Operator", count: 3, deadline: "2026-05-01" }, + { city: "Chicago", state: "IL", role: "Assembler", count: 5, deadline: "2026-04-30", notes: "second shift" }, + { city: "South Bend", state: "IN", role: "Electrician", count: 1, deadline: "2026-04-28", notes: "journeyman license" }, + { city: "Murfreesboro", state: "TN", role: "Packaging Operator", count: 4, deadline: "2026-05-02" }, +]; + +function requestToPayload(req: FillRequest): string { + return [ + `# Fill Request`, + `Role: ${req.role} × ${req.count}`, + `Location: ${req.city}, ${req.state}`, + `Deadline: ${req.deadline}`, + req.notes ? `Notes: ${req.notes}` : "", + "", + "Recommend candidates from the matrix data. Cite playbook references.", + ].filter(Boolean).join("\n"); +} + +interface Result { + req: FillRequest; + ok: boolean; + response_chars?: number; + bug_fingerprints?: number; + matrix_kept?: number; + matrix_dropped?: number; + latency_ms?: number; + error?: string; + preview?: string; +} + +async function runOne(req: FillRequest): Promise { + const payload = requestToPayload(req); + const file_path = `requests/${req.role.toLowerCase().replace(/\s+/g, "_")}_${req.city.toLowerCase().replace(/\s+/g, "_")}_${req.state}.md`; + try { + const r = await fetch(`${GATEWAY}/v1/mode/execute`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + task_class: "staffing_inference", + file_path, + file_content: payload, + force_mode: "staffing_inference_lakehouse", + force_model: MODEL, + }), + signal: AbortSignal.timeout(180_000), + }); + if (!r.ok) { + const body = await r.text().catch(() => ""); + return { req, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` }; + } + const j: any = await r.json(); + return { + req, ok: true, + response_chars: (j.response ?? "").length, + bug_fingerprints: j.sources?.bug_fingerprints_count, + matrix_kept: j.sources?.matrix_chunks_kept, + matrix_dropped: j.sources?.matrix_chunks_dropped, + latency_ms: j.latency_ms, + preview: (j.response ?? "").slice(0, 400), + }; + } catch (e: any) { + return { req, ok: false, error: e.message }; + } +} + +async function main() { + console.log(`[pass4] requests=${REQUESTS.length} model=${MODEL} mode=staffing_inference_lakehouse\n`); + let i = 0; + const results: Result[] = []; + for (const req of REQUESTS) { + i++; + process.stdout.write(` [${i}/${REQUESTS.length}] ${req.role.padEnd(22)} × ${req.count} in ${req.city}, ${req.state} ... `); + const r = await runOne(req); + results.push(r); + if (r.ok) { + console.log(`✓ resp=${r.response_chars} bug=${r.bug_fingerprints ?? 0} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)} ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`); + } else { + console.log(`✗ ${r.error}`); + } + } + + console.log(`\n[pass4] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded\n`); + + // Show first successful response head to verify the framing actually + // produced staffing-style output (verdict + ranked candidates) not + // generic prose. + const first = results.find(r => r.ok && r.preview); + if (first) { + console.log(`[pass4] first successful response preview (${first.req.city} ${first.req.role}):`); + console.log(first.preview!.split("\n").map(l => " | " + l).join("\n")); + } +} + +main().catch(e => { console.error(e); process.exit(1); });