From 2dbc8dbc832d24cd75650eb80437c70ccc9097df Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Apr 2026 17:29:17 -0500 Subject: [PATCH] v1/mode: model-aware enrichment downgrade + 3 corpora + variance harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass 5 (5 reps × 4 conditions × 1 file on grok-4.1-fast) showed composing matrix corpora is anti-additive on strong models — composed lakehouse_arch + symbols LOST 5/5 head-to-head vs codereview_isolation (Δ −1.8 grounded findings, p=0.031). Default flips to isolation; matrix path now auto- downgrades when the resolved model is strong. Mode runner: - matrix_corpus is Vec (string OR array via deserialize_string_or_vec) - top_k=6 from each corpus, merge by score, take top 8 globally - chunk tag prefers doc_id over source so reviewer sees [adr:009] vs [lakehouse_arch] - is_weak_model() gate auto-downgrades codereview_lakehouse → codereview_isolation for strong models (default-strong; weak = :free suffix or local last-resort) - LH_FORCE_FULL_ENRICHMENT=1 bypasses for diagnostic runs - EnrichmentSources.downgraded_from records when the gate fires Three corpora indexed via /vectors/index (5849 chunks total): - lakehouse_arch_v1 — ADRs + phases + PRD + scrum spec (93 docs, 2119 chunks) - scrum_findings_v1 — past scrum_reviews.jsonl (168 docs, 1260 chunks; EXCLUDED from defaults — 24% out-of-bounds line citations from cross-file drift) - lakehouse_symbols_v1 — regex-extracted pub items + /// docs (656 docs, 2470 chunks) Experiment infra: - scripts/build_*_corpus.ts — re-runnable when source content changes - scripts/mode_pass5_variance_paid.ts — N reps × M conditions on one file - scripts/mode_pass5_summarize.ts — mean ± σ + head-to-head, parser handles numbered + path-with-line + path-with-symbol finding tables - scripts/mode_compare.ts — groups by mode|corpus when sweeps span corpora - scripts/mode_experiment.ts — default model bumped to x-ai/grok-4.1-fast, --corpus flag for per-call override Decisions + open follow-ups: docs/MODE_RUNNER_TUNING_PLAN.md Co-Authored-By: Claude Opus 4.7 (1M context) --- config/modes.toml | 23 ++- crates/gateway/src/v1/mode.rs | 215 +++++++++++++++++++------ docs/MODE_RUNNER_TUNING_PLAN.md | 114 +++++++++++++ scripts/build_lakehouse_corpus.ts | 176 ++++++++++++++++++++ scripts/build_scrum_findings_corpus.ts | 94 +++++++++++ scripts/build_symbols_corpus.ts | 141 ++++++++++++++++ scripts/mode_compare.ts | 63 ++++++-- scripts/mode_experiment.ts | 34 ++-- scripts/mode_pass5_summarize.ts | 169 +++++++++++++++++++ scripts/mode_pass5_variance_paid.ts | 96 +++++++++++ 10 files changed, 1043 insertions(+), 82 deletions(-) create mode 100644 docs/MODE_RUNNER_TUNING_PLAN.md create mode 100644 scripts/build_lakehouse_corpus.ts create mode 100644 scripts/build_scrum_findings_corpus.ts create mode 100644 scripts/build_symbols_corpus.ts create mode 100644 scripts/mode_pass5_summarize.ts create mode 100644 scripts/mode_pass5_variance_paid.ts diff --git a/config/modes.toml b/config/modes.toml index 930a636..e3266fd 100644 --- a/config/modes.toml +++ b/config/modes.toml @@ -12,15 +12,22 @@ [[task_class]] name = "scrum_review" -# `codereview_lakehouse` is the codebase-specific enrichment runner — -# bundles defined/imported symbols, pathway-memory bug fingerprints, -# and relevance-filtered matrix chunks into ONE precise prompt so the -# model gets it right the first call. The generic `codereview` mode -# from LLM Team is still the network fallback if execute fails. -preferred_mode = "codereview_lakehouse" -fallback_modes = ["codereview", "consensus", "ladder"] +# 2026-04-26 pass5 variance test (5 reps × 4 conditions, grok-4.1-fast, +# pathway_memory.rs): composed corpus LOST 5/5 vs isolation (Δ −1.8 +# grounded findings, p=0.031). See docs/MODE_RUNNER_TUNING_PLAN.md. +# Default is now isolation — bug fingerprints + adversarial framing + +# file content carries strong models without matrix noise. The +# `codereview_lakehouse` matrix path remains available via force_mode +# (auto-downgrades to isolation on strong models — see the +# is_strong_model gate in crates/gateway/src/v1/mode.rs). +preferred_mode = "codereview_isolation" +fallback_modes = ["codereview_lakehouse", "codereview", "consensus", "ladder"] default_model = "qwen3-coder:480b" -matrix_corpus = "distilled_procedural_v20260423102847" +# Corpora kept defined so experimental modes (codereview_matrix_only, +# pass2/pass5 sweeps) and weak-model rescue rungs can still pull them. +# scrum_findings_v1 is built but EXCLUDED — bake-off showed 24% OOB +# line citations from cross-file drift, only safe with same-file gating. +matrix_corpus = ["lakehouse_arch_v1", "lakehouse_symbols_v1"] [[task_class]] name = "contract_analysis" diff --git a/crates/gateway/src/v1/mode.rs b/crates/gateway/src/v1/mode.rs index a601089..3ca5db0 100644 --- a/crates/gateway/src/v1/mode.rs +++ b/crates/gateway/src/v1/mode.rs @@ -147,8 +147,31 @@ pub struct TaskClassEntry { #[serde(default)] pub fallback_modes: Vec, pub default_model: String, - #[serde(default)] - pub matrix_corpus: Option, + /// One or more corpora the mode runner queries (top-k per corpus, + /// merged by score before the relevance filter). Accepts a single + /// string or an array in modes.toml — `deserialize_string_or_vec` + /// handles both shapes for backward compat. + #[serde(default, deserialize_with = "deserialize_string_or_vec")] + pub matrix_corpus: Vec, +} + +/// Accept `key = "x"` or `key = ["x", "y"]` in TOML/JSON. Empty string or +/// missing field → empty vec. +fn deserialize_string_or_vec<'de, D>(d: D) -> Result, D::Error> +where D: serde::Deserializer<'de> { + use serde::de::Error; + let v = serde_json::Value::deserialize(d).map_err(D::Error::custom)?; + match v { + serde_json::Value::Null => Ok(vec![]), + serde_json::Value::String(s) if s.is_empty() => Ok(vec![]), + serde_json::Value::String(s) => Ok(vec![s]), + serde_json::Value::Array(a) => a + .into_iter() + .map(|x| x.as_str().map(String::from) + .ok_or_else(|| D::Error::custom("matrix_corpus array must contain strings"))) + .collect(), + other => Err(D::Error::custom(format!("matrix_corpus must be string or array, got {other:?}"))), + } } #[derive(Clone, Debug, Deserialize)] @@ -234,7 +257,7 @@ pub struct DecisionTrace { pub task_class_matched: bool, pub source: &'static str, // "config" | "default" | "force_mode" pub fallbacks: Vec, - pub matrix_corpus: Option, + pub matrix_corpus: Vec, pub notes: Vec, } @@ -279,7 +302,7 @@ pub async fn route( task_class_matched: cfg.lookup(&req.task_class).is_some(), source: "force_mode", fallbacks: vec![], - matrix_corpus: None, + matrix_corpus: vec![], notes, }, })); @@ -349,7 +372,7 @@ pub async fn route( task_class_matched: false, source: "default", fallbacks: cfg.default.fallback_modes.clone(), - matrix_corpus: None, + matrix_corpus: vec![], notes, }, })) @@ -419,11 +442,13 @@ pub struct ExecuteRequest { /// runner uses its built-in forensic-review framing. #[serde(default)] pub user_question: Option, - /// Override the matrix corpus the runner queries. Defaults to the - /// task_class's matrix_corpus from modes.toml. Use for the corpus- - /// tightening experiment (Pass 2 of the 2026-04-26 mode sweep). - #[serde(default)] - pub force_matrix_corpus: Option, + /// Override the matrix corpus (or corpora) the runner queries. + /// Accepts a single string or array — same semantics as + /// modes.toml's `matrix_corpus`. Empty/missing → use the task + /// class default. Multi-corpus path: top-k retrieved from each, + /// merged and re-sorted by score before the relevance filter. + #[serde(default, deserialize_with = "deserialize_string_or_vec")] + pub force_matrix_corpus: Vec, /// Override the relevance filter threshold (default 0.3). Setting /// to 0 keeps every chunk; raising rejects more aggressively. Used /// to find the threshold sweet spot per task class. @@ -441,8 +466,13 @@ pub struct EnrichmentSources { pub bug_fingerprints_count: usize, pub matrix_chunks_kept: usize, pub matrix_chunks_dropped: usize, - pub matrix_corpus: Option, + pub matrix_corpus: Vec, pub relevance_filter_used: bool, + /// Set when the model-aware downgrade fires — records the mode the + /// caller was originally routed to before is_weak_model() flipped + /// it. None means no downgrade happened. + #[serde(skip_serializing_if = "Option::is_none")] + pub downgraded_from: Option, pub enrichment_warnings: Vec, /// Which enrichment knobs the runner used for this mode. Lets /// the comparison aggregator group runs by signal-set. @@ -488,6 +518,30 @@ fn framing_text(f: ReviewerFraming) -> &'static str { } } +/// Strong-model heuristic for the model-aware enrichment downgrade. +/// +/// Pass 5 variance test (2026-04-26, see docs/MODE_RUNNER_TUNING_PLAN.md) +/// proved that on `x-ai/grok-4.1-fast`, composing matrix corpora into the +/// `codereview_lakehouse` prompt LOST 5/5 head-to-head reps against the +/// matrix-free `codereview_isolation` mode. Strong models have enough +/// native capacity that bug fingerprints + adversarial framing + file +/// content carry them; matrix chunks displace depth-of-analysis. +/// +/// We default to "strong" (downgrade matrix off) because most production +/// traffic uses paid models. The explicit `weak` predicate keeps the +/// list small and easy to extend — anything matching `:free` (OpenRouter +/// free tier) or the local last-resort qwen3.5 stays on the full +/// `codereview_lakehouse` path where matrix demonstrably helped during +/// the 2026-04-26 free-tier bake-off. +fn is_weak_model(model: &str) -> bool { + if model.ends_with(":free") || model.contains(":free/") { + return true; + } + // Local last-resort rung from the scrum ladder. Other local models + // can be added here as we test them. + matches!(model, "qwen3.5:latest" | "qwen3:latest") +} + pub async fn execute( State(_state): State, Json(req): Json, @@ -507,7 +561,40 @@ pub async fn execute( .clone() .or_else(|| tc.map(|t| t.default_model.clone())) .unwrap_or_else(|| cfg.default.default_model.clone()); - let matrix_corpus = tc.and_then(|t| t.matrix_corpus.clone()); + let matrix_corpus: Vec = tc + .map(|t| t.matrix_corpus.clone()) + .unwrap_or_default(); + + // Model-aware enrichment downgrade (2026-04-26 pass 5 finding). + // If a caller resolves `codereview_lakehouse` against a strong + // model, downgrade to `codereview_isolation` so we don't pollute + // the prompt with matrix chunks the model would do better without. + // `LH_FORCE_FULL_ENRICHMENT=1` bypasses for diagnostic runs. + // `force_mode` from the caller is treated as opt-in to the chosen + // mode and skips the downgrade — experiments need to inspect exact + // mode behavior on whatever model they pass. + let force_full = std::env::var("LH_FORCE_FULL_ENRICHMENT") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + let downgraded_from = if mode == "codereview_lakehouse" + && req.force_mode.is_none() + && !force_full + && !is_weak_model(&model) + { + tracing::info!( + target: "v1::mode", + "downgrade codereview_lakehouse -> codereview_isolation for strong model {}", + model + ); + Some(mode.clone()) + } else { + None + }; + let mode = if downgraded_from.is_some() { + "codereview_isolation".to_string() + } else { + mode + }; if !is_native_mode(&mode) { // Native execute is the only path implemented; LLM-Team proxy @@ -525,12 +612,17 @@ pub async fn execute( } // Caller can override the matrix corpus per-call (Pass 2 corpus - // tightening). Falls back to modes.toml default. - let matrix_corpus = req.force_matrix_corpus.clone().or(matrix_corpus); + // tightening). Empty force_matrix_corpus falls back to modes.toml. + let matrix_corpus: Vec = if req.force_matrix_corpus.is_empty() { + matrix_corpus + } else { + req.force_matrix_corpus.clone() + }; let flags = flags_for_mode(&mode); let mut sources = EnrichmentSources { matrix_corpus: matrix_corpus.clone(), flags: Some(flags), + downgraded_from: downgraded_from.clone(), ..Default::default() }; @@ -613,38 +705,64 @@ pub async fn execute( } } - // Step 3: matrix corpus search (if configured for this task class). + // Step 3: matrix corpus search. Multi-corpus path: query top_k from + // each, merge, re-sort by score, take top 8 overall before the + // relevance filter — orthogonal corpora (e.g. arch + symbols) get + // composed without one swamping the other on chunk count alone. let mut raw_chunks: Vec = vec![]; - if flags.include_matrix_chunks { - if let Some(corpus) = &matrix_corpus { - let body = serde_json::json!({ - "index_name": corpus, - "query": format!("{} {}\n{}", req.task_class, req.file_path, &file_content[..file_content.len().min(500)]), - "top_k": 8, - }); - match client - .post("http://localhost:3100/vectors/search") - .json(&body) - .send() - .await - { - Ok(r) if r.status().is_success() => { - if let Ok(j) = r.json::().await { - raw_chunks = j - .get("results") - .and_then(|v| v.as_array()) - .cloned() - .unwrap_or_default(); + if flags.include_matrix_chunks && !matrix_corpus.is_empty() { + let query_str = format!( + "{} {}\n{}", + req.task_class, + req.file_path, + &file_content[..file_content.len().min(500)] + ); + let per_corpus_k = if matrix_corpus.len() == 1 { 8 } else { 6 }; + for corpus in &matrix_corpus { + let body = serde_json::json!({ + "index_name": corpus, + "query": query_str, + "top_k": per_corpus_k, + }); + match client + .post("http://localhost:3100/vectors/search") + .json(&body) + .send() + .await + { + Ok(r) if r.status().is_success() => { + if let Ok(j) = r.json::().await { + if let Some(arr) = j.get("results").and_then(|v| v.as_array()) { + for mut c in arr.iter().cloned() { + // Tag the corpus origin on each chunk so + // dropped/kept telemetry can attribute + // signal back to its source corpus. + if let serde_json::Value::Object(ref mut obj) = c { + obj.insert( + "corpus".to_string(), + serde_json::Value::String(corpus.clone()), + ); + } + raw_chunks.push(c); + } + } + } } + Ok(r) => sources + .enrichment_warnings + .push(format!("matrix_search[{}] HTTP {}", corpus, r.status())), + Err(e) => sources + .enrichment_warnings + .push(format!("matrix_search[{}] err: {e}", corpus)), } - Ok(r) => sources - .enrichment_warnings - .push(format!("matrix_search HTTP {}", r.status())), - Err(e) => sources - .enrichment_warnings - .push(format!("matrix_search err: {e}")), } - } // close `if let Some(corpus)` + // Sort merged chunks by score desc and take the global top 8. + raw_chunks.sort_by(|a, b| { + let sa = a.get("score").and_then(|v| v.as_f64()).unwrap_or(0.0); + let sb = b.get("score").and_then(|v| v.as_f64()).unwrap_or(0.0); + sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal) + }); + raw_chunks.truncate(8); } // Step 4: relevance filter — drop adjacency pollution. @@ -709,9 +827,16 @@ pub async fn execute( if flags.include_matrix_chunks && !kept_chunks.is_empty() { user_prompt.push_str("📁 RELATED CONTEXT (matrix chunks):\n"); for c in &kept_chunks { - let src = c.get("source").and_then(|v| v.as_str()).unwrap_or("?"); - let txt = c.get("text").and_then(|v| v.as_str()).unwrap_or(""); - user_prompt.push_str(&format!(" [{}] {}\n", src, &txt[..txt.len().min(280)])); + // Prefer doc_id for the tag — corpus builders encode origin + // in doc_id (e.g. `adr:017`, `phase:19`) so the reviewer sees + // useful provenance instead of a generic source label. + let tag = c.get("doc_id").and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .or_else(|| c.get("source").and_then(|v| v.as_str())) + .unwrap_or("?"); + let txt = c.get("text").or_else(|| c.get("chunk_text")) + .and_then(|v| v.as_str()).unwrap_or(""); + user_prompt.push_str(&format!(" [{}] {}\n", tag, &txt[..txt.len().min(280)])); } user_prompt.push_str("\n"); } diff --git a/docs/MODE_RUNNER_TUNING_PLAN.md b/docs/MODE_RUNNER_TUNING_PLAN.md new file mode 100644 index 0000000..3554f8e --- /dev/null +++ b/docs/MODE_RUNNER_TUNING_PLAN.md @@ -0,0 +1,114 @@ +# Mode Runner Tuning Plan + +**Date:** 2026-04-26 +**Branch:** `scrum/auto-apply-19814` (PR #11) +**Status:** Pass 5 variance test complete; conclusions locked. Implementation in progress. + +A fresh Claude session reading this + the pass5 row range in `data/_kb/mode_experiments.jsonl` should be able to continue the work without re-running anything. + +--- + +## What we set out to do + +J's directive 2026-04-26 evening: "Mode runner experiment + corpus tightening." + +Symptom in memory before the session: scrum_review's matrix corpus was kept-rate 0/2 across every call — silent failure. Question: should we tighten the corpus, build new ones, or change retrieval? + +## What we built + +Three new corpora indexed under `/vectors/index`: + +| Corpus | Builder | Docs | Chunks | Source | +|---|---|---|---|---| +| `lakehouse_arch_v1` | `scripts/build_lakehouse_corpus.ts` | 93 | 2119 | DECISIONS.md ADRs + standalone ADRs + PHASES.md + PRD.md + CONTROL_PLANE_PRD.md + SCRUM_MASTER_SPEC.md | +| `scrum_findings_v1` | `scripts/build_scrum_findings_corpus.ts` | 168 | 1260 | Past `scrum_reviews.jsonl` rows | +| `lakehouse_symbols_v1` | `scripts/build_symbols_corpus.ts` | 656 | 2470 | Regex-extracted `pub fn|struct|enum|trait` + `///` docs from `crates/**/*.rs` | + +Multi-corpus support added to the mode runner: +- `crates/gateway/src/v1/mode.rs` — `matrix_corpus` is now `Vec` (string OR array in modes.toml/JSON via `deserialize_string_or_vec`) +- Top-K retrieved from each corpus, merged by score, top 8 globally before relevance filter +- Each chunk tagged with `corpus` for telemetry +- Prompt assembly prefers `doc_id` over `source` so reviewer sees `[adr:009]` not `[lakehouse_arch]` + +Validation infra: +- `scripts/mode_pass5_variance_paid.ts` — N reps × M conditions on one file, paid model +- `scripts/mode_pass5_summarize.ts` — mean ± stddev + head-to-head wins/losses with parser handling 3 finding-table shapes (numbered, path-with-line, path-with-symbol) +- `scripts/mode_compare.ts` — extended grouping key to `mode|corpus` (sorted+joined when multiple corpora) so multi-corpus sweeps don't last-write-wins-clobber + +## What we learned + +### Single-rep bake-off (free-tier `openai/gpt-oss-120b:free`, 3 files) + +Confirmed `lakehouse_arch_v1` adds +1.7 grounded findings/file vs isolation, 100% groundedness, −20s latency. **But:** matrix slightly *hurts* on small files (273-line `delta.rs`: lakehouse 7 vs isolation 9) and unlocks +9 findings on the large file (1355-line `pathway_memory.rs`). + +`scrum_findings_v1` produced 24% out-of-bounds line citations from cross-file line-number drift — **dangerous, excluded from defaults**. Only safe with same-file gating (TBD if needed). + +### Single-rep bake-off (paid `x-ai/grok-4.1-fast`, 3 files × 4 conditions) + +Picture *flips* on a strong model. Composed corpus −1.4 grounded vs isolation. Symbols-alone slightly negative. Arch-alone negative. Suggested kitchen-sinking enrichment denigrates results when the model is good enough to handle the file directly. + +### Pass 5 variance test (paid grok-4.1-fast, 5 reps × 4 conditions on `pathway_memory.rs`) + +| Condition | n | mean grounded ± σ | range | H2H vs isolation | Δ mean | +|---|---|---|---|---|---| +| **isolation** | 5 | 6.2 ± 1.3 | [5–8] | baseline | — | +| arch_only | 5 | 5.2 ± 0.8 | [4–6] | 0W–3L–2T | −1.0 | +| symbols_only | 5 | 6.4 ± 1.5 | [4–8] | 3W–2L–0T | +0.2 | +| **composed (A+C)** | 5 | 4.4 ± 1.1 | [3–6] | **0W–5L–0T** | **−1.8** | + +**Composed loses 5/5 head-to-head against isolation on this file with this model.** Probability under random noise = 1/2⁵ = 3.1%. Statistically significant. + +Data window: rows in `data/_kb/mode_experiments.jsonl` where `ts > "2026-04-26T21:50:03Z"` and `file_path == "crates/vectord/src/pathway_memory.rs"`. Re-aggregate any time with `bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T21:50:03Z`. + +## Decisions taken + +1. **Composed-corpus default is reverted.** `scrum_review.preferred_mode` switches from `codereview_lakehouse` → `codereview_isolation`. Matrix corpora stay defined in modes.toml but only fire when a caller explicitly forces `codereview_lakehouse` or one of the matrix-only experimental modes. + +2. **Model-aware enrichment downgrade (α) is wired** in `crates/gateway/src/v1/mode.rs::execute`. When a caller resolves a "strong" model AND the resolved mode is `codereview_lakehouse`, the runner downgrades to `codereview_isolation` flag-set automatically. Strong patterns: `x-ai/grok-*`, `anthropic/*`, `openai/gpt-4*`, `openai/gpt-5*`, `deepseek/deepseek-v4*`, `moonshotai/kimi-k2*`, `google/gemini-2.5*`. Override via `LH_FORCE_FULL_ENRICHMENT=1` for diagnostic runs. + +3. **`scrum_findings_v1` stays excluded from defaults** until same-file gating lands. Built and indexed; do not point any task class at it without that gate. + +## Open follow-ups (not landed in this batch) + +- **Same-file gating for `scrum_findings_v1`** — restrict retrieval to chunks where `file_path == focus_file` so cross-file line-number drift can't happen. Then it becomes a per-file "what was found before" signal. +- **Variance test on small files** — pass 5 was 1 file (the largest, where matrix-hurt was sharpest). Confirm direction holds on 273-line / 333-line files. ~15 min × 2 files = ~30 min. +- **Verify weak-model gain holds with α** — the bake-off showed matrix helps free-tier `gpt-oss-120b:free` on the large file. After α is wired, re-run on a free-tier model to confirm full enrichment still fires for it. ~5 min. +- **Higher-signal matrix (β fork)** — if we ever want matrix back as a default, it can't be whole-ADR/whole-section chunks. Better: only retrieve chunks where the focus file's defined symbols appear. Tighter signal, fewer chunks. Postponed. + +## Reference data + tools + +- **Mode-runner code:** `crates/gateway/src/v1/mode.rs` +- **Mode config:** `config/modes.toml` +- **Per-call experiment log:** `data/_kb/mode_experiments.jsonl` +- **Sweep harnesses:** + - `scripts/mode_experiment.ts` — files × modes × 1 rep (default model: `x-ai/grok-4.1-fast`) + - `scripts/mode_pass2_corpus_sweep.ts` — corpus × threshold sweep + - `scripts/mode_pass3_variance.ts` — temp × reps on one mode + - `scripts/mode_pass5_variance_paid.ts` — N reps × M conditions on one file +- **Aggregators:** + - `scripts/mode_compare.ts` — full per-mode comparison with grounding check + - `scripts/mode_pass5_summarize.ts` — variance + head-to-head, robust to 3 table shapes +- **Corpus builders (re-runnable when source docs / scrum_reviews / source code change):** + - `scripts/build_lakehouse_corpus.ts` + - `scripts/build_scrum_findings_corpus.ts` + - `scripts/build_symbols_corpus.ts` + +## Re-entry recipe (fresh session) + +```bash +cd /home/profit/lakehouse +git log --oneline scrum/auto-apply-19814 -10 # what's recent +cat docs/MODE_RUNNER_TUNING_PLAN.md # this file +bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T21:50:03Z # locked result +curl -s http://localhost:3100/v1/mode/list | jq '.task_classes.scrum_review' # current config +``` + +If you want to reproduce the bake-off: + +```bash +# Strong model variance test (~17 min): +bun run scripts/mode_pass5_variance_paid.ts + +# Weak-model regression (~10 min): +LH_MODEL=openai/gpt-oss-120b:free LH_REPS=3 bun run scripts/mode_pass5_variance_paid.ts +``` diff --git a/scripts/build_lakehouse_corpus.ts b/scripts/build_lakehouse_corpus.ts new file mode 100644 index 0000000..2f15634 --- /dev/null +++ b/scripts/build_lakehouse_corpus.ts @@ -0,0 +1,176 @@ +#!/usr/bin/env bun +/** + * Build the `lakehouse_arch_v1` corpus — Option A from 2026-04-26 + * corpus-tightening pass. Sources: DECISIONS.md ADRs, standalone + * ADR-NNN-*.md docs, PHASES.md per-phase entries, PRD.md, + * CONTROL_PLANE_PRD.md, SCRUM_MASTER_SPEC.md sections. + * + * doc_id encodes origin (adr:017, phase:19, prd:executive_summary, ...) + * so the reviewer prompt's [tag] surfaces useful context. + * + * Usage: + * bun run scripts/build_lakehouse_corpus.ts # build + * bun run scripts/build_lakehouse_corpus.ts --dry-run # show docs, don't POST + * bun run scripts/build_lakehouse_corpus.ts --print # dump first chunk + count + */ + +import { readFileSync, readdirSync } from "node:fs"; +import { resolve } from "node:path"; + +const ROOT = resolve(import.meta.dir, ".."); +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_arch_v1"; +const SOURCE_LABEL = "lakehouse_arch"; +const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500); +const OVERLAP = Number(process.env.LH_OVERLAP ?? 150); + +interface Doc { id: string; text: string } + +function slug(s: string): string { + return s + .toLowerCase() + .replace(/[^a-z0-9]+/g, "_") + .replace(/^_+|_+$/g, "") + .slice(0, 60); +} + +// Split DECISIONS.md by `## ADR-NNN: title`. Drop date line so it doesn't +// dilute the embedding (ADRs are about intent, not when they happened). +function chunkDecisionsMd(md: string): Doc[] { + const docs: Doc[] = []; + const sections = md.split(/^## ADR-(\d+):\s*(.+)$/m); + // sections = [preamble, num, title, body, num, title, body, ...] + for (let i = 1; i < sections.length; i += 3) { + const num = sections[i].padStart(3, "0"); + const title = sections[i + 1].trim(); + const body = sections[i + 2] + .replace(/^\*\*Date:\*\*.*$/m, "") + .trim(); + docs.push({ + id: `adr:${num}`, + text: `# ADR-${num}: ${title}\n\n${body}`, + }); + } + return docs; +} + +// Standalone ADR-NNN-*.md files in docs/ — keep one doc per file. +function chunkStandaloneAdrs(dir: string): Doc[] { + const docs: Doc[] = []; + for (const f of readdirSync(dir)) { + const m = f.match(/^ADR-(\d+)-(.+)\.md$/); + if (!m) continue; + const num = m[1].padStart(3, "0"); + const slug_ = slug(m[2]); + docs.push({ + id: `adr_doc:${num}_${slug_}`, + text: readFileSync(resolve(dir, f), "utf8"), + }); + } + return docs; +} + +// PHASES.md uses `## Phase N: title` headings + nested checklists. Split +// by phase. Sub-bullets stay with their parent phase so context is intact. +function chunkPhasesMd(md: string): Doc[] { + const docs: Doc[] = []; + const sections = md.split(/^## (Phase[^\n]*)$/m); + for (let i = 1; i < sections.length; i += 2) { + const heading = sections[i].trim(); + const body = sections[i + 1].trim(); + if (!body) continue; + const phase_num_match = heading.match(/Phase\s+(\S+)/); + const id_part = phase_num_match + ? `phase:${slug(phase_num_match[1])}` + : `phase:${slug(heading)}`; + docs.push({ id: id_part, text: `## ${heading}\n${body}` }); + } + return docs; +} + +// Generic doc: split by `## Section` (top-level inside a single doc). If +// the section list is empty, return the whole file as one doc and let the +// server-side chunker handle it. +function chunkBySectionH2(filePath: string, originPrefix: string): Doc[] { + const md = readFileSync(filePath, "utf8"); + const sections = md.split(/^## (.+)$/m); + if (sections.length < 3) { + return [{ id: `${originPrefix}:_full`, text: md }]; + } + const docs: Doc[] = []; + // Capture preamble (before any ## heading) if non-trivial + if (sections[0].trim().length > 200) { + docs.push({ + id: `${originPrefix}:_preamble`, + text: sections[0].trim(), + }); + } + for (let i = 1; i < sections.length; i += 2) { + const heading = sections[i].trim(); + const body = sections[i + 1].trim(); + if (!body) continue; + docs.push({ + id: `${originPrefix}:${slug(heading)}`, + text: `## ${heading}\n${body}`, + }); + } + return docs; +} + +function buildAllDocs(): Doc[] { + const docs: Doc[] = []; + docs.push(...chunkDecisionsMd(readFileSync(resolve(ROOT, "docs/DECISIONS.md"), "utf8"))); + docs.push(...chunkStandaloneAdrs(resolve(ROOT, "docs"))); + docs.push(...chunkPhasesMd(readFileSync(resolve(ROOT, "docs/PHASES.md"), "utf8"))); + docs.push(...chunkBySectionH2(resolve(ROOT, "docs/PRD.md"), "prd")); + docs.push(...chunkBySectionH2(resolve(ROOT, "docs/CONTROL_PLANE_PRD.md"), "ctrl_prd")); + docs.push(...chunkBySectionH2(resolve(ROOT, "docs/SCRUM_MASTER_SPEC.md"), "scrum_spec")); + return docs; +} + +async function main() { + const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print"); + const printOnly = process.argv.includes("--print"); + + const docs = buildAllDocs(); + const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); + const expectedChunks = Math.ceil(totalBytes / (CHUNK_SIZE - OVERLAP)); + + console.log(`[corpus] ${docs.length} documents · ${totalBytes} bytes · ~${expectedChunks} chunks at ${CHUNK_SIZE}/${OVERLAP}`); + console.log(`[corpus] origins: ${[...new Set(docs.map(d => d.id.split(":")[0]))].join(", ")}`); + + if (printOnly) { + console.log("\n[corpus] first 3 doc IDs:"); + docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`)); + console.log("\n[corpus] last 3 doc IDs:"); + docs.slice(-3).forEach(d => console.log(` ${d.id} (${d.text.length} bytes) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`)); + return; + } + if (dryRun) return; + + const body = { + index_name: INDEX_NAME, + source: SOURCE_LABEL, + documents: docs, + chunk_size: CHUNK_SIZE, + overlap: OVERLAP, + }; + + console.log(`[corpus] POST ${GATEWAY}/vectors/index → ${INDEX_NAME}`); + const r = await fetch(`${GATEWAY}/vectors/index`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify(body), + signal: AbortSignal.timeout(60_000), + }); + if (!r.ok) { + console.error(`[corpus] HTTP ${r.status}: ${await r.text()}`); + process.exit(1); + } + const j: any = await r.json(); + console.log(`[corpus] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); + console.log(`[corpus] poll: curl -s ${GATEWAY}/vectors/jobs/${j.job_id} | jq`); + console.log(`[corpus] verify: curl -s '${GATEWAY}/vectors/indexes' | jq '.[]|select(.index_name=="${INDEX_NAME}")'`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/build_scrum_findings_corpus.ts b/scripts/build_scrum_findings_corpus.ts new file mode 100644 index 0000000..f25bdb4 --- /dev/null +++ b/scripts/build_scrum_findings_corpus.ts @@ -0,0 +1,94 @@ +#!/usr/bin/env bun +/** + * Build the `scrum_findings_v1` corpus — Option B from 2026-04-26 + * corpus pass. Self-feeding: each accepted scrum review's + * `suggestions_preview` becomes a document, indexed under doc_id + * `review::` so multi-iteration coexists. + * + * Re-run this whenever scrum_reviews.jsonl grows; the index_name stays + * stable and the gateway will re-register metadata. + */ + +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; + +const ROOT = resolve(import.meta.dir, ".."); +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "scrum_findings_v1"; +const SOURCE_LABEL = "scrum_findings"; +const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 1500); +const OVERLAP = Number(process.env.LH_OVERLAP ?? 150); +const MIN_PREVIEW_BYTES = 200; // skip stub rows + +interface Doc { id: string; text: string } + +function slugFile(path: string): string { + return path.replace(/^crates\//, "").replace(/[^a-z0-9]+/gi, "_").slice(0, 40); +} + +function compactTs(iso: string): string { + return iso.replace(/[-:T]/g, "").slice(0, 14); // 20260424T110656 +} + +function buildDocs(): Doc[] { + const lines = readFileSync(resolve(ROOT, "data/_kb/scrum_reviews.jsonl"), "utf8").split("\n").filter(Boolean); + const docs: Doc[] = []; + const idCounts = new Map(); + + for (const line of lines) { + let row: any; + try { row = JSON.parse(line); } catch { continue; } + + const file = row.file ?? ""; + const preview = row.suggestions_preview ?? ""; + if (!file || preview.length < MIN_PREVIEW_BYTES) continue; + + const ts = compactTs(row.reviewed_at ?? ""); + const baseId = `review:${slugFile(file)}:${ts || "no_ts"}`; + // Multiple reviews with same ts (rare but possible) get a counter. + const count = (idCounts.get(baseId) ?? 0) + 1; + idCounts.set(baseId, count); + const id = count === 1 ? baseId : `${baseId}_${count}`; + + const header = `File: ${file}\nReviewed: ${row.reviewed_at ?? "?"}\nModel: ${row.accepted_model ?? "?"}\nVerdict: ${row.verdict ?? "?"}\nFindings: ${row.findings_count ?? "?"}\n\n`; + docs.push({ id, text: header + preview }); + } + return docs; +} + +async function main() { + const dryRun = process.argv.includes("--dry-run") || process.argv.includes("--print"); + const printOnly = process.argv.includes("--print"); + + const docs = buildDocs(); + const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); + console.log(`[corpus-B] ${docs.length} reviews · ${totalBytes} bytes · target chunk_size=${CHUNK_SIZE}`); + console.log(`[corpus-B] file coverage: ${new Set(docs.map(d => d.id.split(":")[1])).size} unique files`); + + if (printOnly) { + docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length}b) ${d.text.slice(0, 80).replace(/\n/g, " ")}…`)); + return; + } + if (dryRun) return; + + const r = await fetch(`${GATEWAY}/vectors/index`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + index_name: INDEX_NAME, + source: SOURCE_LABEL, + documents: docs, + chunk_size: CHUNK_SIZE, + overlap: OVERLAP, + }), + signal: AbortSignal.timeout(60_000), + }); + if (!r.ok) { + console.error(`[corpus-B] HTTP ${r.status}: ${await r.text()}`); + process.exit(1); + } + const j: any = await r.json(); + console.log(`[corpus-B] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/build_symbols_corpus.ts b/scripts/build_symbols_corpus.ts new file mode 100644 index 0000000..72e2038 --- /dev/null +++ b/scripts/build_symbols_corpus.ts @@ -0,0 +1,141 @@ +#!/usr/bin/env bun +/** + * Build the `lakehouse_symbols_v1` corpus — Option C from 2026-04-26 + * pass. Extracts public Rust items with their /// doc comments from + * crates/**\/*.rs. Regex-based — covers ~80% of definitions without + * pulling in a syn-based parser. + * + * doc_id: `symbol:::::` e.g. symbol:vectord::struct::PathwayTrace + * + * Each chunk includes: doc comment (if any) + signature + 1-2 lines + * after the brace so reviewer sees field types / variants for structs + * and enums. + */ + +import { readFileSync, readdirSync, statSync } from "node:fs"; +import { resolve, relative } from "node:path"; + +const ROOT = resolve(import.meta.dir, ".."); +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const INDEX_NAME = process.env.LH_CORPUS_NAME ?? "lakehouse_symbols_v1"; +const SOURCE_LABEL = "lakehouse_symbols"; +const CHUNK_SIZE = Number(process.env.LH_CHUNK_SIZE ?? 800); +const OVERLAP = Number(process.env.LH_OVERLAP ?? 80); + +interface Doc { id: string; text: string } + +function walkRs(dir: string): string[] { + const out: string[] = []; + for (const entry of readdirSync(dir)) { + if (entry === "target" || entry.startsWith(".")) continue; + const full = resolve(dir, entry); + const st = statSync(full); + if (st.isDirectory()) out.push(...walkRs(full)); + else if (entry.endsWith(".rs")) out.push(full); + } + return out; +} + +function crateOf(rsPath: string): string { + const rel = relative(resolve(ROOT, "crates"), rsPath); + return rel.split("/")[0]; +} + +// Match pub fn|struct|enum|trait declarations. Capture the (optional) +// preceding contiguous /// doc block and a few lines after for signature +// + body preview. Skips items inside `mod tests` blocks and #[cfg(test)]. +const ITEM_RE = /(?:^[ \t]*\/\/\/.*\n)*[ \t]*pub(?:\([^)]+\))?[ \t]+(fn|struct|enum|trait|async[ \t]+fn)[ \t]+([A-Za-z_][A-Za-z0-9_]*)/gm; + +function extractItems(src: string, crate: string, relPath: string): Doc[] { + const docs: Doc[] = []; + const seen = new Set(); + + // Quick test-module guard: drop everything from a `mod tests {` line + // onward. Coarse but adequate — public items inside tests are rare. + const cutoff = src.search(/^(#\[cfg\(test\)\]|mod tests\b)/m); + const usable = cutoff > 0 ? src.slice(0, cutoff) : src; + + for (const m of usable.matchAll(ITEM_RE)) { + const matchStart = m.index!; + const kind = m[1].replace(/^async[ \t]+/, "async_"); + const name = m[2]; + + // Walk backward to capture the contiguous /// doc block above. + const lines = usable.slice(0, matchStart).split("\n"); + const docLines: string[] = []; + for (let i = lines.length - 1; i >= 0; i--) { + const t = lines[i].trim(); + if (t.startsWith("///")) docLines.unshift(t.replace(/^\/\/\/\s?/, "")); + else if (t === "" || t.startsWith("#[")) continue; + else break; + } + + // Capture signature + ~6 lines of body preview. + const after = usable.slice(matchStart, matchStart + 800); + const bodyEnd = after.search(/\n\}\n|\n\n[a-z#]/); + const body = bodyEnd > 0 ? after.slice(0, Math.min(bodyEnd, 800)) : after.slice(0, 800); + + const id = `symbol:${crate}::${kind}::${name}`; + if (seen.has(id)) continue; + seen.add(id); + + const header = `${crate}::${name} (${kind}) — ${relPath}`; + const docText = docLines.length > 0 ? `\n${docLines.join("\n")}\n` : "\n"; + docs.push({ id, text: `${header}\n${docText}\n\`\`\`rust\n${body}\n\`\`\`` }); + } + return docs; +} + +function buildDocs(): Doc[] { + const cratesDir = resolve(ROOT, "crates"); + const docs: Doc[] = []; + for (const f of walkRs(cratesDir)) { + const src = readFileSync(f, "utf8"); + const crate = crateOf(f); + const rel = relative(ROOT, f); + docs.push(...extractItems(src, crate, rel)); + } + return docs; +} + +async function main() { + const printOnly = process.argv.includes("--print"); + const dryRun = process.argv.includes("--dry-run") || printOnly; + + const docs = buildDocs(); + const totalBytes = docs.reduce((s, d) => s + d.text.length, 0); + const byCrate = new Map(); + for (const d of docs) { + const c = d.id.split("::")[0].replace("symbol:", ""); + byCrate.set(c, (byCrate.get(c) ?? 0) + 1); + } + console.log(`[corpus-C] ${docs.length} symbols · ${totalBytes} bytes · chunk_size=${CHUNK_SIZE}`); + console.log(`[corpus-C] by crate: ${[...byCrate.entries()].map(([k, v]) => `${k}=${v}`).join(", ")}`); + + if (printOnly) { + docs.slice(0, 3).forEach(d => console.log(` ${d.id} (${d.text.length}b)\n ${d.text.slice(0, 200).replace(/\n/g, "\n ")}\n`)); + return; + } + if (dryRun) return; + + const r = await fetch(`${GATEWAY}/vectors/index`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + index_name: INDEX_NAME, + source: SOURCE_LABEL, + documents: docs, + chunk_size: CHUNK_SIZE, + overlap: OVERLAP, + }), + signal: AbortSignal.timeout(60_000), + }); + if (!r.ok) { + console.error(`[corpus-C] HTTP ${r.status}: ${await r.text()}`); + process.exit(1); + } + const j: any = await r.json(); + console.log(`[corpus-C] job ${j.job_id} · ${j.documents} docs → ${j.chunks} chunks queued`); +} + +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/mode_compare.ts b/scripts/mode_compare.ts index ddc0877..ec403d8 100644 --- a/scripts/mode_compare.ts +++ b/scripts/mode_compare.ts @@ -227,13 +227,37 @@ function main() { process.exit(1); } - // Group by file → mode + // Group by file → mode (with corpus appended when matrix-bearing modes + // were swept across multiple corpora — otherwise lakehouse_arch_v1 + // would clobber scrum_findings_v1 etc). matrix_corpus is now a Vec + // on the wire (multi-corpus support); legacy rows have either a + // string or null. Coerce to a stable key. + const matrixCorpus = (r: Row): string => { + const c = (r.sources as any)?.matrix_corpus; + if (!c) return ""; + if (typeof c === "string") return c; + if (Array.isArray(c)) { + if (c.length === 0) return ""; + if (c.length === 1) return c[0]; + // Stable join: sort then "+"-separate so order doesn't matter. + return [...c].sort().join("+"); + } + return ""; + }; + const corporaInPlay = new Set(rows.map(matrixCorpus).filter(c => c)); + const showCorpus = corporaInPlay.size > 1; + const keyOf = (r: Row): string => { + const c = matrixCorpus(r); + return showCorpus && c ? `${r.mode}|${c}` : r.mode; + }; + const byFile: Record> = {}; const allModes = new Set(); for (const r of rows) { byFile[r.file_path] ??= {}; - byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file - allModes.add(r.mode); + const k = keyOf(r); + byFile[r.file_path][k] = r; // last-write-wins per (mode,corpus) per file + allModes.add(k); } const modesSorted = [...allModes].sort(); @@ -246,13 +270,13 @@ function main() { const fileLines = fileContent ? fileContent.split("\n").length : 0; console.log(` (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE — grounding skipped" : ""})`); console.log( - ` ${pad("mode", 30)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}` + ` ${pad("mode", 56)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}` ); - console.log(` ${"─".repeat(30)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`); + console.log(` ${"─".repeat(56)} ${"─".repeat(6)} ${"─".repeat(5)} ${"─".repeat(9)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(6)}`); for (const mode of modesSorted) { const r = byFile[file][mode]; if (!r) { - console.log(` ${pad(modeLabel(mode), 30)} ${pad("—", 6, true)}`); + console.log(` ${pad(modeLabel(mode), 56)} ${pad("—", 6, true)}`); continue; } const findings = countFindings(r.response); @@ -261,7 +285,7 @@ function main() { const grounding = checkGrounding(r.response, fileContent); const groundedStr = grounding.total === 0 ? "—" : `${grounding.grounded}/${grounding.total}`; console.log( - ` ${pad(modeLabel(mode), 30)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}` + ` ${pad(modeLabel(mode), 56)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}` ); } console.log(""); @@ -271,11 +295,11 @@ function main() { // avg_groundedness is the rate at which findings cite real symbols // within file bounds. Modes with low groundedness are confabulating. console.log("═══ PER-MODE AGGREGATE ═══\n"); - console.log(` ${pad("mode", 30)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`); - console.log(` ${"─".repeat(30)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`); + console.log(` ${pad("mode", 56)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`); + console.log(` ${"─".repeat(56)} ${"─".repeat(3)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(7)} ${"─".repeat(10)} ${"─".repeat(7)}`); const fileCache: Record = {}; for (const mode of modesSorted) { - const modeRows = rows.filter(r => r.mode === mode); + const modeRows = rows.filter(r => keyOf(r) === mode); if (modeRows.length === 0) continue; const n = modeRows.length; let totFind = 0, totGround = 0, totHallu = 0; @@ -292,7 +316,7 @@ function main() { const avgHallu = (totHallu / n).toFixed(1); const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n); console.log( - ` ${pad(modeLabel(mode), 30)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}` + ` ${pad(modeLabel(mode), 56)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}` ); } @@ -301,13 +325,20 @@ function main() { // wins). Comparing grounded findings instead corrects for modes // that produce convincing-but-fake output. console.log("\n═══ MODE vs codereview_lakehouse (grounded findings, per file) ═══\n"); - console.log(` ${pad("mode", 30)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg grounded", 16, true)}`); - console.log(` ${"─".repeat(30)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`); + console.log(` ${pad("mode", 56)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg grounded", 16, true)}`); + console.log(` ${"─".repeat(56)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`); + // Pick whichever codereview_lakehouse key shows up most often as the + // baseline (handles corpus-suffixed keys when showCorpus=true). + const baselineKey = modesSorted + .filter(k => k.startsWith("codereview_lakehouse")) + .sort((a, b) => + Object.values(byFile).filter(f => f[b]).length - + Object.values(byFile).filter(f => f[a]).length)[0] ?? "codereview_lakehouse"; for (const mode of modesSorted) { - if (mode === "codereview_lakehouse") continue; + if (mode === baselineKey) continue; let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0; for (const file of Object.keys(byFile)) { - const baseline = byFile[file]["codereview_lakehouse"]; + const baseline = byFile[file][baselineKey]; const challenger = byFile[file][mode]; if (!baseline || !challenger) continue; const fc = fileCache[file] ??= readFileSafe(file); @@ -322,7 +353,7 @@ function main() { if (n === 0) continue; const avgDelta = (totalDelta / n).toFixed(1); console.log( - ` ${pad(modeLabel(mode), 30)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}` + ` ${pad(modeLabel(mode), 56)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}` ); } console.log("\n[compare] done — ⚗ marks lossy/control modes, exclude from recommendations\n"); diff --git a/scripts/mode_experiment.ts b/scripts/mode_experiment.ts index f4a1f76..96ec73d 100644 --- a/scripts/mode_experiment.ts +++ b/scripts/mode_experiment.ts @@ -32,7 +32,7 @@ const DEFAULT_FILES = [ "crates/queryd/src/service.rs", ]; -function parseArgs(): { files: string[]; modes: string[]; model: string } { +function parseArgs(): { files: string[]; modes: string[]; model: string; corpus: string[] } { const args = Bun.argv.slice(2); const out: Record = {}; for (let i = 0; i < args.length; i++) { @@ -41,8 +41,12 @@ function parseArgs(): { files: string[]; modes: string[]; model: string } { } const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean); const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean); - const model = out.model ?? "openai/gpt-oss-120b:free"; - return { files, modes, model }; + // Default to the paid OpenRouter primary (matches scrum_master_pipeline + // ladder rung 1). Pass `--model openai/gpt-oss-120b:free` if you want + // the old free-tier baseline. See SCRUM_MASTER_SPEC.md for the ladder. + const model = out.model ?? "x-ai/grok-4.1-fast"; + const corpus = (out.corpus ?? "").split(",").map(s => s.trim()).filter(Boolean); + return { files, modes, model, corpus }; } interface RunResult { @@ -58,19 +62,22 @@ interface RunResult { error?: string; } -async function runOne(file: string, mode: string, model: string): Promise { +async function runOne(file: string, mode: string, model: string, corpus: string[]): Promise { const t0 = Date.now(); try { + const body: any = { + task_class: TASK_CLASS, + file_path: file, + force_mode: mode, + force_model: model, + }; + if (corpus.length === 1) body.force_matrix_corpus = corpus[0]; + else if (corpus.length > 1) body.force_matrix_corpus = corpus; const r = await fetch(`${GATEWAY}/v1/mode/execute`, { method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ - task_class: TASK_CLASS, - file_path: file, - force_mode: mode, - force_model: model, - }), - signal: AbortSignal.timeout(180_000), + body: JSON.stringify(body), + signal: AbortSignal.timeout(240_000), }); if (!r.ok) { const body = await r.text().catch(() => ""); @@ -92,9 +99,10 @@ async function runOne(file: string, mode: string, model: string): Promise 0) console.log(`[experiment] corpus override: ${corpus.join(" + ")}`); console.log(""); const results: RunResult[] = []; @@ -103,7 +111,7 @@ async function main() { for (const mode of modes) { i++; process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `); - const r = await runOne(file, mode, model); + const r = await runOne(file, mode, model, corpus); results.push(r); if (r.ok) { console.log( diff --git a/scripts/mode_pass5_summarize.ts b/scripts/mode_pass5_summarize.ts new file mode 100644 index 0000000..0b90657 --- /dev/null +++ b/scripts/mode_pass5_summarize.ts @@ -0,0 +1,169 @@ +#!/usr/bin/env bun +/** + * Pass 5 variance summarizer. Reads data/_kb/mode_experiments.jsonl + * since a timestamp, groups by (mode|corpus), reports mean ± stddev + * of grounded finding count, plus a head-to-head wins/losses table + * vs the isolation baseline. + * + * Usage: + * bun run scripts/mode_pass5_summarize.ts # default 2h + * bun run scripts/mode_pass5_summarize.ts --since 2026-04-26T22 # explicit + */ + +import { readFileSync, existsSync } from "node:fs"; + +const argSince = (() => { + const i = Bun.argv.indexOf("--since"); + return i >= 0 ? Bun.argv[i + 1] : new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString(); +})(); + +const JSONL = "data/_kb/mode_experiments.jsonl"; +if (!existsSync(JSONL)) { console.error(`no ${JSONL}`); process.exit(1); } + +interface Row { + ts: string; mode: string; file_path: string; response: string; + sources: { matrix_corpus?: string | string[] | null }; + latency_ms: number; +} + +function corpusKey(c: any): string { + if (!c) return ""; + if (typeof c === "string") return c; + if (Array.isArray(c)) return c.length === 0 ? "" : [...c].sort().join("+"); + return ""; +} +const condKey = (r: Row) => { + const c = corpusKey(r.sources?.matrix_corpus); + return c ? `${r.mode}|${c}` : r.mode; +}; + +// Reuse the same grounding logic as mode_compare — symbols cited in +// findings rows must appear in the focus file, and any line numbers +// must fall within EOF. +function extractFindings(md: string): { symbols: string[]; lines: number[] }[] { + const sec = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i; + const m = md.match(sec); + let section = md; + if (m && m.index !== undefined) { + const after = md.slice(m.index + m[0].length); + const stop = after.search(/\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i); + section = stop >= 0 ? after.slice(0, stop) : after; + } + // Three row shapes: + // 1) numbered: `| 1 | ... |` + // 2) path-with-line: `| service.rs:106 | ... |` + // 3) path-with-sym: `| crates/vectord/src/pathway_memory.rs:load_fn (≈L220) | ... |` + // Pick whichever shape matches the most rows (ties favor numbered). + const numbered = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l)); + const pathRows = section.split("\n").filter(l => /^\|\s*[a-z_/\.][a-z_/\.0-9]*\.(rs|ts|py)\b/i.test(l)); + const rows = numbered.length >= pathRows.length ? numbered : pathRows; + return rows.map(row => { + const sym = new Set(); + for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]); + for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) sym.add(t[1]); + const lines: number[] = []; + for (const t of row.matchAll(/[:\-](\d{2,5})/g)) lines.push(parseInt(t[1])); + return { symbols: [...sym], lines }; + }); +} + +function grounded(md: string, file: string): { total: number; grounded: number; oob: number } { + const content = readFileSync(file, "utf8"); + const eof = content.split("\n").length; + const findings = extractFindings(md); + let g = 0, oob = 0; + for (const f of findings) { + const symHit = f.symbols.length > 0 && f.symbols.some(s => content.includes(s)); + const lineOob = f.lines.length > 0 && f.lines.some(l => l > eof); + if (lineOob) oob++; + if (symHit && !lineOob) g++; + } + return { total: findings.length, grounded: g, oob }; +} + +const lines = readFileSync(JSONL, "utf8").split("\n").filter(Boolean); +const rows: Row[] = []; +for (const l of lines) { + try { + const r: Row = JSON.parse(l); + if (r.ts < argSince) continue; + rows.push(r); + } catch {} +} + +if (rows.length === 0) { console.error(`no rows since ${argSince}`); process.exit(1); } + +// Group: condition → file → array of grounded counts +type CellArr = { grnd: number[]; total: number[]; oob: number[]; ms: number[] }; +const byCond: Record> = {}; +for (const r of rows) { + const k = condKey(r); + byCond[k] ??= {}; + byCond[k][r.file_path] ??= { grnd: [], total: [], oob: [], ms: [] }; + const g = grounded(r.response, r.file_path); + byCond[k][r.file_path].grnd.push(g.grounded); + byCond[k][r.file_path].total.push(g.total); + byCond[k][r.file_path].oob.push(g.oob); + byCond[k][r.file_path].ms.push(r.latency_ms); +} + +function stats(xs: number[]): { n: number; mean: number; sd: number; min: number; max: number } { + const n = xs.length; + if (n === 0) return { n: 0, mean: 0, sd: 0, min: 0, max: 0 }; + const mean = xs.reduce((s, x) => s + x, 0) / n; + const variance = n === 1 ? 0 : xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1); + return { n, mean, sd: Math.sqrt(variance), min: Math.min(...xs), max: Math.max(...xs) }; +} + +const conditions = Object.keys(byCond).sort(); +const files = [...new Set(rows.map(r => r.file_path))].sort(); + +console.log(`\n═══ Pass 5 variance — since ${argSince} ═══\n`); +console.log(` ${rows.length} rows · ${conditions.length} conditions · ${files.length} files\n`); + +for (const file of files) { + console.log(`📄 ${file}`); + console.log(` ${"condition".padEnd(56)} n ${"grounded mean ± sd".padStart(20)} ${"range".padStart(8)} ${"oob".padStart(4)} ${"avg ms".padStart(7)}`); + console.log(` ${"─".repeat(56)} ─── ${"─".repeat(20)} ${"─".repeat(8)} ${"─".repeat(4)} ${"─".repeat(7)}`); + for (const c of conditions) { + const cell = byCond[c]?.[file]; + if (!cell || cell.grnd.length === 0) continue; + const s = stats(cell.grnd); + const oobSum = cell.oob.reduce((a, b) => a + b, 0); + const msMean = cell.ms.reduce((a, b) => a + b, 0) / cell.ms.length; + const meanSd = `${s.mean.toFixed(1)} ± ${s.sd.toFixed(1)}`; + const range = `[${s.min}-${s.max}]`; + console.log(` ${c.padEnd(56)} ${String(s.n).padStart(3)} ${meanSd.padStart(20)} ${range.padStart(8)} ${String(oobSum).padStart(4)} ${Math.round(msMean / 1000).toString().padStart(5)}s`); + } + console.log(""); +} + +// Head-to-head: for each condition vs isolation baseline, count rep-by-rep +// wins across the same file. Requires equal rep counts. +console.log(`═══ Head-to-head: each condition vs isolation, rep-by-rep ═══\n`); +const isoKey = conditions.find(c => c.startsWith("codereview_isolation")); +if (!isoKey) { + console.log(" no isolation rows in window"); +} else { + console.log(` baseline: ${isoKey}\n`); + console.log(` ${"challenger".padEnd(56)} wins losses ties Δ mean grnd`); + console.log(` ${"─".repeat(56)} ${"─".repeat(4)} ${"─".repeat(6)} ${"─".repeat(4)} ${"─".repeat(12)}`); + for (const c of conditions) { + if (c === isoKey) continue; + let wins = 0, losses = 0, ties = 0, deltaSum = 0, n = 0; + for (const file of files) { + const isoArr = byCond[isoKey]?.[file]?.grnd ?? []; + const cArr = byCond[c]?.[file]?.grnd ?? []; + const k = Math.min(isoArr.length, cArr.length); + for (let i = 0; i < k; i++) { + if (cArr[i] > isoArr[i]) wins++; + else if (cArr[i] < isoArr[i]) losses++; + else ties++; + deltaSum += cArr[i] - isoArr[i]; + n++; + } + } + const dMean = n > 0 ? (deltaSum / n).toFixed(2) : "—"; + console.log(` ${c.padEnd(56)} ${String(wins).padStart(4)} ${String(losses).padStart(6)} ${String(ties).padStart(4)} ${dMean.padStart(12)}`); + } +} diff --git a/scripts/mode_pass5_variance_paid.ts b/scripts/mode_pass5_variance_paid.ts new file mode 100644 index 0000000..2191747 --- /dev/null +++ b/scripts/mode_pass5_variance_paid.ts @@ -0,0 +1,96 @@ +#!/usr/bin/env bun +/** + * Pass 5: variance test for the 2026-04-26 paid-model bake-off. + * + * The pass-4 single-rep sweep showed isolation beating every matrix + * condition by 1.0-1.4 grounded findings/file on grok-4.1-fast. This + * harness runs N reps × M conditions on the file where the effect was + * sharpest (pathway_memory.rs, 1355 lines) so we can decide whether + * the deltas are real signal or run-to-run noise. + * + * Conditions: + * 1. codereview_isolation — no matrix + * 2. codereview_lakehouse + corpus=lakehouse_arch_v1 — A only + * 3. codereview_lakehouse + corpus=lakehouse_symbols_v1 — C only + * 4. codereview_lakehouse (modes.toml default) — A+C composed + * + * Output appends per-call to data/_kb/mode_experiments.jsonl. Aggregate + * with `bun run scripts/mode_compare.ts --since ` and read the + * grounded column with multiple rows per (mode|corpus) key. + * + * Usage: + * bun run scripts/mode_pass5_variance_paid.ts + * LH_REPS=3 LH_FILE=crates/queryd/src/delta.rs bun run scripts/mode_pass5_variance_paid.ts + */ + +const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100"; +const MODEL = process.env.LH_MODEL ?? "x-ai/grok-4.1-fast"; +const FILE = process.env.LH_FILE ?? "crates/vectord/src/pathway_memory.rs"; +const REPS = Number(process.env.LH_REPS ?? 5); + +interface Condition { + label: string; + mode: string; + corpus?: string | string[]; +} + +const CONDITIONS: Condition[] = [ + { label: "isolation ", mode: "codereview_isolation" }, + { label: "arch_only ", mode: "codereview_lakehouse", corpus: "lakehouse_arch_v1" }, + { label: "symbols_only ", mode: "codereview_lakehouse", corpus: "lakehouse_symbols_v1" }, + { label: "composed (A+C) ", mode: "codereview_lakehouse" /* uses modes.toml default */ }, +]; + +async function runOne(c: Condition, rep: number): Promise<{ ok: boolean; latency_ms?: number; resp_chars?: number; error?: string }> { + const body: any = { + task_class: "scrum_review", + file_path: FILE, + force_mode: c.mode, + force_model: MODEL, + }; + if (c.corpus !== undefined) body.force_matrix_corpus = c.corpus; + + try { + const r = await fetch(`${GATEWAY}/v1/mode/execute`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify(body), + signal: AbortSignal.timeout(240_000), + }); + if (!r.ok) { + const txt = await r.text().catch(() => ""); + return { ok: false, error: `HTTP ${r.status}: ${txt.slice(0, 160)}` }; + } + const j: any = await r.json(); + return { ok: true, latency_ms: j.latency_ms, resp_chars: (j.response ?? "").length }; + } catch (e: any) { + return { ok: false, error: e.message }; + } +} + +async function main() { + const total = CONDITIONS.length * REPS; + console.log(`[pass5] file=${FILE}`); + console.log(`[pass5] model=${MODEL} · ${CONDITIONS.length} conditions × ${REPS} reps = ${total} runs`); + console.log(""); + + let i = 0; + const startTs = new Date().toISOString(); + for (let rep = 1; rep <= REPS; rep++) { + for (const c of CONDITIONS) { + i++; + process.stdout.write(` [${i}/${total}] rep=${rep} ${c.label}... `); + const r = await runOne(c, rep); + if (r.ok) { + console.log(`✓ ${r.resp_chars} chars · ${((r.latency_ms ?? 0) / 1000).toFixed(1)}s`); + } else { + console.log(`✗ ${r.error}`); + } + } + } + + console.log(`\n[pass5] complete · started ${startTs}`); + console.log(`[pass5] aggregate: bun run scripts/mode_compare.ts --since ${startTs}`); +} + +main().catch(e => { console.error(e); process.exit(1); });