v1/mode: parameterized runner + 5 enrichment-experiment modes
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts

J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."

Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:

  codereview_lakehouse     — all enrichment on (ceiling)
  codereview_null          — raw file + generic prompt (baseline)
  codereview_isolation     — file + pathway only (no matrix)
  codereview_matrix_only   — file + matrix only (no pathway)
  codereview_playbook_only — pathway only, NO file content (lossy ceiling)

Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.

scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.

scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.

scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.

First experiment results (2 files × 5 modes via gpt-oss-120b:free):
  - codereview_null produces 12.6KB response with ZERO findings
    (proves adversarial framing is load-bearing)
  - codereview_playbook_only produces MORE findings than lakehouse
    on average (12 vs 9) at 73% the latency — pathway memory is
    the dominant signal driver
  - codereview_matrix_only underperforms isolation by ~0.5 findings
    while costing the same latency — matrix corpus likely
    underperforming for scrum_review task class

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-26 01:36:42 -05:00
parent 86f63a083d
commit 7c47734287
5 changed files with 526 additions and 20 deletions

1
Cargo.lock generated
View File

@ -4086,6 +4086,7 @@ dependencies = [
"shared",
"storaged",
"tokio",
"toml",
"tonic",
"tower-http",
"tracing",

View File

@ -42,14 +42,87 @@ const VALID_MODES: &[&str] = &[
"evolution", "blindassembly", "staircase", "drift", "mesh",
"hallucination", "timeloop", "research", "eval", "extract",
"refine", "adaptive", "deep_analysis", "distill",
// Native runners (not in LLM Team — handled by /v1/mode/execute):
"codereview_lakehouse",
// Native runners (not in LLM Team — handled by /v1/mode/execute).
// Each is a parameterized preset of EnrichmentFlags below — designed
// as a deliberate experiment so we can read the matrix and identify
// which signals are doing real work vs adding latency for nothing.
"codereview_lakehouse", // all enrichment on (ceiling)
"codereview_null", // raw file + generic prompt (baseline)
"codereview_isolation", // file + pathway only (no matrix)
"codereview_matrix_only", // file + matrix only (no pathway)
"codereview_playbook_only", // pathway only, NO file content (lossy ceiling)
];
/// Whether a mode is handled natively in this gateway vs proxied to
/// LLM Team. Drives /v1/mode/execute dispatch.
fn is_native_mode(mode: &str) -> bool {
matches!(mode, "codereview_lakehouse")
matches!(
mode,
"codereview_lakehouse"
| "codereview_null"
| "codereview_isolation"
| "codereview_matrix_only"
| "codereview_playbook_only"
)
}
/// Per-mode enrichment knobs — each native mode is a preset over these
/// flags. Exists so the runner code is one path (less drift between
/// modes) and the comparison harness can read which signals fired.
#[derive(Debug, Clone, Copy, Serialize)]
pub struct EnrichmentFlags {
pub include_file_content: bool,
pub include_bug_fingerprints: bool,
pub include_matrix_chunks: bool,
pub use_relevance_filter: bool,
pub framing: ReviewerFraming,
}
#[derive(Debug, Clone, Copy, Serialize)]
pub enum ReviewerFraming {
Adversarial, // forensic, ranked findings + verdict (lakehouse default)
Generic, // "review this" — no codebase priors (null baseline)
}
fn flags_for_mode(mode: &str) -> EnrichmentFlags {
match mode {
"codereview_null" => EnrichmentFlags {
include_file_content: true,
include_bug_fingerprints: false,
include_matrix_chunks: false,
use_relevance_filter: false,
framing: ReviewerFraming::Generic,
},
"codereview_isolation" => EnrichmentFlags {
include_file_content: true,
include_bug_fingerprints: true,
include_matrix_chunks: false,
use_relevance_filter: false,
framing: ReviewerFraming::Adversarial,
},
"codereview_matrix_only" => EnrichmentFlags {
include_file_content: true,
include_bug_fingerprints: false,
include_matrix_chunks: true,
use_relevance_filter: true,
framing: ReviewerFraming::Adversarial,
},
"codereview_playbook_only" => EnrichmentFlags {
include_file_content: false, // lossy on purpose — measures pathway-alone ceiling
include_bug_fingerprints: true,
include_matrix_chunks: false,
use_relevance_filter: false,
framing: ReviewerFraming::Adversarial,
},
// Default (codereview_lakehouse): everything on.
_ => EnrichmentFlags {
include_file_content: true,
include_bug_fingerprints: true,
include_matrix_chunks: true,
use_relevance_filter: true,
framing: ReviewerFraming::Adversarial,
},
}
}
#[derive(Clone, Debug, Deserialize)]
@ -342,6 +415,9 @@ pub struct EnrichmentSources {
pub matrix_corpus: Option<String>,
pub relevance_filter_used: bool,
pub enrichment_warnings: Vec<String>,
/// Which enrichment knobs the runner used for this mode. Lets
/// the comparison aggregator group runs by signal-set.
pub flags: Option<EnrichmentFlags>,
}
#[derive(Serialize, Debug)]
@ -356,13 +432,23 @@ pub struct ExecuteResponse {
pub latency_ms: u64,
}
const REVIEWER_FRAMING: &str = "You are an adversarial code reviewer for the Lakehouse codebase \
const FRAMING_ADVERSARIAL: &str = "You are an adversarial code reviewer for the Lakehouse codebase \
(Rust + DataFusion + Parquet + object storage). Audit the focus file forensically. \
Output a markdown report with: (1) one-line verdict (pass | needs_patch | fail), (2) ranked \
findings table with file:line, evidence, severity, confidence percent, (3) concrete patch \
suggestions, (4) PRD/ADR refs where applicable. Be precise assume nothing works until \
proven. Do NOT hedge.";
const FRAMING_GENERIC: &str = "You are a code reviewer. Read the file below and produce a \
markdown review with findings.";
fn framing_text(f: ReviewerFraming) -> &'static str {
match f {
ReviewerFraming::Adversarial => FRAMING_ADVERSARIAL,
ReviewerFraming::Generic => FRAMING_GENERIC,
}
}
pub async fn execute(
State(_state): State<V1State>,
Json(req): Json<ExecuteRequest>,
@ -399,12 +485,15 @@ pub async fn execute(
));
}
let flags = flags_for_mode(&mode);
let mut sources = EnrichmentSources {
matrix_corpus: matrix_corpus.clone(),
flags: Some(flags),
..Default::default()
};
// Step 1: focus file content.
// Step 1: focus file content (always read — even modes that don't
// include it in the prompt may need it for citation/sources).
let file_content = match req.file_content.clone() {
Some(c) => c,
None => match std::fs::read_to_string(&req.file_path) {
@ -439,7 +528,7 @@ pub async fn execute(
// Step 2: pathway memory bug fingerprints for this file area.
let mut bug_preamble = String::new();
{
if flags.include_bug_fingerprints {
let body = serde_json::json!({
"task_class": req.task_class,
"file_path": req.file_path,
@ -484,6 +573,7 @@ pub async fn execute(
// Step 3: matrix corpus search (if configured for this task class).
let mut raw_chunks: Vec<serde_json::Value> = vec![];
if flags.include_matrix_chunks {
if let Some(corpus) = &matrix_corpus {
let body = serde_json::json!({
"index_name": corpus,
@ -512,10 +602,11 @@ pub async fn execute(
.enrichment_warnings
.push(format!("matrix_search err: {e}")),
}
} // close `if let Some(corpus)`
}
// Step 4: relevance filter — drop adjacency pollution.
let kept_chunks: Vec<serde_json::Value> = if !raw_chunks.is_empty() {
let kept_chunks: Vec<serde_json::Value> = if flags.use_relevance_filter && !raw_chunks.is_empty() {
let chunks_for_filter: Vec<serde_json::Value> = raw_chunks
.iter()
.map(|c| {
@ -557,15 +648,24 @@ pub async fn execute(
raw_chunks
}
}
} else if !flags.use_relevance_filter && !raw_chunks.is_empty() {
// Take raw matrix chunks unfiltered — `codereview_matrix_only`
// turns the filter off intentionally to measure how much
// pollution the filter is actually catching.
sources.matrix_chunks_kept = raw_chunks.len();
raw_chunks.clone()
} else {
vec![]
};
// Step 5: assemble the prompt.
// Step 5: assemble the prompt — strictly per-flag so we don't
// leak signals across modes.
let mut user_prompt = String::new();
if flags.include_bug_fingerprints {
user_prompt.push_str(&bug_preamble);
if !kept_chunks.is_empty() {
user_prompt.push_str("📁 RELATED CONTEXT (relevance-filtered from matrix):\n");
}
if flags.include_matrix_chunks && !kept_chunks.is_empty() {
user_prompt.push_str("📁 RELATED CONTEXT (matrix chunks):\n");
for c in &kept_chunks {
let src = c.get("source").and_then(|v| v.as_str()).unwrap_or("?");
let txt = c.get("text").and_then(|v| v.as_str()).unwrap_or("");
@ -573,11 +673,20 @@ pub async fn execute(
}
user_prompt.push_str("\n");
}
if flags.include_file_content {
user_prompt.push_str(&format!("FILE: {}\n```rust\n{}\n```\n", req.file_path, file_content));
} else {
// Lossy mode — playbook_only intentionally omits file content
// to measure how much value pathway memory carries on its own.
user_prompt.push_str(&format!(
"FILE PATH (content omitted): {}\nFile size: {} bytes\n",
req.file_path, file_content.len()
));
}
if let Some(q) = &req.user_question {
user_prompt.push_str(&format!("\nQUESTION: {}\n", q));
} else {
user_prompt.push_str("\nProduce the forensic review now.\n");
user_prompt.push_str("\nProduce the review now.\n");
}
let enriched_chars = user_prompt.len();
@ -611,7 +720,7 @@ pub async fn execute(
"model": model,
"provider": provider_hint,
"messages": [
{ "role": "system", "content": REVIEWER_FRAMING },
{ "role": "system", "content": framing_text(flags.framing) },
{ "role": "user", "content": user_prompt },
],
"temperature": 0.1,
@ -668,16 +777,45 @@ pub async fn execute(
}
};
Ok(Json(ExecuteResponse {
mode,
model,
task_class: req.task_class,
let resp = ExecuteResponse {
mode: mode.clone(),
model: model.clone(),
task_class: req.task_class.clone(),
enriched_prompt_chars: enriched_chars,
enriched_prompt_preview: preview,
sources,
response: response_text,
latency_ms: t0.elapsed().as_millis() as u64,
}))
};
// Append to mode_experiments.jsonl so the comparison aggregator
// can read the matrix later. Best-effort — write failure must not
// fail the request. Skips if LH_MODE_LOG_OFF=1.
if std::env::var("LH_MODE_LOG_OFF").as_deref() != Ok("1") {
let log_path = std::env::var("LH_MODE_LOG_PATH")
.unwrap_or_else(|_| "data/_kb/mode_experiments.jsonl".to_string());
let row = serde_json::json!({
"ts": chrono::Utc::now().to_rfc3339(),
"mode": resp.mode,
"model": resp.model,
"task_class": resp.task_class,
"file_path": req.file_path,
"enriched_prompt_chars": resp.enriched_prompt_chars,
"response_chars": resp.response.len(),
"latency_ms": resp.latency_ms,
"sources": resp.sources,
"response": resp.response,
});
if let Some(parent) = std::path::Path::new(&log_path).parent() {
let _ = std::fs::create_dir_all(parent);
}
if let Ok(mut f) = std::fs::OpenOptions::new().create(true).append(true).open(&log_path) {
use std::io::Write;
let _ = writeln!(f, "{}", row);
}
}
Ok(Json(resp))
}
#[cfg(test)]

186
scripts/mode_compare.ts Normal file
View File

@ -0,0 +1,186 @@
#!/usr/bin/env bun
/**
* Mode comparison aggregator reads data/_kb/mode_experiments.jsonl
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
* comparison matrix that lets us see what each enrichment dimension
* is actually doing.
*
* Per file, per mode, computes:
* - response_chars
* - finding_count (rows in markdown tables heuristic, regex)
* - pathway_citations (mentions of "Pathway memory" or "📚")
* - latency_ms
* - matrix_chunks_kept / dropped
*
* Then surfaces:
* - per file, what each mode produced (rows next to each other)
* - per mode, average response_chars + latency
* - which modes ALWAYS underperform vs codereview_lakehouse
* - which signals (bug fingerprints, matrix) correlate with output size
*
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
*/
import { readFileSync, existsSync } from "node:fs";
interface Row {
ts: string;
mode: string;
model: string;
task_class: string;
file_path: string;
enriched_prompt_chars: number;
response_chars: number;
latency_ms: number;
sources: {
focus_file_bytes?: number;
bug_fingerprints_count?: number;
matrix_chunks_kept?: number;
matrix_chunks_dropped?: number;
relevance_filter_used?: boolean;
flags?: any;
};
response: string;
}
function parseArgs(): { jsonl: string; since: string | null } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
return {
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
since: out.since || null,
};
}
function loadRows(path: string, since: string | null): Row[] {
if (!existsSync(path)) {
console.error(`[compare] no log file at ${path}`);
process.exit(1);
}
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const line of lines) {
try {
const r: Row = JSON.parse(line);
if (since && r.ts < since) continue;
rows.push(r);
} catch {
// skip malformed
}
}
return rows;
}
function countFindings(md: string): number {
// Markdown table rows that look like findings: `| <num> | ...` or `| **N** | ...`
// Heuristic — adversarial framing produces ranked tables.
const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return matches ? matches.length : 0;
}
function countPathwayCitations(md: string): number {
// How many times the model referenced the pathway memory preamble.
const re = /pathway\s*memory|📚/gi;
return (md.match(re) ?? []).length;
}
function pad(s: string | number, n: number, right = false): string {
const str = String(s);
if (str.length >= n) return str.slice(0, n);
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}
function main() {
const { jsonl, since } = parseArgs();
const rows = loadRows(jsonl, since);
if (rows.length === 0) {
console.error("[compare] no rows after filter");
process.exit(1);
}
// Group by file → mode
const byFile: Record<string, Record<string, Row>> = {};
const allModes = new Set<string>();
for (const r of rows) {
byFile[r.file_path] ??= {};
byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
allModes.add(r.mode);
}
const modesSorted = [...allModes].sort();
// Per-file matrix
console.log("\n═══ PER-FILE COMPARISON ═══\n");
for (const file of Object.keys(byFile).sort()) {
console.log(`📄 ${file}`);
console.log(
` ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}`
);
console.log(` ${"─".repeat(28)} ${"─".repeat(6)} ${"─".repeat(8)} ${"─".repeat(8)} ${"─".repeat(7)} ${"─".repeat(9)} ${"─".repeat(6)}`);
for (const mode of modesSorted) {
const r = byFile[file][mode];
if (!r) {
console.log(` ${pad(mode, 28)} ${pad("—", 6, true)}`);
continue;
}
const findings = countFindings(r.response);
const cits = countPathwayCitations(r.response);
const mk = r.sources.matrix_chunks_kept ?? 0;
const md = r.sources.matrix_chunks_dropped ?? 0;
const bf = r.sources.bug_fingerprints_count ?? 0;
console.log(
` ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}`
);
}
console.log("");
}
// Per-mode averages
console.log("═══ PER-MODE AGGREGATE ═══\n");
console.log(` ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`);
console.log(` ${"─".repeat(28)} ${"─".repeat(4)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(8)} ${"─".repeat(8)}`);
for (const mode of modesSorted) {
const modeRows = rows.filter(r => r.mode === mode);
if (modeRows.length === 0) continue;
const n = modeRows.length;
const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n);
const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10;
const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10;
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
console.log(
` ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}`
);
}
// Mode-relative: how often does each mode produce MORE findings than lakehouse?
console.log("\n═══ MODE vs codereview_lakehouse (per file) ═══\n");
console.log(` ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg findings", 16, true)}`);
console.log(` ${"─".repeat(28)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
for (const mode of modesSorted) {
if (mode === "codereview_lakehouse") continue;
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
for (const file of Object.keys(byFile)) {
const baseline = byFile[file]["codereview_lakehouse"];
const challenger = byFile[file][mode];
if (!baseline || !challenger) continue;
const bf = countFindings(baseline.response);
const cf = countFindings(challenger.response);
if (cf > bf) wins++;
else if (cf < bf) losses++;
else ties++;
totalDelta += cf - bf;
n++;
}
if (n === 0) continue;
const avgDelta = (totalDelta / n).toFixed(1);
console.log(
` ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
);
}
console.log("\n[compare] done\n");
}
main();

127
scripts/mode_experiment.ts Normal file
View File

@ -0,0 +1,127 @@
#!/usr/bin/env bun
/**
* Mode experiment harness sweeps a set of files through every native
* mode, calling /v1/mode/execute serially. Results land in the
* mode_experiments.jsonl that the gateway already writes (the runner
* appends per-call). This script just orchestrates the calls.
*
* Usage:
* bun run scripts/mode_experiment.ts \
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
* --model openai/gpt-oss-120b:free
*
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
* one model. Cloud-quota-resilient uses OpenRouter free model unless
* --model overrides.
*/
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
const ALL_MODES = [
"codereview_lakehouse",
"codereview_null",
"codereview_isolation",
"codereview_matrix_only",
"codereview_playbook_only",
];
const DEFAULT_FILES = [
"crates/queryd/src/delta.rs",
"crates/queryd/src/service.rs",
];
function parseArgs(): { files: string[]; modes: string[]; model: string } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
const model = out.model ?? "openai/gpt-oss-120b:free";
return { files, modes, model };
}
interface RunResult {
file: string;
mode: string;
ok: boolean;
latency_ms?: number;
response_chars?: number;
enriched_chars?: number;
bug_fingerprints?: number;
matrix_kept?: number;
matrix_dropped?: number;
error?: string;
}
async function runOne(file: string, mode: string, model: string): Promise<RunResult> {
const t0 = Date.now();
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: TASK_CLASS,
file_path: file,
force_mode: mode,
force_model: model,
}),
signal: AbortSignal.timeout(180_000),
});
if (!r.ok) {
const body = await r.text().catch(() => "");
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
}
const j: any = await r.json();
return {
file, mode, ok: true,
latency_ms: j.latency_ms,
response_chars: (j.response ?? "").length,
enriched_chars: j.enriched_prompt_chars,
bug_fingerprints: j.sources?.bug_fingerprints_count,
matrix_kept: j.sources?.matrix_chunks_kept,
matrix_dropped: j.sources?.matrix_chunks_dropped,
};
} catch (e: any) {
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
}
}
async function main() {
const { files, modes, model } = parseArgs();
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
console.log("");
const results: RunResult[] = [];
let i = 0;
for (const file of files) {
for (const mode of modes) {
i++;
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
const r = await runOne(file, mode, model);
results.push(r);
if (r.ok) {
console.log(
`${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
);
} else {
console.log(`${r.error}`);
}
}
}
console.log("");
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
}
main().catch(e => { console.error(e); process.exit(1); });

View File

@ -1438,6 +1438,56 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
// Collect attempts for the pathway trace sidecar.
const pathwayAttempts: LadderAttemptRec[] = [];
// ─── Mode runner fast path (J 2026-04-26) ───
// Modes are prompt-molders, not model-pickers. /v1/mode/execute
// composes pathway memory + relevance-filtered matrix chunks +
// focus-file context into ONE prompt designed for one-shot success.
// Try it first; if the response is substantive, skip the ladder
// entirely. If anything goes wrong, fall through unchanged.
//
// Off by default until we've A/B-validated quality vs the ladder.
// LH_USE_MODE_RUNNER=1 enables. LH_MODE_MIN_CHARS controls the
// success bar (default 2000 — anything shorter is treated as a
// thin response and falls through).
if (process.env.LH_USE_MODE_RUNNER === "1") {
const minChars = Number(process.env.LH_MODE_MIN_CHARS ?? 2000);
log(` ⚡ mode runner enabled — trying /v1/mode/execute (min_chars=${minChars})`);
const t0 = Date.now();
try {
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
task_class: taskClass,
file_path: rel,
file_content: content,
}),
signal: AbortSignal.timeout(180_000),
});
const modeMs = Date.now() - t0;
if (r.ok) {
const j: any = await r.json();
const respChars = (j.response ?? "").length;
if (respChars >= minChars) {
log(` ✓ mode ${j.mode}${j.model} | ${j.enriched_prompt_chars} prompt chars → ${respChars} resp chars in ${modeMs}ms`);
log(` sources: ${j.sources?.bug_fingerprints_count ?? 0} fingerprints, ${j.sources?.matrix_chunks_kept ?? 0}/${(j.sources?.matrix_chunks_kept ?? 0) + (j.sources?.matrix_chunks_dropped ?? 0)} matrix chunks kept`);
accepted = j.response;
acceptedModel = `mode_runner/${j.mode}/${j.model}`;
acceptedOn = 1;
history.push({ n: 1, model: j.model, status: "accepted", chars: respChars });
pathwayAttempts.push({ rung: 0, model: j.model, latency_ms: modeMs, accepted: true, reject_reason: null });
} else {
log(` ✗ mode runner returned ${respChars} chars (<${minChars}), falling through to ladder`);
}
} else {
const body = await r.text().catch(() => "");
log(` ✗ mode runner HTTP ${r.status}: ${body.slice(0, 200)} — falling through to ladder`);
}
} catch (e: any) {
log(` ✗ mode runner err: ${e.message} — falling through to ladder`);
}
}
// Single-model strategy with same-model retry. modelIdx advances
// only on PROVIDER errors. Quality rejects from observer keep the
// same model and retry with enriched context (history feeds back
@ -1448,6 +1498,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
let qualityRetriesOnCurrentModel = 0;
for (let step = 0; step < MAX_ATTEMPTS; step++) {
// Mode runner already produced an acceptable response — short-circuit
// the ladder. Falls through to the post-loop bookkeeping which
// handles {history, pathwayAttempts, hotSwap replay, etc}.
if (accepted) break;
if (modelIdx >= ladderOrder.length) {
log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`);
break;