v1/mode: parameterized runner + 5 enrichment-experiment modes
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."
Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:
codereview_lakehouse — all enrichment on (ceiling)
codereview_null — raw file + generic prompt (baseline)
codereview_isolation — file + pathway only (no matrix)
codereview_matrix_only — file + matrix only (no pathway)
codereview_playbook_only — pathway only, NO file content (lossy ceiling)
Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.
scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.
scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.
scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.
First experiment results (2 files × 5 modes via gpt-oss-120b:free):
- codereview_null produces 12.6KB response with ZERO findings
(proves adversarial framing is load-bearing)
- codereview_playbook_only produces MORE findings than lakehouse
on average (12 vs 9) at 73% the latency — pathway memory is
the dominant signal driver
- codereview_matrix_only underperforms isolation by ~0.5 findings
while costing the same latency — matrix corpus likely
underperforming for scrum_review task class
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
86f63a083d
commit
7c47734287
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -4086,6 +4086,7 @@ dependencies = [
|
||||
"shared",
|
||||
"storaged",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tonic",
|
||||
"tower-http",
|
||||
"tracing",
|
||||
|
||||
@ -42,14 +42,87 @@ const VALID_MODES: &[&str] = &[
|
||||
"evolution", "blindassembly", "staircase", "drift", "mesh",
|
||||
"hallucination", "timeloop", "research", "eval", "extract",
|
||||
"refine", "adaptive", "deep_analysis", "distill",
|
||||
// Native runners (not in LLM Team — handled by /v1/mode/execute):
|
||||
"codereview_lakehouse",
|
||||
// Native runners (not in LLM Team — handled by /v1/mode/execute).
|
||||
// Each is a parameterized preset of EnrichmentFlags below — designed
|
||||
// as a deliberate experiment so we can read the matrix and identify
|
||||
// which signals are doing real work vs adding latency for nothing.
|
||||
"codereview_lakehouse", // all enrichment on (ceiling)
|
||||
"codereview_null", // raw file + generic prompt (baseline)
|
||||
"codereview_isolation", // file + pathway only (no matrix)
|
||||
"codereview_matrix_only", // file + matrix only (no pathway)
|
||||
"codereview_playbook_only", // pathway only, NO file content (lossy ceiling)
|
||||
];
|
||||
|
||||
/// Whether a mode is handled natively in this gateway vs proxied to
|
||||
/// LLM Team. Drives /v1/mode/execute dispatch.
|
||||
fn is_native_mode(mode: &str) -> bool {
|
||||
matches!(mode, "codereview_lakehouse")
|
||||
matches!(
|
||||
mode,
|
||||
"codereview_lakehouse"
|
||||
| "codereview_null"
|
||||
| "codereview_isolation"
|
||||
| "codereview_matrix_only"
|
||||
| "codereview_playbook_only"
|
||||
)
|
||||
}
|
||||
|
||||
/// Per-mode enrichment knobs — each native mode is a preset over these
|
||||
/// flags. Exists so the runner code is one path (less drift between
|
||||
/// modes) and the comparison harness can read which signals fired.
|
||||
#[derive(Debug, Clone, Copy, Serialize)]
|
||||
pub struct EnrichmentFlags {
|
||||
pub include_file_content: bool,
|
||||
pub include_bug_fingerprints: bool,
|
||||
pub include_matrix_chunks: bool,
|
||||
pub use_relevance_filter: bool,
|
||||
pub framing: ReviewerFraming,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize)]
|
||||
pub enum ReviewerFraming {
|
||||
Adversarial, // forensic, ranked findings + verdict (lakehouse default)
|
||||
Generic, // "review this" — no codebase priors (null baseline)
|
||||
}
|
||||
|
||||
fn flags_for_mode(mode: &str) -> EnrichmentFlags {
|
||||
match mode {
|
||||
"codereview_null" => EnrichmentFlags {
|
||||
include_file_content: true,
|
||||
include_bug_fingerprints: false,
|
||||
include_matrix_chunks: false,
|
||||
use_relevance_filter: false,
|
||||
framing: ReviewerFraming::Generic,
|
||||
},
|
||||
"codereview_isolation" => EnrichmentFlags {
|
||||
include_file_content: true,
|
||||
include_bug_fingerprints: true,
|
||||
include_matrix_chunks: false,
|
||||
use_relevance_filter: false,
|
||||
framing: ReviewerFraming::Adversarial,
|
||||
},
|
||||
"codereview_matrix_only" => EnrichmentFlags {
|
||||
include_file_content: true,
|
||||
include_bug_fingerprints: false,
|
||||
include_matrix_chunks: true,
|
||||
use_relevance_filter: true,
|
||||
framing: ReviewerFraming::Adversarial,
|
||||
},
|
||||
"codereview_playbook_only" => EnrichmentFlags {
|
||||
include_file_content: false, // lossy on purpose — measures pathway-alone ceiling
|
||||
include_bug_fingerprints: true,
|
||||
include_matrix_chunks: false,
|
||||
use_relevance_filter: false,
|
||||
framing: ReviewerFraming::Adversarial,
|
||||
},
|
||||
// Default (codereview_lakehouse): everything on.
|
||||
_ => EnrichmentFlags {
|
||||
include_file_content: true,
|
||||
include_bug_fingerprints: true,
|
||||
include_matrix_chunks: true,
|
||||
use_relevance_filter: true,
|
||||
framing: ReviewerFraming::Adversarial,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
@ -342,6 +415,9 @@ pub struct EnrichmentSources {
|
||||
pub matrix_corpus: Option<String>,
|
||||
pub relevance_filter_used: bool,
|
||||
pub enrichment_warnings: Vec<String>,
|
||||
/// Which enrichment knobs the runner used for this mode. Lets
|
||||
/// the comparison aggregator group runs by signal-set.
|
||||
pub flags: Option<EnrichmentFlags>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
@ -356,13 +432,23 @@ pub struct ExecuteResponse {
|
||||
pub latency_ms: u64,
|
||||
}
|
||||
|
||||
const REVIEWER_FRAMING: &str = "You are an adversarial code reviewer for the Lakehouse codebase \
|
||||
const FRAMING_ADVERSARIAL: &str = "You are an adversarial code reviewer for the Lakehouse codebase \
|
||||
(Rust + DataFusion + Parquet + object storage). Audit the focus file forensically. \
|
||||
Output a markdown report with: (1) one-line verdict (pass | needs_patch | fail), (2) ranked \
|
||||
findings table with file:line, evidence, severity, confidence percent, (3) concrete patch \
|
||||
suggestions, (4) PRD/ADR refs where applicable. Be precise — assume nothing works until \
|
||||
proven. Do NOT hedge.";
|
||||
|
||||
const FRAMING_GENERIC: &str = "You are a code reviewer. Read the file below and produce a \
|
||||
markdown review with findings.";
|
||||
|
||||
fn framing_text(f: ReviewerFraming) -> &'static str {
|
||||
match f {
|
||||
ReviewerFraming::Adversarial => FRAMING_ADVERSARIAL,
|
||||
ReviewerFraming::Generic => FRAMING_GENERIC,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute(
|
||||
State(_state): State<V1State>,
|
||||
Json(req): Json<ExecuteRequest>,
|
||||
@ -399,12 +485,15 @@ pub async fn execute(
|
||||
));
|
||||
}
|
||||
|
||||
let flags = flags_for_mode(&mode);
|
||||
let mut sources = EnrichmentSources {
|
||||
matrix_corpus: matrix_corpus.clone(),
|
||||
flags: Some(flags),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Step 1: focus file content.
|
||||
// Step 1: focus file content (always read — even modes that don't
|
||||
// include it in the prompt may need it for citation/sources).
|
||||
let file_content = match req.file_content.clone() {
|
||||
Some(c) => c,
|
||||
None => match std::fs::read_to_string(&req.file_path) {
|
||||
@ -439,7 +528,7 @@ pub async fn execute(
|
||||
|
||||
// Step 2: pathway memory bug fingerprints for this file area.
|
||||
let mut bug_preamble = String::new();
|
||||
{
|
||||
if flags.include_bug_fingerprints {
|
||||
let body = serde_json::json!({
|
||||
"task_class": req.task_class,
|
||||
"file_path": req.file_path,
|
||||
@ -484,6 +573,7 @@ pub async fn execute(
|
||||
|
||||
// Step 3: matrix corpus search (if configured for this task class).
|
||||
let mut raw_chunks: Vec<serde_json::Value> = vec![];
|
||||
if flags.include_matrix_chunks {
|
||||
if let Some(corpus) = &matrix_corpus {
|
||||
let body = serde_json::json!({
|
||||
"index_name": corpus,
|
||||
@ -512,10 +602,11 @@ pub async fn execute(
|
||||
.enrichment_warnings
|
||||
.push(format!("matrix_search err: {e}")),
|
||||
}
|
||||
} // close `if let Some(corpus)`
|
||||
}
|
||||
|
||||
// Step 4: relevance filter — drop adjacency pollution.
|
||||
let kept_chunks: Vec<serde_json::Value> = if !raw_chunks.is_empty() {
|
||||
let kept_chunks: Vec<serde_json::Value> = if flags.use_relevance_filter && !raw_chunks.is_empty() {
|
||||
let chunks_for_filter: Vec<serde_json::Value> = raw_chunks
|
||||
.iter()
|
||||
.map(|c| {
|
||||
@ -557,15 +648,24 @@ pub async fn execute(
|
||||
raw_chunks
|
||||
}
|
||||
}
|
||||
} else if !flags.use_relevance_filter && !raw_chunks.is_empty() {
|
||||
// Take raw matrix chunks unfiltered — `codereview_matrix_only`
|
||||
// turns the filter off intentionally to measure how much
|
||||
// pollution the filter is actually catching.
|
||||
sources.matrix_chunks_kept = raw_chunks.len();
|
||||
raw_chunks.clone()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
// Step 5: assemble the prompt.
|
||||
// Step 5: assemble the prompt — strictly per-flag so we don't
|
||||
// leak signals across modes.
|
||||
let mut user_prompt = String::new();
|
||||
if flags.include_bug_fingerprints {
|
||||
user_prompt.push_str(&bug_preamble);
|
||||
if !kept_chunks.is_empty() {
|
||||
user_prompt.push_str("📁 RELATED CONTEXT (relevance-filtered from matrix):\n");
|
||||
}
|
||||
if flags.include_matrix_chunks && !kept_chunks.is_empty() {
|
||||
user_prompt.push_str("📁 RELATED CONTEXT (matrix chunks):\n");
|
||||
for c in &kept_chunks {
|
||||
let src = c.get("source").and_then(|v| v.as_str()).unwrap_or("?");
|
||||
let txt = c.get("text").and_then(|v| v.as_str()).unwrap_or("");
|
||||
@ -573,11 +673,20 @@ pub async fn execute(
|
||||
}
|
||||
user_prompt.push_str("\n");
|
||||
}
|
||||
if flags.include_file_content {
|
||||
user_prompt.push_str(&format!("FILE: {}\n```rust\n{}\n```\n", req.file_path, file_content));
|
||||
} else {
|
||||
// Lossy mode — playbook_only intentionally omits file content
|
||||
// to measure how much value pathway memory carries on its own.
|
||||
user_prompt.push_str(&format!(
|
||||
"FILE PATH (content omitted): {}\nFile size: {} bytes\n",
|
||||
req.file_path, file_content.len()
|
||||
));
|
||||
}
|
||||
if let Some(q) = &req.user_question {
|
||||
user_prompt.push_str(&format!("\nQUESTION: {}\n", q));
|
||||
} else {
|
||||
user_prompt.push_str("\nProduce the forensic review now.\n");
|
||||
user_prompt.push_str("\nProduce the review now.\n");
|
||||
}
|
||||
|
||||
let enriched_chars = user_prompt.len();
|
||||
@ -611,7 +720,7 @@ pub async fn execute(
|
||||
"model": model,
|
||||
"provider": provider_hint,
|
||||
"messages": [
|
||||
{ "role": "system", "content": REVIEWER_FRAMING },
|
||||
{ "role": "system", "content": framing_text(flags.framing) },
|
||||
{ "role": "user", "content": user_prompt },
|
||||
],
|
||||
"temperature": 0.1,
|
||||
@ -668,16 +777,45 @@ pub async fn execute(
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Json(ExecuteResponse {
|
||||
mode,
|
||||
model,
|
||||
task_class: req.task_class,
|
||||
let resp = ExecuteResponse {
|
||||
mode: mode.clone(),
|
||||
model: model.clone(),
|
||||
task_class: req.task_class.clone(),
|
||||
enriched_prompt_chars: enriched_chars,
|
||||
enriched_prompt_preview: preview,
|
||||
sources,
|
||||
response: response_text,
|
||||
latency_ms: t0.elapsed().as_millis() as u64,
|
||||
}))
|
||||
};
|
||||
|
||||
// Append to mode_experiments.jsonl so the comparison aggregator
|
||||
// can read the matrix later. Best-effort — write failure must not
|
||||
// fail the request. Skips if LH_MODE_LOG_OFF=1.
|
||||
if std::env::var("LH_MODE_LOG_OFF").as_deref() != Ok("1") {
|
||||
let log_path = std::env::var("LH_MODE_LOG_PATH")
|
||||
.unwrap_or_else(|_| "data/_kb/mode_experiments.jsonl".to_string());
|
||||
let row = serde_json::json!({
|
||||
"ts": chrono::Utc::now().to_rfc3339(),
|
||||
"mode": resp.mode,
|
||||
"model": resp.model,
|
||||
"task_class": resp.task_class,
|
||||
"file_path": req.file_path,
|
||||
"enriched_prompt_chars": resp.enriched_prompt_chars,
|
||||
"response_chars": resp.response.len(),
|
||||
"latency_ms": resp.latency_ms,
|
||||
"sources": resp.sources,
|
||||
"response": resp.response,
|
||||
});
|
||||
if let Some(parent) = std::path::Path::new(&log_path).parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
if let Ok(mut f) = std::fs::OpenOptions::new().create(true).append(true).open(&log_path) {
|
||||
use std::io::Write;
|
||||
let _ = writeln!(f, "{}", row);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Json(resp))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
186
scripts/mode_compare.ts
Normal file
186
scripts/mode_compare.ts
Normal file
@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
|
||||
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
|
||||
* comparison matrix that lets us see what each enrichment dimension
|
||||
* is actually doing.
|
||||
*
|
||||
* Per file, per mode, computes:
|
||||
* - response_chars
|
||||
* - finding_count (rows in markdown tables — heuristic, regex)
|
||||
* - pathway_citations (mentions of "Pathway memory" or "📚")
|
||||
* - latency_ms
|
||||
* - matrix_chunks_kept / dropped
|
||||
*
|
||||
* Then surfaces:
|
||||
* - per file, what each mode produced (rows next to each other)
|
||||
* - per mode, average response_chars + latency
|
||||
* - which modes ALWAYS underperform vs codereview_lakehouse
|
||||
* - which signals (bug fingerprints, matrix) correlate with output size
|
||||
*
|
||||
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
|
||||
*/
|
||||
|
||||
import { readFileSync, existsSync } from "node:fs";
|
||||
|
||||
interface Row {
|
||||
ts: string;
|
||||
mode: string;
|
||||
model: string;
|
||||
task_class: string;
|
||||
file_path: string;
|
||||
enriched_prompt_chars: number;
|
||||
response_chars: number;
|
||||
latency_ms: number;
|
||||
sources: {
|
||||
focus_file_bytes?: number;
|
||||
bug_fingerprints_count?: number;
|
||||
matrix_chunks_kept?: number;
|
||||
matrix_chunks_dropped?: number;
|
||||
relevance_filter_used?: boolean;
|
||||
flags?: any;
|
||||
};
|
||||
response: string;
|
||||
}
|
||||
|
||||
function parseArgs(): { jsonl: string; since: string | null } {
|
||||
const args = Bun.argv.slice(2);
|
||||
const out: Record<string, string> = {};
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const a = args[i];
|
||||
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
|
||||
}
|
||||
return {
|
||||
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
|
||||
since: out.since || null,
|
||||
};
|
||||
}
|
||||
|
||||
function loadRows(path: string, since: string | null): Row[] {
|
||||
if (!existsSync(path)) {
|
||||
console.error(`[compare] no log file at ${path}`);
|
||||
process.exit(1);
|
||||
}
|
||||
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
|
||||
const rows: Row[] = [];
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const r: Row = JSON.parse(line);
|
||||
if (since && r.ts < since) continue;
|
||||
rows.push(r);
|
||||
} catch {
|
||||
// skip malformed
|
||||
}
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
function countFindings(md: string): number {
|
||||
// Markdown table rows that look like findings: `| <num> | ...` or `| **N** | ...`
|
||||
// Heuristic — adversarial framing produces ranked tables.
|
||||
const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
|
||||
return matches ? matches.length : 0;
|
||||
}
|
||||
|
||||
function countPathwayCitations(md: string): number {
|
||||
// How many times the model referenced the pathway memory preamble.
|
||||
const re = /pathway\s*memory|📚/gi;
|
||||
return (md.match(re) ?? []).length;
|
||||
}
|
||||
|
||||
function pad(s: string | number, n: number, right = false): string {
|
||||
const str = String(s);
|
||||
if (str.length >= n) return str.slice(0, n);
|
||||
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
|
||||
}
|
||||
|
||||
function main() {
|
||||
const { jsonl, since } = parseArgs();
|
||||
const rows = loadRows(jsonl, since);
|
||||
if (rows.length === 0) {
|
||||
console.error("[compare] no rows after filter");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Group by file → mode
|
||||
const byFile: Record<string, Record<string, Row>> = {};
|
||||
const allModes = new Set<string>();
|
||||
for (const r of rows) {
|
||||
byFile[r.file_path] ??= {};
|
||||
byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
|
||||
allModes.add(r.mode);
|
||||
}
|
||||
const modesSorted = [...allModes].sort();
|
||||
|
||||
// Per-file matrix
|
||||
console.log("\n═══ PER-FILE COMPARISON ═══\n");
|
||||
for (const file of Object.keys(byFile).sort()) {
|
||||
console.log(`📄 ${file}`);
|
||||
console.log(
|
||||
` ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}`
|
||||
);
|
||||
console.log(` ${"─".repeat(28)} ${"─".repeat(6)} ${"─".repeat(8)} ${"─".repeat(8)} ${"─".repeat(7)} ${"─".repeat(9)} ${"─".repeat(6)}`);
|
||||
for (const mode of modesSorted) {
|
||||
const r = byFile[file][mode];
|
||||
if (!r) {
|
||||
console.log(` ${pad(mode, 28)} ${pad("—", 6, true)}`);
|
||||
continue;
|
||||
}
|
||||
const findings = countFindings(r.response);
|
||||
const cits = countPathwayCitations(r.response);
|
||||
const mk = r.sources.matrix_chunks_kept ?? 0;
|
||||
const md = r.sources.matrix_chunks_dropped ?? 0;
|
||||
const bf = r.sources.bug_fingerprints_count ?? 0;
|
||||
console.log(
|
||||
` ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}`
|
||||
);
|
||||
}
|
||||
console.log("");
|
||||
}
|
||||
|
||||
// Per-mode averages
|
||||
console.log("═══ PER-MODE AGGREGATE ═══\n");
|
||||
console.log(` ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`);
|
||||
console.log(` ${"─".repeat(28)} ${"─".repeat(4)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(8)} ${"─".repeat(8)}`);
|
||||
for (const mode of modesSorted) {
|
||||
const modeRows = rows.filter(r => r.mode === mode);
|
||||
if (modeRows.length === 0) continue;
|
||||
const n = modeRows.length;
|
||||
const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n);
|
||||
const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10;
|
||||
const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10;
|
||||
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
|
||||
console.log(
|
||||
` ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}`
|
||||
);
|
||||
}
|
||||
|
||||
// Mode-relative: how often does each mode produce MORE findings than lakehouse?
|
||||
console.log("\n═══ MODE vs codereview_lakehouse (per file) ═══\n");
|
||||
console.log(` ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg findings", 16, true)}`);
|
||||
console.log(` ${"─".repeat(28)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
|
||||
for (const mode of modesSorted) {
|
||||
if (mode === "codereview_lakehouse") continue;
|
||||
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
|
||||
for (const file of Object.keys(byFile)) {
|
||||
const baseline = byFile[file]["codereview_lakehouse"];
|
||||
const challenger = byFile[file][mode];
|
||||
if (!baseline || !challenger) continue;
|
||||
const bf = countFindings(baseline.response);
|
||||
const cf = countFindings(challenger.response);
|
||||
if (cf > bf) wins++;
|
||||
else if (cf < bf) losses++;
|
||||
else ties++;
|
||||
totalDelta += cf - bf;
|
||||
n++;
|
||||
}
|
||||
if (n === 0) continue;
|
||||
const avgDelta = (totalDelta / n).toFixed(1);
|
||||
console.log(
|
||||
` ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
|
||||
);
|
||||
}
|
||||
console.log("\n[compare] done\n");
|
||||
}
|
||||
|
||||
main();
|
||||
127
scripts/mode_experiment.ts
Normal file
127
scripts/mode_experiment.ts
Normal file
@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Mode experiment harness — sweeps a set of files through every native
|
||||
* mode, calling /v1/mode/execute serially. Results land in the
|
||||
* mode_experiments.jsonl that the gateway already writes (the runner
|
||||
* appends per-call). This script just orchestrates the calls.
|
||||
*
|
||||
* Usage:
|
||||
* bun run scripts/mode_experiment.ts \
|
||||
* --files crates/queryd/src/delta.rs,crates/queryd/src/service.rs \
|
||||
* --modes codereview_lakehouse,codereview_null,codereview_isolation,codereview_matrix_only \
|
||||
* --model openai/gpt-oss-120b:free
|
||||
*
|
||||
* Defaults: 5 modes × $LH_EXPERIMENT_FILES files (or 2 default targets) ×
|
||||
* one model. Cloud-quota-resilient — uses OpenRouter free model unless
|
||||
* --model overrides.
|
||||
*/
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY ?? "http://localhost:3100";
|
||||
const TASK_CLASS = process.env.LH_EXPERIMENT_TASK ?? "scrum_review";
|
||||
|
||||
const ALL_MODES = [
|
||||
"codereview_lakehouse",
|
||||
"codereview_null",
|
||||
"codereview_isolation",
|
||||
"codereview_matrix_only",
|
||||
"codereview_playbook_only",
|
||||
];
|
||||
|
||||
const DEFAULT_FILES = [
|
||||
"crates/queryd/src/delta.rs",
|
||||
"crates/queryd/src/service.rs",
|
||||
];
|
||||
|
||||
function parseArgs(): { files: string[]; modes: string[]; model: string } {
|
||||
const args = Bun.argv.slice(2);
|
||||
const out: Record<string, string> = {};
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const a = args[i];
|
||||
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
|
||||
}
|
||||
const files = (out.files ?? DEFAULT_FILES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||||
const modes = (out.modes ?? ALL_MODES.join(",")).split(",").map(s => s.trim()).filter(Boolean);
|
||||
const model = out.model ?? "openai/gpt-oss-120b:free";
|
||||
return { files, modes, model };
|
||||
}
|
||||
|
||||
interface RunResult {
|
||||
file: string;
|
||||
mode: string;
|
||||
ok: boolean;
|
||||
latency_ms?: number;
|
||||
response_chars?: number;
|
||||
enriched_chars?: number;
|
||||
bug_fingerprints?: number;
|
||||
matrix_kept?: number;
|
||||
matrix_dropped?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function runOne(file: string, mode: string, model: string): Promise<RunResult> {
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: TASK_CLASS,
|
||||
file_path: file,
|
||||
force_mode: mode,
|
||||
force_model: model,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180_000),
|
||||
});
|
||||
if (!r.ok) {
|
||||
const body = await r.text().catch(() => "");
|
||||
return { file, mode, ok: false, error: `HTTP ${r.status}: ${body.slice(0, 200)}` };
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
file, mode, ok: true,
|
||||
latency_ms: j.latency_ms,
|
||||
response_chars: (j.response ?? "").length,
|
||||
enriched_chars: j.enriched_prompt_chars,
|
||||
bug_fingerprints: j.sources?.bug_fingerprints_count,
|
||||
matrix_kept: j.sources?.matrix_chunks_kept,
|
||||
matrix_dropped: j.sources?.matrix_chunks_dropped,
|
||||
};
|
||||
} catch (e: any) {
|
||||
return { file, mode, ok: false, error: e.message, latency_ms: Date.now() - t0 };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { files, modes, model } = parseArgs();
|
||||
console.log(`[experiment] files=${files.length} × modes=${modes.length} = ${files.length * modes.length} runs`);
|
||||
console.log(`[experiment] model=${model} task=${TASK_CLASS} gateway=${GATEWAY}`);
|
||||
console.log("");
|
||||
|
||||
const results: RunResult[] = [];
|
||||
let i = 0;
|
||||
for (const file of files) {
|
||||
for (const mode of modes) {
|
||||
i++;
|
||||
process.stdout.write(` [${i}/${files.length * modes.length}] ${mode.padEnd(28)} ${file} ... `);
|
||||
const r = await runOne(file, mode, model);
|
||||
results.push(r);
|
||||
if (r.ok) {
|
||||
console.log(
|
||||
`✓ ${(r.response_chars ?? 0).toString().padStart(5)} chars | ` +
|
||||
`prompt ${(r.enriched_chars ?? 0).toString().padStart(5)} chars | ` +
|
||||
`${((r.latency_ms ?? 0) / 1000).toFixed(1).padStart(5)}s | ` +
|
||||
`bug=${r.bug_fingerprints ?? "-"} mtx=${r.matrix_kept ?? 0}/${(r.matrix_kept ?? 0) + (r.matrix_dropped ?? 0)}`
|
||||
);
|
||||
} else {
|
||||
console.log(`✗ ${r.error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log(`[experiment] complete · ${results.filter(r => r.ok).length}/${results.length} succeeded`);
|
||||
console.log(`[experiment] full per-call detail in data/_kb/mode_experiments.jsonl`);
|
||||
console.log(`[experiment] aggregate with: bun run scripts/mode_compare.ts`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e); process.exit(1); });
|
||||
@ -1438,6 +1438,56 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
||||
// Collect attempts for the pathway trace sidecar.
|
||||
const pathwayAttempts: LadderAttemptRec[] = [];
|
||||
|
||||
// ─── Mode runner fast path (J 2026-04-26) ───
|
||||
// Modes are prompt-molders, not model-pickers. /v1/mode/execute
|
||||
// composes pathway memory + relevance-filtered matrix chunks +
|
||||
// focus-file context into ONE prompt designed for one-shot success.
|
||||
// Try it first; if the response is substantive, skip the ladder
|
||||
// entirely. If anything goes wrong, fall through unchanged.
|
||||
//
|
||||
// Off by default until we've A/B-validated quality vs the ladder.
|
||||
// LH_USE_MODE_RUNNER=1 enables. LH_MODE_MIN_CHARS controls the
|
||||
// success bar (default 2000 — anything shorter is treated as a
|
||||
// thin response and falls through).
|
||||
if (process.env.LH_USE_MODE_RUNNER === "1") {
|
||||
const minChars = Number(process.env.LH_MODE_MIN_CHARS ?? 2000);
|
||||
log(` ⚡ mode runner enabled — trying /v1/mode/execute (min_chars=${minChars})`);
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/mode/execute`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
task_class: taskClass,
|
||||
file_path: rel,
|
||||
file_content: content,
|
||||
}),
|
||||
signal: AbortSignal.timeout(180_000),
|
||||
});
|
||||
const modeMs = Date.now() - t0;
|
||||
if (r.ok) {
|
||||
const j: any = await r.json();
|
||||
const respChars = (j.response ?? "").length;
|
||||
if (respChars >= minChars) {
|
||||
log(` ✓ mode ${j.mode} → ${j.model} | ${j.enriched_prompt_chars} prompt chars → ${respChars} resp chars in ${modeMs}ms`);
|
||||
log(` sources: ${j.sources?.bug_fingerprints_count ?? 0} fingerprints, ${j.sources?.matrix_chunks_kept ?? 0}/${(j.sources?.matrix_chunks_kept ?? 0) + (j.sources?.matrix_chunks_dropped ?? 0)} matrix chunks kept`);
|
||||
accepted = j.response;
|
||||
acceptedModel = `mode_runner/${j.mode}/${j.model}`;
|
||||
acceptedOn = 1;
|
||||
history.push({ n: 1, model: j.model, status: "accepted", chars: respChars });
|
||||
pathwayAttempts.push({ rung: 0, model: j.model, latency_ms: modeMs, accepted: true, reject_reason: null });
|
||||
} else {
|
||||
log(` ✗ mode runner returned ${respChars} chars (<${minChars}), falling through to ladder`);
|
||||
}
|
||||
} else {
|
||||
const body = await r.text().catch(() => "");
|
||||
log(` ✗ mode runner HTTP ${r.status}: ${body.slice(0, 200)} — falling through to ladder`);
|
||||
}
|
||||
} catch (e: any) {
|
||||
log(` ✗ mode runner err: ${e.message} — falling through to ladder`);
|
||||
}
|
||||
}
|
||||
|
||||
// Single-model strategy with same-model retry. modelIdx advances
|
||||
// only on PROVIDER errors. Quality rejects from observer keep the
|
||||
// same model and retry with enriched context (history feeds back
|
||||
@ -1448,6 +1498,10 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
||||
let qualityRetriesOnCurrentModel = 0;
|
||||
|
||||
for (let step = 0; step < MAX_ATTEMPTS; step++) {
|
||||
// Mode runner already produced an acceptable response — short-circuit
|
||||
// the ladder. Falls through to the post-loop bookkeeping which
|
||||
// handles {history, pathwayAttempts, hotSwap replay, etc}.
|
||||
if (accepted) break;
|
||||
if (modelIdx >= ladderOrder.length) {
|
||||
log(` ✗ all ${ladderOrder.length} fallback models exhausted, marking UNRESOLVED`);
|
||||
break;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user