Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."
Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:
codereview_lakehouse — all enrichment on (ceiling)
codereview_null — raw file + generic prompt (baseline)
codereview_isolation — file + pathway only (no matrix)
codereview_matrix_only — file + matrix only (no pathway)
codereview_playbook_only — pathway only, NO file content (lossy ceiling)
Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.
scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.
scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.
scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.
First experiment results (2 files × 5 modes via gpt-oss-120b:free):
- codereview_null produces 12.6KB response with ZERO findings
(proves adversarial framing is load-bearing)
- codereview_playbook_only produces MORE findings than lakehouse
on average (12 vs 9) at 73% the latency — pathway memory is
the dominant signal driver
- codereview_matrix_only underperforms isolation by ~0.5 findings
while costing the same latency — matrix corpus likely
underperforming for scrum_review task class
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
6.9 KiB
TypeScript
187 lines
6.9 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
|
|
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
|
|
* comparison matrix that lets us see what each enrichment dimension
|
|
* is actually doing.
|
|
*
|
|
* Per file, per mode, computes:
|
|
* - response_chars
|
|
* - finding_count (rows in markdown tables — heuristic, regex)
|
|
* - pathway_citations (mentions of "Pathway memory" or "📚")
|
|
* - latency_ms
|
|
* - matrix_chunks_kept / dropped
|
|
*
|
|
* Then surfaces:
|
|
* - per file, what each mode produced (rows next to each other)
|
|
* - per mode, average response_chars + latency
|
|
* - which modes ALWAYS underperform vs codereview_lakehouse
|
|
* - which signals (bug fingerprints, matrix) correlate with output size
|
|
*
|
|
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
|
|
*/
|
|
|
|
import { readFileSync, existsSync } from "node:fs";
|
|
|
|
interface Row {
|
|
ts: string;
|
|
mode: string;
|
|
model: string;
|
|
task_class: string;
|
|
file_path: string;
|
|
enriched_prompt_chars: number;
|
|
response_chars: number;
|
|
latency_ms: number;
|
|
sources: {
|
|
focus_file_bytes?: number;
|
|
bug_fingerprints_count?: number;
|
|
matrix_chunks_kept?: number;
|
|
matrix_chunks_dropped?: number;
|
|
relevance_filter_used?: boolean;
|
|
flags?: any;
|
|
};
|
|
response: string;
|
|
}
|
|
|
|
function parseArgs(): { jsonl: string; since: string | null } {
|
|
const args = Bun.argv.slice(2);
|
|
const out: Record<string, string> = {};
|
|
for (let i = 0; i < args.length; i++) {
|
|
const a = args[i];
|
|
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
|
|
}
|
|
return {
|
|
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
|
|
since: out.since || null,
|
|
};
|
|
}
|
|
|
|
function loadRows(path: string, since: string | null): Row[] {
|
|
if (!existsSync(path)) {
|
|
console.error(`[compare] no log file at ${path}`);
|
|
process.exit(1);
|
|
}
|
|
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
|
|
const rows: Row[] = [];
|
|
for (const line of lines) {
|
|
try {
|
|
const r: Row = JSON.parse(line);
|
|
if (since && r.ts < since) continue;
|
|
rows.push(r);
|
|
} catch {
|
|
// skip malformed
|
|
}
|
|
}
|
|
return rows;
|
|
}
|
|
|
|
function countFindings(md: string): number {
|
|
// Markdown table rows that look like findings: `| <num> | ...` or `| **N** | ...`
|
|
// Heuristic — adversarial framing produces ranked tables.
|
|
const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
|
|
return matches ? matches.length : 0;
|
|
}
|
|
|
|
function countPathwayCitations(md: string): number {
|
|
// How many times the model referenced the pathway memory preamble.
|
|
const re = /pathway\s*memory|📚/gi;
|
|
return (md.match(re) ?? []).length;
|
|
}
|
|
|
|
function pad(s: string | number, n: number, right = false): string {
|
|
const str = String(s);
|
|
if (str.length >= n) return str.slice(0, n);
|
|
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
|
|
}
|
|
|
|
function main() {
|
|
const { jsonl, since } = parseArgs();
|
|
const rows = loadRows(jsonl, since);
|
|
if (rows.length === 0) {
|
|
console.error("[compare] no rows after filter");
|
|
process.exit(1);
|
|
}
|
|
|
|
// Group by file → mode
|
|
const byFile: Record<string, Record<string, Row>> = {};
|
|
const allModes = new Set<string>();
|
|
for (const r of rows) {
|
|
byFile[r.file_path] ??= {};
|
|
byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
|
|
allModes.add(r.mode);
|
|
}
|
|
const modesSorted = [...allModes].sort();
|
|
|
|
// Per-file matrix
|
|
console.log("\n═══ PER-FILE COMPARISON ═══\n");
|
|
for (const file of Object.keys(byFile).sort()) {
|
|
console.log(`📄 ${file}`);
|
|
console.log(
|
|
` ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}`
|
|
);
|
|
console.log(` ${"─".repeat(28)} ${"─".repeat(6)} ${"─".repeat(8)} ${"─".repeat(8)} ${"─".repeat(7)} ${"─".repeat(9)} ${"─".repeat(6)}`);
|
|
for (const mode of modesSorted) {
|
|
const r = byFile[file][mode];
|
|
if (!r) {
|
|
console.log(` ${pad(mode, 28)} ${pad("—", 6, true)}`);
|
|
continue;
|
|
}
|
|
const findings = countFindings(r.response);
|
|
const cits = countPathwayCitations(r.response);
|
|
const mk = r.sources.matrix_chunks_kept ?? 0;
|
|
const md = r.sources.matrix_chunks_dropped ?? 0;
|
|
const bf = r.sources.bug_fingerprints_count ?? 0;
|
|
console.log(
|
|
` ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}`
|
|
);
|
|
}
|
|
console.log("");
|
|
}
|
|
|
|
// Per-mode averages
|
|
console.log("═══ PER-MODE AGGREGATE ═══\n");
|
|
console.log(` ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`);
|
|
console.log(` ${"─".repeat(28)} ${"─".repeat(4)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(8)} ${"─".repeat(8)}`);
|
|
for (const mode of modesSorted) {
|
|
const modeRows = rows.filter(r => r.mode === mode);
|
|
if (modeRows.length === 0) continue;
|
|
const n = modeRows.length;
|
|
const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n);
|
|
const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10;
|
|
const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10;
|
|
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
|
|
console.log(
|
|
` ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}`
|
|
);
|
|
}
|
|
|
|
// Mode-relative: how often does each mode produce MORE findings than lakehouse?
|
|
console.log("\n═══ MODE vs codereview_lakehouse (per file) ═══\n");
|
|
console.log(` ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg findings", 16, true)}`);
|
|
console.log(` ${"─".repeat(28)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
|
|
for (const mode of modesSorted) {
|
|
if (mode === "codereview_lakehouse") continue;
|
|
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
|
|
for (const file of Object.keys(byFile)) {
|
|
const baseline = byFile[file]["codereview_lakehouse"];
|
|
const challenger = byFile[file][mode];
|
|
if (!baseline || !challenger) continue;
|
|
const bf = countFindings(baseline.response);
|
|
const cf = countFindings(challenger.response);
|
|
if (cf > bf) wins++;
|
|
else if (cf < bf) losses++;
|
|
else ties++;
|
|
totalDelta += cf - bf;
|
|
n++;
|
|
}
|
|
if (n === 0) continue;
|
|
const avgDelta = (totalDelta / n).toFixed(1);
|
|
console.log(
|
|
` ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
|
|
);
|
|
}
|
|
console.log("\n[compare] done\n");
|
|
}
|
|
|
|
main();
|