lakehouse/scripts/mode_compare.ts
root 7c47734287
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1/mode: parameterized runner + 5 enrichment-experiment modes
J's directive (2026-04-26): "Create different modes so we can really
dial in the architecture before it gets further along — pinpoint the
failures and strengths equally so I know what direction to go in.
Loop theater happens when we don't pinpoint the most accurate path."

Refactored execute() to switch on mode name → EnrichmentFlags preset.
Five native modes designed as deliberate experiments — each isolates
one architectural axis so the comparison matrix reads off what's
doing work vs what's adding latency for nothing:

  codereview_lakehouse     — all enrichment on (ceiling)
  codereview_null          — raw file + generic prompt (baseline)
  codereview_isolation     — file + pathway only (no matrix)
  codereview_matrix_only   — file + matrix only (no pathway)
  codereview_playbook_only — pathway only, NO file content (lossy ceiling)

Each call appends a row to data/_kb/mode_experiments.jsonl with full
sources + response. LH_MODE_LOG_OFF=1 to suppress.

scripts/mode_experiment.ts — sweeps files × modes serially, prints
live progress with per-call enrichment stats. Defaults to OpenRouter
free model so cloud quota doesn't gate experiments.

scripts/mode_compare.ts — reads the JSONL, outputs per-file matrix
+ per-mode aggregate + mode-vs-baseline win/loss with avg finding
delta. Heuristic finding-count from markdown table rows; pathway
citation count from preamble references.

scrum_master_pipeline.ts gets a mode-runner fast path gated by
LH_USE_MODE_RUNNER=1: try /v1/mode/execute first, fall through to
the existing ladder if response < LH_MODE_MIN_CHARS (default 2000)
or anything errors. Off by default until A/B-validated.

First experiment results (2 files × 5 modes via gpt-oss-120b:free):
  - codereview_null produces 12.6KB response with ZERO findings
    (proves adversarial framing is load-bearing)
  - codereview_playbook_only produces MORE findings than lakehouse
    on average (12 vs 9) at 73% the latency — pathway memory is
    the dominant signal driver
  - codereview_matrix_only underperforms isolation by ~0.5 findings
    while costing the same latency — matrix corpus likely
    underperforming for scrum_review task class

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 01:36:42 -05:00

187 lines
6.9 KiB
TypeScript

#!/usr/bin/env bun
/**
* Mode comparison aggregator — reads data/_kb/mode_experiments.jsonl
* (written per-call by /v1/mode/execute) and surfaces the cross-mode
* comparison matrix that lets us see what each enrichment dimension
* is actually doing.
*
* Per file, per mode, computes:
* - response_chars
* - finding_count (rows in markdown tables — heuristic, regex)
* - pathway_citations (mentions of "Pathway memory" or "📚")
* - latency_ms
* - matrix_chunks_kept / dropped
*
* Then surfaces:
* - per file, what each mode produced (rows next to each other)
* - per mode, average response_chars + latency
* - which modes ALWAYS underperform vs codereview_lakehouse
* - which signals (bug fingerprints, matrix) correlate with output size
*
* Usage: bun run scripts/mode_compare.ts [--jsonl path] [--since 2026-04-26]
*/
import { readFileSync, existsSync } from "node:fs";
interface Row {
ts: string;
mode: string;
model: string;
task_class: string;
file_path: string;
enriched_prompt_chars: number;
response_chars: number;
latency_ms: number;
sources: {
focus_file_bytes?: number;
bug_fingerprints_count?: number;
matrix_chunks_kept?: number;
matrix_chunks_dropped?: number;
relevance_filter_used?: boolean;
flags?: any;
};
response: string;
}
function parseArgs(): { jsonl: string; since: string | null } {
const args = Bun.argv.slice(2);
const out: Record<string, string> = {};
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (a.startsWith("--")) out[a.slice(2)] = args[++i] ?? "";
}
return {
jsonl: out.jsonl ?? "data/_kb/mode_experiments.jsonl",
since: out.since || null,
};
}
function loadRows(path: string, since: string | null): Row[] {
if (!existsSync(path)) {
console.error(`[compare] no log file at ${path}`);
process.exit(1);
}
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
const rows: Row[] = [];
for (const line of lines) {
try {
const r: Row = JSON.parse(line);
if (since && r.ts < since) continue;
rows.push(r);
} catch {
// skip malformed
}
}
return rows;
}
function countFindings(md: string): number {
// Markdown table rows that look like findings: `| <num> | ...` or `| **N** | ...`
// Heuristic — adversarial framing produces ranked tables.
const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm);
return matches ? matches.length : 0;
}
function countPathwayCitations(md: string): number {
// How many times the model referenced the pathway memory preamble.
const re = /pathway\s*memory|📚/gi;
return (md.match(re) ?? []).length;
}
function pad(s: string | number, n: number, right = false): string {
const str = String(s);
if (str.length >= n) return str.slice(0, n);
return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length);
}
function main() {
const { jsonl, since } = parseArgs();
const rows = loadRows(jsonl, since);
if (rows.length === 0) {
console.error("[compare] no rows after filter");
process.exit(1);
}
// Group by file → mode
const byFile: Record<string, Record<string, Row>> = {};
const allModes = new Set<string>();
for (const r of rows) {
byFile[r.file_path] ??= {};
byFile[r.file_path][r.mode] = r; // last-write-wins per mode per file
allModes.add(r.mode);
}
const modesSorted = [...allModes].sort();
// Per-file matrix
console.log("\n═══ PER-FILE COMPARISON ═══\n");
for (const file of Object.keys(byFile).sort()) {
console.log(`📄 ${file}`);
console.log(
` ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}`
);
console.log(` ${"─".repeat(28)} ${"─".repeat(6)} ${"─".repeat(8)} ${"─".repeat(8)} ${"─".repeat(7)} ${"─".repeat(9)} ${"─".repeat(6)}`);
for (const mode of modesSorted) {
const r = byFile[file][mode];
if (!r) {
console.log(` ${pad(mode, 28)} ${pad("—", 6, true)}`);
continue;
}
const findings = countFindings(r.response);
const cits = countPathwayCitations(r.response);
const mk = r.sources.matrix_chunks_kept ?? 0;
const md = r.sources.matrix_chunks_dropped ?? 0;
const bf = r.sources.bug_fingerprints_count ?? 0;
console.log(
` ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}`
);
}
console.log("");
}
// Per-mode averages
console.log("═══ PER-MODE AGGREGATE ═══\n");
console.log(` ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`);
console.log(` ${"─".repeat(28)} ${"─".repeat(4)} ${"─".repeat(9)} ${"─".repeat(9)} ${"─".repeat(8)} ${"─".repeat(8)}`);
for (const mode of modesSorted) {
const modeRows = rows.filter(r => r.mode === mode);
if (modeRows.length === 0) continue;
const n = modeRows.length;
const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n);
const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10;
const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10;
const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n);
console.log(
` ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}`
);
}
// Mode-relative: how often does each mode produce MORE findings than lakehouse?
console.log("\n═══ MODE vs codereview_lakehouse (per file) ═══\n");
console.log(` ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("Δ avg findings", 16, true)}`);
console.log(` ${"─".repeat(28)} ${"─".repeat(5)} ${"─".repeat(7)} ${"─".repeat(5)} ${"─".repeat(16)}`);
for (const mode of modesSorted) {
if (mode === "codereview_lakehouse") continue;
let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0;
for (const file of Object.keys(byFile)) {
const baseline = byFile[file]["codereview_lakehouse"];
const challenger = byFile[file][mode];
if (!baseline || !challenger) continue;
const bf = countFindings(baseline.response);
const cf = countFindings(challenger.response);
if (cf > bf) wins++;
else if (cf < bf) losses++;
else ties++;
totalDelta += cf - bf;
n++;
}
if (n === 0) continue;
const avgDelta = (totalDelta / n).toFixed(1);
console.log(
` ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}`
);
}
console.log("\n[compare] done\n");
}
main();