// Compounding Stress Battery — the rigorous smoke test. // // Three iterations against /v1/respond, each running: // α baseline (3 easy tasks) — should complete local-only with boost // β drift (3 niche tasks) — forces executor miss → overseer fires // γ impossible (2 zero-supply) — must fail honestly, no token explosion // δ distill outcomes — writes distilled_*.jsonl + vector indexes // ε overseer meta-review — gpt-oss:120b judges the iteration // ζ scrum judgment — gpt-oss:120b reviews overseer proposals // // Iteration N+1 runs the same tasks as iteration N. We measure compounding: // does turns_per_task drop? does overseer_called_rate drop? does // correction_effective rise? If 3/5 metrics trend favorably, architecture // validated; otherwise the scrum verdict points at what to fix. // // Fail-fast: every error bubbles. No silent catches — the run ABORTS with // the underlying stack so we see exactly where the architecture broke. // // Runtime: ~60-90 min. Cloud cost: ~24-32 gpt-oss calls (well under daily cap). import { writeFile, mkdir, readFile } from "node:fs/promises"; import { join } from "node:path"; const GATEWAY = process.env.GATEWAY_URL ?? "http://localhost:3100"; const LLM_TEAM = process.env.LLM_TEAM_URL ?? "http://localhost:5000"; const BATTERY_DIR = process.env.BATTERY_DIR ?? "/home/profit/lakehouse/data/_kb/battery"; // 10-minute timeout per /v1/respond call — cloud executor on a hard task // can chew for a while, and we want to see real behavior, not premature aborts. const RESPOND_TIMEOUT_MS = 10 * 60 * 1000; const META_TIMEOUT_MS = 5 * 60 * 1000; interface Task { task_class: string; operation: string; spec: Record; } interface Tasks { phases: { alpha_baseline: Task[]; beta_drift: Task[]; gamma_impossible: Task[]; }; models: { executor_cloud: string; reviewer_cloud: string; overseer_cloud: string; }; } interface RunResult { status: "ok" | "failed" | "blocked"; iterations: number; artifact: any; log: any[]; error?: string | null; _elapsed_ms: number; } interface TaskRun { task: Task; phase: "alpha" | "beta" | "gamma"; result: RunResult; } // ─── HTTP helpers ─── async function runRespond(task: Task, models: Tasks["models"]): Promise { const body = { task_class: task.task_class, operation: task.operation, spec: task.spec, executor_model: models.executor_cloud, reviewer_model: models.reviewer_cloud, }; const start = Date.now(); const resp = await fetch(`${GATEWAY}/v1/respond`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify(body), signal: AbortSignal.timeout(RESPOND_TIMEOUT_MS), }); if (!resp.ok) { const txt = await resp.text(); throw new Error(`/v1/respond HTTP ${resp.status}: ${txt.slice(0, 500)}`); } const j = (await resp.json()) as RunResult; j._elapsed_ms = Date.now() - start; return j; } async function runDistill(source: string): Promise { const body = { mode: "distill", prompt: "battery iteration distill", source }; const resp = await fetch(`${LLM_TEAM}/api/run?mode=distill`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify(body), signal: AbortSignal.timeout(META_TIMEOUT_MS), }); if (!resp.ok) throw new Error(`distill HTTP ${resp.status}`); const text = await resp.text(); // SSE stream — parse data: lines, return parsed event objects const events: any[] = []; for (const line of text.split("\n")) { if (!line.startsWith("data: ")) continue; try { events.push(JSON.parse(line.slice(6))); } catch { /* skip */ } } return events; } async function cloudChat( model: string, prompt: string, temperature: number, think: boolean, ): Promise { const resp = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ model, messages: [{ role: "user", content: prompt }], temperature, think, provider: "ollama_cloud", }), signal: AbortSignal.timeout(META_TIMEOUT_MS), }); if (!resp.ok) { const txt = await resp.text(); throw new Error(`/v1/chat ${model} HTTP ${resp.status}: ${txt.slice(0, 500)}`); } const j = await resp.json() as any; return j.choices?.[0]?.message?.content ?? ""; } // ─── Meta-review + scrum ─── async function overseerReview( iterNum: number, artifacts: any, models: Tasks["models"], ): Promise { const prompt = `You are the OVERSEER reviewing iteration ${iterNum} of a stress battery run against Lakehouse /v1/respond. For each task in the battery below, examine: status (ok/failed/blocked), iterations used, error signature, whether the in-loop overseer fired, total tokens. Produce a PR-style meta-review in markdown with these sections: ## What worked List specific tasks (by operation string) that completed correctly and the evidence — turns_used, citations, tokens. Be concrete. ## What failed List specific tasks that failed or needed overseer correction. Classify: was it a real failure (impossible task), a drift we should repair, or a false positive from the test? ## Proposed changes for iteration ${iterNum + 1} At least 3 concrete architectural changes, each with: - **Target file** (e.g. \`crates/gateway/src/execution_loop/mod.rs\`) - **Rationale** (what the metrics show) - **Expected impact** (which metric should move in iter ${iterNum + 1}) Be honest about weaknesses. Do NOT propose generic best practices — reference specific observations from the artifacts below. ARTIFACTS (iteration ${iterNum}): ${JSON.stringify(artifacts, null, 2).slice(0, 30000)}`; return cloudChat(models.overseer_cloud, prompt, 0.2, true); } async function scrumJudge( iterNum: number, review: string, models: Tasks["models"], ): Promise { const prompt = `You are the SCRUM MASTER. The OVERSEER proposed these architectural changes for iteration ${iterNum + 1} based on iteration ${iterNum}'s results. For each proposal, produce a verdict in markdown: - **Proposal N**: - **Verdict**: APPROVE | REVISE | REJECT - **Reason**: why - **If APPROVE**: is the expected impact realistic? what's the blast radius? is the target file correct? - **If REVISE**: what should change about the proposal before applying? - **If REJECT**: why is the proposal wrong or out of scope? Final section: ## PR-ready changes Bulleted list of only the APPROVE proposals, ready to apply. Be rigorous. Don't rubber-stamp. If a proposal references a file that probably doesn't exist, REJECT and say so. If a proposal is a generic "improve X" without concrete plan, REVISE. OVERSEER PROPOSED: ${review.slice(0, 15000)}`; return cloudChat(models.overseer_cloud, prompt, 0.1, true); } // ─── Iteration driver ─── async function runIteration(iterNum: number, tasks: Tasks): Promise { console.log(`\n${"═".repeat(60)}`); console.log(`▶ ITERATION ${iterNum}`); console.log(`${"═".repeat(60)}\n`); const iterDir = join(BATTERY_DIR, `iter_${iterNum}`); await mkdir(iterDir, { recursive: true }); const runs: TaskRun[] = []; for (const [phaseKey, phaseName] of [ ["alpha_baseline", "alpha"], ["beta_drift", "beta"], ["gamma_impossible", "gamma"], ] as const) { console.log(`\n── Phase ${phaseName} ──`); for (const task of tasks.phases[phaseKey]) { console.log(` ▶ ${task.operation}`); const result = await runRespond(task, tasks.models); const overseerFired = (result.log ?? []).some(e => e.kind === "overseer_correction"); console.log( ` status=${result.status} turns=${result.iterations}` + ` tokens=${result.artifact?.usage?.total_tokens ?? 0}` + ` overseer=${overseerFired}` + ` elapsed=${Math.round(result._elapsed_ms / 1000)}s` ); if (result.error) console.log(` error: ${result.error.slice(0, 200)}`); runs.push({ task, phase: phaseName, result }); } } // Phase δ console.log(`\n── Phase δ: distill outcomes_tail:20 ──`); const distillEvents = await runDistill("outcomes_tail:20"); const distillFinal = [...distillEvents].reverse() .find(e => e.role === "final") ?? distillEvents[distillEvents.length - 1]; const distillText = distillFinal?.text ?? JSON.stringify(distillFinal ?? {}).slice(0, 200); console.log(` ${distillText.split("\n")[0]}`); await writeFile(join(iterDir, "distill_output.txt"), distillText); // Metrics const collectPhase = (p: string) => runs.filter(r => r.phase === p); const phaseMetrics = (p: string) => { const ps = collectPhase(p); if (ps.length === 0) return { count: 0 }; return { count: ps.length, ok: ps.filter(r => r.result.status === "ok").length, failed: ps.filter(r => r.result.status === "failed").length, avg_turns: ps.reduce((s, r) => s + (r.result.iterations || 0), 0) / ps.length, total_tokens: ps.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0), overseer_called: ps.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length, avg_elapsed_s: ps.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / ps.length / 1000, }; }; const metrics = { iteration: iterNum, total_tasks: runs.length, ok_tasks: runs.filter(r => r.result.status === "ok").length, failed_tasks: runs.filter(r => r.result.status === "failed").length, blocked_tasks: runs.filter(r => r.result.status === "blocked").length, total_tokens: runs.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0), avg_turns_per_task: runs.reduce((s, r) => s + (r.result.iterations || 0), 0) / runs.length, overseer_called_rate: runs.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length / runs.length, total_elapsed_s: runs.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / 1000, by_phase: { alpha: phaseMetrics("alpha"), beta: phaseMetrics("beta"), gamma: phaseMetrics("gamma"), }, }; console.log(`\n── Metrics ──`); console.log(` total_tokens: ${metrics.total_tokens}`); console.log(` avg_turns_per_task: ${metrics.avg_turns_per_task.toFixed(2)}`); console.log(` overseer_called_rate: ${(metrics.overseer_called_rate * 100).toFixed(1)}%`); console.log(` ok/total: ${metrics.ok_tasks}/${metrics.total_tasks}`); await writeFile(join(iterDir, "runs.json"), JSON.stringify(runs, null, 2)); await writeFile(join(iterDir, "metrics.json"), JSON.stringify(metrics, null, 2)); // Phase ε: overseer review console.log(`\n── Phase ε: overseer meta-review ──`); const reviewInput = { metrics, task_summary: runs.map(r => ({ operation: r.task.operation, phase: r.phase, status: r.result.status, iterations: r.result.iterations, tokens: r.result.artifact?.usage?.total_tokens ?? 0, overseer_called: (r.result.log ?? []).some(e => e.kind === "overseer_correction"), error: r.result.error ?? null, elapsed_s: Math.round((r.result._elapsed_ms || 0) / 1000), })), }; const review = await overseerReview(iterNum, reviewInput, tasks.models); await writeFile(join(iterDir, "overseer_review.md"), review); console.log(` ✓ ${review.length} chars`); // Phase ζ: scrum console.log(`\n── Phase ζ: scrum judgment ──`); const verdict = await scrumJudge(iterNum, review, tasks.models); await writeFile(join(iterDir, "scrum_findings.md"), verdict); console.log(` ✓ ${verdict.length} chars`); return metrics; } // ─── Main ─── async function main() { const tasks = JSON.parse( await readFile("/home/profit/lakehouse/tests/battery/tasks.json", "utf8"), ) as Tasks; await mkdir(BATTERY_DIR, { recursive: true }); const iterations: any[] = []; const batteryStart = Date.now(); for (let i = 1; i <= 3; i++) { const m = await runIteration(i, tasks); iterations.push(m); } const batteryElapsed = (Date.now() - batteryStart) / 1000; // Summary const delta = (k: keyof any, inverted = false) => { const vals = iterations.map((m: any) => m[k]); if (vals.some(v => v === undefined)) return "—"; const diff = vals[2] - vals[0]; const pct = vals[0] !== 0 ? (diff / vals[0]) * 100 : 0; const arrow = inverted ? (diff < 0 ? "↓ better" : "↑ worse") : (diff > 0 ? "↑ better" : "↓ worse"); return `${arrow} (${diff > 0 ? "+" : ""}${diff.toFixed?.(2) ?? diff}, ${pct.toFixed(1)}%)`; }; const rows = [ ["total_tokens", "inverted", "want ↓ — fewer tokens for same work"], ["avg_turns_per_task", "inverted", "want ↓ — executor gets smarter"], ["overseer_called_rate", "inverted", "want ↓ — fewer cloud escalations"], ["ok_tasks", "normal", "want ↑ — more successes"], ["total_elapsed_s", "inverted", "want ↓ — faster iterations"], ]; let summary = `# Compounding Stress Battery — Summary\n\n`; summary += `**Run:** ${new Date().toISOString()}\n`; summary += `**Elapsed:** ${Math.round(batteryElapsed)}s (${(batteryElapsed/60).toFixed(1)} min)\n`; summary += `**Models:** executor=${tasks.models.executor_cloud}, reviewer=${tasks.models.reviewer_cloud}, overseer=${tasks.models.overseer_cloud}\n\n`; summary += `## Compounding Metrics\n\n`; summary += `| Metric | iter 1 | iter 2 | iter 3 | Trend (1→3) | Goal |\n`; summary += `|---|---|---|---|---|---|\n`; for (const [key, inv, goal] of rows) { const vals = iterations.map((m: any) => { const v = m[key as string]; return typeof v === "number" ? v.toFixed(2) : String(v); }); summary += `| ${key} | ${vals[0]} | ${vals[1]} | ${vals[2]} | ${delta(key as any, inv === "inverted")} | ${goal} |\n`; } summary += "\n"; // Count trending metrics const trends = rows.map(([k, inv]) => { const vs = iterations.map((m: any) => m[k as string]) as number[]; const improved = inv === "inverted" ? vs[2] < vs[0] : vs[2] > vs[0]; return { metric: k, improved }; }); const improvedCount = trends.filter(t => t.improved).length; summary += `## Verdict\n\n`; if (improvedCount >= 3) { summary += `**✓ Architecture validated** — ${improvedCount}/${trends.length} compounding metrics improved from iteration 1 to 3.\n\n`; } else { summary += `**✗ Compounding NOT demonstrated** — only ${improvedCount}/${trends.length} metrics improved. See scrum_findings.md in each iter_N/ directory for the overseer's proposals and the scrum master's review of what to change.\n\n`; } summary += `Metrics that ${improvedCount >= 3 ? "improved" : "regressed"}:\n`; for (const t of trends) { summary += `- ${t.metric}: ${t.improved ? "✓ improved" : "✗ flat or worse"}\n`; } summary += `\n## Artifacts\n\n`; summary += `- \`iter_1/\`, \`iter_2/\`, \`iter_3/\` — per-iteration runs.json, metrics.json, overseer_review.md, scrum_findings.md, distill_output.txt\n`; summary += `- \`summary.md\` — this file\n`; await writeFile(join(BATTERY_DIR, "summary.md"), summary); console.log(`\n${"═".repeat(60)}`); console.log(`✓ BATTERY COMPLETE — ${Math.round(batteryElapsed)}s`); console.log(` Summary: ${join(BATTERY_DIR, "summary.md")}`); console.log(`${"═".repeat(60)}\n`); console.log(summary); } main().catch(e => { console.error(`\n${"═".repeat(60)}`); console.error(`✗ BATTERY FAILED: ${e.message}`); console.error(`${"═".repeat(60)}\n`); if (e.stack) console.error(e.stack); process.exit(1); });