lakehouse/tests/battery/compounding_battery.ts
root 41b0a99ed2 chore: add real content that was sitting untracked
Surfaced by today's untracked-files audit. None of these are accidents —
multiple are referenced by name in CLAUDE.md and memory files but were
never added.

Categories:
- docs/PHASE_AUDIT_GUIDE.md (106 LOC) — Claude Code phase audit guidance
- ops/systemd/lakehouse-langfuse-bridge.service — Langfuse bridge unit
- package.json — top-level npm manifest
- scripts/e2e_pipeline_check.sh + production_smoke.sh — real test scripts
- reports/kimi/audit-last-week*.md — the "Two reports live" CLAUDE.md cites
- tests/multi-agent/scenarios/ — 44 staffing scenarios (cutover decision A)
- tests/multi-agent/playbooks/ — 102 playbook records
- tests/battery/, tests/agent_test/PRD.md, tests/real-world/* — real tests
- sidecar/sidecar/{lab_ui,pipeline_lab}.py — 888 LOC dev-only UIs that
  remain in service post-sidecar-drop (commit ba928b1 explicitly kept them)

Sensitivity check: scenarios use synthetic company names ("Heritage Foods",
"Cornerstone Fabrication"); audit reports describe code findings only;
no PII or secrets surfaced.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 22:22:10 -05:00

405 lines
15 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Compounding Stress Battery — the rigorous smoke test.
//
// Three iterations against /v1/respond, each running:
// α baseline (3 easy tasks) — should complete local-only with boost
// β drift (3 niche tasks) — forces executor miss → overseer fires
// γ impossible (2 zero-supply) — must fail honestly, no token explosion
// δ distill outcomes — writes distilled_*.jsonl + vector indexes
// ε overseer meta-review — gpt-oss:120b judges the iteration
// ζ scrum judgment — gpt-oss:120b reviews overseer proposals
//
// Iteration N+1 runs the same tasks as iteration N. We measure compounding:
// does turns_per_task drop? does overseer_called_rate drop? does
// correction_effective rise? If 3/5 metrics trend favorably, architecture
// validated; otherwise the scrum verdict points at what to fix.
//
// Fail-fast: every error bubbles. No silent catches — the run ABORTS with
// the underlying stack so we see exactly where the architecture broke.
//
// Runtime: ~60-90 min. Cloud cost: ~24-32 gpt-oss calls (well under daily cap).
import { writeFile, mkdir, readFile } from "node:fs/promises";
import { join } from "node:path";
const GATEWAY = process.env.GATEWAY_URL ?? "http://localhost:3100";
const LLM_TEAM = process.env.LLM_TEAM_URL ?? "http://localhost:5000";
const BATTERY_DIR = process.env.BATTERY_DIR
?? "/home/profit/lakehouse/data/_kb/battery";
// 10-minute timeout per /v1/respond call — cloud executor on a hard task
// can chew for a while, and we want to see real behavior, not premature aborts.
const RESPOND_TIMEOUT_MS = 10 * 60 * 1000;
const META_TIMEOUT_MS = 5 * 60 * 1000;
interface Task {
task_class: string;
operation: string;
spec: Record<string, any>;
}
interface Tasks {
phases: {
alpha_baseline: Task[];
beta_drift: Task[];
gamma_impossible: Task[];
};
models: {
executor_cloud: string;
reviewer_cloud: string;
overseer_cloud: string;
};
}
interface RunResult {
status: "ok" | "failed" | "blocked";
iterations: number;
artifact: any;
log: any[];
error?: string | null;
_elapsed_ms: number;
}
interface TaskRun {
task: Task;
phase: "alpha" | "beta" | "gamma";
result: RunResult;
}
// ─── HTTP helpers ───
async function runRespond(task: Task, models: Tasks["models"]): Promise<RunResult> {
const body = {
task_class: task.task_class,
operation: task.operation,
spec: task.spec,
executor_model: models.executor_cloud,
reviewer_model: models.reviewer_cloud,
};
const start = Date.now();
const resp = await fetch(`${GATEWAY}/v1/respond`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(body),
signal: AbortSignal.timeout(RESPOND_TIMEOUT_MS),
});
if (!resp.ok) {
const txt = await resp.text();
throw new Error(`/v1/respond HTTP ${resp.status}: ${txt.slice(0, 500)}`);
}
const j = (await resp.json()) as RunResult;
j._elapsed_ms = Date.now() - start;
return j;
}
async function runDistill(source: string): Promise<any[]> {
const body = { mode: "distill", prompt: "battery iteration distill", source };
const resp = await fetch(`${LLM_TEAM}/api/run?mode=distill`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(body),
signal: AbortSignal.timeout(META_TIMEOUT_MS),
});
if (!resp.ok) throw new Error(`distill HTTP ${resp.status}`);
const text = await resp.text();
// SSE stream — parse data: lines, return parsed event objects
const events: any[] = [];
for (const line of text.split("\n")) {
if (!line.startsWith("data: ")) continue;
try { events.push(JSON.parse(line.slice(6))); } catch { /* skip */ }
}
return events;
}
async function cloudChat(
model: string,
prompt: string,
temperature: number,
think: boolean,
): Promise<string> {
const resp = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
model,
messages: [{ role: "user", content: prompt }],
temperature,
think,
provider: "ollama_cloud",
}),
signal: AbortSignal.timeout(META_TIMEOUT_MS),
});
if (!resp.ok) {
const txt = await resp.text();
throw new Error(`/v1/chat ${model} HTTP ${resp.status}: ${txt.slice(0, 500)}`);
}
const j = await resp.json() as any;
return j.choices?.[0]?.message?.content ?? "";
}
// ─── Meta-review + scrum ───
async function overseerReview(
iterNum: number,
artifacts: any,
models: Tasks["models"],
): Promise<string> {
const prompt = `You are the OVERSEER reviewing iteration ${iterNum} of a stress battery run against Lakehouse /v1/respond.
For each task in the battery below, examine: status (ok/failed/blocked), iterations used, error signature, whether the in-loop overseer fired, total tokens.
Produce a PR-style meta-review in markdown with these sections:
## What worked
List specific tasks (by operation string) that completed correctly and the evidence — turns_used, citations, tokens. Be concrete.
## What failed
List specific tasks that failed or needed overseer correction. Classify: was it a real failure (impossible task), a drift we should repair, or a false positive from the test?
## Proposed changes for iteration ${iterNum + 1}
At least 3 concrete architectural changes, each with:
- **Target file** (e.g. \`crates/gateway/src/execution_loop/mod.rs\`)
- **Rationale** (what the metrics show)
- **Expected impact** (which metric should move in iter ${iterNum + 1})
Be honest about weaknesses. Do NOT propose generic best practices — reference specific observations from the artifacts below.
ARTIFACTS (iteration ${iterNum}):
${JSON.stringify(artifacts, null, 2).slice(0, 30000)}`;
return cloudChat(models.overseer_cloud, prompt, 0.2, true);
}
async function scrumJudge(
iterNum: number,
review: string,
models: Tasks["models"],
): Promise<string> {
const prompt = `You are the SCRUM MASTER. The OVERSEER proposed these architectural changes for iteration ${iterNum + 1} based on iteration ${iterNum}'s results.
For each proposal, produce a verdict in markdown:
- **Proposal N**: <short name>
- **Verdict**: APPROVE | REVISE | REJECT
- **Reason**: why
- **If APPROVE**: is the expected impact realistic? what's the blast radius? is the target file correct?
- **If REVISE**: what should change about the proposal before applying?
- **If REJECT**: why is the proposal wrong or out of scope?
Final section:
## PR-ready changes
Bulleted list of only the APPROVE proposals, ready to apply.
Be rigorous. Don't rubber-stamp. If a proposal references a file that probably doesn't exist, REJECT and say so. If a proposal is a generic "improve X" without concrete plan, REVISE.
OVERSEER PROPOSED:
${review.slice(0, 15000)}`;
return cloudChat(models.overseer_cloud, prompt, 0.1, true);
}
// ─── Iteration driver ───
async function runIteration(iterNum: number, tasks: Tasks): Promise<any> {
console.log(`\n${"═".repeat(60)}`);
console.log(`▶ ITERATION ${iterNum}`);
console.log(`${"═".repeat(60)}\n`);
const iterDir = join(BATTERY_DIR, `iter_${iterNum}`);
await mkdir(iterDir, { recursive: true });
const runs: TaskRun[] = [];
for (const [phaseKey, phaseName] of [
["alpha_baseline", "alpha"],
["beta_drift", "beta"],
["gamma_impossible", "gamma"],
] as const) {
console.log(`\n── Phase ${phaseName} ──`);
for (const task of tasks.phases[phaseKey]) {
console.log(`${task.operation}`);
const result = await runRespond(task, tasks.models);
const overseerFired = (result.log ?? []).some(e => e.kind === "overseer_correction");
console.log(
` status=${result.status} turns=${result.iterations}` +
` tokens=${result.artifact?.usage?.total_tokens ?? 0}` +
` overseer=${overseerFired}` +
` elapsed=${Math.round(result._elapsed_ms / 1000)}s`
);
if (result.error) console.log(` error: ${result.error.slice(0, 200)}`);
runs.push({ task, phase: phaseName, result });
}
}
// Phase δ
console.log(`\n── Phase δ: distill outcomes_tail:20 ──`);
const distillEvents = await runDistill("outcomes_tail:20");
const distillFinal = [...distillEvents].reverse()
.find(e => e.role === "final") ?? distillEvents[distillEvents.length - 1];
const distillText = distillFinal?.text ?? JSON.stringify(distillFinal ?? {}).slice(0, 200);
console.log(` ${distillText.split("\n")[0]}`);
await writeFile(join(iterDir, "distill_output.txt"), distillText);
// Metrics
const collectPhase = (p: string) => runs.filter(r => r.phase === p);
const phaseMetrics = (p: string) => {
const ps = collectPhase(p);
if (ps.length === 0) return { count: 0 };
return {
count: ps.length,
ok: ps.filter(r => r.result.status === "ok").length,
failed: ps.filter(r => r.result.status === "failed").length,
avg_turns: ps.reduce((s, r) => s + (r.result.iterations || 0), 0) / ps.length,
total_tokens: ps.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
overseer_called: ps.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length,
avg_elapsed_s: ps.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / ps.length / 1000,
};
};
const metrics = {
iteration: iterNum,
total_tasks: runs.length,
ok_tasks: runs.filter(r => r.result.status === "ok").length,
failed_tasks: runs.filter(r => r.result.status === "failed").length,
blocked_tasks: runs.filter(r => r.result.status === "blocked").length,
total_tokens: runs.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
avg_turns_per_task: runs.reduce((s, r) => s + (r.result.iterations || 0), 0) / runs.length,
overseer_called_rate: runs.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length / runs.length,
total_elapsed_s: runs.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / 1000,
by_phase: {
alpha: phaseMetrics("alpha"),
beta: phaseMetrics("beta"),
gamma: phaseMetrics("gamma"),
},
};
console.log(`\n── Metrics ──`);
console.log(` total_tokens: ${metrics.total_tokens}`);
console.log(` avg_turns_per_task: ${metrics.avg_turns_per_task.toFixed(2)}`);
console.log(` overseer_called_rate: ${(metrics.overseer_called_rate * 100).toFixed(1)}%`);
console.log(` ok/total: ${metrics.ok_tasks}/${metrics.total_tasks}`);
await writeFile(join(iterDir, "runs.json"), JSON.stringify(runs, null, 2));
await writeFile(join(iterDir, "metrics.json"), JSON.stringify(metrics, null, 2));
// Phase ε: overseer review
console.log(`\n── Phase ε: overseer meta-review ──`);
const reviewInput = {
metrics,
task_summary: runs.map(r => ({
operation: r.task.operation,
phase: r.phase,
status: r.result.status,
iterations: r.result.iterations,
tokens: r.result.artifact?.usage?.total_tokens ?? 0,
overseer_called: (r.result.log ?? []).some(e => e.kind === "overseer_correction"),
error: r.result.error ?? null,
elapsed_s: Math.round((r.result._elapsed_ms || 0) / 1000),
})),
};
const review = await overseerReview(iterNum, reviewInput, tasks.models);
await writeFile(join(iterDir, "overseer_review.md"), review);
console.log(`${review.length} chars`);
// Phase ζ: scrum
console.log(`\n── Phase ζ: scrum judgment ──`);
const verdict = await scrumJudge(iterNum, review, tasks.models);
await writeFile(join(iterDir, "scrum_findings.md"), verdict);
console.log(`${verdict.length} chars`);
return metrics;
}
// ─── Main ───
async function main() {
const tasks = JSON.parse(
await readFile("/home/profit/lakehouse/tests/battery/tasks.json", "utf8"),
) as Tasks;
await mkdir(BATTERY_DIR, { recursive: true });
const iterations: any[] = [];
const batteryStart = Date.now();
for (let i = 1; i <= 3; i++) {
const m = await runIteration(i, tasks);
iterations.push(m);
}
const batteryElapsed = (Date.now() - batteryStart) / 1000;
// Summary
const delta = (k: keyof any, inverted = false) => {
const vals = iterations.map((m: any) => m[k]);
if (vals.some(v => v === undefined)) return "—";
const diff = vals[2] - vals[0];
const pct = vals[0] !== 0 ? (diff / vals[0]) * 100 : 0;
const arrow = inverted ? (diff < 0 ? "↓ better" : "↑ worse") : (diff > 0 ? "↑ better" : "↓ worse");
return `${arrow} (${diff > 0 ? "+" : ""}${diff.toFixed?.(2) ?? diff}, ${pct.toFixed(1)}%)`;
};
const rows = [
["total_tokens", "inverted", "want ↓ — fewer tokens for same work"],
["avg_turns_per_task", "inverted", "want ↓ — executor gets smarter"],
["overseer_called_rate", "inverted", "want ↓ — fewer cloud escalations"],
["ok_tasks", "normal", "want ↑ — more successes"],
["total_elapsed_s", "inverted", "want ↓ — faster iterations"],
];
let summary = `# Compounding Stress Battery — Summary\n\n`;
summary += `**Run:** ${new Date().toISOString()}\n`;
summary += `**Elapsed:** ${Math.round(batteryElapsed)}s (${(batteryElapsed/60).toFixed(1)} min)\n`;
summary += `**Models:** executor=${tasks.models.executor_cloud}, reviewer=${tasks.models.reviewer_cloud}, overseer=${tasks.models.overseer_cloud}\n\n`;
summary += `## Compounding Metrics\n\n`;
summary += `| Metric | iter 1 | iter 2 | iter 3 | Trend (1→3) | Goal |\n`;
summary += `|---|---|---|---|---|---|\n`;
for (const [key, inv, goal] of rows) {
const vals = iterations.map((m: any) => {
const v = m[key as string];
return typeof v === "number" ? v.toFixed(2) : String(v);
});
summary += `| ${key} | ${vals[0]} | ${vals[1]} | ${vals[2]} | ${delta(key as any, inv === "inverted")} | ${goal} |\n`;
}
summary += "\n";
// Count trending metrics
const trends = rows.map(([k, inv]) => {
const vs = iterations.map((m: any) => m[k as string]) as number[];
const improved = inv === "inverted" ? vs[2] < vs[0] : vs[2] > vs[0];
return { metric: k, improved };
});
const improvedCount = trends.filter(t => t.improved).length;
summary += `## Verdict\n\n`;
if (improvedCount >= 3) {
summary += `**✓ Architecture validated** — ${improvedCount}/${trends.length} compounding metrics improved from iteration 1 to 3.\n\n`;
} else {
summary += `**✗ Compounding NOT demonstrated** — only ${improvedCount}/${trends.length} metrics improved. See scrum_findings.md in each iter_N/ directory for the overseer's proposals and the scrum master's review of what to change.\n\n`;
}
summary += `Metrics that ${improvedCount >= 3 ? "improved" : "regressed"}:\n`;
for (const t of trends) {
summary += `- ${t.metric}: ${t.improved ? "✓ improved" : "✗ flat or worse"}\n`;
}
summary += `\n## Artifacts\n\n`;
summary += `- \`iter_1/\`, \`iter_2/\`, \`iter_3/\` — per-iteration runs.json, metrics.json, overseer_review.md, scrum_findings.md, distill_output.txt\n`;
summary += `- \`summary.md\` — this file\n`;
await writeFile(join(BATTERY_DIR, "summary.md"), summary);
console.log(`\n${"═".repeat(60)}`);
console.log(`✓ BATTERY COMPLETE — ${Math.round(batteryElapsed)}s`);
console.log(` Summary: ${join(BATTERY_DIR, "summary.md")}`);
console.log(`${"═".repeat(60)}\n`);
console.log(summary);
}
main().catch(e => {
console.error(`\n${"═".repeat(60)}`);
console.error(`✗ BATTERY FAILED: ${e.message}`);
console.error(`${"═".repeat(60)}\n`);
if (e.stack) console.error(e.stack);
process.exit(1);
});