Surfaced by today's untracked-files audit. None of these are accidents —
multiple are referenced by name in CLAUDE.md and memory files but were
never added.
Categories:
- docs/PHASE_AUDIT_GUIDE.md (106 LOC) — Claude Code phase audit guidance
- ops/systemd/lakehouse-langfuse-bridge.service — Langfuse bridge unit
- package.json — top-level npm manifest
- scripts/e2e_pipeline_check.sh + production_smoke.sh — real test scripts
- reports/kimi/audit-last-week*.md — the "Two reports live" CLAUDE.md cites
- tests/multi-agent/scenarios/ — 44 staffing scenarios (cutover decision A)
- tests/multi-agent/playbooks/ — 102 playbook records
- tests/battery/, tests/agent_test/PRD.md, tests/real-world/* — real tests
- sidecar/sidecar/{lab_ui,pipeline_lab}.py — 888 LOC dev-only UIs that
remain in service post-sidecar-drop (commit ba928b1 explicitly kept them)
Sensitivity check: scenarios use synthetic company names ("Heritage Foods",
"Cornerstone Fabrication"); audit reports describe code findings only;
no PII or secrets surfaced.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
405 lines
15 KiB
TypeScript
405 lines
15 KiB
TypeScript
// Compounding Stress Battery — the rigorous smoke test.
|
||
//
|
||
// Three iterations against /v1/respond, each running:
|
||
// α baseline (3 easy tasks) — should complete local-only with boost
|
||
// β drift (3 niche tasks) — forces executor miss → overseer fires
|
||
// γ impossible (2 zero-supply) — must fail honestly, no token explosion
|
||
// δ distill outcomes — writes distilled_*.jsonl + vector indexes
|
||
// ε overseer meta-review — gpt-oss:120b judges the iteration
|
||
// ζ scrum judgment — gpt-oss:120b reviews overseer proposals
|
||
//
|
||
// Iteration N+1 runs the same tasks as iteration N. We measure compounding:
|
||
// does turns_per_task drop? does overseer_called_rate drop? does
|
||
// correction_effective rise? If 3/5 metrics trend favorably, architecture
|
||
// validated; otherwise the scrum verdict points at what to fix.
|
||
//
|
||
// Fail-fast: every error bubbles. No silent catches — the run ABORTS with
|
||
// the underlying stack so we see exactly where the architecture broke.
|
||
//
|
||
// Runtime: ~60-90 min. Cloud cost: ~24-32 gpt-oss calls (well under daily cap).
|
||
|
||
import { writeFile, mkdir, readFile } from "node:fs/promises";
|
||
import { join } from "node:path";
|
||
|
||
const GATEWAY = process.env.GATEWAY_URL ?? "http://localhost:3100";
|
||
const LLM_TEAM = process.env.LLM_TEAM_URL ?? "http://localhost:5000";
|
||
const BATTERY_DIR = process.env.BATTERY_DIR
|
||
?? "/home/profit/lakehouse/data/_kb/battery";
|
||
|
||
// 10-minute timeout per /v1/respond call — cloud executor on a hard task
|
||
// can chew for a while, and we want to see real behavior, not premature aborts.
|
||
const RESPOND_TIMEOUT_MS = 10 * 60 * 1000;
|
||
const META_TIMEOUT_MS = 5 * 60 * 1000;
|
||
|
||
interface Task {
|
||
task_class: string;
|
||
operation: string;
|
||
spec: Record<string, any>;
|
||
}
|
||
|
||
interface Tasks {
|
||
phases: {
|
||
alpha_baseline: Task[];
|
||
beta_drift: Task[];
|
||
gamma_impossible: Task[];
|
||
};
|
||
models: {
|
||
executor_cloud: string;
|
||
reviewer_cloud: string;
|
||
overseer_cloud: string;
|
||
};
|
||
}
|
||
|
||
interface RunResult {
|
||
status: "ok" | "failed" | "blocked";
|
||
iterations: number;
|
||
artifact: any;
|
||
log: any[];
|
||
error?: string | null;
|
||
_elapsed_ms: number;
|
||
}
|
||
|
||
interface TaskRun {
|
||
task: Task;
|
||
phase: "alpha" | "beta" | "gamma";
|
||
result: RunResult;
|
||
}
|
||
|
||
// ─── HTTP helpers ───
|
||
|
||
async function runRespond(task: Task, models: Tasks["models"]): Promise<RunResult> {
|
||
const body = {
|
||
task_class: task.task_class,
|
||
operation: task.operation,
|
||
spec: task.spec,
|
||
executor_model: models.executor_cloud,
|
||
reviewer_model: models.reviewer_cloud,
|
||
};
|
||
const start = Date.now();
|
||
const resp = await fetch(`${GATEWAY}/v1/respond`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify(body),
|
||
signal: AbortSignal.timeout(RESPOND_TIMEOUT_MS),
|
||
});
|
||
if (!resp.ok) {
|
||
const txt = await resp.text();
|
||
throw new Error(`/v1/respond HTTP ${resp.status}: ${txt.slice(0, 500)}`);
|
||
}
|
||
const j = (await resp.json()) as RunResult;
|
||
j._elapsed_ms = Date.now() - start;
|
||
return j;
|
||
}
|
||
|
||
async function runDistill(source: string): Promise<any[]> {
|
||
const body = { mode: "distill", prompt: "battery iteration distill", source };
|
||
const resp = await fetch(`${LLM_TEAM}/api/run?mode=distill`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify(body),
|
||
signal: AbortSignal.timeout(META_TIMEOUT_MS),
|
||
});
|
||
if (!resp.ok) throw new Error(`distill HTTP ${resp.status}`);
|
||
const text = await resp.text();
|
||
// SSE stream — parse data: lines, return parsed event objects
|
||
const events: any[] = [];
|
||
for (const line of text.split("\n")) {
|
||
if (!line.startsWith("data: ")) continue;
|
||
try { events.push(JSON.parse(line.slice(6))); } catch { /* skip */ }
|
||
}
|
||
return events;
|
||
}
|
||
|
||
async function cloudChat(
|
||
model: string,
|
||
prompt: string,
|
||
temperature: number,
|
||
think: boolean,
|
||
): Promise<string> {
|
||
const resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||
method: "POST",
|
||
headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
model,
|
||
messages: [{ role: "user", content: prompt }],
|
||
temperature,
|
||
think,
|
||
provider: "ollama_cloud",
|
||
}),
|
||
signal: AbortSignal.timeout(META_TIMEOUT_MS),
|
||
});
|
||
if (!resp.ok) {
|
||
const txt = await resp.text();
|
||
throw new Error(`/v1/chat ${model} HTTP ${resp.status}: ${txt.slice(0, 500)}`);
|
||
}
|
||
const j = await resp.json() as any;
|
||
return j.choices?.[0]?.message?.content ?? "";
|
||
}
|
||
|
||
// ─── Meta-review + scrum ───
|
||
|
||
async function overseerReview(
|
||
iterNum: number,
|
||
artifacts: any,
|
||
models: Tasks["models"],
|
||
): Promise<string> {
|
||
const prompt = `You are the OVERSEER reviewing iteration ${iterNum} of a stress battery run against Lakehouse /v1/respond.
|
||
|
||
For each task in the battery below, examine: status (ok/failed/blocked), iterations used, error signature, whether the in-loop overseer fired, total tokens.
|
||
|
||
Produce a PR-style meta-review in markdown with these sections:
|
||
|
||
## What worked
|
||
List specific tasks (by operation string) that completed correctly and the evidence — turns_used, citations, tokens. Be concrete.
|
||
|
||
## What failed
|
||
List specific tasks that failed or needed overseer correction. Classify: was it a real failure (impossible task), a drift we should repair, or a false positive from the test?
|
||
|
||
## Proposed changes for iteration ${iterNum + 1}
|
||
At least 3 concrete architectural changes, each with:
|
||
- **Target file** (e.g. \`crates/gateway/src/execution_loop/mod.rs\`)
|
||
- **Rationale** (what the metrics show)
|
||
- **Expected impact** (which metric should move in iter ${iterNum + 1})
|
||
|
||
Be honest about weaknesses. Do NOT propose generic best practices — reference specific observations from the artifacts below.
|
||
|
||
ARTIFACTS (iteration ${iterNum}):
|
||
${JSON.stringify(artifacts, null, 2).slice(0, 30000)}`;
|
||
|
||
return cloudChat(models.overseer_cloud, prompt, 0.2, true);
|
||
}
|
||
|
||
async function scrumJudge(
|
||
iterNum: number,
|
||
review: string,
|
||
models: Tasks["models"],
|
||
): Promise<string> {
|
||
const prompt = `You are the SCRUM MASTER. The OVERSEER proposed these architectural changes for iteration ${iterNum + 1} based on iteration ${iterNum}'s results.
|
||
|
||
For each proposal, produce a verdict in markdown:
|
||
|
||
- **Proposal N**: <short name>
|
||
- **Verdict**: APPROVE | REVISE | REJECT
|
||
- **Reason**: why
|
||
- **If APPROVE**: is the expected impact realistic? what's the blast radius? is the target file correct?
|
||
- **If REVISE**: what should change about the proposal before applying?
|
||
- **If REJECT**: why is the proposal wrong or out of scope?
|
||
|
||
Final section:
|
||
## PR-ready changes
|
||
Bulleted list of only the APPROVE proposals, ready to apply.
|
||
|
||
Be rigorous. Don't rubber-stamp. If a proposal references a file that probably doesn't exist, REJECT and say so. If a proposal is a generic "improve X" without concrete plan, REVISE.
|
||
|
||
OVERSEER PROPOSED:
|
||
${review.slice(0, 15000)}`;
|
||
|
||
return cloudChat(models.overseer_cloud, prompt, 0.1, true);
|
||
}
|
||
|
||
// ─── Iteration driver ───
|
||
|
||
async function runIteration(iterNum: number, tasks: Tasks): Promise<any> {
|
||
console.log(`\n${"═".repeat(60)}`);
|
||
console.log(`▶ ITERATION ${iterNum}`);
|
||
console.log(`${"═".repeat(60)}\n`);
|
||
|
||
const iterDir = join(BATTERY_DIR, `iter_${iterNum}`);
|
||
await mkdir(iterDir, { recursive: true });
|
||
|
||
const runs: TaskRun[] = [];
|
||
|
||
for (const [phaseKey, phaseName] of [
|
||
["alpha_baseline", "alpha"],
|
||
["beta_drift", "beta"],
|
||
["gamma_impossible", "gamma"],
|
||
] as const) {
|
||
console.log(`\n── Phase ${phaseName} ──`);
|
||
for (const task of tasks.phases[phaseKey]) {
|
||
console.log(` ▶ ${task.operation}`);
|
||
const result = await runRespond(task, tasks.models);
|
||
const overseerFired = (result.log ?? []).some(e => e.kind === "overseer_correction");
|
||
console.log(
|
||
` status=${result.status} turns=${result.iterations}` +
|
||
` tokens=${result.artifact?.usage?.total_tokens ?? 0}` +
|
||
` overseer=${overseerFired}` +
|
||
` elapsed=${Math.round(result._elapsed_ms / 1000)}s`
|
||
);
|
||
if (result.error) console.log(` error: ${result.error.slice(0, 200)}`);
|
||
runs.push({ task, phase: phaseName, result });
|
||
}
|
||
}
|
||
|
||
// Phase δ
|
||
console.log(`\n── Phase δ: distill outcomes_tail:20 ──`);
|
||
const distillEvents = await runDistill("outcomes_tail:20");
|
||
const distillFinal = [...distillEvents].reverse()
|
||
.find(e => e.role === "final") ?? distillEvents[distillEvents.length - 1];
|
||
const distillText = distillFinal?.text ?? JSON.stringify(distillFinal ?? {}).slice(0, 200);
|
||
console.log(` ${distillText.split("\n")[0]}`);
|
||
await writeFile(join(iterDir, "distill_output.txt"), distillText);
|
||
|
||
// Metrics
|
||
const collectPhase = (p: string) => runs.filter(r => r.phase === p);
|
||
const phaseMetrics = (p: string) => {
|
||
const ps = collectPhase(p);
|
||
if (ps.length === 0) return { count: 0 };
|
||
return {
|
||
count: ps.length,
|
||
ok: ps.filter(r => r.result.status === "ok").length,
|
||
failed: ps.filter(r => r.result.status === "failed").length,
|
||
avg_turns: ps.reduce((s, r) => s + (r.result.iterations || 0), 0) / ps.length,
|
||
total_tokens: ps.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
|
||
overseer_called: ps.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length,
|
||
avg_elapsed_s: ps.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / ps.length / 1000,
|
||
};
|
||
};
|
||
|
||
const metrics = {
|
||
iteration: iterNum,
|
||
total_tasks: runs.length,
|
||
ok_tasks: runs.filter(r => r.result.status === "ok").length,
|
||
failed_tasks: runs.filter(r => r.result.status === "failed").length,
|
||
blocked_tasks: runs.filter(r => r.result.status === "blocked").length,
|
||
total_tokens: runs.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
|
||
avg_turns_per_task: runs.reduce((s, r) => s + (r.result.iterations || 0), 0) / runs.length,
|
||
overseer_called_rate: runs.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length / runs.length,
|
||
total_elapsed_s: runs.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / 1000,
|
||
by_phase: {
|
||
alpha: phaseMetrics("alpha"),
|
||
beta: phaseMetrics("beta"),
|
||
gamma: phaseMetrics("gamma"),
|
||
},
|
||
};
|
||
|
||
console.log(`\n── Metrics ──`);
|
||
console.log(` total_tokens: ${metrics.total_tokens}`);
|
||
console.log(` avg_turns_per_task: ${metrics.avg_turns_per_task.toFixed(2)}`);
|
||
console.log(` overseer_called_rate: ${(metrics.overseer_called_rate * 100).toFixed(1)}%`);
|
||
console.log(` ok/total: ${metrics.ok_tasks}/${metrics.total_tasks}`);
|
||
|
||
await writeFile(join(iterDir, "runs.json"), JSON.stringify(runs, null, 2));
|
||
await writeFile(join(iterDir, "metrics.json"), JSON.stringify(metrics, null, 2));
|
||
|
||
// Phase ε: overseer review
|
||
console.log(`\n── Phase ε: overseer meta-review ──`);
|
||
const reviewInput = {
|
||
metrics,
|
||
task_summary: runs.map(r => ({
|
||
operation: r.task.operation,
|
||
phase: r.phase,
|
||
status: r.result.status,
|
||
iterations: r.result.iterations,
|
||
tokens: r.result.artifact?.usage?.total_tokens ?? 0,
|
||
overseer_called: (r.result.log ?? []).some(e => e.kind === "overseer_correction"),
|
||
error: r.result.error ?? null,
|
||
elapsed_s: Math.round((r.result._elapsed_ms || 0) / 1000),
|
||
})),
|
||
};
|
||
const review = await overseerReview(iterNum, reviewInput, tasks.models);
|
||
await writeFile(join(iterDir, "overseer_review.md"), review);
|
||
console.log(` ✓ ${review.length} chars`);
|
||
|
||
// Phase ζ: scrum
|
||
console.log(`\n── Phase ζ: scrum judgment ──`);
|
||
const verdict = await scrumJudge(iterNum, review, tasks.models);
|
||
await writeFile(join(iterDir, "scrum_findings.md"), verdict);
|
||
console.log(` ✓ ${verdict.length} chars`);
|
||
|
||
return metrics;
|
||
}
|
||
|
||
// ─── Main ───
|
||
|
||
async function main() {
|
||
const tasks = JSON.parse(
|
||
await readFile("/home/profit/lakehouse/tests/battery/tasks.json", "utf8"),
|
||
) as Tasks;
|
||
|
||
await mkdir(BATTERY_DIR, { recursive: true });
|
||
|
||
const iterations: any[] = [];
|
||
const batteryStart = Date.now();
|
||
|
||
for (let i = 1; i <= 3; i++) {
|
||
const m = await runIteration(i, tasks);
|
||
iterations.push(m);
|
||
}
|
||
|
||
const batteryElapsed = (Date.now() - batteryStart) / 1000;
|
||
|
||
// Summary
|
||
const delta = (k: keyof any, inverted = false) => {
|
||
const vals = iterations.map((m: any) => m[k]);
|
||
if (vals.some(v => v === undefined)) return "—";
|
||
const diff = vals[2] - vals[0];
|
||
const pct = vals[0] !== 0 ? (diff / vals[0]) * 100 : 0;
|
||
const arrow = inverted ? (diff < 0 ? "↓ better" : "↑ worse") : (diff > 0 ? "↑ better" : "↓ worse");
|
||
return `${arrow} (${diff > 0 ? "+" : ""}${diff.toFixed?.(2) ?? diff}, ${pct.toFixed(1)}%)`;
|
||
};
|
||
|
||
const rows = [
|
||
["total_tokens", "inverted", "want ↓ — fewer tokens for same work"],
|
||
["avg_turns_per_task", "inverted", "want ↓ — executor gets smarter"],
|
||
["overseer_called_rate", "inverted", "want ↓ — fewer cloud escalations"],
|
||
["ok_tasks", "normal", "want ↑ — more successes"],
|
||
["total_elapsed_s", "inverted", "want ↓ — faster iterations"],
|
||
];
|
||
|
||
let summary = `# Compounding Stress Battery — Summary\n\n`;
|
||
summary += `**Run:** ${new Date().toISOString()}\n`;
|
||
summary += `**Elapsed:** ${Math.round(batteryElapsed)}s (${(batteryElapsed/60).toFixed(1)} min)\n`;
|
||
summary += `**Models:** executor=${tasks.models.executor_cloud}, reviewer=${tasks.models.reviewer_cloud}, overseer=${tasks.models.overseer_cloud}\n\n`;
|
||
|
||
summary += `## Compounding Metrics\n\n`;
|
||
summary += `| Metric | iter 1 | iter 2 | iter 3 | Trend (1→3) | Goal |\n`;
|
||
summary += `|---|---|---|---|---|---|\n`;
|
||
for (const [key, inv, goal] of rows) {
|
||
const vals = iterations.map((m: any) => {
|
||
const v = m[key as string];
|
||
return typeof v === "number" ? v.toFixed(2) : String(v);
|
||
});
|
||
summary += `| ${key} | ${vals[0]} | ${vals[1]} | ${vals[2]} | ${delta(key as any, inv === "inverted")} | ${goal} |\n`;
|
||
}
|
||
summary += "\n";
|
||
|
||
// Count trending metrics
|
||
const trends = rows.map(([k, inv]) => {
|
||
const vs = iterations.map((m: any) => m[k as string]) as number[];
|
||
const improved = inv === "inverted" ? vs[2] < vs[0] : vs[2] > vs[0];
|
||
return { metric: k, improved };
|
||
});
|
||
const improvedCount = trends.filter(t => t.improved).length;
|
||
|
||
summary += `## Verdict\n\n`;
|
||
if (improvedCount >= 3) {
|
||
summary += `**✓ Architecture validated** — ${improvedCount}/${trends.length} compounding metrics improved from iteration 1 to 3.\n\n`;
|
||
} else {
|
||
summary += `**✗ Compounding NOT demonstrated** — only ${improvedCount}/${trends.length} metrics improved. See scrum_findings.md in each iter_N/ directory for the overseer's proposals and the scrum master's review of what to change.\n\n`;
|
||
}
|
||
|
||
summary += `Metrics that ${improvedCount >= 3 ? "improved" : "regressed"}:\n`;
|
||
for (const t of trends) {
|
||
summary += `- ${t.metric}: ${t.improved ? "✓ improved" : "✗ flat or worse"}\n`;
|
||
}
|
||
|
||
summary += `\n## Artifacts\n\n`;
|
||
summary += `- \`iter_1/\`, \`iter_2/\`, \`iter_3/\` — per-iteration runs.json, metrics.json, overseer_review.md, scrum_findings.md, distill_output.txt\n`;
|
||
summary += `- \`summary.md\` — this file\n`;
|
||
|
||
await writeFile(join(BATTERY_DIR, "summary.md"), summary);
|
||
console.log(`\n${"═".repeat(60)}`);
|
||
console.log(`✓ BATTERY COMPLETE — ${Math.round(batteryElapsed)}s`);
|
||
console.log(` Summary: ${join(BATTERY_DIR, "summary.md")}`);
|
||
console.log(`${"═".repeat(60)}\n`);
|
||
console.log(summary);
|
||
}
|
||
|
||
main().catch(e => {
|
||
console.error(`\n${"═".repeat(60)}`);
|
||
console.error(`✗ BATTERY FAILED: ${e.message}`);
|
||
console.error(`${"═".repeat(60)}\n`);
|
||
if (e.stack) console.error(e.stack);
|
||
process.exit(1);
|
||
});
|