lakehouse/tests/battery/compounding_battery.ts

// Compounding Stress Battery — the rigorous smoke test.
//
// Three iterations against /v1/respond, each running:
//   α  baseline (3 easy tasks)       — should complete local-only with boost
//   β  drift   (3 niche tasks)       — forces executor miss → overseer fires
//   γ  impossible (2 zero-supply)    — must fail honestly, no token explosion
//   δ  distill outcomes              — writes distilled_*.jsonl + vector indexes
//   ε  overseer meta-review          — gpt-oss:120b judges the iteration
//   ζ  scrum judgment                — gpt-oss:120b reviews overseer proposals
//
// Iteration N+1 runs the same tasks as iteration N. We measure compounding:
// does turns_per_task drop? does overseer_called_rate drop? does
// correction_effective rise? If 3/5 metrics trend favorably, architecture
// validated; otherwise the scrum verdict points at what to fix.
//
// Fail-fast: every error bubbles. No silent catches — the run ABORTS with
// the underlying stack so we see exactly where the architecture broke.
//
// Runtime: ~60-90 min. Cloud cost: ~24-32 gpt-oss calls (well under daily cap).

import { writeFile, mkdir, readFile } from "node:fs/promises";
import { join } from "node:path";

const GATEWAY = process.env.GATEWAY_URL ?? "http://localhost:3100";
const LLM_TEAM = process.env.LLM_TEAM_URL ?? "http://localhost:5000";
const BATTERY_DIR = process.env.BATTERY_DIR
  ?? "/home/profit/lakehouse/data/_kb/battery";

// 10-minute timeout per /v1/respond call — cloud executor on a hard task
// can chew for a while, and we want to see real behavior, not premature aborts.
const RESPOND_TIMEOUT_MS = 10 * 60 * 1000;
const META_TIMEOUT_MS = 5 * 60 * 1000;

interface Task {
  task_class: string;
  operation: string;
  spec: Record<string, any>;
}

interface Tasks {
  phases: {
    alpha_baseline: Task[];
    beta_drift: Task[];
    gamma_impossible: Task[];
  };
  models: {
    executor_cloud: string;
    reviewer_cloud: string;
    overseer_cloud: string;
  };
}

interface RunResult {
  status: "ok" | "failed" | "blocked";
  iterations: number;
  artifact: any;
  log: any[];
  error?: string | null;
  _elapsed_ms: number;
}

interface TaskRun {
  task: Task;
  phase: "alpha" | "beta" | "gamma";
  result: RunResult;
}

// ─── HTTP helpers ───

async function runRespond(task: Task, models: Tasks["models"]): Promise<RunResult> {
  const body = {
    task_class: task.task_class,
    operation: task.operation,
    spec: task.spec,
    executor_model: models.executor_cloud,
    reviewer_model: models.reviewer_cloud,
  };
  const start = Date.now();
  const resp = await fetch(`${GATEWAY}/v1/respond`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify(body),
    signal: AbortSignal.timeout(RESPOND_TIMEOUT_MS),
  });
  if (!resp.ok) {
    const txt = await resp.text();
    throw new Error(`/v1/respond HTTP ${resp.status}: ${txt.slice(0, 500)}`);
  }
  const j = (await resp.json()) as RunResult;
  j._elapsed_ms = Date.now() - start;
  return j;
}

async function runDistill(source: string): Promise<any[]> {
  const body = { mode: "distill", prompt: "battery iteration distill", source };
  const resp = await fetch(`${LLM_TEAM}/api/run?mode=distill`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify(body),
    signal: AbortSignal.timeout(META_TIMEOUT_MS),
  });
  if (!resp.ok) throw new Error(`distill HTTP ${resp.status}`);
  const text = await resp.text();
  // SSE stream — parse data: lines, return parsed event objects
  const events: any[] = [];
  for (const line of text.split("\n")) {
    if (!line.startsWith("data: ")) continue;
    try { events.push(JSON.parse(line.slice(6))); } catch { /* skip */ }
  }
  return events;
}

async function cloudChat(
  model: string,
  prompt: string,
  temperature: number,
  think: boolean,
): Promise<string> {
  const resp = await fetch(`${GATEWAY}/v1/chat`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      model,
      messages: [{ role: "user", content: prompt }],
      temperature,
      think,
      provider: "ollama_cloud",
    }),
    signal: AbortSignal.timeout(META_TIMEOUT_MS),
  });
  if (!resp.ok) {
    const txt = await resp.text();
    throw new Error(`/v1/chat ${model} HTTP ${resp.status}: ${txt.slice(0, 500)}`);
  }
  const j = await resp.json() as any;
  return j.choices?.[0]?.message?.content ?? "";
}

// ─── Meta-review + scrum ───

async function overseerReview(
  iterNum: number,
  artifacts: any,
  models: Tasks["models"],
): Promise<string> {
  const prompt = `You are the OVERSEER reviewing iteration ${iterNum} of a stress battery run against Lakehouse /v1/respond.

For each task in the battery below, examine: status (ok/failed/blocked), iterations used, error signature, whether the in-loop overseer fired, total tokens.

Produce a PR-style meta-review in markdown with these sections:

## What worked
List specific tasks (by operation string) that completed correctly and the evidence — turns_used, citations, tokens. Be concrete.

## What failed
List specific tasks that failed or needed overseer correction. Classify: was it a real failure (impossible task), a drift we should repair, or a false positive from the test?

## Proposed changes for iteration ${iterNum + 1}
At least 3 concrete architectural changes, each with:
- **Target file** (e.g. \`crates/gateway/src/execution_loop/mod.rs\`)
- **Rationale** (what the metrics show)
- **Expected impact** (which metric should move in iter ${iterNum + 1})

Be honest about weaknesses. Do NOT propose generic best practices — reference specific observations from the artifacts below.

ARTIFACTS (iteration ${iterNum}):
${JSON.stringify(artifacts, null, 2).slice(0, 30000)}`;

  return cloudChat(models.overseer_cloud, prompt, 0.2, true);
}

async function scrumJudge(
  iterNum: number,
  review: string,
  models: Tasks["models"],
): Promise<string> {
  const prompt = `You are the SCRUM MASTER. The OVERSEER proposed these architectural changes for iteration ${iterNum + 1} based on iteration ${iterNum}'s results.

For each proposal, produce a verdict in markdown:

- **Proposal N**: <short name>
  - **Verdict**: APPROVE | REVISE | REJECT
  - **Reason**: why
  - **If APPROVE**: is the expected impact realistic? what's the blast radius? is the target file correct?
  - **If REVISE**: what should change about the proposal before applying?
  - **If REJECT**: why is the proposal wrong or out of scope?

Final section:
## PR-ready changes
Bulleted list of only the APPROVE proposals, ready to apply.

Be rigorous. Don't rubber-stamp. If a proposal references a file that probably doesn't exist, REJECT and say so. If a proposal is a generic "improve X" without concrete plan, REVISE.

OVERSEER PROPOSED:
${review.slice(0, 15000)}`;

  return cloudChat(models.overseer_cloud, prompt, 0.1, true);
}

// ─── Iteration driver ───

async function runIteration(iterNum: number, tasks: Tasks): Promise<any> {
  console.log(`\n${"═".repeat(60)}`);
  console.log(`▶ ITERATION ${iterNum}`);
  console.log(`${"═".repeat(60)}\n`);

  const iterDir = join(BATTERY_DIR, `iter_${iterNum}`);
  await mkdir(iterDir, { recursive: true });

  const runs: TaskRun[] = [];

  for (const [phaseKey, phaseName] of [
    ["alpha_baseline", "alpha"],
    ["beta_drift", "beta"],
    ["gamma_impossible", "gamma"],
  ] as const) {
    console.log(`\n── Phase ${phaseName} ──`);
    for (const task of tasks.phases[phaseKey]) {
      console.log(`  ▶ ${task.operation}`);
      const result = await runRespond(task, tasks.models);
      const overseerFired = (result.log ?? []).some(e => e.kind === "overseer_correction");
      console.log(
        `    status=${result.status} turns=${result.iterations}` +
        ` tokens=${result.artifact?.usage?.total_tokens ?? 0}` +
        ` overseer=${overseerFired}` +
        ` elapsed=${Math.round(result._elapsed_ms / 1000)}s`
      );
      if (result.error) console.log(`    error: ${result.error.slice(0, 200)}`);
      runs.push({ task, phase: phaseName, result });
    }
  }

  // Phase δ
  console.log(`\n── Phase δ: distill outcomes_tail:20 ──`);
  const distillEvents = await runDistill("outcomes_tail:20");
  const distillFinal = [...distillEvents].reverse()
    .find(e => e.role === "final") ?? distillEvents[distillEvents.length - 1];
  const distillText = distillFinal?.text ?? JSON.stringify(distillFinal ?? {}).slice(0, 200);
  console.log(`  ${distillText.split("\n")[0]}`);
  await writeFile(join(iterDir, "distill_output.txt"), distillText);

  // Metrics
  const collectPhase = (p: string) => runs.filter(r => r.phase === p);
  const phaseMetrics = (p: string) => {
    const ps = collectPhase(p);
    if (ps.length === 0) return { count: 0 };
    return {
      count: ps.length,
      ok: ps.filter(r => r.result.status === "ok").length,
      failed: ps.filter(r => r.result.status === "failed").length,
      avg_turns: ps.reduce((s, r) => s + (r.result.iterations || 0), 0) / ps.length,
      total_tokens: ps.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
      overseer_called: ps.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length,
      avg_elapsed_s: ps.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / ps.length / 1000,
    };
  };

  const metrics = {
    iteration: iterNum,
    total_tasks: runs.length,
    ok_tasks: runs.filter(r => r.result.status === "ok").length,
    failed_tasks: runs.filter(r => r.result.status === "failed").length,
    blocked_tasks: runs.filter(r => r.result.status === "blocked").length,
    total_tokens: runs.reduce((s, r) => s + (r.result.artifact?.usage?.total_tokens ?? 0), 0),
    avg_turns_per_task: runs.reduce((s, r) => s + (r.result.iterations || 0), 0) / runs.length,
    overseer_called_rate: runs.filter(r => (r.result.log ?? []).some(e => e.kind === "overseer_correction")).length / runs.length,
    total_elapsed_s: runs.reduce((s, r) => s + (r.result._elapsed_ms || 0), 0) / 1000,
    by_phase: {
      alpha: phaseMetrics("alpha"),
      beta: phaseMetrics("beta"),
      gamma: phaseMetrics("gamma"),
    },
  };

  console.log(`\n── Metrics ──`);
  console.log(`  total_tokens: ${metrics.total_tokens}`);
  console.log(`  avg_turns_per_task: ${metrics.avg_turns_per_task.toFixed(2)}`);
  console.log(`  overseer_called_rate: ${(metrics.overseer_called_rate * 100).toFixed(1)}%`);
  console.log(`  ok/total: ${metrics.ok_tasks}/${metrics.total_tasks}`);

  await writeFile(join(iterDir, "runs.json"), JSON.stringify(runs, null, 2));
  await writeFile(join(iterDir, "metrics.json"), JSON.stringify(metrics, null, 2));

  // Phase ε: overseer review
  console.log(`\n── Phase ε: overseer meta-review ──`);
  const reviewInput = {
    metrics,
    task_summary: runs.map(r => ({
      operation: r.task.operation,
      phase: r.phase,
      status: r.result.status,
      iterations: r.result.iterations,
      tokens: r.result.artifact?.usage?.total_tokens ?? 0,
      overseer_called: (r.result.log ?? []).some(e => e.kind === "overseer_correction"),
      error: r.result.error ?? null,
      elapsed_s: Math.round((r.result._elapsed_ms || 0) / 1000),
    })),
  };
  const review = await overseerReview(iterNum, reviewInput, tasks.models);
  await writeFile(join(iterDir, "overseer_review.md"), review);
  console.log(`  ✓ ${review.length} chars`);

  // Phase ζ: scrum
  console.log(`\n── Phase ζ: scrum judgment ──`);
  const verdict = await scrumJudge(iterNum, review, tasks.models);
  await writeFile(join(iterDir, "scrum_findings.md"), verdict);
  console.log(`  ✓ ${verdict.length} chars`);

  return metrics;
}

// ─── Main ───

async function main() {
  const tasks = JSON.parse(
    await readFile("/home/profit/lakehouse/tests/battery/tasks.json", "utf8"),
  ) as Tasks;

  await mkdir(BATTERY_DIR, { recursive: true });

  const iterations: any[] = [];
  const batteryStart = Date.now();

  for (let i = 1; i <= 3; i++) {
    const m = await runIteration(i, tasks);
    iterations.push(m);
  }

  const batteryElapsed = (Date.now() - batteryStart) / 1000;

  // Summary
  const delta = (k: keyof any, inverted = false) => {
    const vals = iterations.map((m: any) => m[k]);
    if (vals.some(v => v === undefined)) return "—";
    const diff = vals[2] - vals[0];
    const pct = vals[0] !== 0 ? (diff / vals[0]) * 100 : 0;
    const arrow = inverted ? (diff < 0 ? "↓ better" : "↑ worse") : (diff > 0 ? "↑ better" : "↓ worse");
    return `${arrow} (${diff > 0 ? "+" : ""}${diff.toFixed?.(2) ?? diff}, ${pct.toFixed(1)}%)`;
  };

  const rows = [
    ["total_tokens", "inverted", "want ↓ — fewer tokens for same work"],
    ["avg_turns_per_task", "inverted", "want ↓ — executor gets smarter"],
    ["overseer_called_rate", "inverted", "want ↓ — fewer cloud escalations"],
    ["ok_tasks", "normal", "want ↑ — more successes"],
    ["total_elapsed_s", "inverted", "want ↓ — faster iterations"],
  ];

  let summary = `# Compounding Stress Battery — Summary\n\n`;
  summary += `**Run:** ${new Date().toISOString()}\n`;
  summary += `**Elapsed:** ${Math.round(batteryElapsed)}s (${(batteryElapsed/60).toFixed(1)} min)\n`;
  summary += `**Models:** executor=${tasks.models.executor_cloud}, reviewer=${tasks.models.reviewer_cloud}, overseer=${tasks.models.overseer_cloud}\n\n`;

  summary += `## Compounding Metrics\n\n`;
  summary += `| Metric | iter 1 | iter 2 | iter 3 | Trend (1→3) | Goal |\n`;
  summary += `|---|---|---|---|---|---|\n`;
  for (const [key, inv, goal] of rows) {
    const vals = iterations.map((m: any) => {
      const v = m[key as string];
      return typeof v === "number" ? v.toFixed(2) : String(v);
    });
    summary += `| ${key} | ${vals[0]} | ${vals[1]} | ${vals[2]} | ${delta(key as any, inv === "inverted")} | ${goal} |\n`;
  }
  summary += "\n";

  // Count trending metrics
  const trends = rows.map(([k, inv]) => {
    const vs = iterations.map((m: any) => m[k as string]) as number[];
    const improved = inv === "inverted" ? vs[2] < vs[0] : vs[2] > vs[0];
    return { metric: k, improved };
  });
  const improvedCount = trends.filter(t => t.improved).length;

  summary += `## Verdict\n\n`;
  if (improvedCount >= 3) {
    summary += `**✓ Architecture validated** — ${improvedCount}/${trends.length} compounding metrics improved from iteration 1 to 3.\n\n`;
  } else {
    summary += `**✗ Compounding NOT demonstrated** — only ${improvedCount}/${trends.length} metrics improved. See scrum_findings.md in each iter_N/ directory for the overseer's proposals and the scrum master's review of what to change.\n\n`;
  }

  summary += `Metrics that ${improvedCount >= 3 ? "improved" : "regressed"}:\n`;
  for (const t of trends) {
    summary += `- ${t.metric}: ${t.improved ? "✓ improved" : "✗ flat or worse"}\n`;
  }

  summary += `\n## Artifacts\n\n`;
  summary += `- \`iter_1/\`, \`iter_2/\`, \`iter_3/\` — per-iteration runs.json, metrics.json, overseer_review.md, scrum_findings.md, distill_output.txt\n`;
  summary += `- \`summary.md\` — this file\n`;

  await writeFile(join(BATTERY_DIR, "summary.md"), summary);
  console.log(`\n${"═".repeat(60)}`);
  console.log(`✓ BATTERY COMPLETE — ${Math.round(batteryElapsed)}s`);
  console.log(`  Summary: ${join(BATTERY_DIR, "summary.md")}`);
  console.log(`${"═".repeat(60)}\n`);
  console.log(summary);
}

main().catch(e => {
  console.error(`\n${"═".repeat(60)}`);
  console.error(`✗ BATTERY FAILED: ${e.message}`);
  console.error(`${"═".repeat(60)}\n`);
  if (e.stack) console.error(e.stack);
  process.exit(1);
});