remove 5 orphaned dev experiments from tests/real-world/

Per J: "all of our test code ended up in the main." These are 5 one-time dev experiments that were never wired into any automation and have zero live consumers in the production code path. Deleting them. Removed (1418 LOC total): - enrich_prd_pipeline.ts (528 LOC) — Phase 21 architecture stress test - nine_consecutive_audits.ts (185 LOC) — empirical study of audit compounding - hard_task_escalation.ts (267 LOC) — escalation-ladder test (refs retired cloud models qpt-oss:20b/120b) - autonomous_loop.ts (214 LOC) — wrapper experiment around scrum_master - consensus_reducer_design.ts (224 LOC) — N=3 design consultation; output JSON referenced from pathway_memory.rs comment but the script itself has no consumer Verified: 0 references in package.json / justfile / Makefile / any production .ts/.rs/.sh file. The single mention from pathway_memory.rs is a //! doc comment referencing the JSON output (data/_kb/ consensus_reducer_design_*.json), not the script. Build clean post-delete. KEPT: - scrum_master_pipeline.ts — referenced from observer.ts, vectord, scripts - scrum_applier.ts — referenced from auditor schemas If you need any of these back, they're in git history. cherry-pick or git show HEAD~1 -- tests/real-world/<file>.ts will recover the source. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 02:05:24 -05:00 · 2026-05-03 02:05:24 -05:00 · 6aafd41785
commit 6aafd41785
parent bb5a3b3f5e
5 changed files with 0 additions and 1419 deletions
--- a/tests/real-world/autonomous_loop.ts
+++ b/tests/real-world/autonomous_loop.ts
@ -1,214 +0,0 @@
-#!/usr/bin/env bun
-// Autonomous scrum loop — wraps scrum_master_pipeline.ts + scrum_applier.ts
-// in a goal-driven retry loop. Observer is POSTed an iteration summary at
-// every boundary so it can build meta-commentary outside the loop's epistemic
-// scope.
-//
-// Usage:
-//   LOOP_TARGETS="crates/a/src/x.rs,crates/b/src/y.rs" \
-//   LOOP_MAX_ITERS=5 \
-//   LOOP_PUSH=1 \
-//   bun run tests/real-world/autonomous_loop.ts
-//
-// Stop conditions: max_iters reached OR 2 consecutive iters with 0 commits.
-
-import { spawn } from "node:child_process";
-import { appendFile, readFile } from "node:fs/promises";
-import { existsSync } from "node:fs";
-
-const REPO = "/home/profit/lakehouse";
-const OBSERVER = process.env.LOOP_OBSERVER ?? "http://localhost:3800";
-const BRANCH = process.env.LOOP_BRANCH ?? "scrum/auto-apply-19814";
-const MAX_ITERS = Number(process.env.LOOP_MAX_ITERS ?? 3);
-const PUSH = process.env.LOOP_PUSH === "1";
-const MIN_CONF = process.env.LOOP_MIN_CONF ?? "85";
-// Optional override — when unset, let scrum_applier.ts use ITS default
-// (currently x-ai/grok-4.1-fast on openrouter). The prior hardcoded
-// qwen3-coder:480b default was clobbering the applier's own default
-// and forcing every iter to hit the throttled ollama_cloud account.
-const APPLIER_MODEL = process.env.LOOP_APPLIER_MODEL;
-const APPLIER_PROVIDER = process.env.LOOP_APPLIER_PROVIDER;
-const TARGETS = (process.env.LOOP_TARGETS ?? "crates/queryd/src/service.rs,crates/gateway/src/main.rs,crates/gateway/src/v1/mod.rs")
-  .split(",").map(s => s.trim()).filter(Boolean);
-
-const FORENSIC = process.env.LH_SCRUM_FORENSIC ?? `${REPO}/docs/SCRUM_FORENSIC_PROMPT.md`;
-const PROPOSAL = process.env.LH_SCRUM_PROPOSAL ?? `${REPO}/docs/SCRUM_FIX_WAVE.md`;
-
-const LOOP_ID = `loop_${Date.now().toString(36)}`;
-const JOURNAL = `${REPO}/data/_kb/autonomous_loops.jsonl`;
-
-interface IterResult {
-  iter: number;
-  scrum_reviews_added: number;
-  applier_outcomes: Record<string, number>;
-  commits_landed: number;
-  commit_shas: string[];
-  build_status: "green" | "red" | "unknown";
-  duration_ms: number;
-}
-
-function log(msg: string) {
-  const ts = new Date().toISOString().slice(11, 19);
-  console.log(`[loop ${LOOP_ID} ${ts}] ${msg}`);
-}
-
-function runCmd(cmd: string, args: string[], env: Record<string, string> = {}): Promise<{ code: number; stdout: string; stderr: string }> {
-  return new Promise((resolve) => {
-    const child = spawn(cmd, args, { cwd: REPO, env: { ...process.env, ...env } });
-    let stdout = "", stderr = "";
-    child.stdout.on("data", (d) => { stdout += d; process.stdout.write(d); });
-    child.stderr.on("data", (d) => { stderr += d; process.stderr.write(d); });
-    child.on("close", (code) => resolve({ code: code ?? -1, stdout, stderr }));
-  });
-}
-
-async function countLines(path: string): Promise<number> {
-  if (!existsSync(path)) return 0;
-  const text = await readFile(path, "utf8");
-  return text.split("\n").filter(Boolean).length;
-}
-
-async function gitHeadSha(): Promise<string> {
-  const r = await runCmd("git", ["rev-parse", "HEAD"]);
-  return r.stdout.trim();
-}
-
-async function commitsSince(baseSha: string): Promise<string[]> {
-  const r = await runCmd("git", ["log", "--oneline", `${baseSha}..HEAD`]);
-  return r.stdout.trim().split("\n").filter(Boolean);
-}
-
-async function cargoCheckGreen(): Promise<boolean> {
-  log("cargo check --workspace …");
-  const r = await runCmd("cargo", ["check", "--workspace", "--quiet"]);
-  return r.code === 0;
-}
-
-async function postObserver(payload: object) {
-  try {
-    const r = await fetch(`${OBSERVER}/event`, {
-      method: "POST",
-      headers: { "Content-Type": "application/json" },
-      body: JSON.stringify(payload),
-      signal: AbortSignal.timeout(5000),
-    });
-    if (!r.ok) log(`observer POST returned ${r.status}`);
-  } catch (e: any) {
-    log(`observer POST failed: ${e.message}`);
-  }
-}
-
-async function runIter(iter: number, baseSha: string): Promise<IterResult> {
-  const t0 = Date.now();
-  log(`══ iter ${iter} start (base ${baseSha.slice(0, 8)}) targets=${TARGETS.length}`);
-
-  const reviewsBefore = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
-  const applyBefore = await countLines(`${REPO}/data/_kb/auto_apply.jsonl`);
-
-  log(`scrum_master_pipeline.ts → ${TARGETS.length} files`);
-  await runCmd("bun", ["run", "tests/real-world/scrum_master_pipeline.ts"], {
-    LH_SCRUM_FILES: TARGETS.join(","),
-    LH_SCRUM_FORENSIC: FORENSIC,
-    LH_SCRUM_PROPOSAL: PROPOSAL,
-  });
-
-  log(`scrum_applier.ts COMMIT=1 MIN_CONF=${MIN_CONF} files=${TARGETS.length}`);
-  // Only forward model/provider when explicitly overridden — otherwise
-  // let scrum_applier.ts use its own defaults (Grok 4.1 fast on openrouter).
-  const applierEnv: Record<string, string> = {
-    LH_APPLIER_COMMIT: "1",
-    LH_APPLIER_MIN_CONF: MIN_CONF,
-    LH_APPLIER_MAX_FILES: String(TARGETS.length),
-    LH_APPLIER_BRANCH: BRANCH,
-    // Constrain applier to THIS iter's targets so it patches what we
-    // just reviewed instead of the highest-confidence file from history.
-    LH_APPLIER_FILES: TARGETS.join(","),
-  };
-  if (APPLIER_MODEL) applierEnv.LH_APPLIER_MODEL = APPLIER_MODEL;
-  if (APPLIER_PROVIDER) applierEnv.LH_APPLIER_PROVIDER = APPLIER_PROVIDER;
-  await runCmd("bun", ["run", "tests/real-world/scrum_applier.ts"], applierEnv);
-
-  const reviewsAfter = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
-  const applyAfterText = existsSync(`${REPO}/data/_kb/auto_apply.jsonl`)
-    ? await readFile(`${REPO}/data/_kb/auto_apply.jsonl`, "utf8")
-    : "";
-  const applyRows = applyAfterText.split("\n").filter(Boolean).slice(applyBefore);
-  const outcomes: Record<string, number> = {};
-  for (const line of applyRows) {
-    try {
-      const o = JSON.parse(line);
-      outcomes[o.action ?? "?"] = (outcomes[o.action ?? "?"] ?? 0) + 1;
-    } catch { /* skip malformed */ }
-  }
-
-  const commitShas = await commitsSince(baseSha);
-  const buildStatus = commitShas.length > 0 ? (await cargoCheckGreen() ? "green" : "red") : "unknown";
-
-  const result: IterResult = {
-    iter,
-    scrum_reviews_added: reviewsAfter - reviewsBefore,
-    applier_outcomes: outcomes,
-    commits_landed: commitShas.length,
-    commit_shas: commitShas.map(s => s.split(" ")[0]),
-    build_status: buildStatus,
-    duration_ms: Date.now() - t0,
-  };
-
-  log(`iter ${iter} done — reviews+${result.scrum_reviews_added} commits=${result.commits_landed} build=${buildStatus} (${(result.duration_ms / 1000).toFixed(1)}s)`);
-
-  await postObserver({
-    source: "autonomous_loop",
-    loop_id: LOOP_ID,
-    event_kind: "iteration_complete",
-    iter,
-    targets: TARGETS,
-    success: buildStatus !== "red",
-    scrum_reviews_added: result.scrum_reviews_added,
-    applier_outcomes: result.applier_outcomes,
-    commits_landed: result.commits_landed,
-    commit_shas: result.commit_shas,
-    build_status: buildStatus,
-    duration_ms: result.duration_ms,
-    ts: new Date().toISOString(),
-  });
-
-  await appendFile(JOURNAL, JSON.stringify({ loop_id: LOOP_ID, ...result, ts: new Date().toISOString() }) + "\n");
-
-  return result;
-}
-
-async function main() {
-  log(`autonomous loop starting · branch=${BRANCH} max_iters=${MAX_ITERS} push=${PUSH}`);
-  log(`targets: ${TARGETS.join(", ")}`);
-
-  const branchR = await runCmd("git", ["branch", "--show-current"]);
-  if (branchR.stdout.trim() !== BRANCH) {
-    log(`ERROR: on branch ${branchR.stdout.trim()}, expected ${BRANCH}. Refusing to run.`);
-    process.exit(1);
-  }
-
-  let consecutiveZero = 0;
-  for (let iter = 1; iter <= MAX_ITERS; iter++) {
-    const baseSha = await gitHeadSha();
-    const result = await runIter(iter, baseSha);
-
-    if (PUSH && result.commits_landed > 0) {
-      log(`git push origin ${BRANCH}`);
-      const pushR = await runCmd("git", ["push", "origin", BRANCH]);
-      if (pushR.code !== 0) log(`push failed (continuing): ${pushR.stderr.slice(0, 200)}`);
-    }
-
-    consecutiveZero = result.commits_landed === 0 ? consecutiveZero + 1 : 0;
-    if (consecutiveZero >= 2) {
-      log(`STOP: 2 consecutive iters with 0 commits. Loop converged or stuck.`);
-      break;
-    }
-  }
-
-  log(`loop ${LOOP_ID} complete. Journal: ${JOURNAL}`);
-}
-
-main().catch((e) => {
-  log(`FATAL: ${e.message}`);
-  process.exit(1);
-});
--- a/tests/real-world/consensus_reducer_design.ts
+++ b/tests/real-world/consensus_reducer_design.ts
@ -1,224 +0,0 @@
-// consensus_reducer_design.ts — N=3 design consultation.
-//
-// J's ask: enhance the tree-split reducer to preserve FULL backtrack-able
-// context (endpoints tried, attempt count per model in the ladder, KB
-// sources retrieved, context7 bridge hits, MCP observer signals, audit
-// verdicts) instead of collapsing to a summary. Then index the full
-// context through our existing vectord matrix indexing (HNSW + Lance +
-// playbook_memory) so successful pathways become hot-swappable — the
-// system asks "what did we try, what worked, in what order" for a
-// similar task class and gets a ranked playbook back.
-//
-// Before building, consult three diverse models and print their proposals
-// side-by-side so we can pick the convergent design.
-
-const GATEWAY = "http://localhost:3100";
-
-const DESIGN_BRIEF = `
-# Context — Lakehouse signal→commit loop
-
-We run 6x scrum-master iterations that audit Rust crates for PRD
-alignment, produce findings + confidence, and feed an auto-applier that
-lands small mechanical commits through a cargo-green-and-warning-stable
-gate. Key components:
-
- \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung
-  model LADDER (kimi-k2:1t → qwen3-coder:480b → deepseek → mistral-large
-  → gpt-oss:120b → qwen3.5:397b → openrouter free rescues → local
-  qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5
-  proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K
-  shards, asks each rung in order, accepts first response passing
-  structural checks.
- \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions
-  (file, verdict, critical_failures_count, gradient_tier, attempts,
-  reviewer_model, tree_split_fired). Escalates failure clusters to LLM
-  Team by POSTing to /v1/chat with qwen3-coder:480b.
- \`context7-bridge\` — external library docs lookup.
- \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum
-  findings; writes to data/_kb/audit_facts.jsonl via LLM Team
-  \`/api/run?mode=extract\`.
- \`crates/vectord/src/playbook_memory.rs\` — indexing for proven
-  playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry,
-  PatternReport. Uses HNSW index + Lance columnar backend + promotion
-  pipeline. Already battle-tested for workers/staffing queries.
- Tree-split REDUCER: after shards return map-style summaries, they are
-  concatenated with internal §N§ markers and fed to a reviewer model to
-  produce ONE file-level review. Currently the reducer sees summaries,
-  not the full context behind each shard's conclusion.
-
-# The problem
-
-The reducer currently TRUNCATES to a short summary. When the auditor or
-a future iteration wants to backtrack WHY the reducer concluded what it
-did — which attempt succeeded, which failed, what KB chunks were
-retrieved, what observer signal classified the file as LOOPING vs
-CONVERGING — that context is lost. So:
-
-1. Auditor can't verify citation provenance beyond the summary line.
-2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a
-   "tried X and it was easy" playbook — they look identical downstream.
-3. The matrix indexing is only used for RAG chunks during the scrum
-   pass, NOT for storing the full end-to-end pathway of a successful
-   review.
-
-# The design question
-
-Propose an enhanced reducer + indexing design that:
-
-(a) Preserves the FULL backtrack context per reviewed file:
-    - every ladder attempt (model, ms, accepted_y/n, reject_reason)
-    - every retrieved KB chunk (source doc, chunk id, cosine score, rank)
-    - every observer signal (class, priors, prior-iter outcomes)
-    - every context7 bridge hit (library, version pulled)
-    - every sub-pipeline call (LLM Team extract results, audit consensus)
-
-(b) Stores this pathway into vectord's matrix indexing alongside the
-    review verdict so it becomes retrievable by similarity. When a new
-    file's fingerprint (task_class + file-path prefix + signal class)
-    matches a past successful pathway, the system can hot-swap by
-    replaying or short-circuiting to the model/KB combo that worked.
-
-(c) Surfaces the matrix-index hit rate as a feedback signal on the
-    scrum's UI — "this file was solved 3 times before by the same ladder
-    rung; consider short-circuiting to rung 5."
-
-(d) Is compatible with the existing playbook_memory.rs primitives
-    (PlaybookEntry, DocRef, FailureRecord, BoostEntry) — extend don't
-    replace. The indexing layer is in production for workers/staffing;
-    we want the reducer pathway to piggyback on proven infrastructure.
-
-# Constraints
- NO new crate. Extend vectord + scrum_master_pipeline.
- Full context can be LARGE — a reviewed file might have 5 retrievals,
-  4 ladder attempts, 8 observer priors. Design the embedding /
-  fingerprint so similar-but-not-identical pathways cluster.
- The reducer summary is still needed for the reviewer LLM input —
-  don't remove it, ADD the full-context sidecar.
- Audit trail: every pathway must be replayable deterministically from
-  what's stored (i.e., enough context to re-run without the original
-  prompt cache).
-
-# Required output (STRICT JSON, no prose, no markdown fences):
-
-{
-  "approach": "one-paragraph summary of your proposed design",
-  "data_model": {
-    "new_fields_on_playbook_entry": [...],
-    "new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ]
-  },
-  "storage_strategy": {
-    "what_to_vectorize": "the text that becomes the embedding",
-    "fingerprint_key": "the deterministic key for similarity retrieval",
-    "backend": "HNSW, Lance, playbook_memory — pick"
-  },
-  "reducer_changes": {
-    "inputs_added": [...],
-    "outputs_added": [...],
-    "compatibility_notes": "how existing callers stay working"
-  },
-  "hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway",
-  "ui_signal": "what to surface so J sees whether matrix indexing is earning its keep",
-  "risks": [...],
-  "why_this_beats_summarization": "one-paragraph argument"
-}
-`.trim();
-
-interface Probe {
-  name: string;
-  provider: "ollama" | "ollama_cloud" | "openrouter";
-  model: string;
-}
-
-// Round-3 probe set — 4 probes covering the remaining ladder rungs +
-// architecture/provider diversity. J wanted all 4 of the untouched
-// options so the aggregated 10-model signal is saturated across the
-// usable ladder.
-const PROBES: Probe[] = [
-  { name: "qwen35-397b",        provider: "ollama_cloud", model: "qwen3.5:397b" },
-  { name: "openrouter-gpt-oss", provider: "openrouter",   model: "openai/gpt-oss-120b:free" },
-  { name: "openrouter-gemma3",  provider: "openrouter",   model: "google/gemma-3-27b-it:free" },
-  { name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check
-];
-
-async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> {
-  const started = Date.now();
-  try {
-    const r = await fetch(`${GATEWAY}/v1/chat`, {
-      method: "POST",
-      headers: { "Content-Type": "application/json" },
-      body: JSON.stringify({
-        provider: p.provider,
-        model: p.model,
-        messages: [
-          { role: "system", content: "You are a senior architect. Output STRICT JSON only." },
-          { role: "user", content: DESIGN_BRIEF },
-        ],
-        max_tokens: 3000,
-        temperature: 0,
-      }),
-    });
-    const ms = Date.now() - started;
-    if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` };
-    const j = await r.json();
-    const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? "";
-    return { name: p.name, raw: String(content), ms };
-  } catch (e: any) {
-    return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) };
-  }
-}
-
-function extractJson(raw: string): any | null {
-  let s = raw.trim();
-  const fence = s.match(/^```(?:json)?\s*/);
-  if (fence) s = s.slice(fence[0].length);
-  if (s.endsWith("```")) s = s.slice(0, -3).trim();
-  const first = s.indexOf("{");
-  const last = s.lastIndexOf("}");
-  if (first < 0 || last <= first) return null;
-  try {
-    return JSON.parse(s.slice(first, last + 1));
-  } catch {
-    return null;
-  }
-}
-
-function summarize(obj: any, max = 240): string {
-  if (!obj) return "(no JSON parsed)";
-  if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj;
-  if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; ");
-  return Object.entries(obj)
-    .map(([k, v]) => `${k}=${summarize(v, max)}`)
-    .join(" | ");
-}
-
-async function main() {
-  console.log(`\n── N=3 design consensus ──`);
-  console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`);
-
-  const results = await Promise.all(PROBES.map(ask));
-
-  for (const r of results) {
-    console.log(`\n── ${r.name} (${r.ms}ms) ──`);
-    if (r.error) { console.log(`  ERROR: ${r.error}`); continue; }
-    const j = extractJson(r.raw);
-    if (!j) {
-      console.log(`  raw (no JSON): ${r.raw.slice(0, 600)}…`);
-      continue;
-    }
-    console.log(`  approach: ${summarize(j.approach, 400)}`);
-    console.log(`  fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`);
-    console.log(`  vectorize:   ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`);
-    console.log(`  backend:     ${summarize(j.storage_strategy?.backend, 200)}`);
-    console.log(`  hot_swap:    ${summarize(j.hot_swap_logic, 300)}`);
-    console.log(`  new_types:   ${summarize(j.data_model?.new_types, 400)}`);
-    console.log(`  risks:       ${summarize(j.risks, 300)}`);
-    console.log(`  why>summary: ${summarize(j.why_this_beats_summarization, 300)}`);
-  }
-
-  // Write full JSON to disk so we can inspect later.
-  const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`;
-  await Bun.write(outPath, JSON.stringify(results, null, 2));
-  console.log(`\nfull responses → ${outPath}`);
-}
-
-await main();
--- a/tests/real-world/enrich_prd_pipeline.ts
+++ b/tests/real-world/enrich_prd_pipeline.ts
@ -1,528 +0,0 @@
-// Real-world architecture stress test — 6 iterations of the full pipeline
-// against the PRD as a corpus. Goal: prove at scale what Phase 21
-// promised (context continuation + tree-split), plus Phase 19
-// compounding across iterations.
-//
-// Run: bun run tests/real-world/enrich_prd_pipeline.ts
-//
-// No mocks. No skipped layers. On any error, the test triggers
-// cloud-rescue rather than fail — it's the architecture's job to
-// recover. The test FAILS only if we can't complete 6 iterations.
-
-import { readFile, writeFile, mkdir } from "node:fs/promises";
-import { createHash } from "node:crypto";
-
-const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
-const CHUNK_SIZE = 800;            // chars per chunk — ~200 tokens
-const CHUNK_OVERLAP = 120;
-const TOP_K_RETRIEVE = 12;         // chunks per iteration — pulled up to force overflow
-const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
-const INJECT_FAIL_ON_ITER = 3;     // force the TASK-retry loop on iter 3
-
-// Continuation controls (per-cloud-call) — used for output-overflow.
-// Separate from the task-retry loop (per-task) — that handles errors
-// across attempts.
-const PRIMARY_MAX_TOKENS = 150;     // tight — forces truncation
-const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
-const MAX_CONTINUATIONS = 6;         // max stitch pieces per cloud call
-
-// Task-level retry loop (J's clarification, 2026-04-22):
-// When a TASK errors, retry the whole task up to 6 times. Each
-// retry gets prior attempts' failures injected as learning context,
-// so attempt N+1 is informed by what N failed at. The loop caps at
-// 6 to avoid infinite spinning on genuinely unsolvable tasks.
-const MAX_TASK_RETRIES = 6;
-
-// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
-// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
-// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
-// loop fires all 6 with compounding failure context.
-const FORCE_RETRY_MODEL_SEQUENCE = [
-  "deliberately-invalid-model-attempt-1",
-  "deliberately-invalid-model-attempt-2",
-  "deliberately-invalid-model-attempt-3",
-  "deliberately-invalid-model-attempt-4",
-  "deliberately-invalid-model-attempt-5",
-  "gpt-oss:20b", // 6th attempt succeeds
-];
-const GATEWAY = "http://localhost:3100";
-const SIDECAR = "http://localhost:3200";
-const CLOUD_MODEL = "gpt-oss:120b";
-const RESCUE_MODEL = "gpt-oss:20b";       // fallback local cloud model via sidecar
-const RUN_NONCE = Date.now().toString(36);
-const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
-
-// The 6 progressively-compounding questions. #6 explicitly requires
-// synthesis across prior 5 answers.
-const QUESTIONS: string[] = [
-  "Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
-  "How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
-  "Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
-  "What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
-  "How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
-  "Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
-];
-
-type Chunk = { id: string; text: string; embedding: number[]; offset: number };
-
-interface IterationResult {
-  iteration: number;
-  question: string;
-  retrieval_top_k: number;
-  context_chars_before_budget: number;
-  tree_split_fired: boolean;
-  cloud_calls_total: number;
-  continuation_retries: number;
-  rescue_triggered: boolean;
-  // Task-level retry telemetry
-  task_attempts_made: number;       // how many attempts fired (1 = first succeeded)
-  task_retry_history: Array<{ n: number; model: string; error: string }>;
-  playbook_id: string | null;
-  tokens_prompt: number;
-  tokens_completion: number;
-  citations_from_prior_iterations: string[];
-  duration_ms: number;
-  answer_preview: string;
-  errors_recovered: string[];
-}
-
-function log(msg: string) { console.log(`[enrich] ${msg}`); }
-
-function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
-
-function cosine(a: number[], b: number[]): number {
-  let dot = 0, na = 0, nb = 0;
-  for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
-  if (na === 0 || nb === 0) return 0;
-  return dot / (Math.sqrt(na) * Math.sqrt(nb));
-}
-
-function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
-
-async function embedBatch(texts: string[]): Promise<number[][]> {
-  // Sidecar /embed accepts a list. On partial failure, retry individually.
-  const r = await fetch(`${SIDECAR}/embed`, {
-    method: "POST", headers: { "content-type": "application/json" },
-    body: JSON.stringify({ texts }),
-    signal: AbortSignal.timeout(120000),
-  });
-  if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
-  const j: any = await r.json();
-  return j.embeddings;
-}
-
-function chunkText(text: string): Array<{ text: string; offset: number }> {
-  const out: Array<{ text: string; offset: number }> = [];
-  let i = 0;
-  while (i < text.length) {
-    const end = Math.min(i + CHUNK_SIZE, text.length);
-    const slice = text.slice(i, end).trim();
-    if (slice.length > 50) out.push({ text: slice, offset: i });
-    if (end >= text.length) break;
-    i = end - CHUNK_OVERLAP;
-  }
-  return out;
-}
-
-async function chat(opts: {
-  provider: "ollama" | "ollama_cloud",
-  model: string,
-  messages: Array<{ role: string; content: string }>,
-  max_tokens: number,
-  think: boolean,
-}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
-  const r = await fetch(`${GATEWAY}/v1/chat`, {
-    method: "POST", headers: { "content-type": "application/json" },
-    body: JSON.stringify({ ...opts }),
-    signal: AbortSignal.timeout(180000),
-  });
-  if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
-  const j: any = await r.json();
-  return {
-    content: j.choices?.[0]?.message?.content ?? "",
-    prompt_tokens: j.usage?.prompt_tokens ?? 0,
-    completion_tokens: j.usage?.completion_tokens ?? 0,
-    finish_reason: j.choices?.[0]?.finish_reason ?? "?",
-  };
-}
-
-// ─── Tree-split over oversized chunk set ──────────────────────────
-async function treeSplitSummarize(
-  chunks: Chunk[],
-  question: string,
-): Promise<{ scratchpad: string; cloud_calls: number }> {
-  // Shard into groups fitting within half the budget each.
-  const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
-  const shards: Chunk[][] = [];
-  for (let i = 0; i < chunks.length; i += perShard) {
-    shards.push(chunks.slice(i, i + perShard));
-  }
-  log(`  tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
-  let scratchpad = "";
-  let cloud_calls = 0;
-  for (let si = 0; si < shards.length; si++) {
-    const shard = shards[si];
-    const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
-    const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
-    const r = await chat({
-      provider: "ollama_cloud",
-      model: CLOUD_MODEL,
-      messages: [
-        { role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
-        { role: "user", content: userMsg },
-      ],
-      max_tokens: 500,
-      think: false,
-    });
-    cloud_calls += 1;
-    scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
-    if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
-      // truncate oldest halves
-      scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
-      log(`  tree-split: scratchpad truncated to ${scratchpad.length} chars`);
-    }
-  }
-  return { scratchpad, cloud_calls };
-}
-
-// ─── Continuable generate — up to max_continuations stitches ──────
-//
-// Two failure modes handled:
-//   A) Empty response — typically thinking model burned the budget
-//      on hidden reasoning. Retry with 2× max_tokens.
-//   B) Truncated response (finish_reason=length) — answer got cut off
-//      mid-sentence. Pass the partial back as scratchpad and ask the
-//      model to continue from where it stopped.
-//
-// Stitching: keep appending content across retries; prompt_tokens and
-// completion_tokens accumulate; finish_reason reflects the LAST call.
-// Loop exits on the first call that finishes cleanly (stop) with
-// non-empty content, OR when retries hit the cap.
-async function generateContinuable(
-  opts: Parameters<typeof chat>[0] & { max_continuations?: number },
-): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
-  const maxCont = opts.max_continuations ?? 1;
-  let total = await chat(opts);
-  let retries = 0;
-  while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
-    retries += 1;
-    const mode = total.content.length === 0 ? "empty" : "truncated";
-    log(`  continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
-    // Continuation prompt — branch on failure mode:
-    //   empty  → retry with 2× tokens, same prompt (thinking budget)
-    //   length → pass the partial as assistant turn, ask to continue
-    const continuationMessages = total.content.length === 0
-      ? opts.messages
-      : [
-          ...opts.messages,
-          { role: "assistant", content: total.content },
-          { role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
-        ];
-    const continued = await chat({
-      ...opts,
-      max_tokens: CONTINUATION_MAX_TOKENS,
-      messages: continuationMessages,
-    });
-    total = {
-      content: total.content + continued.content,
-      prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
-      completion_tokens: total.completion_tokens + continued.completion_tokens,
-      finish_reason: continued.finish_reason,
-    };
-  }
-  return { ...total, continuation_retries: retries };
-}
-
-// ─── Single iteration: retrieve → budget-check → chat → seed ─────
-async function runIteration(
-  iteration: number,
-  question: string,
-  allChunks: Chunk[],
-  priorPlaybookIds: string[],
-  priorAnswers: string[],
-): Promise<IterationResult> {
-  const started = Date.now();
-  const errorsRecovered: string[] = [];
-  log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
-
-  // 1. Embed the question
-  const qEmb = (await embedBatch([question]))[0];
-
-  // 2. Retrieve top-K chunks by cosine
-  const scored = allChunks
-    .map(c => ({ c, score: cosine(qEmb, c.embedding) }))
-    .sort((a, b) => b.score - a.score)
-    .slice(0, TOP_K_RETRIEVE);
-  const chunks = scored.map(x => x.c);
-  log(`  retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
-
-  // 3. Context budget check — tree-split if over
-  const contextChars = chunks.map(c => c.text).join("\n\n").length;
-  let contextForPrompt: string;
-  let treeSplit = false;
-  let cloudCallsTotal = 0;
-  if (contextChars > CONTEXT_BUDGET_CHARS) {
-    treeSplit = true;
-    log(`  context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
-    const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
-    contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
-    cloudCallsTotal += cloud_calls;
-  } else {
-    contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
-  }
-
-  // 4. Seed prompt with prior iteration answers (real compounding).
-  //    Not just IDs — the model needs the CONTENT to synthesize.
-  let citationBlock = "";
-  let citationsReceived: string[] = [];
-  if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
-    const lines = priorAnswers.map((ans, i) => {
-      const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
-      // Trim each prior answer to ~400 chars so we don't blow budget
-      return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
-    });
-    citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
-    citationsReceived = priorPlaybookIds.slice();
-  }
-
-  // 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
-  //    Try the task up to MAX_TASK_RETRIES times. Each retry:
-  //    a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
-  //       cycles through 5 invalid models + 1 valid to force full loop)
-  //    b) Injects prior attempt errors as learning context
-  //    c) If the attempt succeeds (non-empty, >100 chars), loop exits
-  //    d) Otherwise, records failure and tries again with the learning
-  //
-  //    Cap at 6 so we don't spin forever on unsolvable tasks.
-  let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
-  let rescueTriggered = false;
-  const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
-  const forceRetries = iteration === INJECT_FAIL_ON_ITER;
-  if (forceRetries) log(`  FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
-
-  for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
-    const modelForAttempt = forceRetries
-      ? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
-      : CLOUD_MODEL;
-    // Compose a prior-attempts learning block for attempts 2+
-    const learningBlock = taskAttemptHistory.length > 0
-      ? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
-      : "";
-    log(`  task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
-    try {
-      const r = await generateContinuable({
-        provider: "ollama_cloud",
-        model: modelForAttempt,
-        messages: [
-          { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
-          { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
-        ],
-        max_tokens: PRIMARY_MAX_TOKENS,
-        think: false,
-        max_continuations: MAX_CONTINUATIONS,
-      });
-      cloudCallsTotal += 1 + r.continuation_retries;
-      if (r.content && r.content.length > 100) {
-        // Acceptable answer — exit loop
-        result = r;
-        if (attempt > 1) {
-          log(`  task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
-          rescueTriggered = true;
-        }
-        break;
-      }
-      // Thin response — count as failure with learning signal
-      const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
-      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
-      errorsRecovered.push(`attempt ${attempt}: ${err}`);
-    } catch (e) {
-      const err = (e as Error).message;
-      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
-      errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
-      cloudCallsTotal += 1;
-    }
-  }
-
-  // Last-ditch: if all 6 task attempts failed, try the local fallback
-  // once more so we at least return SOMETHING. This is the "don't get
-  // caught in a loop, accept best-so-far" rule J stated explicitly.
-  if (!result) {
-    errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
-    rescueTriggered = true;
-    try {
-      result = await generateContinuable({
-        provider: "ollama",
-        model: "qwen3.5:latest",
-        messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
-        max_tokens: 300,
-        think: false,
-        max_continuations: 2,
-      });
-      cloudCallsTotal += 1 + result.continuation_retries;
-    } catch (e) {
-      // Absolute last resort — fabricate a skeleton result
-      result = {
-        content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
-        prompt_tokens: 0,
-        completion_tokens: 0,
-        continuation_retries: 0,
-        finish_reason: "error",
-      };
-    }
-  }
-  if (result.content.length === 0) {
-    errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
-    rescueTriggered = true;
-    result = await generateContinuable({
-      provider: "ollama",
-      model: "qwen3.5:latest",
-      messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
-      max_tokens: 300,
-      think: false,
-    });
-    cloudCallsTotal += 1;
-  }
-
-  // 6. Seed playbook with the answer
-  let playbook_id: string | null = null;
-  try {
-    const ts = new Date().toISOString();
-    const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
-    const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
-      method: "POST", headers: { "content-type": "application/json" },
-      body: JSON.stringify({
-        operation: seedOp,
-        approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
-        context: result.content.slice(0, 600),
-        endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
-        append: true,
-      }),
-      signal: AbortSignal.timeout(15000),
-    });
-    if (r.ok) {
-      const j: any = await r.json();
-      playbook_id = j.outcome?.playbook_id ?? null;
-    } else {
-      errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
-    }
-  } catch (e) {
-    errorsRecovered.push(`seed exception: ${(e as Error).message}`);
-  }
-
-  return {
-    iteration,
-    question,
-    retrieval_top_k: chunks.length,
-    context_chars_before_budget: contextChars,
-    tree_split_fired: treeSplit,
-    cloud_calls_total: cloudCallsTotal,
-    continuation_retries: result.continuation_retries,
-    rescue_triggered: rescueTriggered,
-    task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
-    task_retry_history: taskAttemptHistory,
-    playbook_id,
-    tokens_prompt: result.prompt_tokens,
-    tokens_completion: result.completion_tokens,
-    citations_from_prior_iterations: citationsReceived,
-    duration_ms: Date.now() - started,
-    answer_preview: result.content.slice(0, 500),
-    errors_recovered: errorsRecovered,
-  };
-}
-
-async function main() {
-  await mkdir(OUT_DIR, { recursive: true });
-  log(`run nonce: ${RUN_NONCE}`);
-  log(`output dir: ${OUT_DIR}`);
-
-  // ─── Phase 1: load, chunk, embed the PRD ───────────────────────
-  log(`loading PRD from ${PRD_PATH}`);
-  const prd = await readFile(PRD_PATH, "utf8");
-  log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
-
-  const raw_chunks = chunkText(prd);
-  log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
-
-  // Embed in batches of 32 to avoid sidecar overload
-  const allChunks: Chunk[] = [];
-  const BATCH = 32;
-  const t0 = Date.now();
-  for (let i = 0; i < raw_chunks.length; i += BATCH) {
-    const batch = raw_chunks.slice(i, i + BATCH);
-    const embs = await embedBatch(batch.map(b => b.text));
-    for (let j = 0; j < batch.length; j++) {
-      allChunks.push({
-        id: hash(batch[j].text),
-        text: batch[j].text,
-        embedding: embs[j].map(x => Number(x)),
-        offset: batch[j].offset,
-      });
-    }
-    log(`  embedded ${allChunks.length}/${raw_chunks.length}`);
-  }
-  log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
-
-  // ─── Phase 2: 6 iterations ─────────────────────────────────────
-  const results: IterationResult[] = [];
-  const priorIds: string[] = [];
-  const priorAnswers: string[] = [];
-  for (let i = 1; i <= QUESTIONS.length; i++) {
-    const q = QUESTIONS[i - 1];
-    const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
-    results.push(r);
-    if (r.playbook_id) priorIds.push(r.playbook_id);
-    priorAnswers.push(r.answer_preview);
-    log(`  → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
-    await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
-  }
-  // Check whether iter 6 actually cited prior pb:IDs in its answer.
-  // Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
-  // hyphens + letters inside the brackets, not just hex chars.
-  const iter6 = results[5];
-  const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
-
-  // ─── Phase 3: summary ──────────────────────────────────────────
-  const summary = {
-    run_nonce: RUN_NONCE,
-    ran_at: new Date().toISOString(),
-    prd_chars: prd.length,
-    prd_chunks: allChunks.length,
-    iterations: results.length,
-    total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
-    total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
-    total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
-    tree_splits_fired: results.filter(r => r.tree_split_fired).length,
-    rescues_triggered: results.filter(r => r.rescue_triggered).length,
-    iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
-    iter6_actually_cited_in_answer: citationsHonored,
-    iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
-    iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
-    max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
-    total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
-    overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
-  };
-  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
-
-  log("");
-  log(`══════ SUMMARY ${summary.overall} ══════`);
-  log(`  6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
-  log(`  tree-splits: ${summary.tree_splits_fired}/6   continuations: ${summary.total_continuation_retries}   rescues: ${summary.rescues_triggered}`);
-  log(`  iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
-  log(`  iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
-  log(`  total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
-  log("");
-  for (const r of results) {
-    const flags = [
-      r.tree_split_fired ? "tree-split" : "",
-      r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
-      r.rescue_triggered ? "rescued" : "",
-      r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
-    ].filter(Boolean).join(" ");
-    log(`  iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
-  }
-  log("");
-  log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
-  process.exit(summary.overall === "PASS" ? 0 : 1);
-}
-
-main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
--- a/tests/real-world/hard_task_escalation.ts
+++ b/tests/real-world/hard_task_escalation.ts
@ -1,267 +0,0 @@
-// Hard-task escalation test. The task is deliberately constructed so
-// that a local 7B model (qwen3.5:latest) will miss at least one of the
-// validation rules. Watch the escalation ladder:
-//   1. qwen3.5:latest  (local 7B)       — likely fails
-//   2. qwen3:latest    (local 7B)       — likely fails differently
-//   3. gpt-oss:20b     (cloud 20B)      — may fail
-//   4. gpt-oss:120b    (cloud 120B)     — should succeed
-//   5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
-//   6. absolute last ditch: return best-so-far with failure annotation
-//
-// Each attempt:
-//   - Calls the model via /v1/chat
-//   - Validates the output against a strict rubric
-//   - On fail: records the specific rubric violations + the partial
-//     output, injects both into the next attempt's prompt as "here's
-//     what's wrong, fix it specifically"
-//   - On success: exit loop
-//
-// Run: bun run tests/real-world/hard_task_escalation.ts
-
-import { writeFile, mkdir } from "node:fs/promises";
-
-const GATEWAY = "http://localhost:3100";
-const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
-const MAX_ATTEMPTS = 6;
-
-// The hard task. Specific enough that a small model will miss at
-// least one rule. Not purely knowledge-based — it's a code-generation
-// task with strict structural constraints.
-const TASK = `Write a complete Rust async function with the EXACT signature:
-
-    pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
-
-It must:
-1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
-2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
-3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
-4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
-5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
-6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
-7. Include rustdoc /// comments on the function and each internal helper
-
-Assume this struct is already imported:
-
-    pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
-
-Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
-
-// Escalation ladder — small-local → large-local → cloud → specialist
-// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
-// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
-// final escalation tier should be kimi-k2:1t (the biggest model
-// we have access to on Ollama Cloud).
-const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
-  { provider: "ollama",       model: "qwen3.5:latest", note: "local 7B" },
-  { provider: "ollama",       model: "qwen3:latest",   note: "local 7B (different) " },
-  { provider: "ollama",       model: "gpt-oss:20b",    note: "local 20B" },          // FIXED: local, not cloud
-  { provider: "ollama_cloud", model: "gpt-oss:120b",   note: "cloud 120B" },
-  { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
-  // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
-  // K2.5/K2.6 both return "this model requires a subscription" on our
-  // current Ollama Cloud key. mistral-large-3:675b is the biggest
-  // model actually provisioned on this key (verified via direct curl
-  // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
-  // swap this line to kimi-k2.5 or kimi-k2.6:cloud.
-  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
-];
-
-// Validation rubric — the answer must pass all of these to be accepted.
-interface RubricResult {
-  passed: boolean;
-  violations: string[];
-  passed_rules: string[];
-}
-
-function validate(code: string): RubricResult {
-  const violations: string[] = [];
-  const passed: string[] = [];
-
-  const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
-
-  check("has pub async fn check_drift_batched signature",
-    /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
-  check("takes Vec<DocRef> argument",
-    /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
-  check("returns Result<Vec<String>, String>",
-    /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
-  check("uses reqwest",
-    /\breqwest\b/i.test(code));
-  check("references JoinSet or Semaphore for concurrency",
-    /\bJoinSet\b|\bSemaphore\b/i.test(code));
-  check("bounds concurrency at 4",
-    /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
-  // Exponential backoff — models express this several ways. Accept
-  // any recognizable doubling pattern starting at 250ms. 2026-04-22:
-  // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
-  // rejected even though the code is correct. Broadening rubric to
-  // match all idiomatic doubling forms.
-  const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
-                  || /millis\s*\(\s*250\b/.test(code);
-  const hasDoublingPattern = /250\s*\*\s*2/.test(code)         // 250 * 2^n literal
-                          || /<<\s*\d+/.test(code)              // bit-shift
-                          || /\.pow\s*\(/.test(code)            // 2u32.pow(attempt)
-                          || /\*=\s*2\b/.test(code)             // delay *= 2  ← was missing
-                          || /\*\s*2\s*;/.test(code)            // delay = delay * 2;
-                          || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
-  check("has 250ms backoff seed",
-    hasSeed250);
-  check("reaches 500ms backoff (literal or doubling from 250)",
-    /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
-    || /millis\s*\(\s*500\b/.test(code)
-    || (hasSeed250 && hasDoublingPattern));
-  check("reaches 1000ms backoff (literal or doubling to 1000)",
-    /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
-    || /millis\s*\(\s*1000\b/.test(code)
-    || (hasSeed250 && hasDoublingPattern));
-  check("case-insensitive tool grouping (to_ascii_lowercase)",
-    /to_ascii_lowercase|to_lowercase/.test(code));
-  check("NO .unwrap() — all errors bubble via ?",
-    !/\.unwrap\s*\(\s*\)/.test(code));
-  check("NO .expect(...) — all errors bubble via ?",
-    !/\.expect\s*\(/.test(code));
-  check("NO panic!() / unimplemented!() / todo!()",
-    !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
-  check("has rustdoc /// comments",
-    /\/\/\//.test(code));
-  check("reasonable length (> 500 chars)",
-    code.length > 500);
-
-  return { passed: violations.length === 0, violations, passed_rules: passed };
-}
-
-function log(msg: string) { console.log(`[hard] ${msg}`); }
-
-async function chat(opts: {
-  provider: "ollama" | "ollama_cloud",
-  model: string,
-  prompt: string,
-}): Promise<{ content: string; error?: string }> {
-  try {
-    const r = await fetch(`${GATEWAY}/v1/chat`, {
-      method: "POST", headers: { "content-type": "application/json" },
-      body: JSON.stringify({
-        provider: opts.provider,
-        model: opts.model,
-        messages: [{ role: "user", content: opts.prompt }],
-        max_tokens: 2500,
-        temperature: 0.2,
-        think: false,
-      }),
-      signal: AbortSignal.timeout(240000),
-    });
-    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
-    const j: any = await r.json();
-    return { content: j.choices?.[0]?.message?.content ?? "" };
-  } catch (e) {
-    return { content: "", error: (e as Error).message };
-  }
-}
-
-interface AttemptRecord {
-  n: number;
-  provider: string;
-  model: string;
-  duration_ms: number;
-  content_chars: number;
-  error: string | null;
-  rubric_violations: string[];
-  rubric_passed: string[];
-  accepted: boolean;
-}
-
-function extractCode(raw: string): string {
-  // Strip common fence wrappers
-  const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
-  if (m) return m[1].trim();
-  return raw.trim();
-}
-
-async function main() {
-  await mkdir(OUT_DIR, { recursive: true });
-  log(`output: ${OUT_DIR}`);
-  log(`task: ${TASK.slice(0, 120)}...`);
-  log("");
-
-  const attempts: AttemptRecord[] = [];
-  let acceptedCode: string | null = null;
-
-  for (let i = 0; i < MAX_ATTEMPTS; i++) {
-    const n = i + 1;
-    const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
-
-    // Build the prompt: base task + prior failures' learning blocks
-    let priorLearning = "";
-    if (attempts.length > 0) {
-      priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
-      for (const a of attempts) {
-        priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
-        for (const v of a.rubric_violations) priorLearning += `  - ${v}\n`;
-        if (a.error) priorLearning += `  [error: ${a.error.slice(0, 120)}]\n`;
-      }
-      priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
-    }
-
-    log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
-    const t0 = Date.now();
-    const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
-    const dur = Date.now() - t0;
-
-    const code = extractCode(r.content);
-    const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
-
-    const record: AttemptRecord = {
-      n,
-      provider: rung.provider,
-      model: rung.model,
-      duration_ms: dur,
-      content_chars: code.length,
-      error: r.error ?? null,
-      rubric_violations: rubric.violations,
-      rubric_passed: rubric.passed_rules,
-      accepted: rubric.passed,
-    };
-    attempts.push(record);
-
-    log(`  → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
-    for (const v of rubric.violations.slice(0, 5)) log(`       ✗ ${v}`);
-
-    await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
-    await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
-
-    if (rubric.passed) {
-      log(`  ✅ ACCEPTED on attempt ${n}`);
-      acceptedCode = code;
-      break;
-    }
-  }
-
-  const summary = {
-    task: TASK.slice(0, 200),
-    total_attempts: attempts.length,
-    accepted: acceptedCode !== null,
-    accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
-    escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
-    per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
-    per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
-    total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
-  };
-  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
-
-  log("");
-  log(`═══ RESULT ═══`);
-  log(`attempts: ${summary.total_attempts}`);
-  log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
-  log(`escalation path:`);
-  for (const [i, a] of attempts.entries()) {
-    const mark = a.accepted ? "✅" : "❌";
-    log(`  ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
-  }
-  log("");
-  log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
-  log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
-
-  process.exit(summary.accepted ? 0 : 1);
-}
-
-main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
--- a/tests/real-world/nine_consecutive_audits.ts
+++ b/tests/real-world/nine_consecutive_audits.ts
@ -1,186 +0,0 @@
-// Nine-consecutive audit runner — empirical test of the predictive-
-// compounding property. Runs the audit pipeline 9 times against the
-// same PR (each time with a new diff from Gitea), captures the
-// verdict + audit_lessons state after each run, and reports whether
-// the KB stabilizes or drifts.
-//
-// What we expect (favorable compounding):
-//   - signature_count grows sublinearly (same patterns recur, so
-//     distinct-signature count stabilizes fast)
-//   - verdict settles on a stable value after run 2-3 (first audit
-//     establishes baseline, rest repeat)
-//   - confidence stays LOW for all signatures (same PR repeatedly)
-//   - NO new recurring findings fire because confidence < 0.3 on
-//     same-PR noise (kb_index rating policy)
-//
-// What would indicate drift (the thing we want to prove DOESN'T happen):
-//   - signature_count grows linearly — each run produces new signatures
-//   - verdict oscillates (block → approve → block ...)
-//   - confidence inflates — kb_index rating escalates on repeated runs
-//
-// Run: bun run tests/real-world/nine_consecutive_audits.ts
-
-import { readFile, writeFile } from "node:fs/promises";
-import { join } from "node:path";
-import { aggregate } from "../../auditor/kb_index.ts";
-import { getPrSnapshot } from "../../auditor/gitea.ts";
-import { auditPr } from "../../auditor/audit.ts";
-
-const REPO = "/home/profit/lakehouse";
-const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
-const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
-const POLL_INTERVAL_MS = 5_000;
-const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
-const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
-const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0";
-const RESET_KB = process.env.LH_RESET_KB === "1";
-
-async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise<any> {
-  const short = sha.slice(0, 12);
-  const path = join(VERDICTS_DIR, `${prNum}-${short}.json`);
-  const start = Date.now();
-  while (Date.now() - start < deadlineMs) {
-    try {
-      const raw = await readFile(path, "utf8");
-      return JSON.parse(raw);
-    } catch { /* not yet */ }
-    await Bun.sleep(POLL_INTERVAL_MS);
-  }
-  throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
-}
-
-async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
-  const agg = await aggregate<any>(AUDIT_LESSONS, {
-    keyFn: (r) => r?.signature,
-    scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
-  });
-  const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
-  const recurring = list.filter(r => r.count >= 2);
-  const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0;
-  const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0;
-  return {
-    sig_count: list.length,
-    max_count: list[0]?.count ?? 0,
-    max_confidence: recurringMaxConf,
-    recurring_max_count: recurringMaxCount,
-    top3: list.slice(0, 3).map(a => ({
-      sig: a.signature,
-      count: a.count,
-      conf: a.confidence,
-      summary: a.representative_summary.slice(0, 80),
-    })),
-  };
-}
-
-interface RunRecord {
-  run: number;
-  sha: string;
-  verdict_overall: string;
-  findings_total: number;
-  findings_block: number;
-  findings_warn: number;
-  findings_info: number;
-  audit_duration_ms: number;
-  claims_total: number;
-  claims_empirical: number;
-  kb_sig_count_after: number;
-  kb_max_count_after: number;
-  kb_max_confidence_after: number;
-  kb_recurring_max_count: number;
-}
-
-async function main() {
-  console.log(`[nine] target PR: #${TARGET_PR}`);
-  console.log(`[nine] runs:      ${RUNS}`);
-  console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`);
-  console.log(`[nine] reset_kb:  ${RESET_KB}`);
-  console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
-
-  if (RESET_KB) {
-    console.log("[nine] clearing audit_lessons.jsonl for clean test...");
-    await writeFile(AUDIT_LESSONS, "");
-  }
-  console.log("");
-
-  const pr = await getPrSnapshot(TARGET_PR);
-  console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
-  console.log(`[nine] files in diff: ${pr.files.length}`);
-  console.log("");
-
-  const baseline = await captureAggState();
-  console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
-  console.log("");
-
-  const records: RunRecord[] = [];
-  for (let n = 1; n <= RUNS; n++) {
-    const t0 = Date.now();
-    console.log(`─── run ${n}/${RUNS} ───`);
-
-    const verdict = await auditPr(pr, {
-      dry_run: true,
-      skip_dynamic: true,
-      skip_inference: SKIP_INFERENCE,
-    });
-
-    console.log(`  sha ${verdict.head_sha.slice(0, 12)}`);
-    const after = await captureAggState();
-    const rec: RunRecord = {
-      run: n,
-      sha: verdict.head_sha.slice(0, 12),
-      verdict_overall: String(verdict.overall),
-      findings_total: Number(verdict.metrics?.findings_total ?? 0),
-      findings_block: Number(verdict.metrics?.findings_block ?? 0),
-      findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
-      findings_info: Number(verdict.metrics?.findings_info ?? 0),
-      audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
-      claims_total: Number(verdict.metrics?.claims_total ?? 0),
-      claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
-      kb_sig_count_after: after.sig_count,
-      kb_max_count_after: after.max_count,
-      kb_max_confidence_after: after.max_confidence,
-      kb_recurring_max_count: after.recurring_max_count,
-    };
-    records.push(rec);
-    console.log(`  verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
-    console.log(`  kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
-    console.log(`  elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
-    console.log("");
-  }
-
-  console.log("═══ FINAL ═══");
-  console.log("run | verdict          | find  | block  warn  info | dur_s | kb_sig  max_count  max_conf");
-  for (const r of records) {
-    console.log(
-      `  ${String(r.run).padStart(1)}  | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)}  | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)}  | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)}  ${String(r.kb_max_count_after).padStart(9)}  ${r.kb_max_confidence_after.toFixed(2)}`,
-    );
-  }
-
-  console.log("");
-  console.log("═══ COMPOUNDING PROPERTY ═══");
-  const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
-  const maxConf = records[records.length - 1].kb_max_confidence_after;
-  const recurringMax = records[records.length - 1].kb_recurring_max_count;
-  console.log(`  signatures added over ${RUNS} runs: ${sigDelta}`);
-  console.log(`  max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`);
-  console.log(`  max confidence after run ${RUNS}:    ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
-
-  const verdictSet = new Set(records.map(r => r.verdict_overall));
-  if (verdictSet.size === 1) {
-    console.log(`  verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
-  } else {
-    console.log(`  verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
-  }
-
-  if (maxConf < 0.6 && recurringMax < 5) {
-    console.log(`  confidence policy holding: same-PR noise stays below escalation threshold ✓`);
-  } else {
-    console.log(`  ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`);
-  }
-
-  const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
-  await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
-  console.log("");
-  console.log(`  report: ${jsonOut}`);
-}
-
-main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });