remove 5 orphaned dev experiments from tests/real-world/

Per J: "all of our test code ended up in the main." These are 5 one-time dev experiments that were never wired into any automation and have zero live consumers in the production code path. Deleting them. Removed (1418 LOC total): - enrich_prd_pipeline.ts (528 LOC) — Phase 21 architecture stress test - nine_consecutive_audits.ts (185 LOC) — empirical study of audit compounding - hard_task_escalation.ts (267 LOC) — escalation-ladder test (refs retired cloud models qpt-oss:20b/120b) - autonomous_loop.ts (214 LOC) — wrapper experiment around scrum_master - consensus_reducer_design.ts (224 LOC) — N=3 design consultation; output JSON referenced from pathway_memory.rs comment but the script itself has no consumer Verified: 0 references in package.json / justfile / Makefile / any production .ts/.rs/.sh file. The single mention from pathway_memory.rs is a //! doc comment referencing the JSON output (data/_kb/ consensus_reducer_design_*.json), not the script. Build clean post-delete. KEPT: - scrum_master_pipeline.ts — referenced from observer.ts, vectord, scripts - scrum_applier.ts — referenced from auditor schemas If you need any of these back, they're in git history. cherry-pick or git show HEAD~1 -- tests/real-world/<file>.ts will recover the source. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 02:05:24 -05:00 · 2026-05-03 02:05:24 -05:00 · 6aafd41785
commit 6aafd41785
parent bb5a3b3f5e
5 changed files with 0 additions and 1419 deletions
--- a/tests/real-world/autonomous_loop.ts
+++ b/tests/real-world/autonomous_loop.ts
@ -1,214 +0,0 @@
 #!/usr/bin/env bun
 // Autonomous scrum loop — wraps scrum_master_pipeline.ts + scrum_applier.ts
 // in a goal-driven retry loop. Observer is POSTed an iteration summary at
 // every boundary so it can build meta-commentary outside the loop's epistemic
 // scope.
 //
 // Usage:
 //   LOOP_TARGETS="crates/a/src/x.rs,crates/b/src/y.rs" \
 //   LOOP_MAX_ITERS=5 \
 //   LOOP_PUSH=1 \
 //   bun run tests/real-world/autonomous_loop.ts
 //
 // Stop conditions: max_iters reached OR 2 consecutive iters with 0 commits.
 import { spawn } from "node:child_process";
 import { appendFile, readFile } from "node:fs/promises";
 import { existsSync } from "node:fs";
 const REPO = "/home/profit/lakehouse";
 const OBSERVER = process.env.LOOP_OBSERVER ?? "http://localhost:3800";
 const BRANCH = process.env.LOOP_BRANCH ?? "scrum/auto-apply-19814";
 const MAX_ITERS = Number(process.env.LOOP_MAX_ITERS ?? 3);
 const PUSH = process.env.LOOP_PUSH === "1";
 const MIN_CONF = process.env.LOOP_MIN_CONF ?? "85";
 // Optional override — when unset, let scrum_applier.ts use ITS default
 // (currently x-ai/grok-4.1-fast on openrouter). The prior hardcoded
 // qwen3-coder:480b default was clobbering the applier's own default
 // and forcing every iter to hit the throttled ollama_cloud account.
 const APPLIER_MODEL = process.env.LOOP_APPLIER_MODEL;
 const APPLIER_PROVIDER = process.env.LOOP_APPLIER_PROVIDER;
 const TARGETS = (process.env.LOOP_TARGETS ?? "crates/queryd/src/service.rs,crates/gateway/src/main.rs,crates/gateway/src/v1/mod.rs")
  .split(",").map(s => s.trim()).filter(Boolean);
 const FORENSIC = process.env.LH_SCRUM_FORENSIC ?? `${REPO}/docs/SCRUM_FORENSIC_PROMPT.md`;
 const PROPOSAL = process.env.LH_SCRUM_PROPOSAL ?? `${REPO}/docs/SCRUM_FIX_WAVE.md`;
 const LOOP_ID = `loop_${Date.now().toString(36)}`;
 const JOURNAL = `${REPO}/data/_kb/autonomous_loops.jsonl`;
 interface IterResult {
  iter: number;
  scrum_reviews_added: number;
  applier_outcomes: Record<string, number>;
  commits_landed: number;
  commit_shas: string[];
  build_status: "green" | "red" | "unknown";
  duration_ms: number;
 }
 function log(msg: string) {
  const ts = new Date().toISOString().slice(11, 19);
  console.log(`[loop ${LOOP_ID} ${ts}] ${msg}`);
 }
 function runCmd(cmd: string, args: string[], env: Record<string, string> = {}): Promise<{ code: number; stdout: string; stderr: string }> {
  return new Promise((resolve) => {
    const child = spawn(cmd, args, { cwd: REPO, env: { ...process.env, ...env } });
    let stdout = "", stderr = "";
    child.stdout.on("data", (d) => { stdout += d; process.stdout.write(d); });
    child.stderr.on("data", (d) => { stderr += d; process.stderr.write(d); });
    child.on("close", (code) => resolve({ code: code ?? -1, stdout, stderr }));
  });
 }
 async function countLines(path: string): Promise<number> {
  if (!existsSync(path)) return 0;
  const text = await readFile(path, "utf8");
  return text.split("\n").filter(Boolean).length;
 }
 async function gitHeadSha(): Promise<string> {
  const r = await runCmd("git", ["rev-parse", "HEAD"]);
  return r.stdout.trim();
 }
 async function commitsSince(baseSha: string): Promise<string[]> {
  const r = await runCmd("git", ["log", "--oneline", `${baseSha}..HEAD`]);
  return r.stdout.trim().split("\n").filter(Boolean);
 }
 async function cargoCheckGreen(): Promise<boolean> {
  log("cargo check --workspace …");
  const r = await runCmd("cargo", ["check", "--workspace", "--quiet"]);
  return r.code === 0;
 }
 async function postObserver(payload: object) {
  try {
    const r = await fetch(`${OBSERVER}/event`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify(payload),
      signal: AbortSignal.timeout(5000),
    });
    if (!r.ok) log(`observer POST returned ${r.status}`);
  } catch (e: any) {
    log(`observer POST failed: ${e.message}`);
  }
 }
 async function runIter(iter: number, baseSha: string): Promise<IterResult> {
  const t0 = Date.now();
  log(`══ iter ${iter} start (base ${baseSha.slice(0, 8)}) targets=${TARGETS.length}`);
  const reviewsBefore = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
  const applyBefore = await countLines(`${REPO}/data/_kb/auto_apply.jsonl`);
  log(`scrum_master_pipeline.ts → ${TARGETS.length} files`);
  await runCmd("bun", ["run", "tests/real-world/scrum_master_pipeline.ts"], {
    LH_SCRUM_FILES: TARGETS.join(","),
    LH_SCRUM_FORENSIC: FORENSIC,
    LH_SCRUM_PROPOSAL: PROPOSAL,
  });
  log(`scrum_applier.ts COMMIT=1 MIN_CONF=${MIN_CONF} files=${TARGETS.length}`);
  // Only forward model/provider when explicitly overridden — otherwise
  // let scrum_applier.ts use its own defaults (Grok 4.1 fast on openrouter).
  const applierEnv: Record<string, string> = {
    LH_APPLIER_COMMIT: "1",
    LH_APPLIER_MIN_CONF: MIN_CONF,
    LH_APPLIER_MAX_FILES: String(TARGETS.length),
    LH_APPLIER_BRANCH: BRANCH,
    // Constrain applier to THIS iter's targets so it patches what we
    // just reviewed instead of the highest-confidence file from history.
    LH_APPLIER_FILES: TARGETS.join(","),
  };
  if (APPLIER_MODEL) applierEnv.LH_APPLIER_MODEL = APPLIER_MODEL;
  if (APPLIER_PROVIDER) applierEnv.LH_APPLIER_PROVIDER = APPLIER_PROVIDER;
  await runCmd("bun", ["run", "tests/real-world/scrum_applier.ts"], applierEnv);
  const reviewsAfter = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
  const applyAfterText = existsSync(`${REPO}/data/_kb/auto_apply.jsonl`)
    ? await readFile(`${REPO}/data/_kb/auto_apply.jsonl`, "utf8")
    : "";
  const applyRows = applyAfterText.split("\n").filter(Boolean).slice(applyBefore);
  const outcomes: Record<string, number> = {};
  for (const line of applyRows) {
    try {
      const o = JSON.parse(line);
      outcomes[o.action ?? "?"] = (outcomes[o.action ?? "?"] ?? 0) + 1;
    } catch { /* skip malformed */ }
  }
  const commitShas = await commitsSince(baseSha);
  const buildStatus = commitShas.length > 0 ? (await cargoCheckGreen() ? "green" : "red") : "unknown";
  const result: IterResult = {
    iter,
    scrum_reviews_added: reviewsAfter - reviewsBefore,
    applier_outcomes: outcomes,
    commits_landed: commitShas.length,
    commit_shas: commitShas.map(s => s.split(" ")[0]),
    build_status: buildStatus,
    duration_ms: Date.now() - t0,
  };
  log(`iter ${iter} done — reviews+${result.scrum_reviews_added} commits=${result.commits_landed} build=${buildStatus} (${(result.duration_ms / 1000).toFixed(1)}s)`);
  await postObserver({
    source: "autonomous_loop",
    loop_id: LOOP_ID,
    event_kind: "iteration_complete",
    iter,
    targets: TARGETS,
    success: buildStatus !== "red",
    scrum_reviews_added: result.scrum_reviews_added,
    applier_outcomes: result.applier_outcomes,
    commits_landed: result.commits_landed,
    commit_shas: result.commit_shas,
    build_status: buildStatus,
    duration_ms: result.duration_ms,
    ts: new Date().toISOString(),
  });
  await appendFile(JOURNAL, JSON.stringify({ loop_id: LOOP_ID, ...result, ts: new Date().toISOString() }) + "\n");
  return result;
 }
 async function main() {
  log(`autonomous loop starting · branch=${BRANCH} max_iters=${MAX_ITERS} push=${PUSH}`);
  log(`targets: ${TARGETS.join(", ")}`);
  const branchR = await runCmd("git", ["branch", "--show-current"]);
  if (branchR.stdout.trim() !== BRANCH) {
    log(`ERROR: on branch ${branchR.stdout.trim()}, expected ${BRANCH}. Refusing to run.`);
    process.exit(1);
  }
  let consecutiveZero = 0;
  for (let iter = 1; iter <= MAX_ITERS; iter++) {
    const baseSha = await gitHeadSha();
    const result = await runIter(iter, baseSha);
    if (PUSH && result.commits_landed > 0) {
      log(`git push origin ${BRANCH}`);
      const pushR = await runCmd("git", ["push", "origin", BRANCH]);
      if (pushR.code !== 0) log(`push failed (continuing): ${pushR.stderr.slice(0, 200)}`);
    }
    consecutiveZero = result.commits_landed === 0 ? consecutiveZero + 1 : 0;
    if (consecutiveZero >= 2) {
      log(`STOP: 2 consecutive iters with 0 commits. Loop converged or stuck.`);
      break;
    }
  }
  log(`loop ${LOOP_ID} complete. Journal: ${JOURNAL}`);
 }
 main().catch((e) => {
  log(`FATAL: ${e.message}`);
  process.exit(1);
 });
--- a/tests/real-world/consensus_reducer_design.ts
+++ b/tests/real-world/consensus_reducer_design.ts
@ -1,224 +0,0 @@
 // consensus_reducer_design.ts — N=3 design consultation.
 //
 // J's ask: enhance the tree-split reducer to preserve FULL backtrack-able
 // context (endpoints tried, attempt count per model in the ladder, KB
 // sources retrieved, context7 bridge hits, MCP observer signals, audit
 // verdicts) instead of collapsing to a summary. Then index the full
 // context through our existing vectord matrix indexing (HNSW + Lance +
 // playbook_memory) so successful pathways become hot-swappable — the
 // system asks "what did we try, what worked, in what order" for a
 // similar task class and gets a ranked playbook back.
 //
 // Before building, consult three diverse models and print their proposals
 // side-by-side so we can pick the convergent design.
 const GATEWAY = "http://localhost:3100";
 const DESIGN_BRIEF = `
 # Context — Lakehouse signal→commit loop
 We run 6x scrum-master iterations that audit Rust crates for PRD
 alignment, produce findings + confidence, and feed an auto-applier that
 lands small mechanical commits through a cargo-green-and-warning-stable
 gate. Key components:
 - \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung
  model LADDER (kimi-k2:1t → qwen3-coder:480b → deepseek → mistral-large
  → gpt-oss:120b → qwen3.5:397b → openrouter free rescues → local
  qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5
  proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K
  shards, asks each rung in order, accepts first response passing
  structural checks.
 - \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions
  (file, verdict, critical_failures_count, gradient_tier, attempts,
  reviewer_model, tree_split_fired). Escalates failure clusters to LLM
  Team by POSTing to /v1/chat with qwen3-coder:480b.
 - \`context7-bridge\` — external library docs lookup.
 - \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum
  findings; writes to data/_kb/audit_facts.jsonl via LLM Team
  \`/api/run?mode=extract\`.
 - \`crates/vectord/src/playbook_memory.rs\` — indexing for proven
  playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry,
  PatternReport. Uses HNSW index + Lance columnar backend + promotion
  pipeline. Already battle-tested for workers/staffing queries.
 - Tree-split REDUCER: after shards return map-style summaries, they are
  concatenated with internal §N§ markers and fed to a reviewer model to
  produce ONE file-level review. Currently the reducer sees summaries,
  not the full context behind each shard's conclusion.
 # The problem
 The reducer currently TRUNCATES to a short summary. When the auditor or
 a future iteration wants to backtrack WHY the reducer concluded what it
 did — which attempt succeeded, which failed, what KB chunks were
 retrieved, what observer signal classified the file as LOOPING vs
 CONVERGING — that context is lost. So:
 1. Auditor can't verify citation provenance beyond the summary line.
 2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a
   "tried X and it was easy" playbook — they look identical downstream.
 3. The matrix indexing is only used for RAG chunks during the scrum
   pass, NOT for storing the full end-to-end pathway of a successful
   review.
 # The design question
 Propose an enhanced reducer + indexing design that:
 (a) Preserves the FULL backtrack context per reviewed file:
    - every ladder attempt (model, ms, accepted_y/n, reject_reason)
    - every retrieved KB chunk (source doc, chunk id, cosine score, rank)
    - every observer signal (class, priors, prior-iter outcomes)
    - every context7 bridge hit (library, version pulled)
    - every sub-pipeline call (LLM Team extract results, audit consensus)
 (b) Stores this pathway into vectord's matrix indexing alongside the
    review verdict so it becomes retrievable by similarity. When a new
    file's fingerprint (task_class + file-path prefix + signal class)
    matches a past successful pathway, the system can hot-swap by
    replaying or short-circuiting to the model/KB combo that worked.
 (c) Surfaces the matrix-index hit rate as a feedback signal on the
    scrum's UI — "this file was solved 3 times before by the same ladder
    rung; consider short-circuiting to rung 5."
 (d) Is compatible with the existing playbook_memory.rs primitives
    (PlaybookEntry, DocRef, FailureRecord, BoostEntry) — extend don't
    replace. The indexing layer is in production for workers/staffing;
    we want the reducer pathway to piggyback on proven infrastructure.
 # Constraints
 - NO new crate. Extend vectord + scrum_master_pipeline.
 - Full context can be LARGE — a reviewed file might have 5 retrievals,
  4 ladder attempts, 8 observer priors. Design the embedding /
  fingerprint so similar-but-not-identical pathways cluster.
 - The reducer summary is still needed for the reviewer LLM input —
  don't remove it, ADD the full-context sidecar.
 - Audit trail: every pathway must be replayable deterministically from
  what's stored (i.e., enough context to re-run without the original
  prompt cache).
 # Required output (STRICT JSON, no prose, no markdown fences):
 {
  "approach": "one-paragraph summary of your proposed design",
  "data_model": {
    "new_fields_on_playbook_entry": [...],
    "new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ]
  },
  "storage_strategy": {
    "what_to_vectorize": "the text that becomes the embedding",
    "fingerprint_key": "the deterministic key for similarity retrieval",
    "backend": "HNSW, Lance, playbook_memory — pick"
  },
  "reducer_changes": {
    "inputs_added": [...],
    "outputs_added": [...],
    "compatibility_notes": "how existing callers stay working"
  },
  "hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway",
  "ui_signal": "what to surface so J sees whether matrix indexing is earning its keep",
  "risks": [...],
  "why_this_beats_summarization": "one-paragraph argument"
 }
 `.trim();
 interface Probe {
  name: string;
  provider: "ollama" | "ollama_cloud" | "openrouter";
  model: string;
 }
 // Round-3 probe set — 4 probes covering the remaining ladder rungs +
 // architecture/provider diversity. J wanted all 4 of the untouched
 // options so the aggregated 10-model signal is saturated across the
 // usable ladder.
 const PROBES: Probe[] = [
  { name: "qwen35-397b",        provider: "ollama_cloud", model: "qwen3.5:397b" },
  { name: "openrouter-gpt-oss", provider: "openrouter",   model: "openai/gpt-oss-120b:free" },
  { name: "openrouter-gemma3",  provider: "openrouter",   model: "google/gemma-3-27b-it:free" },
  { name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check
 ];
 async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> {
  const started = Date.now();
  try {
    const r = await fetch(`${GATEWAY}/v1/chat`, {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
        provider: p.provider,
        model: p.model,
        messages: [
          { role: "system", content: "You are a senior architect. Output STRICT JSON only." },
          { role: "user", content: DESIGN_BRIEF },
        ],
        max_tokens: 3000,
        temperature: 0,
      }),
    });
    const ms = Date.now() - started;
    if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` };
    const j = await r.json();
    const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? "";
    return { name: p.name, raw: String(content), ms };
  } catch (e: any) {
    return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) };
  }
 }
 function extractJson(raw: string): any | null {
  let s = raw.trim();
  const fence = s.match(/^```(?:json)?\s*/);
  if (fence) s = s.slice(fence[0].length);
  if (s.endsWith("```")) s = s.slice(0, -3).trim();
  const first = s.indexOf("{");
  const last = s.lastIndexOf("}");
  if (first < 0 || last <= first) return null;
  try {
    return JSON.parse(s.slice(first, last + 1));
  } catch {
    return null;
  }
 }
 function summarize(obj: any, max = 240): string {
  if (!obj) return "(no JSON parsed)";
  if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj;
  if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; ");
  return Object.entries(obj)
    .map(([k, v]) => `${k}=${summarize(v, max)}`)
    .join(" | ");
 }
 async function main() {
  console.log(`\n── N=3 design consensus ──`);
  console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`);
  const results = await Promise.all(PROBES.map(ask));
  for (const r of results) {
    console.log(`\n── ${r.name} (${r.ms}ms) ──`);
    if (r.error) { console.log(`  ERROR: ${r.error}`); continue; }
    const j = extractJson(r.raw);
    if (!j) {
      console.log(`  raw (no JSON): ${r.raw.slice(0, 600)}…`);
      continue;
    }
    console.log(`  approach: ${summarize(j.approach, 400)}`);
    console.log(`  fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`);
    console.log(`  vectorize:   ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`);
    console.log(`  backend:     ${summarize(j.storage_strategy?.backend, 200)}`);
    console.log(`  hot_swap:    ${summarize(j.hot_swap_logic, 300)}`);
    console.log(`  new_types:   ${summarize(j.data_model?.new_types, 400)}`);
    console.log(`  risks:       ${summarize(j.risks, 300)}`);
    console.log(`  why>summary: ${summarize(j.why_this_beats_summarization, 300)}`);
  }
  // Write full JSON to disk so we can inspect later.
  const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`;
  await Bun.write(outPath, JSON.stringify(results, null, 2));
  console.log(`\nfull responses → ${outPath}`);
 }
 await main();
--- a/tests/real-world/enrich_prd_pipeline.ts
+++ b/tests/real-world/enrich_prd_pipeline.ts
@ -1,528 +0,0 @@
 // Real-world architecture stress test — 6 iterations of the full pipeline
 // against the PRD as a corpus. Goal: prove at scale what Phase 21
 // promised (context continuation + tree-split), plus Phase 19
 // compounding across iterations.
 //
 // Run: bun run tests/real-world/enrich_prd_pipeline.ts
 //
 // No mocks. No skipped layers. On any error, the test triggers
 // cloud-rescue rather than fail — it's the architecture's job to
 // recover. The test FAILS only if we can't complete 6 iterations.
 import { readFile, writeFile, mkdir } from "node:fs/promises";
 import { createHash } from "node:crypto";
 const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
 const CHUNK_SIZE = 800;            // chars per chunk — ~200 tokens
 const CHUNK_OVERLAP = 120;
 const TOP_K_RETRIEVE = 12;         // chunks per iteration — pulled up to force overflow
 const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
 const INJECT_FAIL_ON_ITER = 3;     // force the TASK-retry loop on iter 3
 // Continuation controls (per-cloud-call) — used for output-overflow.
 // Separate from the task-retry loop (per-task) — that handles errors
 // across attempts.
 const PRIMARY_MAX_TOKENS = 150;     // tight — forces truncation
 const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
 const MAX_CONTINUATIONS = 6;         // max stitch pieces per cloud call
 // Task-level retry loop (J's clarification, 2026-04-22):
 // When a TASK errors, retry the whole task up to 6 times. Each
 // retry gets prior attempts' failures injected as learning context,
 // so attempt N+1 is informed by what N failed at. The loop caps at
 // 6 to avoid infinite spinning on genuinely unsolvable tasks.
 const MAX_TASK_RETRIES = 6;
 // To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
 // 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
 // 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
 // loop fires all 6 with compounding failure context.
 const FORCE_RETRY_MODEL_SEQUENCE = [
  "deliberately-invalid-model-attempt-1",
  "deliberately-invalid-model-attempt-2",
  "deliberately-invalid-model-attempt-3",
  "deliberately-invalid-model-attempt-4",
  "deliberately-invalid-model-attempt-5",
  "gpt-oss:20b", // 6th attempt succeeds
 ];
 const GATEWAY = "http://localhost:3100";
 const SIDECAR = "http://localhost:3200";
 const CLOUD_MODEL = "gpt-oss:120b";
 const RESCUE_MODEL = "gpt-oss:20b";       // fallback local cloud model via sidecar
 const RUN_NONCE = Date.now().toString(36);
 const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
 // The 6 progressively-compounding questions. #6 explicitly requires
 // synthesis across prior 5 answers.
 const QUESTIONS: string[] = [
  "Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
  "How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
  "Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
  "What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
  "How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
  "Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
 ];
 type Chunk = { id: string; text: string; embedding: number[]; offset: number };
 interface IterationResult {
  iteration: number;
  question: string;
  retrieval_top_k: number;
  context_chars_before_budget: number;
  tree_split_fired: boolean;
  cloud_calls_total: number;
  continuation_retries: number;
  rescue_triggered: boolean;
  // Task-level retry telemetry
  task_attempts_made: number;       // how many attempts fired (1 = first succeeded)
  task_retry_history: Array<{ n: number; model: string; error: string }>;
  playbook_id: string | null;
  tokens_prompt: number;
  tokens_completion: number;
  citations_from_prior_iterations: string[];
  duration_ms: number;
  answer_preview: string;
  errors_recovered: string[];
 }
 function log(msg: string) { console.log(`[enrich] ${msg}`); }
 function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
 function cosine(a: number[], b: number[]): number {
  let dot = 0, na = 0, nb = 0;
  for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
  if (na === 0 || nb === 0) return 0;
  return dot / (Math.sqrt(na) * Math.sqrt(nb));
 }
 function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
 async function embedBatch(texts: string[]): Promise<number[][]> {
  // Sidecar /embed accepts a list. On partial failure, retry individually.
  const r = await fetch(`${SIDECAR}/embed`, {
    method: "POST", headers: { "content-type": "application/json" },
    body: JSON.stringify({ texts }),
    signal: AbortSignal.timeout(120000),
  });
  if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
  const j: any = await r.json();
  return j.embeddings;
 }
 function chunkText(text: string): Array<{ text: string; offset: number }> {
  const out: Array<{ text: string; offset: number }> = [];
  let i = 0;
  while (i < text.length) {
    const end = Math.min(i + CHUNK_SIZE, text.length);
    const slice = text.slice(i, end).trim();
    if (slice.length > 50) out.push({ text: slice, offset: i });
    if (end >= text.length) break;
    i = end - CHUNK_OVERLAP;
  }
  return out;
 }
 async function chat(opts: {
  provider: "ollama" | "ollama_cloud",
  model: string,
  messages: Array<{ role: string; content: string }>,
  max_tokens: number,
  think: boolean,
 }): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
  const r = await fetch(`${GATEWAY}/v1/chat`, {
    method: "POST", headers: { "content-type": "application/json" },
    body: JSON.stringify({ ...opts }),
    signal: AbortSignal.timeout(180000),
  });
  if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
  const j: any = await r.json();
  return {
    content: j.choices?.[0]?.message?.content ?? "",
    prompt_tokens: j.usage?.prompt_tokens ?? 0,
    completion_tokens: j.usage?.completion_tokens ?? 0,
    finish_reason: j.choices?.[0]?.finish_reason ?? "?",
  };
 }
 // ─── Tree-split over oversized chunk set ──────────────────────────
 async function treeSplitSummarize(
  chunks: Chunk[],
  question: string,
 ): Promise<{ scratchpad: string; cloud_calls: number }> {
  // Shard into groups fitting within half the budget each.
  const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
  const shards: Chunk[][] = [];
  for (let i = 0; i < chunks.length; i += perShard) {
    shards.push(chunks.slice(i, i + perShard));
  }
  log(`  tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
  let scratchpad = "";
  let cloud_calls = 0;
  for (let si = 0; si < shards.length; si++) {
    const shard = shards[si];
    const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
    const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
    const r = await chat({
      provider: "ollama_cloud",
      model: CLOUD_MODEL,
      messages: [
        { role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
        { role: "user", content: userMsg },
      ],
      max_tokens: 500,
      think: false,
    });
    cloud_calls += 1;
    scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
    if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
      // truncate oldest halves
      scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
      log(`  tree-split: scratchpad truncated to ${scratchpad.length} chars`);
    }
  }
  return { scratchpad, cloud_calls };
 }
 // ─── Continuable generate — up to max_continuations stitches ──────
 //
 // Two failure modes handled:
 //   A) Empty response — typically thinking model burned the budget
 //      on hidden reasoning. Retry with 2× max_tokens.
 //   B) Truncated response (finish_reason=length) — answer got cut off
 //      mid-sentence. Pass the partial back as scratchpad and ask the
 //      model to continue from where it stopped.
 //
 // Stitching: keep appending content across retries; prompt_tokens and
 // completion_tokens accumulate; finish_reason reflects the LAST call.
 // Loop exits on the first call that finishes cleanly (stop) with
 // non-empty content, OR when retries hit the cap.
 async function generateContinuable(
  opts: Parameters<typeof chat>[0] & { max_continuations?: number },
 ): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
  const maxCont = opts.max_continuations ?? 1;
  let total = await chat(opts);
  let retries = 0;
  while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
    retries += 1;
    const mode = total.content.length === 0 ? "empty" : "truncated";
    log(`  continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
    // Continuation prompt — branch on failure mode:
    //   empty  → retry with 2× tokens, same prompt (thinking budget)
    //   length → pass the partial as assistant turn, ask to continue
    const continuationMessages = total.content.length === 0
      ? opts.messages
      : [
          ...opts.messages,
          { role: "assistant", content: total.content },
          { role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
        ];
    const continued = await chat({
      ...opts,
      max_tokens: CONTINUATION_MAX_TOKENS,
      messages: continuationMessages,
    });
    total = {
      content: total.content + continued.content,
      prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
      completion_tokens: total.completion_tokens + continued.completion_tokens,
      finish_reason: continued.finish_reason,
    };
  }
  return { ...total, continuation_retries: retries };
 }
 // ─── Single iteration: retrieve → budget-check → chat → seed ─────
 async function runIteration(
  iteration: number,
  question: string,
  allChunks: Chunk[],
  priorPlaybookIds: string[],
  priorAnswers: string[],
 ): Promise<IterationResult> {
  const started = Date.now();
  const errorsRecovered: string[] = [];
  log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
  // 1. Embed the question
  const qEmb = (await embedBatch([question]))[0];
  // 2. Retrieve top-K chunks by cosine
  const scored = allChunks
    .map(c => ({ c, score: cosine(qEmb, c.embedding) }))
    .sort((a, b) => b.score - a.score)
    .slice(0, TOP_K_RETRIEVE);
  const chunks = scored.map(x => x.c);
  log(`  retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
  // 3. Context budget check — tree-split if over
  const contextChars = chunks.map(c => c.text).join("\n\n").length;
  let contextForPrompt: string;
  let treeSplit = false;
  let cloudCallsTotal = 0;
  if (contextChars > CONTEXT_BUDGET_CHARS) {
    treeSplit = true;
    log(`  context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
    const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
    contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
    cloudCallsTotal += cloud_calls;
  } else {
    contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
  }
  // 4. Seed prompt with prior iteration answers (real compounding).
  //    Not just IDs — the model needs the CONTENT to synthesize.
  let citationBlock = "";
  let citationsReceived: string[] = [];
  if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
    const lines = priorAnswers.map((ans, i) => {
      const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
      // Trim each prior answer to ~400 chars so we don't blow budget
      return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
    });
    citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
    citationsReceived = priorPlaybookIds.slice();
  }
  // 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
  //    Try the task up to MAX_TASK_RETRIES times. Each retry:
  //    a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
  //       cycles through 5 invalid models + 1 valid to force full loop)
  //    b) Injects prior attempt errors as learning context
  //    c) If the attempt succeeds (non-empty, >100 chars), loop exits
  //    d) Otherwise, records failure and tries again with the learning
  //
  //    Cap at 6 so we don't spin forever on unsolvable tasks.
  let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
  let rescueTriggered = false;
  const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
  const forceRetries = iteration === INJECT_FAIL_ON_ITER;
  if (forceRetries) log(`  FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
  for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
    const modelForAttempt = forceRetries
      ? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
      : CLOUD_MODEL;
    // Compose a prior-attempts learning block for attempts 2+
    const learningBlock = taskAttemptHistory.length > 0
      ? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
      : "";
    log(`  task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
    try {
      const r = await generateContinuable({
        provider: "ollama_cloud",
        model: modelForAttempt,
        messages: [
          { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
          { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
        ],
        max_tokens: PRIMARY_MAX_TOKENS,
        think: false,
        max_continuations: MAX_CONTINUATIONS,
      });
      cloudCallsTotal += 1 + r.continuation_retries;
      if (r.content && r.content.length > 100) {
        // Acceptable answer — exit loop
        result = r;
        if (attempt > 1) {
          log(`  task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
          rescueTriggered = true;
        }
        break;
      }
      // Thin response — count as failure with learning signal
      const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
      errorsRecovered.push(`attempt ${attempt}: ${err}`);
    } catch (e) {
      const err = (e as Error).message;
      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
      errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
      cloudCallsTotal += 1;
    }
  }
  // Last-ditch: if all 6 task attempts failed, try the local fallback
  // once more so we at least return SOMETHING. This is the "don't get
  // caught in a loop, accept best-so-far" rule J stated explicitly.
  if (!result) {
    errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
    rescueTriggered = true;
    try {
      result = await generateContinuable({
        provider: "ollama",
        model: "qwen3.5:latest",
        messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
        max_tokens: 300,
        think: false,
        max_continuations: 2,
      });
      cloudCallsTotal += 1 + result.continuation_retries;
    } catch (e) {
      // Absolute last resort — fabricate a skeleton result
      result = {
        content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
        prompt_tokens: 0,
        completion_tokens: 0,
        continuation_retries: 0,
        finish_reason: "error",
      };
    }
  }
  if (result.content.length === 0) {
    errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
    rescueTriggered = true;
    result = await generateContinuable({
      provider: "ollama",
      model: "qwen3.5:latest",
      messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
      max_tokens: 300,
      think: false,
    });
    cloudCallsTotal += 1;
  }
  // 6. Seed playbook with the answer
  let playbook_id: string | null = null;
  try {
    const ts = new Date().toISOString();
    const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
    const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
      method: "POST", headers: { "content-type": "application/json" },
      body: JSON.stringify({
        operation: seedOp,
        approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
        context: result.content.slice(0, 600),
        endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
        append: true,
      }),
      signal: AbortSignal.timeout(15000),
    });
    if (r.ok) {
      const j: any = await r.json();
      playbook_id = j.outcome?.playbook_id ?? null;
    } else {
      errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
    }
  } catch (e) {
    errorsRecovered.push(`seed exception: ${(e as Error).message}`);
  }
  return {
    iteration,
    question,
    retrieval_top_k: chunks.length,
    context_chars_before_budget: contextChars,
    tree_split_fired: treeSplit,
    cloud_calls_total: cloudCallsTotal,
    continuation_retries: result.continuation_retries,
    rescue_triggered: rescueTriggered,
    task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
    task_retry_history: taskAttemptHistory,
    playbook_id,
    tokens_prompt: result.prompt_tokens,
    tokens_completion: result.completion_tokens,
    citations_from_prior_iterations: citationsReceived,
    duration_ms: Date.now() - started,
    answer_preview: result.content.slice(0, 500),
    errors_recovered: errorsRecovered,
  };
 }
 async function main() {
  await mkdir(OUT_DIR, { recursive: true });
  log(`run nonce: ${RUN_NONCE}`);
  log(`output dir: ${OUT_DIR}`);
  // ─── Phase 1: load, chunk, embed the PRD ───────────────────────
  log(`loading PRD from ${PRD_PATH}`);
  const prd = await readFile(PRD_PATH, "utf8");
  log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
  const raw_chunks = chunkText(prd);
  log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
  // Embed in batches of 32 to avoid sidecar overload
  const allChunks: Chunk[] = [];
  const BATCH = 32;
  const t0 = Date.now();
  for (let i = 0; i < raw_chunks.length; i += BATCH) {
    const batch = raw_chunks.slice(i, i + BATCH);
    const embs = await embedBatch(batch.map(b => b.text));
    for (let j = 0; j < batch.length; j++) {
      allChunks.push({
        id: hash(batch[j].text),
        text: batch[j].text,
        embedding: embs[j].map(x => Number(x)),
        offset: batch[j].offset,
      });
    }
    log(`  embedded ${allChunks.length}/${raw_chunks.length}`);
  }
  log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
  // ─── Phase 2: 6 iterations ─────────────────────────────────────
  const results: IterationResult[] = [];
  const priorIds: string[] = [];
  const priorAnswers: string[] = [];
  for (let i = 1; i <= QUESTIONS.length; i++) {
    const q = QUESTIONS[i - 1];
    const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
    results.push(r);
    if (r.playbook_id) priorIds.push(r.playbook_id);
    priorAnswers.push(r.answer_preview);
    log(`  → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
    await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
  }
  // Check whether iter 6 actually cited prior pb:IDs in its answer.
  // Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
  // hyphens + letters inside the brackets, not just hex chars.
  const iter6 = results[5];
  const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
  // ─── Phase 3: summary ──────────────────────────────────────────
  const summary = {
    run_nonce: RUN_NONCE,
    ran_at: new Date().toISOString(),
    prd_chars: prd.length,
    prd_chunks: allChunks.length,
    iterations: results.length,
    total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
    total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
    total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
    tree_splits_fired: results.filter(r => r.tree_split_fired).length,
    rescues_triggered: results.filter(r => r.rescue_triggered).length,
    iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
    iter6_actually_cited_in_answer: citationsHonored,
    iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
    iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
    max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
    total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
    overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
  };
  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
  log("");
  log(`══════ SUMMARY ${summary.overall} ══════`);
  log(`  6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
  log(`  tree-splits: ${summary.tree_splits_fired}/6   continuations: ${summary.total_continuation_retries}   rescues: ${summary.rescues_triggered}`);
  log(`  iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
  log(`  iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
  log(`  total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
  log("");
  for (const r of results) {
    const flags = [
      r.tree_split_fired ? "tree-split" : "",
      r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
      r.rescue_triggered ? "rescued" : "",
      r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
    ].filter(Boolean).join(" ");
    log(`  iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
  }
  log("");
  log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
  process.exit(summary.overall === "PASS" ? 0 : 1);
 }
 main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
--- a/tests/real-world/hard_task_escalation.ts
+++ b/tests/real-world/hard_task_escalation.ts
@ -1,267 +0,0 @@
 // Hard-task escalation test. The task is deliberately constructed so
 // that a local 7B model (qwen3.5:latest) will miss at least one of the
 // validation rules. Watch the escalation ladder:
 //   1. qwen3.5:latest  (local 7B)       — likely fails
 //   2. qwen3:latest    (local 7B)       — likely fails differently
 //   3. gpt-oss:20b     (cloud 20B)      — may fail
 //   4. gpt-oss:120b    (cloud 120B)     — should succeed
 //   5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
 //   6. absolute last ditch: return best-so-far with failure annotation
 //
 // Each attempt:
 //   - Calls the model via /v1/chat
 //   - Validates the output against a strict rubric
 //   - On fail: records the specific rubric violations + the partial
 //     output, injects both into the next attempt's prompt as "here's
 //     what's wrong, fix it specifically"
 //   - On success: exit loop
 //
 // Run: bun run tests/real-world/hard_task_escalation.ts
 import { writeFile, mkdir } from "node:fs/promises";
 const GATEWAY = "http://localhost:3100";
 const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
 const MAX_ATTEMPTS = 6;
 // The hard task. Specific enough that a small model will miss at
 // least one rule. Not purely knowledge-based — it's a code-generation
 // task with strict structural constraints.
 const TASK = `Write a complete Rust async function with the EXACT signature:
    pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
 It must:
 1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
 2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
 3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
 4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
 5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
 6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
 7. Include rustdoc /// comments on the function and each internal helper
 Assume this struct is already imported:
    pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
 Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
 // Escalation ladder — small-local → large-local → cloud → specialist
 // cloud → trillion-param cloud. Corrected 2026-04-22 per J:
 // gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
 // final escalation tier should be kimi-k2:1t (the biggest model
 // we have access to on Ollama Cloud).
 const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
  { provider: "ollama",       model: "qwen3.5:latest", note: "local 7B" },
  { provider: "ollama",       model: "qwen3:latest",   note: "local 7B (different) " },
  { provider: "ollama",       model: "gpt-oss:20b",    note: "local 20B" },          // FIXED: local, not cloud
  { provider: "ollama_cloud", model: "gpt-oss:120b",   note: "cloud 120B" },
  { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
  // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
  // K2.5/K2.6 both return "this model requires a subscription" on our
  // current Ollama Cloud key. mistral-large-3:675b is the biggest
  // model actually provisioned on this key (verified via direct curl
  // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
  // swap this line to kimi-k2.5 or kimi-k2.6:cloud.
  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
 ];
 // Validation rubric — the answer must pass all of these to be accepted.
 interface RubricResult {
  passed: boolean;
  violations: string[];
  passed_rules: string[];
 }
 function validate(code: string): RubricResult {
  const violations: string[] = [];
  const passed: string[] = [];
  const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
  check("has pub async fn check_drift_batched signature",
    /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
  check("takes Vec<DocRef> argument",
    /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
  check("returns Result<Vec<String>, String>",
    /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
  check("uses reqwest",
    /\breqwest\b/i.test(code));
  check("references JoinSet or Semaphore for concurrency",
    /\bJoinSet\b|\bSemaphore\b/i.test(code));
  check("bounds concurrency at 4",
    /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
  // Exponential backoff — models express this several ways. Accept
  // any recognizable doubling pattern starting at 250ms. 2026-04-22:
  // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
  // rejected even though the code is correct. Broadening rubric to
  // match all idiomatic doubling forms.
  const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
                  || /millis\s*\(\s*250\b/.test(code);
  const hasDoublingPattern = /250\s*\*\s*2/.test(code)         // 250 * 2^n literal
                          || /<<\s*\d+/.test(code)              // bit-shift
                          || /\.pow\s*\(/.test(code)            // 2u32.pow(attempt)
                          || /\*=\s*2\b/.test(code)             // delay *= 2  ← was missing
                          || /\*\s*2\s*;/.test(code)            // delay = delay * 2;
                          || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
  check("has 250ms backoff seed",
    hasSeed250);
  check("reaches 500ms backoff (literal or doubling from 250)",
    /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
    || /millis\s*\(\s*500\b/.test(code)
    || (hasSeed250 && hasDoublingPattern));
  check("reaches 1000ms backoff (literal or doubling to 1000)",
    /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
    || /millis\s*\(\s*1000\b/.test(code)
    || (hasSeed250 && hasDoublingPattern));
  check("case-insensitive tool grouping (to_ascii_lowercase)",
    /to_ascii_lowercase|to_lowercase/.test(code));
  check("NO .unwrap() — all errors bubble via ?",
    !/\.unwrap\s*\(\s*\)/.test(code));
  check("NO .expect(...) — all errors bubble via ?",
    !/\.expect\s*\(/.test(code));
  check("NO panic!() / unimplemented!() / todo!()",
    !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
  check("has rustdoc /// comments",
    /\/\/\//.test(code));
  check("reasonable length (> 500 chars)",
    code.length > 500);
  return { passed: violations.length === 0, violations, passed_rules: passed };
 }
 function log(msg: string) { console.log(`[hard] ${msg}`); }
 async function chat(opts: {
  provider: "ollama" | "ollama_cloud",
  model: string,
  prompt: string,
 }): Promise<{ content: string; error?: string }> {
  try {
    const r = await fetch(`${GATEWAY}/v1/chat`, {
      method: "POST", headers: { "content-type": "application/json" },
      body: JSON.stringify({
        provider: opts.provider,
        model: opts.model,
        messages: [{ role: "user", content: opts.prompt }],
        max_tokens: 2500,
        temperature: 0.2,
        think: false,
      }),
      signal: AbortSignal.timeout(240000),
    });
    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
    const j: any = await r.json();
    return { content: j.choices?.[0]?.message?.content ?? "" };
  } catch (e) {
    return { content: "", error: (e as Error).message };
  }
 }
 interface AttemptRecord {
  n: number;
  provider: string;
  model: string;
  duration_ms: number;
  content_chars: number;
  error: string | null;
  rubric_violations: string[];
  rubric_passed: string[];
  accepted: boolean;
 }
 function extractCode(raw: string): string {
  // Strip common fence wrappers
  const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
  if (m) return m[1].trim();
  return raw.trim();
 }
 async function main() {
  await mkdir(OUT_DIR, { recursive: true });
  log(`output: ${OUT_DIR}`);
  log(`task: ${TASK.slice(0, 120)}...`);
  log("");
  const attempts: AttemptRecord[] = [];
  let acceptedCode: string | null = null;
  for (let i = 0; i < MAX_ATTEMPTS; i++) {
    const n = i + 1;
    const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
    // Build the prompt: base task + prior failures' learning blocks
    let priorLearning = "";
    if (attempts.length > 0) {
      priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
      for (const a of attempts) {
        priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
        for (const v of a.rubric_violations) priorLearning += `  - ${v}\n`;
        if (a.error) priorLearning += `  [error: ${a.error.slice(0, 120)}]\n`;
      }
      priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
    }
    log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
    const t0 = Date.now();
    const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
    const dur = Date.now() - t0;
    const code = extractCode(r.content);
    const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
    const record: AttemptRecord = {
      n,
      provider: rung.provider,
      model: rung.model,
      duration_ms: dur,
      content_chars: code.length,
      error: r.error ?? null,
      rubric_violations: rubric.violations,
      rubric_passed: rubric.passed_rules,
      accepted: rubric.passed,
    };
    attempts.push(record);
    log(`  → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
    for (const v of rubric.violations.slice(0, 5)) log(`       ✗ ${v}`);
    await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
    await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
    if (rubric.passed) {
      log(`  ✅ ACCEPTED on attempt ${n}`);
      acceptedCode = code;
      break;
    }
  }
  const summary = {
    task: TASK.slice(0, 200),
    total_attempts: attempts.length,
    accepted: acceptedCode !== null,
    accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
    escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
    per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
    per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
    total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
  };
  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
  log("");
  log(`═══ RESULT ═══`);
  log(`attempts: ${summary.total_attempts}`);
  log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
  log(`escalation path:`);
  for (const [i, a] of attempts.entries()) {
    const mark = a.accepted ? "✅" : "❌";
    log(`  ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
  }
  log("");
  log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
  log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
  process.exit(summary.accepted ? 0 : 1);
 }
 main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
--- a/tests/real-world/nine_consecutive_audits.ts
+++ b/tests/real-world/nine_consecutive_audits.ts
@ -1,186 +0,0 @@
 // Nine-consecutive audit runner — empirical test of the predictive-
 // compounding property. Runs the audit pipeline 9 times against the
 // same PR (each time with a new diff from Gitea), captures the
 // verdict + audit_lessons state after each run, and reports whether
 // the KB stabilizes or drifts.
 //
 // What we expect (favorable compounding):
 //   - signature_count grows sublinearly (same patterns recur, so
 //     distinct-signature count stabilizes fast)
 //   - verdict settles on a stable value after run 2-3 (first audit
 //     establishes baseline, rest repeat)
 //   - confidence stays LOW for all signatures (same PR repeatedly)
 //   - NO new recurring findings fire because confidence < 0.3 on
 //     same-PR noise (kb_index rating policy)
 //
 // What would indicate drift (the thing we want to prove DOESN'T happen):
 //   - signature_count grows linearly — each run produces new signatures
 //   - verdict oscillates (block → approve → block ...)
 //   - confidence inflates — kb_index rating escalates on repeated runs
 //
 // Run: bun run tests/real-world/nine_consecutive_audits.ts
 import { readFile, writeFile } from "node:fs/promises";
 import { join } from "node:path";
 import { aggregate } from "../../auditor/kb_index.ts";
 import { getPrSnapshot } from "../../auditor/gitea.ts";
 import { auditPr } from "../../auditor/audit.ts";
 const REPO = "/home/profit/lakehouse";
 const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
 const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
 const POLL_INTERVAL_MS = 5_000;
 const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
 const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
 const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0";
 const RESET_KB = process.env.LH_RESET_KB === "1";
 async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise<any> {
  const short = sha.slice(0, 12);
  const path = join(VERDICTS_DIR, `${prNum}-${short}.json`);
  const start = Date.now();
  while (Date.now() - start < deadlineMs) {
    try {
      const raw = await readFile(path, "utf8");
      return JSON.parse(raw);
    } catch { /* not yet */ }
    await Bun.sleep(POLL_INTERVAL_MS);
  }
  throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
 }
 async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
  const agg = await aggregate<any>(AUDIT_LESSONS, {
    keyFn: (r) => r?.signature,
    scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
  });
  const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
  const recurring = list.filter(r => r.count >= 2);
  const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0;
  const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0;
  return {
    sig_count: list.length,
    max_count: list[0]?.count ?? 0,
    max_confidence: recurringMaxConf,
    recurring_max_count: recurringMaxCount,
    top3: list.slice(0, 3).map(a => ({
      sig: a.signature,
      count: a.count,
      conf: a.confidence,
      summary: a.representative_summary.slice(0, 80),
    })),
  };
 }
 interface RunRecord {
  run: number;
  sha: string;
  verdict_overall: string;
  findings_total: number;
  findings_block: number;
  findings_warn: number;
  findings_info: number;
  audit_duration_ms: number;
  claims_total: number;
  claims_empirical: number;
  kb_sig_count_after: number;
  kb_max_count_after: number;
  kb_max_confidence_after: number;
  kb_recurring_max_count: number;
 }
 async function main() {
  console.log(`[nine] target PR: #${TARGET_PR}`);
  console.log(`[nine] runs:      ${RUNS}`);
  console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`);
  console.log(`[nine] reset_kb:  ${RESET_KB}`);
  console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
  if (RESET_KB) {
    console.log("[nine] clearing audit_lessons.jsonl for clean test...");
    await writeFile(AUDIT_LESSONS, "");
  }
  console.log("");
  const pr = await getPrSnapshot(TARGET_PR);
  console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
  console.log(`[nine] files in diff: ${pr.files.length}`);
  console.log("");
  const baseline = await captureAggState();
  console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
  console.log("");
  const records: RunRecord[] = [];
  for (let n = 1; n <= RUNS; n++) {
    const t0 = Date.now();
    console.log(`─── run ${n}/${RUNS} ───`);
    const verdict = await auditPr(pr, {
      dry_run: true,
      skip_dynamic: true,
      skip_inference: SKIP_INFERENCE,
    });
    console.log(`  sha ${verdict.head_sha.slice(0, 12)}`);
    const after = await captureAggState();
    const rec: RunRecord = {
      run: n,
      sha: verdict.head_sha.slice(0, 12),
      verdict_overall: String(verdict.overall),
      findings_total: Number(verdict.metrics?.findings_total ?? 0),
      findings_block: Number(verdict.metrics?.findings_block ?? 0),
      findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
      findings_info: Number(verdict.metrics?.findings_info ?? 0),
      audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
      claims_total: Number(verdict.metrics?.claims_total ?? 0),
      claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
      kb_sig_count_after: after.sig_count,
      kb_max_count_after: after.max_count,
      kb_max_confidence_after: after.max_confidence,
      kb_recurring_max_count: after.recurring_max_count,
    };
    records.push(rec);
    console.log(`  verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
    console.log(`  kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
    console.log(`  elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
    console.log("");
  }
  console.log("═══ FINAL ═══");
  console.log("run | verdict          | find  | block  warn  info | dur_s | kb_sig  max_count  max_conf");
  for (const r of records) {
    console.log(
      `  ${String(r.run).padStart(1)}  | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)}  | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)}  | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)}  ${String(r.kb_max_count_after).padStart(9)}  ${r.kb_max_confidence_after.toFixed(2)}`,
    );
  }
  console.log("");
  console.log("═══ COMPOUNDING PROPERTY ═══");
  const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
  const maxConf = records[records.length - 1].kb_max_confidence_after;
  const recurringMax = records[records.length - 1].kb_recurring_max_count;
  console.log(`  signatures added over ${RUNS} runs: ${sigDelta}`);
  console.log(`  max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`);
  console.log(`  max confidence after run ${RUNS}:    ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
  const verdictSet = new Set(records.map(r => r.verdict_overall));
  if (verdictSet.size === 1) {
    console.log(`  verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
  } else {
    console.log(`  verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
  }
  if (maxConf < 0.6 && recurringMax < 5) {
    console.log(`  confidence policy holding: same-PR noise stays below escalation threshold ✓`);
  } else {
    console.log(`  ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`);
  }
  const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
  await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
  console.log("");
  console.log(`  report: ${jsonOut}`);
 }
 main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });