Auditor self-test branch: real-world pipelines + cohesion Phase C + KB index (PR #8)

Bundles 12 commits validating the auditor + scrum_master architecture end-to-end: - enrich_prd_pipeline / hard_task_escalation / scrum_master_pipeline stress tests - Tree-split + scrum_reviews.jsonl + kb_query surfacing - Verdict → audit_lessons feedback loop (closed) - kb_index aggregator with confidence-based severity policy - 9-run + 5-run empirical tests proved the predictive-compounding property - Level 1 correction: temp=0 cloud inference for deterministic per-claim verdicts - audit_one.ts dry-run CLI - Fixes: static quoted-string guard, empirical-claim classification, symbol-resolver gate, repo-file size cap See PR #8 for run-by-run commit history.
2026-04-23 03:28:32 +00:00 · 2026-04-23 03:28:32 +00:00 · 156dae6732
commit 156dae6732
parent 6d7b251607
13 changed files with 2024 additions and 13 deletions
--- a/auditor/README.md
+++ b/auditor/README.md
@ -33,6 +33,13 @@ Defaults: polls every 90s, stops on `auditor.paused` file present.

 - `data/_auditor/state.json` — last-audited head SHA per PR
 - `data/_auditor/verdicts/{pr}-{sha}.json` — per-run verdict record
+- `data/_kb/audit_lessons.jsonl` — one row per block/warn finding,
+  path-agnostic signature for dedup. Tailed by kb_query on each audit
+  to surface recurring patterns (2+ distinct PRs with same signature
+  → info, 3-4 → warn, 5+ → block). This is how the auditor learns.
+- `data/_kb/scrum_reviews.jsonl` — scrum-master per-file reviews. If
+  a file in the current PR has been scrum-reviewed, kb_query surfaces
+  the review as a finding with the accepted model and attempt count.

 ## Where YOU edit

--- a/auditor/audit.ts
+++ b/auditor/audit.ts
@ -12,7 +12,8 @@
 // review — reviews have self-review restrictions on Gitea and the
 // auditor currently uses the same PAT as the PR author).

-import { readFile, writeFile, mkdir } from "node:fs/promises";
+import { readFile, writeFile, mkdir, appendFile } from "node:fs/promises";
+import { createHash } from "node:crypto";
 import { join } from "node:path";
 import type { PrSnapshot, Verdict, Finding } from "./types.ts";
 import { getPrDiff, postCommitStatus, postIssueComment } from "./gitea.ts";
@ -24,6 +25,10 @@ import { runInferenceCheck } from "./checks/inference.ts";
 import { runKbCheck } from "./checks/kb_query.ts";

 const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
+// Playbook for audit findings — one row per block/warn finding from a
+// verdict. kb_query tails this next audit and escalates recurrences.
+// Structured as JSONL so it's cheap to append and cheap to tail.
+const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";

 export interface AuditOptions {
  // Skip the cloud inference call (fast path for iteration). Default false.
@ -52,7 +57,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
    runStaticCheck(diff),
    opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
    opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
-    runKbCheck(claims),
+    runKbCheck(claims, pr.files.map(f => f.path)),
  ]);

  const allFindings: Finding[] = [
@ -72,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
    claims_strong: claims.filter(c => c.strength === "strong").length,
    claims_moderate: claims.filter(c => c.strength === "moderate").length,
    claims_weak: claims.filter(c => c.strength === "weak").length,
+    claims_empirical: claims.filter(c => c.strength === "empirical").length,
    claims_total: claims.length,
    diff_bytes: diff.length,
  };
@ -80,6 +86,15 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<

  await persistVerdict(verdict);

+  // Feedback loop — every block/warn finding becomes a row in
+  // audit_lessons.jsonl, dedup-keyed by (check, normalized-summary).
+  // The next audit's kb_query reads these and escalates recurring
+  // findings so we don't lose the "this pattern has been flagged
+  // before" signal across runs. Fire-and-forget; failure here must
+  // not break the audit.
+  appendAuditLessons(verdict).catch(e =>
+    console.error(`[audit] audit_lessons append failed: ${(e as Error).message}`));
+
  if (!opts.dry_run) {
    await postToGitea(verdict);
  }
@ -87,6 +102,42 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
  return verdict;
 }

+// Normalizes a finding summary for dedup: strips path-specific tails
+// ("in path/to/file.ts" → "in <file>"), line numbers, and long
+// commit-hash snippets. The goal is: the SAME class of finding on
+// DIFFERENT files should share a signature, so we can measure
+// "this pattern keeps showing up."
+function normalizedSignature(f: Finding): string {
+  const summary = String(f.summary)
+    .replace(/\bin\s+\S+\.(ts|rs|js|py|md)\b/gi, "in <file>")
+    .replace(/:\+?\d+\b/g, ":<line>")
+    .replace(/[0-9a-f]{8,}/gi, "<hash>")
+    .replace(/\s+/g, " ")
+    .trim()
+    .slice(0, 240);
+  const src = `${f.check}::${f.severity}::${summary}`;
+  return createHash("sha256").update(src).digest("hex").slice(0, 16);
+}
+
+async function appendAuditLessons(v: Verdict): Promise<void> {
+  const actionable = v.findings.filter(f => f.severity === "block" || f.severity === "warn");
+  if (actionable.length === 0) return;
+  await mkdir(join(AUDIT_LESSONS_JSONL, ".."), { recursive: true });
+  const rows: string[] = [];
+  for (const f of actionable) {
+    rows.push(JSON.stringify({
+      signature: normalizedSignature(f),
+      check: f.check,
+      severity: f.severity,
+      summary: f.summary,
+      pr_number: v.pr_number,
+      head_sha: v.head_sha,
+      audited_at: v.audited_at,
+    }));
+  }
+  await appendFile(AUDIT_LESSONS_JSONL, rows.join("\n") + "\n");
+}
+
 async function persistVerdict(v: Verdict): Promise<void> {
  await mkdir(VERDICTS_DIR, { recursive: true });
  const filename = `${v.pr_number}-${v.head_sha.slice(0, 12)}.json`;
--- a/auditor/audit_one.ts
+++ b/auditor/audit_one.ts
@ -0,0 +1,68 @@
+// One-shot dry-run audit of a single PR. Useful for verifying check
+// behavior (kb_query scrum surfacing, inference prompts, etc.) without
+// posting to Gitea. Does NOT touch state.json and does NOT post
+// commit status or PR comments.
+//
+// Run: bun run auditor/audit_one.ts <pr-number>
+
+import { getPrSnapshot } from "./gitea.ts";
+import { auditPr } from "./audit.ts";
+
+async function main() {
+  const prNumRaw = process.argv[2];
+  if (!prNumRaw) {
+    console.error("usage: bun run auditor/audit_one.ts <pr-number>");
+    process.exit(2);
+  }
+  const prNum = Number(prNumRaw);
+  if (!Number.isFinite(prNum)) {
+    console.error(`invalid PR number: ${prNumRaw}`);
+    process.exit(2);
+  }
+
+  console.log(`[audit_one] fetching PR #${prNum}...`);
+  const pr = await getPrSnapshot(prNum);
+  console.log(`[audit_one] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
+  console.log(`[audit_one] files in diff: ${pr.files.length}`);
+  for (const f of pr.files) console.log(`  - ${f.path} (+${f.additions}/-${f.deletions})`);
+  console.log("");
+
+  const verdict = await auditPr(pr, {
+    dry_run: true,                      // no Gitea posting
+    skip_dynamic: true,                  // don't run fixture
+    skip_inference: process.env.LH_AUDITOR_SKIP_INFERENCE === "1",
+  });
+
+  console.log("\n═══ VERDICT ═══");
+  console.log(`overall:  ${verdict.overall}`);
+  console.log(`one-liner: ${verdict.one_liner}`);
+  console.log(`findings:  total=${verdict.metrics.findings_total} block=${verdict.metrics.findings_block} warn=${verdict.metrics.findings_warn} info=${verdict.metrics.findings_info}`);
+  console.log("");
+
+  // Print findings, highlighting kb_query scrum surfacing
+  const byCheck: Record<string, typeof verdict.findings> = {};
+  for (const f of verdict.findings) (byCheck[f.check] ||= []).push(f);
+
+  for (const [check, findings] of Object.entries(byCheck)) {
+    console.log(`── ${check} (${findings.length}) ──`);
+    for (const f of findings) {
+      const tag = f.severity === "block" ? "🛑" : f.severity === "warn" ? "⚠️ " : "ℹ️ ";
+      console.log(`  ${tag} [${f.severity}] ${f.summary}`);
+      if (f.summary.includes("scrum-master")) {
+        for (const e of f.evidence) {
+          console.log(`      → ${e.slice(0, 200)}`);
+        }
+      }
+    }
+  }
+
+  const scrumFindings = verdict.findings.filter(f => f.summary.includes("scrum-master"));
+  console.log("");
+  console.log(`═══ SCRUM WIRE CHECK: ${scrumFindings.length} scrum-master findings surfaced by kb_query ═══`);
+  if (scrumFindings.length === 0) {
+    console.log("  (none — either no matching scrum_reviews.jsonl rows, or files didn't match PR diff)");
+  }
+  process.exit(0);
+}
+
+main().catch(e => { console.error("[audit_one] fatal:", e); process.exit(1); });
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -13,6 +13,8 @@
 // with a 15KB diff + claim list).

 import type { Claim, Finding } from "../types.ts";
+import { Glob } from "bun";
+import { readFile } from "node:fs/promises";

 const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
 // block finding when the file was simply outside the truncation window.
 const MAX_DIFF_CHARS = 40000;
 const CALL_TIMEOUT_MS = 120_000;
+const REPO_ROOT = "/home/profit/lakehouse";

 export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
  if (claims.length === 0) {
@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
    }];
  }

+  // Empirical claims (runtime metrics / observed outcomes) can't be
+  // verified from the diff. Drop them from the cloud prompt so the
+  // reviewer doesn't chase ghosts. A future `runtime_evidence` check
+  // can validate these against data/_kb/*/summary.json outputs.
+  const verifiable = claims.filter(c => c.strength !== "empirical");
+  const empiricalCount = claims.length - verifiable.length;
+  if (verifiable.length === 0) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
+      evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
+    }];
+  }
+
  const truncated = diff.length > MAX_DIFF_CHARS
    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
    : diff;
@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<

  const userMsg = [
    `Ship-claims the author made (numbered 0..N-1):`,
-    claims.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
+    verifiable.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
    "",
    `Diff:`,
    "```",
@ -94,9 +112,22 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
          { role: "system", content: systemMsg },
          { role: "user", content: userMsg },
        ],
+        // Deterministic classification — temp=0 is greedy-sample, so
+        // identical input yields identical output on the same model
+        // version. This kills the signature creep we observed in the
+        // 9-run empirical test (sig_count 16→27 from cloud phrasing
+        // variance at temp=0.2).
+        //
+        // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
+        // model; setting think=false caused it to return empty content
+        // on large prompts (observed during Level 1 validation: 13421
+        // tokens used, empty content returned). The reasoning trace is
+        // variable prose, but at temp=0 the FINAL classification is
+        // still deterministic because greedy sampling converges to
+        // the same conclusion from the same starting state.
        max_tokens: 3000,
-        temperature: 0.2,
-        think: true, // T3 overseer should reason — JSON shape is still required
+        temperature: 0,
+        think: true,
      }),
      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
    });
@ -152,7 +183,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
  for (const v of parsed.claim_verdicts ?? []) {
    if (v?.backed === false) {
      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
-      const claim = claims[idx];
+      // Indices point at the verifiable[] list we sent the cloud,
+      // not the full claims[] list. Translate back.
+      const claim = verifiable[idx];
      if (!claim) continue;
      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
@ -172,17 +205,113 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
  }

  for (const g of parsed.unflagged_gaps ?? []) {
+    const summary = String(g?.summary ?? "?");
+    const location = String(g?.location ?? "?");
+    // False-positive guard — when the cloud says "X not defined in this
+    // diff" or "missing implementation of X", the cloud may just mean
+    // "X is not in the added lines," not "X doesn't exist in the repo."
+    // Extract candidate symbol names and grep the repo. If any symbol
+    // is defined elsewhere, drop the finding — it's a known-symbol
+    // reference, not a placeholder.
+    if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
+      const symbols = extractSymbols(summary);
+      if (symbols.length > 0) {
+        const resolved = await symbolsExistInRepo(symbols);
+        if (resolved.length === symbols.length) {
+          // Every named symbol exists somewhere in the repo — silent drop.
+          continue;
+        }
+        if (resolved.length > 0) {
+          // Partially resolved — demote to info with a note.
+          findings.push({
+            check: "inference",
+            severity: "info",
+            summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
+            evidence: [
+              `location: ${location.slice(0, 140)}`,
+              `resolved via grep: ${resolved.join(",")}`,
+              `unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
+            ],
+          });
+          continue;
+        }
+      }
+    }
    findings.push({
      check: "inference",
      severity: "warn",
-      summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
-      evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
+      summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
+      evidence: [`location: ${location.slice(0, 140)}`],
    });
  }

  return findings;
 }

+// Pull out plausible code-symbol names from a summary string.
+// Matches:
+//   - identifier with backticks: `foo_bar`
+//   - identifier followed by parens: foo_bar()
+//   - CamelCase types
+//   - snake_case_functions
+// Filters out common English words that could be matched accidentally.
+const STOPWORDS = new Set([
+  "not","the","and","for","this","that","with","but","are","was","has",
+  "have","been","any","missing","implementation","diff","defined","never",
+  "referenced","integrated","flow","code","file","some","only","when",
+]);
+function extractSymbols(text: string): string[] {
+  const out = new Set<string>();
+  // `backticked` symbols
+  for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
+  // foo() or foo_bar() calls
+  for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
+  // CamelCase types (3+ chars, must start with uppercase)
+  for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
+  return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
+}
+
+// Scan the repo for at least one definition of each symbol. Uses Bun's
+// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
+// and target/. Skips files > 500KB — those are fixtures/snapshots that
+// won't contain a definition line and slurping them slows the audit.
+async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
+  const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
+  const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
+  const MAX_FILE_BYTES = 500_000;
+  const { stat } = await import("node:fs/promises");
+  const resolved = new Set<string>();
+  const toFind = new Set(symbols);
+  for (const pat of patterns) {
+    if (toFind.size === 0) break;
+    const glob = new Glob(pat);
+    for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
+      if (skip(f)) continue;
+      try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; }
+      let content: string;
+      try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
+      for (const sym of Array.from(toFind)) {
+        // Definition heuristics: `function sym`, `fn sym`, `const sym`,
+        // `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
+        // `trait sym`, `async function sym`, `pub (async )?fn sym`.
+        const re = new RegExp(
+          `\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
+        );
+        if (re.test(content)) {
+          resolved.add(sym);
+          toFind.delete(sym);
+          if (toFind.size === 0) break;
+        }
+      }
+    }
+  }
+  return Array.from(resolved);
+}
+
+function escapeRe(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
 // Lift the first balanced JSON object out of the response. Tolerates
 // leading prose, code fences, and model reasoning preamble when the
 // cloud model ignored "strict JSON only."
--- a/auditor/checks/kb_query.ts
+++ b/auditor/checks/kb_query.ts
@ -8,6 +8,7 @@
 // What this check reads (all file-backed, append-only or periodic):
 //   data/_kb/outcomes.jsonl         — per-scenario outcomes (kb.ts)
 //   data/_kb/error_corrections.jsonl — fail→succeed deltas on same sig
+//   data/_kb/scrum_reviews.jsonl     — scrum-master accepted reviews
 //   data/_observer/ops.jsonl         — observer ring → disk stream
 //   data/_bot/cycles/*.json          — bot cycle results
 //
@ -17,14 +18,17 @@
 import { readFile, readdir, stat } from "node:fs/promises";
 import { join } from "node:path";
 import type { Claim, Finding } from "../types.ts";
+import { aggregate, ratingSeverity, formatAgg } from "../kb_index.ts";

 const KB_DIR = "/home/profit/lakehouse/data/_kb";
 const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
 const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
+const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
+const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
 const TAIL_LINES = 500;
 const MAX_BOT_CYCLE_FILES = 30;

-export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
+export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promise<Finding[]> {
  const findings: Finding[] = [];

  // 1. Recent scenario outcomes: are strong-claim-style phrases showing
@ -48,6 +52,27 @@ export async function runKbCheck(claims: Claim[]): Promise<Finding[]> {
  const obsFindings = await checkObserverStream();
  findings.push(...obsFindings);

+  // 5. Scrum-master reviews — surface prior accepted reviews for any
+  //    file in this PR's diff. Cohesion plan Phase C wire: the
+  //    auditor gets to "borrow" the scrum-master's deeper per-file
+  //    analysis instead of re-doing that work.
+  if (prFiles.length > 0) {
+    const scrumFindings = await checkScrumReviews(prFiles);
+    findings.push(...scrumFindings);
+  }
+
+  // 6. Audit-lessons feedback loop — summarize the top recurring
+  //    patterns from prior audits' block/warn findings. If the same
+  //    pattern signature has fired 3+ times across prior audits,
+  //    emit it as a block-severity finding so reviewers know this
+  //    is a known-recurring class, not a one-off. Does NOT couple
+  //    to the current audit's static/inference findings (those run
+  //    in parallel and we can't see them here) — the amplification
+  //    is emergent: if the current audit's finding-summary matches
+  //    a top recurrence, the reviewer sees both.
+  const auditLessonFindings = await checkAuditLessons();
+  findings.push(...auditLessonFindings);
+
  return findings;
 }

@ -181,3 +206,90 @@ function observerBySource(ops: any[]): string {
  }
  return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
 }
+
+// Audit-lessons — reads data/_kb/audit_lessons.jsonl (populated by
+// every audit's appendAuditLessons). Uses the shared kb_index
+// aggregator: groups by `signature`, distinct-scopes keyed by PR
+// number, severity from ratingSeverity(agg) which applies the
+// confidence × count rating (see kb_index.ts). This is the same
+// aggregation any other KB reader uses — shared discipline, not
+// per-check custom logic.
+async function checkAuditLessons(): Promise<Finding[]> {
+  const bySig = await aggregate<any>(AUDIT_LESSONS_JSONL, {
+    keyFn: (r) => r?.signature,
+    scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
+    checkFn: (r) => r?.check,
+    tailLimit: TAIL_LINES * 4,
+  });
+  if (bySig.size === 0) return [];
+
+  const findings: Finding[] = [];
+  for (const [sig, agg] of bySig) {
+    // Silent on first-ever occurrence — not yet signal.
+    if (agg.count < 2) continue;
+    const sev = ratingSeverity(agg);
+    findings.push({
+      check: "kb_query",
+      severity: sev,
+      summary: `recurring audit pattern (${agg.distinct_scopes} distinct PRs, ${agg.count} flaggings, conf=${agg.confidence.toFixed(2)}): ${agg.representative_summary.slice(0, 160)}`,
+      evidence: [
+        `signature=${sig}`,
+        `checks: ${agg.checks.join(",")}`,
+        `scopes: ${agg.scopes.slice(-6).join(",")}`,
+        formatAgg(agg),
+      ],
+    });
+  }
+  return findings;
+}
+
+// Scrum-master reviews — the scrum pipeline writes one row per
+// accepted per-file review. We match reviews whose `file` matches
+// any path in the PR's diff, then surface the *preview* + which
+// model the escalation ladder had to reach. If the scrum-master
+// needed the 123B specialist or larger to resolve a file, that's
+// a meaningful signal about the code's complexity — and it's
+// surfaced to the PR without the auditor having to re-run the
+// escalation ladder itself.
+async function checkScrumReviews(prFiles: string[]): Promise<Finding[]> {
+  const rows = await tailJsonl<any>(SCRUM_REVIEWS_JSONL, TAIL_LINES);
+  if (rows.length === 0) return [];
+
+  // Match by exact file OR filename suffix — PR files arrive as
+  // `auditor/audit.ts`-style relative paths; scrum stores the same.
+  const norm = (p: string) => p.replace(/^\/+/, "").replace(/^home\/profit\/lakehouse\//, "");
+  const prSet = new Set(prFiles.map(norm));
+
+  // Keep only the most recent review per file (last-wins).
+  const latestByFile = new Map<string, any>();
+  for (const r of rows) {
+    const f = norm(String(r.file ?? ""));
+    if (!f) continue;
+    if (!prSet.has(f)) continue;
+    latestByFile.set(f, r);
+  }
+  if (latestByFile.size === 0) return [];
+
+  const findings: Finding[] = [];
+  for (const [file, r] of latestByFile) {
+    const model = String(r.accepted_model ?? "?");
+    const attempt = r.accepted_on_attempt ?? "?";
+    const treeSplit = !!r.tree_split_fired;
+    // Heuristic: if the scrum-master had to escalate past attempt 3,
+    // or had to tree-split, that's context the PR reviewer should see.
+    // Severity: info for low-escalation, warn if escalated far up
+    // the ladder (cloud specialist required).
+    const heavyEscalation = Number(attempt) >= 4;
+    const sev: "warn" | "info" = heavyEscalation ? "warn" : "info";
+    findings.push({
+      check: "kb_query",
+      severity: sev,
+      summary: `scrum-master review for \`${file}\` — accepted on attempt ${attempt} by \`${model}\`${treeSplit ? " (tree-split)" : ""}`,
+      evidence: [
+        `reviewed_at: ${r.reviewed_at ?? "?"}`,
+        `preview: ${String(r.suggestions_preview ?? "").slice(0, 300).replace(/\n/g, " ")}`,
+      ],
+    });
+  }
+  return findings;
+}
--- a/auditor/checks/static.ts
+++ b/auditor/checks/static.ts
@ -61,7 +61,13 @@ export function runStaticCheck(diff: string): Finding[] {

      if (!isAuditorCheckerFile) {
        for (const { re, why } of BLOCK_PATTERNS) {
-          if (re.test(added)) {
+          const m = added.match(re);
+          if (m && typeof m.index === "number") {
+            // Skip if the match sits inside a quoted string literal —
+            // this is how rubric files (tests/real-world/*, prompt
+            // templates) legitimately reference the patterns they
+            // guard against, without actually executing them.
+            if (isInsideQuotedString(added, m.index)) continue;
            findings.push({
              check: "static",
              severity: "block",
@ -154,6 +160,25 @@ function extractNewFields(addedLines: string[]): string[] {
  return Array.from(fields);
 }

+// True if `pos` falls inside a double- or single-quoted string on this
+// line (backtick template literals too). Walks left→right toggling the
+// "in quote" state on each unescaped quote. Good enough for single-
+// line matches; multi-line strings aren't parsed (they're extremely
+// rare in the patterns we're blocking on, and would require a proper
+// tokenizer to handle correctly).
+function isInsideQuotedString(line: string, pos: number): boolean {
+  let inDouble = false, inSingle = false, inBacktick = false;
+  for (let i = 0; i < pos; i++) {
+    const c = line[i];
+    const esc = i > 0 && line[i - 1] === "\\";
+    if (esc) continue;
+    if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
+    else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
+    else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
+  }
+  return inDouble || inSingle || inBacktick;
+}
+
 function escape(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 }
--- a/auditor/claim_parser.ts
+++ b/auditor/claim_parser.ts
@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [
  /\bprobably\b/i,
 ];

+// Empirical claims: runtime measurements / observed outcomes that can't
+// be verified from a diff (only from the actual run that produced
+// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
+// end-to-end" — true, but only the test's own summary.json can
+// confirm it. Classifying as empirical lets the inference check skip
+// diff-verification and saves the ladder for falsifiable claims.
+const EMPIRICAL_PATTERNS: RegExp[] = [
+  // Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
+  /\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
+  /\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
+  // Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
+  /\b\d+\s+(cloud\s+)?calls?\b/i,
+  /\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
+  /\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
+  // "escalated through N tiers", "N distinct models"
+  /\bescalated\s+through\s+\d+\b/i,
+  /\b\d+\s+distinct\s+(model|tier)s?\b/i,
+];
+
 export interface ParsedClaims {
  claims: Claim[];
  commits_scanned: number;
@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
    const line = lines[i];
    if (line.length < 3) continue;

-    // Strong patterns first — if a line matches strong, it's strong,
-    // don't double-count as moderate.
+    // Empirical match wins over everything else — if a line ALSO
+    // contains a moderate word like "complete", we still want to
+    // classify it as empirical so the inference check doesn't ask
+    // the cloud to prove "58 cloud calls" from the diff. Order:
+    // empirical → strong → moderate → weak.
+    const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
+    if (empirical) {
+      out.push({
+        text: line.trim().slice(0, 200),
+        commit_sha,
+        location: `${location_prefix}:${i + 1}`,
+        strength: "empirical",
+      });
+      continue;
+    }
    const strong = firstMatch(line, STRONG_PATTERNS);
    if (strong) {
      out.push({
--- a/auditor/kb_index.ts
+++ b/auditor/kb_index.ts
@ -0,0 +1,161 @@
+// kb_index — generic on-the-fly aggregation over append-only JSONL
+// scratchpads (audit_lessons, scrum_reviews, outcomes, observer ops).
+//
+// The mem0 insight: raw rows are CHEAP and tell the full story, but
+// downstream prompts need a DEFINITION, not a log. A definition is
+// the aggregate: "this signature has fired N times across M distinct
+// scopes, first_seen=X, last_seen=Y, confidence=M/N."
+//
+// This library is the single shared aggregator. Every KB writer keeps
+// appending raw rows; every KB reader uses aggregate() instead of
+// tailing the raw stream. No second file to sync, no ADD/UPDATE/NOOP
+// routing — the stats roll up from the raw rows every time.
+//
+// Why this works past hundreds of runs:
+//   - aggregate() is bounded by distinct_signatures, not total_rows.
+//   - confidence = distinct_scopes / count — low for same-scope noise,
+//     high for cross-scope patterns. Downstream severity ramps on
+//     confidence × count, not raw count, so one unfixed PR can't
+//     inflate its own recurrence score (the classic mem0 failure).
+//   - rotation (later) moves old raw to archive files; aggregate()
+//     can still read both to compute lifetime counts when needed.
+
+import { readFile } from "node:fs/promises";
+
+export interface AggregateRow {
+  signature: string;
+  count: number;
+  distinct_scopes: number;
+  first_seen: string;
+  last_seen: string;
+  confidence: number;        // distinct_scopes / count — capped at 1.0
+  representative_summary: string;  // most-recent summary for this signature
+  scopes: string[];          // up to 20 most-recent scopes for debugging
+  checks: string[];          // distinct `check` values (audit_lessons-specific)
+}
+
+export interface AggregateOptions<T> {
+  /** How to extract the dedup key from a row. */
+  keyFn: (row: T) => string | undefined;
+  /** How to extract the "scope" — distinct scopes count gives confidence. */
+  scopeFn: (row: T) => string | undefined;
+  /** How to extract the timestamp (defaults to row.audited_at / row.reviewed_at / row.timestamp). */
+  timeFn?: (row: T) => string | undefined;
+  /** How to extract a representative summary (defaults to row.summary). */
+  summaryFn?: (row: T) => string | undefined;
+  /** Max rows to read from the JSONL tail; 0 = read all. */
+  tailLimit?: number;
+  /** Include per-row check field (for multi-check aggregates). */
+  checkFn?: (row: T) => string | undefined;
+}
+
+/**
+ * Read a JSONL file and produce the aggregate map keyed by signature.
+ * Safe on missing or malformed files — returns empty map.
+ */
+export async function aggregate<T = any>(
+  jsonlPath: string,
+  opts: AggregateOptions<T>,
+): Promise<Map<string, AggregateRow>> {
+  const out = new Map<string, AggregateRow>();
+  let raw: string;
+  try { raw = await readFile(jsonlPath, "utf8"); } catch { return out; }
+  const lines = raw.split("\n").filter(l => l.length > 0);
+  const sliceFrom = opts.tailLimit && opts.tailLimit > 0 ? Math.max(0, lines.length - opts.tailLimit) : 0;
+
+  const timeFn = opts.timeFn ?? ((r: any) => r?.audited_at ?? r?.reviewed_at ?? r?.timestamp ?? r?.ran_at);
+  const summaryFn = opts.summaryFn ?? ((r: any) => r?.summary ?? r?.representative_summary);
+
+  // Per-signature scope tracking — need counts by scope to compute
+  // distinct_scopes without double-counting a scope that appears 50
+  // times. Using a Set<scope> per signature.
+  const scopeSets = new Map<string, Set<string>>();
+  const checkSets = new Map<string, Set<string>>();
+
+  for (let i = sliceFrom; i < lines.length; i++) {
+    let row: T;
+    try { row = JSON.parse(lines[i]) as T; } catch { continue; }
+    const sig = opts.keyFn(row);
+    if (!sig) continue;
+
+    let agg = out.get(sig);
+    if (!agg) {
+      agg = {
+        signature: sig,
+        count: 0,
+        distinct_scopes: 0,
+        first_seen: "",
+        last_seen: "",
+        confidence: 0,
+        representative_summary: "",
+        scopes: [],
+        checks: [],
+      };
+      out.set(sig, agg);
+      scopeSets.set(sig, new Set<string>());
+      checkSets.set(sig, new Set<string>());
+    }
+
+    agg.count += 1;
+
+    const scope = opts.scopeFn(row);
+    if (scope !== undefined && scope !== null && scope !== "") {
+      scopeSets.get(sig)!.add(String(scope));
+      // Keep scopes array ordered by recency (newest wins — shift
+      // oldest when at cap).
+      const arr = agg.scopes;
+      const s = String(scope);
+      const existing = arr.indexOf(s);
+      if (existing >= 0) arr.splice(existing, 1);
+      arr.push(s);
+      if (arr.length > 20) arr.shift();
+    }
+
+    if (opts.checkFn) {
+      const c = opts.checkFn(row);
+      if (c) checkSets.get(sig)!.add(String(c));
+    }
+
+    const t = timeFn(row);
+    if (t) {
+      if (!agg.first_seen || t < agg.first_seen) agg.first_seen = t;
+      if (!agg.last_seen  || t > agg.last_seen)  agg.last_seen = t;
+    }
+
+    const s = summaryFn(row);
+    if (s) agg.representative_summary = String(s);
+  }
+
+  // Finalize derived fields.
+  for (const [sig, agg] of out) {
+    const scopes = scopeSets.get(sig) ?? new Set<string>();
+    agg.distinct_scopes = scopes.size;
+    agg.confidence = agg.count > 0 ? Math.min(1, agg.distinct_scopes / agg.count) : 0;
+    const checks = checkSets.get(sig);
+    if (checks) agg.checks = Array.from(checks).sort();
+  }
+  return out;
+}
+
+/**
+ * Severity policy derived from aggregate stats. The rating lives here
+ * (not in each check) so all KB readers ramp severity consistently.
+ *
+ *   - confidence × count product is the real signal.
+ *   - Low confidence (< 0.3) = same-scope noise → info regardless of count.
+ *   - Mid confidence (0.3-0.6) = mixed signal → warn at count ≥ 3.
+ *   - High confidence (> 0.6) with count ≥ 5 = block-worthy cross-cutting pattern.
+ *
+ * Callers can override by reading agg directly; this is the default
+ * policy that matches the "don't escalate one unfixed PR" discipline.
+ */
+export function ratingSeverity(agg: AggregateRow): "info" | "warn" | "block" {
+  if (agg.confidence >= 0.6 && agg.count >= 5) return "block";
+  if (agg.confidence >= 0.3 && agg.count >= 3) return "warn";
+  return "info";
+}
+
+/** Human-friendly one-line summary of an aggregate row for finding evidence. */
+export function formatAgg(agg: AggregateRow): string {
+  return `count=${agg.count} distinct_scopes=${agg.distinct_scopes} confidence=${agg.confidence.toFixed(2)} seen=[${agg.first_seen.slice(0, 10)}..${agg.last_seen.slice(0, 10)}]`;
+}
--- a/auditor/types.ts
+++ b/auditor/types.ts
@ -18,7 +18,14 @@ export interface Claim {
  // Heuristic rating of how strong the claim is. "green+tested"
  // is strong; "should work" is weak. Drives sensitivity — stronger
  // claims get harder-blocked on weak evidence.
-  strength: "weak" | "moderate" | "strong";
+  //
+  // "empirical" is a separate class: runtime measurements like
+  // "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
+  // These cannot be verified from a static diff — only from the test
+  // output that produced them. Inference skips diff-verification for
+  // empirical claims; they become info-level context unless a future
+  // runtime_evidence check contradicts them.
+  strength: "weak" | "moderate" | "strong" | "empirical";
 }

 export interface Finding {
--- a/tests/real-world/enrich_prd_pipeline.ts
+++ b/tests/real-world/enrich_prd_pipeline.ts
@ -0,0 +1,528 @@
+// Real-world architecture stress test — 6 iterations of the full pipeline
+// against the PRD as a corpus. Goal: prove at scale what Phase 21
+// promised (context continuation + tree-split), plus Phase 19
+// compounding across iterations.
+//
+// Run: bun run tests/real-world/enrich_prd_pipeline.ts
+//
+// No mocks. No skipped layers. On any error, the test triggers
+// cloud-rescue rather than fail — it's the architecture's job to
+// recover. The test FAILS only if we can't complete 6 iterations.
+
+import { readFile, writeFile, mkdir } from "node:fs/promises";
+import { createHash } from "node:crypto";
+
+const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
+const CHUNK_SIZE = 800;            // chars per chunk — ~200 tokens
+const CHUNK_OVERLAP = 120;
+const TOP_K_RETRIEVE = 12;         // chunks per iteration — pulled up to force overflow
+const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
+const INJECT_FAIL_ON_ITER = 3;     // force the TASK-retry loop on iter 3
+
+// Continuation controls (per-cloud-call) — used for output-overflow.
+// Separate from the task-retry loop (per-task) — that handles errors
+// across attempts.
+const PRIMARY_MAX_TOKENS = 150;     // tight — forces truncation
+const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
+const MAX_CONTINUATIONS = 6;         // max stitch pieces per cloud call
+
+// Task-level retry loop (J's clarification, 2026-04-22):
+// When a TASK errors, retry the whole task up to 6 times. Each
+// retry gets prior attempts' failures injected as learning context,
+// so attempt N+1 is informed by what N failed at. The loop caps at
+// 6 to avoid infinite spinning on genuinely unsolvable tasks.
+const MAX_TASK_RETRIES = 6;
+
+// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
+// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
+// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
+// loop fires all 6 with compounding failure context.
+const FORCE_RETRY_MODEL_SEQUENCE = [
+  "deliberately-invalid-model-attempt-1",
+  "deliberately-invalid-model-attempt-2",
+  "deliberately-invalid-model-attempt-3",
+  "deliberately-invalid-model-attempt-4",
+  "deliberately-invalid-model-attempt-5",
+  "gpt-oss:20b", // 6th attempt succeeds
+];
+const GATEWAY = "http://localhost:3100";
+const SIDECAR = "http://localhost:3200";
+const CLOUD_MODEL = "gpt-oss:120b";
+const RESCUE_MODEL = "gpt-oss:20b";       // fallback local cloud model via sidecar
+const RUN_NONCE = Date.now().toString(36);
+const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
+
+// The 6 progressively-compounding questions. #6 explicitly requires
+// synthesis across prior 5 answers.
+const QUESTIONS: string[] = [
+  "Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
+  "How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
+  "Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
+  "What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
+  "How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
+  "Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
+];
+
+type Chunk = { id: string; text: string; embedding: number[]; offset: number };
+
+interface IterationResult {
+  iteration: number;
+  question: string;
+  retrieval_top_k: number;
+  context_chars_before_budget: number;
+  tree_split_fired: boolean;
+  cloud_calls_total: number;
+  continuation_retries: number;
+  rescue_triggered: boolean;
+  // Task-level retry telemetry
+  task_attempts_made: number;       // how many attempts fired (1 = first succeeded)
+  task_retry_history: Array<{ n: number; model: string; error: string }>;
+  playbook_id: string | null;
+  tokens_prompt: number;
+  tokens_completion: number;
+  citations_from_prior_iterations: string[];
+  duration_ms: number;
+  answer_preview: string;
+  errors_recovered: string[];
+}
+
+function log(msg: string) { console.log(`[enrich] ${msg}`); }
+
+function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
+
+function cosine(a: number[], b: number[]): number {
+  let dot = 0, na = 0, nb = 0;
+  for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
+  if (na === 0 || nb === 0) return 0;
+  return dot / (Math.sqrt(na) * Math.sqrt(nb));
+}
+
+function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
+
+async function embedBatch(texts: string[]): Promise<number[][]> {
+  // Sidecar /embed accepts a list. On partial failure, retry individually.
+  const r = await fetch(`${SIDECAR}/embed`, {
+    method: "POST", headers: { "content-type": "application/json" },
+    body: JSON.stringify({ texts }),
+    signal: AbortSignal.timeout(120000),
+  });
+  if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
+  const j: any = await r.json();
+  return j.embeddings;
+}
+
+function chunkText(text: string): Array<{ text: string; offset: number }> {
+  const out: Array<{ text: string; offset: number }> = [];
+  let i = 0;
+  while (i < text.length) {
+    const end = Math.min(i + CHUNK_SIZE, text.length);
+    const slice = text.slice(i, end).trim();
+    if (slice.length > 50) out.push({ text: slice, offset: i });
+    if (end >= text.length) break;
+    i = end - CHUNK_OVERLAP;
+  }
+  return out;
+}
+
+async function chat(opts: {
+  provider: "ollama" | "ollama_cloud",
+  model: string,
+  messages: Array<{ role: string; content: string }>,
+  max_tokens: number,
+  think: boolean,
+}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
+  const r = await fetch(`${GATEWAY}/v1/chat`, {
+    method: "POST", headers: { "content-type": "application/json" },
+    body: JSON.stringify({ ...opts }),
+    signal: AbortSignal.timeout(180000),
+  });
+  if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
+  const j: any = await r.json();
+  return {
+    content: j.choices?.[0]?.message?.content ?? "",
+    prompt_tokens: j.usage?.prompt_tokens ?? 0,
+    completion_tokens: j.usage?.completion_tokens ?? 0,
+    finish_reason: j.choices?.[0]?.finish_reason ?? "?",
+  };
+}
+
+// ─── Tree-split over oversized chunk set ──────────────────────────
+async function treeSplitSummarize(
+  chunks: Chunk[],
+  question: string,
+): Promise<{ scratchpad: string; cloud_calls: number }> {
+  // Shard into groups fitting within half the budget each.
+  const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
+  const shards: Chunk[][] = [];
+  for (let i = 0; i < chunks.length; i += perShard) {
+    shards.push(chunks.slice(i, i + perShard));
+  }
+  log(`  tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
+  let scratchpad = "";
+  let cloud_calls = 0;
+  for (let si = 0; si < shards.length; si++) {
+    const shard = shards[si];
+    const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
+    const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
+    const r = await chat({
+      provider: "ollama_cloud",
+      model: CLOUD_MODEL,
+      messages: [
+        { role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
+        { role: "user", content: userMsg },
+      ],
+      max_tokens: 500,
+      think: false,
+    });
+    cloud_calls += 1;
+    scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
+    if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
+      // truncate oldest halves
+      scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
+      log(`  tree-split: scratchpad truncated to ${scratchpad.length} chars`);
+    }
+  }
+  return { scratchpad, cloud_calls };
+}
+
+// ─── Continuable generate — up to max_continuations stitches ──────
+//
+// Two failure modes handled:
+//   A) Empty response — typically thinking model burned the budget
+//      on hidden reasoning. Retry with 2× max_tokens.
+//   B) Truncated response (finish_reason=length) — answer got cut off
+//      mid-sentence. Pass the partial back as scratchpad and ask the
+//      model to continue from where it stopped.
+//
+// Stitching: keep appending content across retries; prompt_tokens and
+// completion_tokens accumulate; finish_reason reflects the LAST call.
+// Loop exits on the first call that finishes cleanly (stop) with
+// non-empty content, OR when retries hit the cap.
+async function generateContinuable(
+  opts: Parameters<typeof chat>[0] & { max_continuations?: number },
+): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
+  const maxCont = opts.max_continuations ?? 1;
+  let total = await chat(opts);
+  let retries = 0;
+  while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
+    retries += 1;
+    const mode = total.content.length === 0 ? "empty" : "truncated";
+    log(`  continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
+    // Continuation prompt — branch on failure mode:
+    //   empty  → retry with 2× tokens, same prompt (thinking budget)
+    //   length → pass the partial as assistant turn, ask to continue
+    const continuationMessages = total.content.length === 0
+      ? opts.messages
+      : [
+          ...opts.messages,
+          { role: "assistant", content: total.content },
+          { role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
+        ];
+    const continued = await chat({
+      ...opts,
+      max_tokens: CONTINUATION_MAX_TOKENS,
+      messages: continuationMessages,
+    });
+    total = {
+      content: total.content + continued.content,
+      prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
+      completion_tokens: total.completion_tokens + continued.completion_tokens,
+      finish_reason: continued.finish_reason,
+    };
+  }
+  return { ...total, continuation_retries: retries };
+}
+
+// ─── Single iteration: retrieve → budget-check → chat → seed ─────
+async function runIteration(
+  iteration: number,
+  question: string,
+  allChunks: Chunk[],
+  priorPlaybookIds: string[],
+  priorAnswers: string[],
+): Promise<IterationResult> {
+  const started = Date.now();
+  const errorsRecovered: string[] = [];
+  log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
+
+  // 1. Embed the question
+  const qEmb = (await embedBatch([question]))[0];
+
+  // 2. Retrieve top-K chunks by cosine
+  const scored = allChunks
+    .map(c => ({ c, score: cosine(qEmb, c.embedding) }))
+    .sort((a, b) => b.score - a.score)
+    .slice(0, TOP_K_RETRIEVE);
+  const chunks = scored.map(x => x.c);
+  log(`  retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
+
+  // 3. Context budget check — tree-split if over
+  const contextChars = chunks.map(c => c.text).join("\n\n").length;
+  let contextForPrompt: string;
+  let treeSplit = false;
+  let cloudCallsTotal = 0;
+  if (contextChars > CONTEXT_BUDGET_CHARS) {
+    treeSplit = true;
+    log(`  context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
+    const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
+    contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
+    cloudCallsTotal += cloud_calls;
+  } else {
+    contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
+  }
+
+  // 4. Seed prompt with prior iteration answers (real compounding).
+  //    Not just IDs — the model needs the CONTENT to synthesize.
+  let citationBlock = "";
+  let citationsReceived: string[] = [];
+  if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
+    const lines = priorAnswers.map((ans, i) => {
+      const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
+      // Trim each prior answer to ~400 chars so we don't blow budget
+      return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
+    });
+    citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
+    citationsReceived = priorPlaybookIds.slice();
+  }
+
+  // 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
+  //    Try the task up to MAX_TASK_RETRIES times. Each retry:
+  //    a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
+  //       cycles through 5 invalid models + 1 valid to force full loop)
+  //    b) Injects prior attempt errors as learning context
+  //    c) If the attempt succeeds (non-empty, >100 chars), loop exits
+  //    d) Otherwise, records failure and tries again with the learning
+  //
+  //    Cap at 6 so we don't spin forever on unsolvable tasks.
+  let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
+  let rescueTriggered = false;
+  const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
+  const forceRetries = iteration === INJECT_FAIL_ON_ITER;
+  if (forceRetries) log(`  FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
+
+  for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
+    const modelForAttempt = forceRetries
+      ? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
+      : CLOUD_MODEL;
+    // Compose a prior-attempts learning block for attempts 2+
+    const learningBlock = taskAttemptHistory.length > 0
+      ? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
+      : "";
+    log(`  task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
+    try {
+      const r = await generateContinuable({
+        provider: "ollama_cloud",
+        model: modelForAttempt,
+        messages: [
+          { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
+          { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
+        ],
+        max_tokens: PRIMARY_MAX_TOKENS,
+        think: false,
+        max_continuations: MAX_CONTINUATIONS,
+      });
+      cloudCallsTotal += 1 + r.continuation_retries;
+      if (r.content && r.content.length > 100) {
+        // Acceptable answer — exit loop
+        result = r;
+        if (attempt > 1) {
+          log(`  task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
+          rescueTriggered = true;
+        }
+        break;
+      }
+      // Thin response — count as failure with learning signal
+      const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
+      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
+      errorsRecovered.push(`attempt ${attempt}: ${err}`);
+    } catch (e) {
+      const err = (e as Error).message;
+      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
+      errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
+      cloudCallsTotal += 1;
+    }
+  }
+
+  // Last-ditch: if all 6 task attempts failed, try the local fallback
+  // once more so we at least return SOMETHING. This is the "don't get
+  // caught in a loop, accept best-so-far" rule J stated explicitly.
+  if (!result) {
+    errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
+    rescueTriggered = true;
+    try {
+      result = await generateContinuable({
+        provider: "ollama",
+        model: "qwen3.5:latest",
+        messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
+        max_tokens: 300,
+        think: false,
+        max_continuations: 2,
+      });
+      cloudCallsTotal += 1 + result.continuation_retries;
+    } catch (e) {
+      // Absolute last resort — fabricate a skeleton result
+      result = {
+        content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
+        prompt_tokens: 0,
+        completion_tokens: 0,
+        continuation_retries: 0,
+        finish_reason: "error",
+      };
+    }
+  }
+  if (result.content.length === 0) {
+    errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
+    rescueTriggered = true;
+    result = await generateContinuable({
+      provider: "ollama",
+      model: "qwen3.5:latest",
+      messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
+      max_tokens: 300,
+      think: false,
+    });
+    cloudCallsTotal += 1;
+  }
+
+  // 6. Seed playbook with the answer
+  let playbook_id: string | null = null;
+  try {
+    const ts = new Date().toISOString();
+    const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
+    const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
+      method: "POST", headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        operation: seedOp,
+        approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
+        context: result.content.slice(0, 600),
+        endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
+        append: true,
+      }),
+      signal: AbortSignal.timeout(15000),
+    });
+    if (r.ok) {
+      const j: any = await r.json();
+      playbook_id = j.outcome?.playbook_id ?? null;
+    } else {
+      errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
+    }
+  } catch (e) {
+    errorsRecovered.push(`seed exception: ${(e as Error).message}`);
+  }
+
+  return {
+    iteration,
+    question,
+    retrieval_top_k: chunks.length,
+    context_chars_before_budget: contextChars,
+    tree_split_fired: treeSplit,
+    cloud_calls_total: cloudCallsTotal,
+    continuation_retries: result.continuation_retries,
+    rescue_triggered: rescueTriggered,
+    task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
+    task_retry_history: taskAttemptHistory,
+    playbook_id,
+    tokens_prompt: result.prompt_tokens,
+    tokens_completion: result.completion_tokens,
+    citations_from_prior_iterations: citationsReceived,
+    duration_ms: Date.now() - started,
+    answer_preview: result.content.slice(0, 500),
+    errors_recovered: errorsRecovered,
+  };
+}
+
+async function main() {
+  await mkdir(OUT_DIR, { recursive: true });
+  log(`run nonce: ${RUN_NONCE}`);
+  log(`output dir: ${OUT_DIR}`);
+
+  // ─── Phase 1: load, chunk, embed the PRD ───────────────────────
+  log(`loading PRD from ${PRD_PATH}`);
+  const prd = await readFile(PRD_PATH, "utf8");
+  log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
+
+  const raw_chunks = chunkText(prd);
+  log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
+
+  // Embed in batches of 32 to avoid sidecar overload
+  const allChunks: Chunk[] = [];
+  const BATCH = 32;
+  const t0 = Date.now();
+  for (let i = 0; i < raw_chunks.length; i += BATCH) {
+    const batch = raw_chunks.slice(i, i + BATCH);
+    const embs = await embedBatch(batch.map(b => b.text));
+    for (let j = 0; j < batch.length; j++) {
+      allChunks.push({
+        id: hash(batch[j].text),
+        text: batch[j].text,
+        embedding: embs[j].map(x => Number(x)),
+        offset: batch[j].offset,
+      });
+    }
+    log(`  embedded ${allChunks.length}/${raw_chunks.length}`);
+  }
+  log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
+
+  // ─── Phase 2: 6 iterations ─────────────────────────────────────
+  const results: IterationResult[] = [];
+  const priorIds: string[] = [];
+  const priorAnswers: string[] = [];
+  for (let i = 1; i <= QUESTIONS.length; i++) {
+    const q = QUESTIONS[i - 1];
+    const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
+    results.push(r);
+    if (r.playbook_id) priorIds.push(r.playbook_id);
+    priorAnswers.push(r.answer_preview);
+    log(`  → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
+    await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
+  }
+  // Check whether iter 6 actually cited prior pb:IDs in its answer.
+  // Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
+  // hyphens + letters inside the brackets, not just hex chars.
+  const iter6 = results[5];
+  const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
+
+  // ─── Phase 3: summary ──────────────────────────────────────────
+  const summary = {
+    run_nonce: RUN_NONCE,
+    ran_at: new Date().toISOString(),
+    prd_chars: prd.length,
+    prd_chunks: allChunks.length,
+    iterations: results.length,
+    total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
+    total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
+    total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
+    tree_splits_fired: results.filter(r => r.tree_split_fired).length,
+    rescues_triggered: results.filter(r => r.rescue_triggered).length,
+    iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
+    iter6_actually_cited_in_answer: citationsHonored,
+    iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
+    iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
+    max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
+    total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
+    overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
+  };
+  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
+
+  log("");
+  log(`══════ SUMMARY ${summary.overall} ══════`);
+  log(`  6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
+  log(`  tree-splits: ${summary.tree_splits_fired}/6   continuations: ${summary.total_continuation_retries}   rescues: ${summary.rescues_triggered}`);
+  log(`  iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
+  log(`  iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
+  log(`  total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
+  log("");
+  for (const r of results) {
+    const flags = [
+      r.tree_split_fired ? "tree-split" : "",
+      r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
+      r.rescue_triggered ? "rescued" : "",
+      r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
+    ].filter(Boolean).join(" ");
+    log(`  iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
+  }
+  log("");
+  log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
+  process.exit(summary.overall === "PASS" ? 0 : 1);
+}
+
+main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
--- a/tests/real-world/hard_task_escalation.ts
+++ b/tests/real-world/hard_task_escalation.ts
@ -0,0 +1,267 @@
+// Hard-task escalation test. The task is deliberately constructed so
+// that a local 7B model (qwen3.5:latest) will miss at least one of the
+// validation rules. Watch the escalation ladder:
+//   1. qwen3.5:latest  (local 7B)       — likely fails
+//   2. qwen3:latest    (local 7B)       — likely fails differently
+//   3. gpt-oss:20b     (cloud 20B)      — may fail
+//   4. gpt-oss:120b    (cloud 120B)     — should succeed
+//   5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
+//   6. absolute last ditch: return best-so-far with failure annotation
+//
+// Each attempt:
+//   - Calls the model via /v1/chat
+//   - Validates the output against a strict rubric
+//   - On fail: records the specific rubric violations + the partial
+//     output, injects both into the next attempt's prompt as "here's
+//     what's wrong, fix it specifically"
+//   - On success: exit loop
+//
+// Run: bun run tests/real-world/hard_task_escalation.ts
+
+import { writeFile, mkdir } from "node:fs/promises";
+
+const GATEWAY = "http://localhost:3100";
+const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
+const MAX_ATTEMPTS = 6;
+
+// The hard task. Specific enough that a small model will miss at
+// least one rule. Not purely knowledge-based — it's a code-generation
+// task with strict structural constraints.
+const TASK = `Write a complete Rust async function with the EXACT signature:
+
+    pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
+
+It must:
+1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
+2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
+3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
+4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
+5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
+6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
+7. Include rustdoc /// comments on the function and each internal helper
+
+Assume this struct is already imported:
+
+    pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
+
+Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
+
+// Escalation ladder — small-local → large-local → cloud → specialist
+// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
+// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
+// final escalation tier should be kimi-k2:1t (the biggest model
+// we have access to on Ollama Cloud).
+const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
+  { provider: "ollama",       model: "qwen3.5:latest", note: "local 7B" },
+  { provider: "ollama",       model: "qwen3:latest",   note: "local 7B (different) " },
+  { provider: "ollama",       model: "gpt-oss:20b",    note: "local 20B" },          // FIXED: local, not cloud
+  { provider: "ollama_cloud", model: "gpt-oss:120b",   note: "cloud 120B" },
+  { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
+  // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
+  // K2.5/K2.6 both return "this model requires a subscription" on our
+  // current Ollama Cloud key. mistral-large-3:675b is the biggest
+  // model actually provisioned on this key (verified via direct curl
+  // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
+  // swap this line to kimi-k2.5 or kimi-k2.6:cloud.
+  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
+];
+
+// Validation rubric — the answer must pass all of these to be accepted.
+interface RubricResult {
+  passed: boolean;
+  violations: string[];
+  passed_rules: string[];
+}
+
+function validate(code: string): RubricResult {
+  const violations: string[] = [];
+  const passed: string[] = [];
+
+  const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
+
+  check("has pub async fn check_drift_batched signature",
+    /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
+  check("takes Vec<DocRef> argument",
+    /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
+  check("returns Result<Vec<String>, String>",
+    /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
+  check("uses reqwest",
+    /\breqwest\b/i.test(code));
+  check("references JoinSet or Semaphore for concurrency",
+    /\bJoinSet\b|\bSemaphore\b/i.test(code));
+  check("bounds concurrency at 4",
+    /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
+  // Exponential backoff — models express this several ways. Accept
+  // any recognizable doubling pattern starting at 250ms. 2026-04-22:
+  // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
+  // rejected even though the code is correct. Broadening rubric to
+  // match all idiomatic doubling forms.
+  const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
+                  || /millis\s*\(\s*250\b/.test(code);
+  const hasDoublingPattern = /250\s*\*\s*2/.test(code)         // 250 * 2^n literal
+                          || /<<\s*\d+/.test(code)              // bit-shift
+                          || /\.pow\s*\(/.test(code)            // 2u32.pow(attempt)
+                          || /\*=\s*2\b/.test(code)             // delay *= 2  ← was missing
+                          || /\*\s*2\s*;/.test(code)            // delay = delay * 2;
+                          || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
+  check("has 250ms backoff seed",
+    hasSeed250);
+  check("reaches 500ms backoff (literal or doubling from 250)",
+    /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
+    || /millis\s*\(\s*500\b/.test(code)
+    || (hasSeed250 && hasDoublingPattern));
+  check("reaches 1000ms backoff (literal or doubling to 1000)",
+    /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
+    || /millis\s*\(\s*1000\b/.test(code)
+    || (hasSeed250 && hasDoublingPattern));
+  check("case-insensitive tool grouping (to_ascii_lowercase)",
+    /to_ascii_lowercase|to_lowercase/.test(code));
+  check("NO .unwrap() — all errors bubble via ?",
+    !/\.unwrap\s*\(\s*\)/.test(code));
+  check("NO .expect(...) — all errors bubble via ?",
+    !/\.expect\s*\(/.test(code));
+  check("NO panic!() / unimplemented!() / todo!()",
+    !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
+  check("has rustdoc /// comments",
+    /\/\/\//.test(code));
+  check("reasonable length (> 500 chars)",
+    code.length > 500);
+
+  return { passed: violations.length === 0, violations, passed_rules: passed };
+}
+
+function log(msg: string) { console.log(`[hard] ${msg}`); }
+
+async function chat(opts: {
+  provider: "ollama" | "ollama_cloud",
+  model: string,
+  prompt: string,
+}): Promise<{ content: string; error?: string }> {
+  try {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST", headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: opts.provider,
+        model: opts.model,
+        messages: [{ role: "user", content: opts.prompt }],
+        max_tokens: 2500,
+        temperature: 0.2,
+        think: false,
+      }),
+      signal: AbortSignal.timeout(240000),
+    });
+    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
+    const j: any = await r.json();
+    return { content: j.choices?.[0]?.message?.content ?? "" };
+  } catch (e) {
+    return { content: "", error: (e as Error).message };
+  }
+}
+
+interface AttemptRecord {
+  n: number;
+  provider: string;
+  model: string;
+  duration_ms: number;
+  content_chars: number;
+  error: string | null;
+  rubric_violations: string[];
+  rubric_passed: string[];
+  accepted: boolean;
+}
+
+function extractCode(raw: string): string {
+  // Strip common fence wrappers
+  const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
+  if (m) return m[1].trim();
+  return raw.trim();
+}
+
+async function main() {
+  await mkdir(OUT_DIR, { recursive: true });
+  log(`output: ${OUT_DIR}`);
+  log(`task: ${TASK.slice(0, 120)}...`);
+  log("");
+
+  const attempts: AttemptRecord[] = [];
+  let acceptedCode: string | null = null;
+
+  for (let i = 0; i < MAX_ATTEMPTS; i++) {
+    const n = i + 1;
+    const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
+
+    // Build the prompt: base task + prior failures' learning blocks
+    let priorLearning = "";
+    if (attempts.length > 0) {
+      priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
+      for (const a of attempts) {
+        priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
+        for (const v of a.rubric_violations) priorLearning += `  - ${v}\n`;
+        if (a.error) priorLearning += `  [error: ${a.error.slice(0, 120)}]\n`;
+      }
+      priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
+    }
+
+    log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
+    const t0 = Date.now();
+    const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
+    const dur = Date.now() - t0;
+
+    const code = extractCode(r.content);
+    const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
+
+    const record: AttemptRecord = {
+      n,
+      provider: rung.provider,
+      model: rung.model,
+      duration_ms: dur,
+      content_chars: code.length,
+      error: r.error ?? null,
+      rubric_violations: rubric.violations,
+      rubric_passed: rubric.passed_rules,
+      accepted: rubric.passed,
+    };
+    attempts.push(record);
+
+    log(`  → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
+    for (const v of rubric.violations.slice(0, 5)) log(`       ✗ ${v}`);
+
+    await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
+    await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
+
+    if (rubric.passed) {
+      log(`  ✅ ACCEPTED on attempt ${n}`);
+      acceptedCode = code;
+      break;
+    }
+  }
+
+  const summary = {
+    task: TASK.slice(0, 200),
+    total_attempts: attempts.length,
+    accepted: acceptedCode !== null,
+    accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
+    escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
+    per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
+    per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
+    total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
+  };
+  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
+
+  log("");
+  log(`═══ RESULT ═══`);
+  log(`attempts: ${summary.total_attempts}`);
+  log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
+  log(`escalation path:`);
+  for (const [i, a] of attempts.entries()) {
+    const mark = a.accepted ? "✅" : "❌";
+    log(`  ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
+  }
+  log("");
+  log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
+  log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
+
+  process.exit(summary.accepted ? 0 : 1);
+}
+
+main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
--- a/tests/real-world/nine_consecutive_audits.ts
+++ b/tests/real-world/nine_consecutive_audits.ts
@ -0,0 +1,181 @@
+// Nine-consecutive audit runner — empirical test of the predictive-
+// compounding property. Pushes 9 empty commits to the current branch,
+// waits for each audit to complete on the new SHA, captures the
+// verdict + audit_lessons state after each run, and reports whether
+// the KB stabilizes or drifts.
+//
+// What we expect (favorable compounding):
+//   - signature_count grows sublinearly (same patterns recur, so
+//     distinct-signature count stabilizes fast)
+//   - verdict settles on a stable value after run 2-3 (first audit
+//     establishes baseline, rest repeat)
+//   - confidence stays LOW for all signatures (same PR repeatedly)
+//   - NO new recurring findings fire because confidence < 0.3 on
+//     same-PR noise (kb_index rating policy)
+//
+// What would indicate drift (the thing we want to prove DOESN'T happen):
+//   - signature_count grows linearly — each run produces new signatures
+//   - verdict oscillates (block → approve → block ...)
+//   - confidence inflates — kb_index rating escalates on repeated runs
+//
+// Run: bun run tests/real-world/nine_consecutive_audits.ts
+
+import { readFile } from "node:fs/promises";
+import { aggregate } from "../../auditor/kb_index.ts";
+
+const REPO = "/home/profit/lakehouse";
+const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
+const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
+const POLL_INTERVAL_MS = 5_000;
+const AUDIT_TIMEOUT_MS = 180_000;
+const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
+const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
+
+async function sh(cmd: string): Promise<{ stdout: string; stderr: string; code: number }> {
+  const p = Bun.spawn(["bash", "-lc", cmd], { cwd: REPO, stdout: "pipe", stderr: "pipe" });
+  const [stdout, stderr] = await Promise.all([new Response(p.stdout).text(), new Response(p.stderr).text()]);
+  const code = await p.exited;
+  return { stdout, stderr, code };
+}
+
+async function getHeadSha(): Promise<string> {
+  const r = await sh("git rev-parse HEAD");
+  return r.stdout.trim();
+}
+
+async function pushEmptyCommit(n: number): Promise<string> {
+  const msg = `test: nine-consecutive audit run ${n}/${RUNS} (compounding probe)`;
+  await sh(`GIT_AUTHOR_NAME=profit GIT_AUTHOR_EMAIL=profit@lakehouse GIT_COMMITTER_NAME=profit GIT_COMMITTER_EMAIL=profit@lakehouse git commit --allow-empty -m "${msg}"`);
+  const sha = await getHeadSha();
+  const pushCmd = `PAT="dead60d1160a02f81d241197d5d18f4608794fb2"; git -c credential.helper='!f() { echo "username=profit"; echo "password='$PAT'"; }; f' push origin HEAD 2>&1`;
+  const pr = await sh(pushCmd);
+  if (pr.code !== 0) throw new Error(`push failed: ${pr.stderr || pr.stdout}`);
+  return sha;
+}
+
+async function waitForVerdict(sha: string, deadlineMs: number): Promise<any> {
+  const short = sha.slice(0, 12);
+  const path = `${VERDICTS_DIR}/${TARGET_PR}-${short}.json`;
+  const start = Date.now();
+  while (Date.now() - start < deadlineMs) {
+    try {
+      const raw = await readFile(path, "utf8");
+      return JSON.parse(raw);
+    } catch { /* not yet */ }
+    await new Promise(r => setTimeout(r, POLL_INTERVAL_MS));
+  }
+  throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
+}
+
+async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
+  const agg = await aggregate<any>(AUDIT_LESSONS, {
+    keyFn: (r) => r?.signature,
+    scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
+  });
+  const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
+  return {
+    sig_count: list.length,
+    max_count: list[0]?.count ?? 0,
+    max_confidence: list.reduce((m, a) => Math.max(m, a.confidence), 0),
+    top3: list.slice(0, 3).map(a => ({
+      sig: a.signature,
+      count: a.count,
+      conf: a.confidence,
+      summary: a.representative_summary.slice(0, 80),
+    })),
+  };
+}
+
+interface RunRecord {
+  run: number;
+  sha: string;
+  verdict_overall: string;
+  findings_total: number;
+  findings_block: number;
+  findings_warn: number;
+  findings_info: number;
+  audit_duration_ms: number;
+  claims_total: number;
+  claims_empirical: number;
+  kb_sig_count_after: number;
+  kb_max_count_after: number;
+  kb_max_confidence_after: number;
+}
+
+async function main() {
+  console.log(`[nine] target PR: #${TARGET_PR}`);
+  console.log(`[nine] runs:      ${RUNS}`);
+  console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
+  console.log("");
+
+  const baseline = await captureAggState();
+  console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
+  console.log("");
+
+  const records: RunRecord[] = [];
+  for (let n = 1; n <= RUNS; n++) {
+    const t0 = Date.now();
+    console.log(`─── run ${n}/${RUNS} ───`);
+    const sha = await pushEmptyCommit(n);
+    console.log(`  pushed ${sha.slice(0, 12)}`);
+    const verdict = await waitForVerdict(sha, AUDIT_TIMEOUT_MS);
+    const after = await captureAggState();
+    const rec: RunRecord = {
+      run: n,
+      sha: sha.slice(0, 12),
+      verdict_overall: String(verdict.overall),
+      findings_total: Number(verdict.metrics?.findings_total ?? 0),
+      findings_block: Number(verdict.metrics?.findings_block ?? 0),
+      findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
+      findings_info: Number(verdict.metrics?.findings_info ?? 0),
+      audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
+      claims_total: Number(verdict.metrics?.claims_total ?? 0),
+      claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
+      kb_sig_count_after: after.sig_count,
+      kb_max_count_after: after.max_count,
+      kb_max_confidence_after: after.max_confidence,
+    };
+    records.push(rec);
+    console.log(`  verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
+    console.log(`  kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
+    console.log(`  elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
+    console.log("");
+  }
+
+  console.log("═══ FINAL ═══");
+  console.log("run | verdict          | find  | block  warn  info | dur_s | kb_sig  max_count  max_conf");
+  for (const r of records) {
+    console.log(
+      `  ${String(r.run).padStart(1)}  | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)}  | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)}  | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)}  ${String(r.kb_max_count_after).padStart(9)}  ${r.kb_max_confidence_after.toFixed(2)}`,
+    );
+  }
+
+  console.log("");
+  console.log("═══ COMPOUNDING PROPERTY ═══");
+  const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
+  const maxCount = records[records.length - 1].kb_max_count_after;
+  const maxConf = records[records.length - 1].kb_max_confidence_after;
+  console.log(`  signatures added over ${RUNS} runs: ${sigDelta}`);
+  console.log(`  max count after run ${RUNS}:         ${maxCount} (same-PR recurrences per signature)`);
+  console.log(`  max confidence after run ${RUNS}:    ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
+
+  const verdictSet = new Set(records.map(r => r.verdict_overall));
+  if (verdictSet.size === 1) {
+    console.log(`  verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
+  } else {
+    console.log(`  verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
+  }
+
+  if (maxConf < 0.3) {
+    console.log(`  confidence policy holding: same-PR noise stays below escalation threshold ✓`);
+  } else {
+    console.log(`  ⚠ confidence escalated above 0.3 on same-PR noise — kb_index policy needs tightening`);
+  }
+
+  const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
+  await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
+  console.log("");
+  console.log(`  report: ${jsonOut}`);
+}
+
+main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -0,0 +1,443 @@
+// Scrum-master orchestrator — pulls git repo source + PRD + a change
+// proposal, chunks everything, hands each code piece to the proven
+// escalation ladder (small-local → big-local → cloud → specialist →
+// biggest) with learning context between attempts. Collects per-file
+// suggestions in a coherent handoff report.
+//
+// What it composes (everything below is already shipped + proven):
+//   - Chunker + embeddings (sidecar /embed, nomic-embed-text)
+//   - In-memory cosine retrieval (top-K PRD + plan chunks per file)
+//   - Escalation ladder (6 tiers, cycling on empty/error/thin-answer)
+//   - Per-attempt learning-context injection (prior failures → prompt)
+//   - Tree-split fallback when combined context exceeds budget
+//   - JSONL output per file + summary
+//
+// Deliberate scope limit: TARGET_FILES is 3 files by default. The
+// pipeline works at larger N, but at ~90s/file × 3 files = 4-5 min,
+// 15 files = 22 min. Bump via env LH_SCRUM_FILES="path1,path2,...".
+//
+// Run: bun run tests/real-world/scrum_master_pipeline.ts
+
+import { readFile, writeFile, mkdir } from "node:fs/promises";
+import { createHash } from "node:crypto";
+
+const GATEWAY = "http://localhost:3100";
+const SIDECAR = "http://localhost:3200";
+const CHUNK_SIZE = 800;
+const CHUNK_OVERLAP = 120;
+const TOP_K_CONTEXT = 5;
+const MAX_ATTEMPTS = 6;
+// Files larger than this get tree-split instead of truncated. Fixes the
+// 6KB false-positive class (model claiming a field is "missing" when
+// it exists past the context cutoff).
+const FILE_TREE_SPLIT_THRESHOLD = 6000;
+const FILE_SHARD_SIZE = 3500;
+// Appended jsonl so auditor's kb_query can surface scrum findings for
+// files touched by a PR under review. Part of cohesion plan Phase C.
+const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
+const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`;
+
+const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
+// Using CONTROL_PLANE_PRD as the "suggested changes" doc since it
+// describes the Phase 38-44 target architecture and is on main.
+// COHESION_INTEGRATION_PLAN.md is still on PR #7 branch.
+const PROPOSAL_PATH = "/home/profit/lakehouse/docs/CONTROL_PLANE_PRD.md";
+
+// Scoped target: 3 representative source files by default.
+// The scrum-master walks these in order and produces one suggestion
+// set per file. Override via env for a wider sweep.
+const DEFAULT_TARGETS = [
+  "/home/profit/lakehouse/crates/vectord/src/playbook_memory.rs",
+  "/home/profit/lakehouse/crates/vectord/src/doc_drift.rs",
+  "/home/profit/lakehouse/auditor/audit.ts",
+];
+const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
+  ? process.env.LH_SCRUM_FILES.split(",").map(s => s.trim())
+  : DEFAULT_TARGETS;
+
+const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
+  { provider: "ollama",       model: "qwen3.5:latest",       note: "local 7B" },
+  { provider: "ollama",       model: "qwen3:latest",         note: "local 7B (peer)" },
+  { provider: "ollama",       model: "gpt-oss:20b",          note: "local 20B" },
+  { provider: "ollama_cloud", model: "gpt-oss:120b",         note: "cloud 120B" },
+  { provider: "ollama_cloud", model: "devstral-2:123b",      note: "cloud 123B coding specialist" },
+  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B last-ditch" },
+];
+
+type Chunk = { id: string; text: string; embedding: number[]; origin: string; offset: number };
+
+interface FileReview {
+  file: string;
+  file_bytes: number;
+  tree_split_fired: boolean;
+  shards_summarized: number;
+  top_prd_chunks: Array<{ origin: string; offset: number; score: number }>;
+  top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>;
+  attempts_made: number;
+  attempts_history: Array<{ n: number; model: string; status: "accepted" | "thin" | "error"; chars: number; error?: string }>;
+  accepted_on: number | null;
+  escalated_to_model: string;
+  suggestions: string;
+  duration_ms: number;
+}
+
+function log(msg: string) { console.log(`[scrum] ${msg}`); }
+
+function cosine(a: number[], b: number[]): number {
+  let dot = 0, na = 0, nb = 0;
+  for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
+  return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
+}
+
+function chunkText(text: string): Array<{ text: string; offset: number }> {
+  const out: Array<{ text: string; offset: number }> = [];
+  for (let i = 0; i < text.length; ) {
+    const end = Math.min(i + CHUNK_SIZE, text.length);
+    const slice = text.slice(i, end).trim();
+    if (slice.length > 60) out.push({ text: slice, offset: i });
+    if (end >= text.length) break;
+    i = end - CHUNK_OVERLAP;
+  }
+  return out;
+}
+
+async function embedBatch(texts: string[]): Promise<number[][]> {
+  const r = await fetch(`${SIDECAR}/embed`, {
+    method: "POST", headers: { "content-type": "application/json" },
+    body: JSON.stringify({ texts }),
+    signal: AbortSignal.timeout(120000),
+  });
+  if (!r.ok) throw new Error(`embed ${r.status}`);
+  return (await r.json() as any).embeddings;
+}
+
+async function chat(opts: {
+  provider: "ollama" | "ollama_cloud",
+  model: string,
+  prompt: string,
+  max_tokens?: number,
+}): Promise<{ content: string; error?: string; prompt_tokens: number; completion_tokens: number }> {
+  try {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST", headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: opts.provider,
+        model: opts.model,
+        messages: [{ role: "user", content: opts.prompt }],
+        max_tokens: opts.max_tokens ?? 1500,
+        temperature: 0.2,
+        think: false,
+      }),
+      signal: AbortSignal.timeout(180000),
+    });
+    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 200)}`, prompt_tokens: 0, completion_tokens: 0 };
+    const j: any = await r.json();
+    return {
+      content: j.choices?.[0]?.message?.content ?? "",
+      prompt_tokens: j.usage?.prompt_tokens ?? 0,
+      completion_tokens: j.usage?.completion_tokens ?? 0,
+    };
+  } catch (e) {
+    return { content: "", error: (e as Error).message, prompt_tokens: 0, completion_tokens: 0 };
+  }
+}
+
+// Accept a file-review answer if it's substantive + structured.
+// We're not validating Rust here — we're validating that the model
+// produced a coherent suggestion set.
+function isAcceptable(answer: string): boolean {
+  if (answer.length < 200) return false;                    // too thin
+  // Must at least try a structured form — numbered list, bullets,
+  // or sections. Models that just hand-wave fail.
+  const hasStructure = /^\s*[-*]\s/m.test(answer)
+                    || /^\s*\d+\.\s/m.test(answer)
+                    || /^\s*#/m.test(answer);
+  return hasStructure;
+}
+
+function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] {
+  return pool
+    .map(c => ({ c, score: cosine(query_emb, c.embedding) }))
+    .sort((a, b) => b.score - a.score)
+    .slice(0, k)
+    .map(x => ({ ...x.c, _score: x.score } as any));
+}
+
+// Tree-split a large file: shard it, summarize each shard against
+// the review question, merge into a scratchpad. Uses cloud because
+// the summarization step needs quality > speed. Returns the
+// scratchpad (full-file distillation) and the cloud-call count.
+async function treeSplitFile(
+  filePath: string,
+  content: string,
+): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> {
+  const shards: Array<{ from: number; to: number; text: string }> = [];
+  for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) {
+    const end = Math.min(i + FILE_SHARD_SIZE, content.length);
+    shards.push({ from: i, to: end, text: content.slice(i, end) });
+  }
+  let scratchpad = "";
+  let cloud_calls = 0;
+  log(`  tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`);
+  for (const [si, shard] of shards.entries()) {
+    const prompt = `You are summarizing ONE SHARD of a source file as part of a multi-shard review. File: ${filePath}. Shard ${si + 1}/${shards.length} (bytes ${shard.from}..${shard.to}).
+
+─────── shard source ───────
+${shard.text}
+─────── end shard ───────
+
+Scratchpad of prior shards (if empty, this is shard 1):
+${scratchpad || "(empty)"}
+
+Extract ONLY facts useful for reviewing this file against its PRD: function names + purposes, struct fields + types, invariants, edge cases, TODO markers, error-handling style. Under 150 words. No prose outside the extracted facts.`;
+    const r = await chat({
+      provider: "ollama_cloud",
+      model: "gpt-oss:120b",
+      prompt,
+      max_tokens: 400,
+    });
+    cloud_calls += 1;
+    if (r.content) {
+      scratchpad += `\n--- shard ${si + 1} (bytes ${shard.from}..${shard.to}) ---\n${r.content.trim()}`;
+    }
+  }
+  return { scratchpad, shards: shards.length, cloud_calls };
+}
+
+async function reviewFile(
+  filePath: string,
+  prd_chunks: Chunk[],
+  proposal_chunks: Chunk[],
+): Promise<FileReview> {
+  const t0 = Date.now();
+  log(`file: ${filePath}`);
+  const content = await readFile(filePath, "utf8");
+  const rel = filePath.replace("/home/profit/lakehouse/", "");
+
+  // Build a query embedding from the first ~800 chars of the file
+  // (good enough for topical retrieval).
+  const seed = content.slice(0, 800);
+  const [seedEmb] = await embedBatch([seed]);
+
+  const topPrd = retrieveTopK(seedEmb, prd_chunks, TOP_K_CONTEXT);
+  const topPlan = retrieveTopK(seedEmb, proposal_chunks, TOP_K_CONTEXT);
+  log(`  retrieved ${topPrd.length} PRD chunks + ${topPlan.length} proposal chunks`);
+
+  const contextBlock = [
+    "═══ RELEVANT PRD EXCERPTS ═══",
+    ...topPrd.map(c => `[PRD @${c.offset}]\n${c.text.slice(0, 600)}`),
+    "",
+    "═══ RELEVANT CHANGE PROPOSAL EXCERPTS ═══",
+    ...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`),
+  ].join("\n\n");
+
+  // Files bigger than FILE_TREE_SPLIT_THRESHOLD get tree-split.
+  // Summarize each shard to a scratchpad, then review against the
+  // scratchpad instead of the truncated first chunk. Prevents the
+  // false-positive pattern where the model claims a field is
+  // "missing" because it's past the context cutoff.
+  let sourceForPrompt: string;
+  let treeSplitFired = false;
+  let shardsSummarized = 0;
+  let extraCloudCalls = 0;
+  if (content.length > FILE_TREE_SPLIT_THRESHOLD) {
+    treeSplitFired = true;
+    const ts = await treeSplitFile(rel, content);
+    shardsSummarized = ts.shards;
+    extraCloudCalls = ts.cloud_calls;
+    sourceForPrompt = `[FULL-FILE SCRATCHPAD — distilled from ${ts.shards} shards via tree-split]\n${ts.scratchpad}`;
+  } else {
+    sourceForPrompt = content;
+  }
+
+  // Prompt — when tree-split fired, include an explicit instruction
+  // not to claim a field/function is "missing" because the scratchpad
+  // is a distillation not the full file. Attacks the rubric-tuning
+  // concern J called out.
+  const truncationWarning = treeSplitFired
+    ? `\nIMPORTANT: the "source" below is a multi-shard distillation (tree-split across ${shardsSummarized} shards), NOT the full raw file. DO NOT claim any field, function, or feature is "missing" based on its absence from this distillation — the distillation may have elided it. Only call out gaps that appear DIRECTLY contradicted by the PRD excerpts.\n`
+    : "";
+
+  const baseTask = `You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan.
+
+FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""})
+${truncationWarning}
+─────── source ───────
+${sourceForPrompt}
+─────── end source ───────
+
+${contextBlock}
+
+Produce a structured review with:
+1. Alignment score (1-10) between this file and the PRD intent
+2. 3-5 concrete suggested changes (bullet points), each naming a specific function/line and what to change
+3. Any gap where this file's behavior contradicts the PRD or the proposal
+
+Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-offset when relevant.`;
+
+  const history: FileReview["attempts_history"] = [];
+  let accepted: string | null = null;
+  let acceptedModel = "";
+  let acceptedOn = 0;
+
+  for (let i = 0; i < MAX_ATTEMPTS; i++) {
+    const n = i + 1;
+    const rung = LADDER[i];
+    const learning = history.length > 0
+      ? `\n\n═══ PRIOR ATTEMPTS FAILED. Specific issues to fix: ═══\n${history.map(h => `Attempt ${h.n} (${h.model}, ${h.chars} chars): ${h.status} — ${h.error ?? "thin/unstructured answer"}`).join("\n")}\n═══`
+      : "";
+
+    log(`  attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${learning ? " [w/ learning]" : ""}`);
+    const r = await chat({
+      provider: rung.provider,
+      model: rung.model,
+      prompt: baseTask + learning,
+      max_tokens: 1500,
+    });
+
+    if (r.error) {
+      history.push({ n, model: rung.model, status: "error", chars: 0, error: r.error.slice(0, 180) });
+      log(`    ✗ error: ${r.error.slice(0, 80)}`);
+      continue;
+    }
+    if (!isAcceptable(r.content)) {
+      history.push({ n, model: rung.model, status: "thin", chars: r.content.length, error: `thin/unstructured (${r.content.length} chars)` });
+      log(`    ✗ thin/unstructured (${r.content.length} chars)`);
+      continue;
+    }
+    history.push({ n, model: rung.model, status: "accepted", chars: r.content.length });
+    accepted = r.content;
+    acceptedModel = `${rung.provider}/${rung.model}`;
+    acceptedOn = n;
+    log(`    ✓ ACCEPTED on attempt ${n} (${rung.model}, ${r.content.length} chars)`);
+    break;
+  }
+
+  const review: FileReview = {
+    file: rel,
+    file_bytes: content.length,
+    tree_split_fired: treeSplitFired,
+    shards_summarized: shardsSummarized,
+    top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
+    top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })),
+    attempts_made: history.length,
+    attempts_history: history,
+    accepted_on: acceptedOn || null,
+    escalated_to_model: acceptedModel,
+    suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]",
+    duration_ms: Date.now() - t0,
+  };
+
+  // Append to the shared scrum-reviews jsonl so the auditor's
+  // kb_query check can surface relevant reviews for files in a
+  // PR diff. Cohesion plan Phase C wire.
+  if (accepted) {
+    const { appendFile, mkdir } = await import("node:fs/promises");
+    const { dirname } = await import("node:path");
+    await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true });
+    const row = {
+      file: rel,
+      reviewed_at: new Date().toISOString(),
+      accepted_model: acceptedModel,
+      accepted_on_attempt: acceptedOn,
+      attempts_made: history.length,
+      tree_split_fired: treeSplitFired,
+      suggestions_preview: accepted.slice(0, 2000),
+    };
+    try {
+      await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
+    } catch (e) {
+      console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
+    }
+  }
+
+  return review;
+}
+
+async function loadAndChunk(path: string, origin_tag: string): Promise<Chunk[]> {
+  const text = await readFile(path, "utf8");
+  const raw = chunkText(text);
+  const embs = await embedBatch(raw.map(r => r.text));
+  return raw.map((r, i) => ({
+    id: createHash("sha256").update(r.text).digest("hex").slice(0, 10),
+    text: r.text,
+    embedding: embs[i],
+    origin: origin_tag,
+    offset: r.offset,
+  }));
+}
+
+async function main() {
+  await mkdir(OUT_DIR, { recursive: true });
+  log(`output: ${OUT_DIR}`);
+  log(`targets: ${TARGET_FILES.length} files`);
+
+  log("loading + embedding PRD...");
+  const prd_chunks = await loadAndChunk(PRD_PATH, "PRD");
+  log(`  PRD: ${prd_chunks.length} chunks`);
+
+  log("loading + embedding cohesion plan...");
+  const plan_chunks = await loadAndChunk(PROPOSAL_PATH, "COHESION_PLAN");
+  log(`  plan: ${plan_chunks.length} chunks`);
+
+  log("");
+  log("─── scrum master: walking target files ───");
+
+  const reviews: FileReview[] = [];
+  for (const f of TARGET_FILES) {
+    const review = await reviewFile(f, prd_chunks, plan_chunks);
+    reviews.push(review);
+    await writeFile(
+      `${OUT_DIR}/review_${review.file.replace(/\//g, "_")}.json`,
+      JSON.stringify(review, null, 2),
+    );
+    log(`  → ${review.file}: ${review.accepted_on ? `accepted on ${review.accepted_on} by ${review.escalated_to_model}` : "UNRESOLVED"} (${review.duration_ms}ms)`);
+  }
+
+  // Consolidated scrum-master report
+  const report_md: string[] = [];
+  report_md.push(`# Scrum-master review\n`);
+  report_md.push(`Generated: ${new Date().toISOString()}`);
+  report_md.push(`Files reviewed: ${reviews.length}`);
+  report_md.push(`Total duration: ${(reviews.reduce((s, r) => s + r.duration_ms, 0) / 1000).toFixed(1)}s\n`);
+  for (const r of reviews) {
+    report_md.push(`\n## ${r.file}`);
+    report_md.push(`- **Accepted on attempt:** ${r.accepted_on ?? "NOT resolved after 6 attempts"}`);
+    report_md.push(`- **Escalated to:** \`${r.escalated_to_model || "—"}\``);
+    report_md.push(`- **Total attempts:** ${r.attempts_made}`);
+    if (r.attempts_history.length > 1) {
+      report_md.push(`- **Attempt history:**`);
+      for (const h of r.attempts_history) {
+        report_md.push(`  - ${h.n}: \`${h.model}\` → ${h.status}${h.error ? ` (${h.error.slice(0, 100)})` : ""}`);
+      }
+    }
+    report_md.push(`\n### Suggestions\n\n${r.suggestions}\n`);
+  }
+  await writeFile(`${OUT_DIR}/scrum_report.md`, report_md.join("\n"));
+
+  const summary = {
+    ran_at: new Date().toISOString(),
+    target_count: TARGET_FILES.length,
+    resolved: reviews.filter(r => r.accepted_on !== null).length,
+    total_attempts: reviews.reduce((s, r) => s + r.attempts_made, 0),
+    total_duration_ms: reviews.reduce((s, r) => s + r.duration_ms, 0),
+    per_file: reviews.map(r => ({ file: r.file, accepted_on: r.accepted_on, model: r.escalated_to_model, attempts: r.attempts_made, ms: r.duration_ms })),
+  };
+  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
+
+  log("");
+  log("═══ SCRUM REPORT ═══");
+  log(`  files: ${summary.target_count}, resolved: ${summary.resolved}, total attempts: ${summary.total_attempts}`);
+  log(`  total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
+  log("");
+  for (const p of summary.per_file) {
+    const mark = p.accepted_on ? "✓" : "✗";
+    log(`  ${mark} ${p.file.padEnd(60)} attempt ${p.accepted_on ?? "—"}/${p.attempts} ${p.model} ${p.ms}ms`);
+  }
+  log("");
+  log(`report: ${OUT_DIR}/scrum_report.md`);
+
+  process.exit(summary.resolved === summary.target_count ? 0 : 1);
+}
+
+main().catch(e => { console.error("[scrum] fatal:", e); process.exit(2); });