4 changed files with 73 additions and 553 deletions
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -19,15 +19,6 @@ import { extractFacts } from "../fact_extractor.ts";

 const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
-// Tie-breaker for claims where the N=3 consensus produces a 1-1-1
-// split (genuinely borderline). Different architecture from the
-// primary reviewer (gpt-oss) so the tie-break isn't correlated with
-// the original disagreement. qwen3-coder:480b is a newer coding
-// specialist at 480B params, well-suited to PR-diff claim verification
-// and distinct in training lineage from gpt-oss.
-const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "qwen3-coder:480b";
-const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
-const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
 // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
 // previously truncated at 15KB causing the reviewer to miss later
 // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
@ -177,131 +168,94 @@ export async function runInferenceCheck(
    `Strict JSON only, matching the shape described. No prose outside JSON.`,
  ].join("\n");

-  // N=3 consensus — run the primary reviewer in parallel, collect
-  // all three parsed responses, majority-vote per claim. Parallel
-  // (Promise.all) because each call is ~20-30s and they're independent;
-  // wall-clock stays ~same as single call, cost 3x tokens. Empirical
-  // justification: in 3-run determinism tests, 7/8 findings were
-  // stable but 1 flipped across runs — majority vote stabilizes the
-  // flipping class without losing the stable signal.
-  const primaryRuns = await Promise.all(
-    Array.from({ length: N_CONSENSUS }, () =>
-      runCloudInference(systemMsg, userMsg, MODEL)),
-  );
-
-  const parsedRuns = primaryRuns.filter(r => r.parsed !== null);
-  if (parsedRuns.length === 0) {
-    // All N calls failed. Surface the first-run diagnostic so the
-    // operator sees *why* (unreachable / non-200 / unparseable).
-    const first = primaryRuns[0];
+  let resp: Response;
+  try {
+    resp = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "ollama_cloud",
+        model: MODEL,
+        messages: [
+          { role: "system", content: systemMsg },
+          { role: "user", content: userMsg },
+        ],
+        // Deterministic classification — temp=0 is greedy-sample, so
+        // identical input yields identical output on the same model
+        // version. This kills the signature creep we observed in the
+        // 9-run empirical test (sig_count 16→27 from cloud phrasing
+        // variance at temp=0.2).
+        //
+        // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
+        // model; setting think=false caused it to return empty content
+        // on large prompts (observed during Level 1 validation: 13421
+        // tokens used, empty content returned). The reasoning trace is
+        // variable prose, but at temp=0 the FINAL classification is
+        // still deterministic because greedy sampling converges to
+        // the same conclusion from the same starting state.
+        max_tokens: 3000,
+        temperature: 0,
+        think: true,
+      }),
+      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
+    });
+  } catch (e) {
+    // Cloud unreachable → soft-fail. Don't block a PR because the
+    // reviewer model is down. Static + dynamic + kb still run.
    return [{
      check: "inference",
      severity: "info",
-      summary: `cloud inference all ${N_CONSENSUS} consensus runs failed — ${first.error ?? "unknown"}`,
+      summary: "cloud inference unreachable — skipped",
+      evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
+    }];
+  }
+
+  if (!resp.ok) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: `cloud inference returned ${resp.status} — skipped`,
+      evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
+    }];
+  }
+
+  const body: any = await resp.json();
+  const content: string = body?.choices?.[0]?.message?.content ?? "";
+  const usage = body?.usage ?? {};
+
+  const parsed = extractJson(content);
+  if (!parsed) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "cloud returned unparseable output — skipped",
      evidence: [
-        `first-run diagnostic: ${first.diagnostic ?? "(none)"}`,
-        `successful runs: 0 / ${N_CONSENSUS}`,
+        `head: ${content.slice(0, 200)}`,
+        `tokens: ${usage.total_tokens ?? "?"}`,
      ],
    }];
  }

-  // Aggregate votes per claim_idx.
-  interface Votes { trues: number; falses: number; evidences: string[] }
-  const votesByClaim = new Map<number, Votes>();
-  const unflaggedByRun: any[][] = [];
-  let totalTokens = 0;
-  for (const run of parsedRuns) {
-    totalTokens += run.tokens;
-    unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []);
-    for (const v of run.parsed?.claim_verdicts ?? []) {
-      const idx = Number(v?.claim_idx);
-      if (!Number.isFinite(idx)) continue;
-      const rec = votesByClaim.get(idx) ?? { trues: 0, falses: 0, evidences: [] };
-      if (v.backed === false) {
-        rec.falses++;
-        rec.evidences.push(String(v.evidence ?? ""));
-      } else if (v.backed === true) {
-        rec.trues++;
-      }
-      votesByClaim.set(idx, rec);
-    }
-  }
-
  const findings: Finding[] = [];

-  // Summary finding so the verdict layer knows the check ran.
+  // One summary info finding so the verdict layer knows the check ran.
  findings.push({
    check: "inference",
    severity: "info",
-    summary: `cloud review completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, tokens=${totalTokens})${curationNote}`,
+    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`,
    evidence: [
-      `claims voted: ${votesByClaim.size}`,
-      `parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`,
+      `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
    ],
  });

-  // Per-claim majority vote; tie-break if no majority.
-  const discrepancies: Array<{
-    claim_idx: number;
-    claim_text: string;
-    votes: { trues: number; falses: number };
-    resolution: "majority_backed" | "majority_not_backed" | "tiebreaker_backed" | "tiebreaker_not_backed" | "unresolved";
-    tiebreaker_model?: string;
-  }> = [];
-
-  for (const [idx, votes] of votesByClaim) {
-    const claim = verifiable[idx];
-    if (!claim) continue;
-    const totalVotes = votes.trues + votes.falses;
-    let notBacked: boolean | null = null;
-    let resolution: typeof discrepancies[number]["resolution"] = "majority_backed";
-    let evidenceText = "";
-    let tbModel: string | undefined;
-
-    if (votes.falses > votes.trues) {
-      notBacked = true;
-      resolution = "majority_not_backed";
-      evidenceText = votes.evidences[0] ?? "(no reason given)";
-    } else if (votes.trues > votes.falses) {
-      notBacked = false;
-      resolution = "majority_backed";
-    } else {
-      // Tie. Run tie-breaker with a different-architecture model.
-      const tb = await runCloudInference(systemMsg, userMsg, TIEBREAKER_MODEL);
-      if (tb.parsed) {
-        const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx);
-        if (tv?.backed === false) {
-          notBacked = true;
-          resolution = "tiebreaker_not_backed";
-          evidenceText = `(tie-breaker ${TIEBREAKER_MODEL}) ${String(tv.evidence ?? "")}`;
-          tbModel = TIEBREAKER_MODEL;
-        } else if (tv?.backed === true) {
-          notBacked = false;
-          resolution = "tiebreaker_backed";
-          tbModel = TIEBREAKER_MODEL;
-        } else {
-          resolution = "unresolved";
-        }
-      } else {
-        resolution = "unresolved";
-      }
-    }
-
-    // Log every case where the N runs disagreed — discrepancies are
-    // signal, not noise. Separate from audit_lessons.jsonl because
-    // they're about the *auditor's* quality, not the PR's quality.
-    const disagreed = totalVotes >= 2 && votes.trues > 0 && votes.falses > 0;
-    if (disagreed || resolution.startsWith("tiebreaker") || resolution === "unresolved") {
-      discrepancies.push({
-        claim_idx: idx,
-        claim_text: claim.text,
-        votes: { trues: votes.trues, falses: votes.falses },
-        resolution,
-        tiebreaker_model: tbModel,
-      });
-    }
-
-    if (notBacked === true) {
+  for (const v of parsed.claim_verdicts ?? []) {
+    if (v?.backed === false) {
+      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
+      // Indices point at the verifiable[] list we sent the cloud,
+      // not the full claims[] list. Translate back.
+      const claim = verifiable[idx];
+      if (!claim) continue;
+      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
        : claim.strength === "moderate" ? "warn"
        : "info";
@ -312,22 +266,12 @@ export async function runInferenceCheck(
        summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
        evidence: [
          `at ${claim.location}`,
-          `consensus: ${votes.falses}/${totalVotes} not-backed (resolution: ${resolution})`,
-          `cloud reason: ${evidenceText.slice(0, 200)}`,
+          `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
        ],
      });
    }
  }

-  // Persist discrepancies so we can measure consensus drift over time.
-  if (discrepancies.length > 0 && ctx) {
-    persistDiscrepancies(ctx, discrepancies).catch(e =>
-      console.error(`[inference] discrepancy log failed: ${(e as Error).message}`));
-  }
-
-  // Use first run's parsed for downstream unflagged_gaps processing.
-  const parsed = parsedRuns[0].parsed;
-
  // Route the curated scratchpad through llm_team's extract-facts
  // pipeline when we have (a) a curated scratchpad (best signal about
  // what the PR actually changed) and (b) PR context to scope facts.
@ -394,71 +338,6 @@ export async function runInferenceCheck(
  return findings;
 }

-// Single cloud call — the consensus loop calls this N times in
-// parallel. Returns the parsed JSON shape + token usage + any error
-// diagnostic. NEVER throws; the consensus aggregator handles partial
-// failures by dropping non-parsed runs from the vote.
-interface CloudRunResult {
-  parsed: any | null;
-  tokens: number;
-  error?: string;       // "unreachable" | "non_200" | "unparseable"
-  diagnostic?: string;  // first 200 chars for debugging
-  model: string;
-}
-
-async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<CloudRunResult> {
-  let resp: Response;
-  try {
-    resp = await fetch(`${GATEWAY}/v1/chat`, {
-      method: "POST",
-      headers: { "content-type": "application/json" },
-      body: JSON.stringify({
-        provider: "ollama_cloud",
-        model,
-        messages: [
-          { role: "system", content: systemMsg },
-          { role: "user", content: userMsg },
-        ],
-        // temp=0 (greedy) + think=true. think=true is required for
-        // gpt-oss:120b — without it the model returns empty content
-        // on large prompts. Variance from the think trace is observed
-        // in practice, which is why we use N=3 consensus, not single-
-        // call determinism.
-        max_tokens: 3000,
-        temperature: 0,
-        think: true,
-      }),
-      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
-    });
-  } catch (e) {
-    return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model };
-  }
-  if (!resp.ok) {
-    return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model };
-  }
-  let body: any;
-  try { body = await resp.json(); }
-  catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; }
-  const content: string = body?.choices?.[0]?.message?.content ?? "";
-  const tokens: number = body?.usage?.total_tokens ?? 0;
-  const parsed = extractJson(content);
-  if (!parsed) {
-    return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model };
-  }
-  return { parsed, tokens, model };
-}
-
-async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
-  await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
-  const rows = discrepancies.map(d => JSON.stringify({
-    pr_number: ctx.pr_number,
-    head_sha: ctx.head_sha,
-    logged_at: new Date().toISOString(),
-    ...d,
-  }));
-  await appendFile(AUDIT_DISCREPANCIES_JSONL, rows.join("\n") + "\n");
-}
-
 // Extract structured knowledge from the curated scratchpad and append
 // to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
 // PR number + head SHA for scope tracking. kb_query tails this next
@ -481,10 +360,6 @@ async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext)
    entities: ex.entities,
    relationships: ex.relationships,
    verification_preview: ex.verification.slice(0, 400),
-    verifier_verdicts: ex.verifier_verdicts,
-    facts_dropped_by_verifier: ex.facts_dropped_by_verifier ?? 0,
-    schema_version: 2,
-    source: "audit_inference",
  };
  await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
  await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
--- a/auditor/fact_extractor.ts
+++ b/auditor/fact_extractor.ts
@ -48,12 +48,6 @@ export interface ExtractedFacts {
  // persists to its own team_runs; this is for our own cross-ref).
  llm_team_run_id?: number;
  extracted_at: string;
-  // Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
-  // UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
-  // pre-drop so operators can see which verdicts mapped to dropped
-  // facts if needed.
-  verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
-  facts_dropped_by_verifier?: number;
  error?: string;
 }

@ -148,25 +142,9 @@ export async function extractFacts(source: string): Promise<ExtractedFacts> {
    return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
  }

-  const rawFacts: string[] = Array.isArray(parsed.facts)
-    ? parsed.facts.slice(0, 50).map(String)
-    : [];
-
-  // Parse the verifier's free-form prose into per-fact verdicts, then
-  // drop any fact the verifier explicitly marked INCORRECT. Leave
-  // UNVERIFIABLE in place: many of our extractions are domain-specific
-  // (Lakehouse internals) and the verifier has no prior-knowledge
-  // anchor, so UNVERIFIABLE is the expected verdict for new signal,
-  // not a quality fail. This is verifier-gated persistence: drop only
-  // what's affirmatively wrong, not what's novel.
-  const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
-  const incorrectIdx = new Set<number>();
-  verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
-  const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));
-
  return {
    ...base,
-    facts: kept,
+    facts: Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : [],
    entities: Array.isArray(parsed.entities)
      ? parsed.entities.slice(0, 30).map((e: any) => ({
          name: String(e?.name ?? ""),
@ -182,36 +160,10 @@ export async function extractFacts(source: string): Promise<ExtractedFacts> {
        })).filter(r => r.from.length > 0 && r.to.length > 0)
      : [],
    verification: verifierText.slice(0, 1500),
-    facts_dropped_by_verifier: incorrectIdx.size,
-    verifier_verdicts: verdicts,
    llm_team_run_id: runId,
  };
 }

-// Parse verifier's free-form output into a per-fact verdict array.
-// The verifier output typically looks like:
-//   **1.** The claim...
-//   * **Verdict:** CORRECT
-//   **2.** ...
-//   **Verdict:** UNVERIFIABLE
-// Using matchAll to iterate — returns a verdict array of length
-// numFacts; unmatched positions stay UNCHECKED.
-function parseVerifierVerdicts(
-  verifierText: string,
-  numFacts: number,
-): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
-  const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
-    Array(numFacts).fill("UNCHECKED");
-  const re = /(?:\*\*|#+\s*)?(\d+)[.):]\s[\s\S]*?\bVerdict\s*:\s*\*?\*?\s*(CORRECT|INCORRECT|UNVERIFIABLE)/gi;
-  for (const m of verifierText.matchAll(re)) {
-    const idx = Number(m[1]) - 1;
-    if (idx >= 0 && idx < numFacts) {
-      out[idx] = m[2].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
-    }
-  }
-  return out;
-}
-
 // Lift the first balanced JSON object out of (possibly fenced) text.
 // Same discipline as inference.ts::extractJson.
 function extractFirstJsonObject(text: string): any | null {
--- a/auditor/kb_stats.ts
+++ b/auditor/kb_stats.ts
@ -1,269 +0,0 @@
-// kb_stats — on-demand dashboard numbers from the KB scratchpad
-// files. Reads data/_auditor/verdicts/*, data/_kb/audit_lessons.jsonl,
-// data/_kb/audit_facts.jsonl, data/_kb/audit_discrepancies.jsonl,
-// data/_kb/scrum_reviews.jsonl and prints:
-//
-//   - verdict flip-flop rate (same SHA re-audited, verdict changed?)
-//   - consensus discrepancy rate (N runs disagreed on a claim)
-//   - confidence distribution from kb_index aggregator
-//   - top N recurring entities from audit_facts
-//   - fact growth over time
-//   - scrum vs inference KB split
-//
-// Run:  bun run auditor/kb_stats.ts
-//       bun run auditor/kb_stats.ts --top 15     # show top 15 entities
-//       bun run auditor/kb_stats.ts --json       # machine-readable
-//
-// This is the "dashboard" without running Grafana. If someone really
-// wants a dashboard, wire this output into a static HTML page + cron.
-
-import { readFile, readdir } from "node:fs/promises";
-import { join } from "node:path";
-import { aggregate } from "./kb_index.ts";
-
-const REPO = "/home/profit/lakehouse";
-const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
-const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
-const AUDIT_FACTS = `${REPO}/data/_kb/audit_facts.jsonl`;
-const AUDIT_DISCREPANCIES = `${REPO}/data/_kb/audit_discrepancies.jsonl`;
-const SCRUM_REVIEWS = `${REPO}/data/_kb/scrum_reviews.jsonl`;
-
-interface Args {
-  top: number;
-  json: boolean;
-}
-
-function parseArgs(argv: string[]): Args {
-  const a: Args = { top: 10, json: false };
-  for (let i = 2; i < argv.length; i++) {
-    if (argv[i] === "--top") a.top = Number(argv[++i] ?? 10);
-    else if (argv[i] === "--json") a.json = true;
-  }
-  return a;
-}
-
-async function readJsonl<T = any>(path: string): Promise<T[]> {
-  try {
-    const raw = await readFile(path, "utf8");
-    return raw.split("\n").filter(l => l.length > 0).map(l => {
-      try { return JSON.parse(l) as T; } catch { return null as any; }
-    }).filter(r => r !== null);
-  } catch { return []; }
-}
-
-async function loadVerdicts(): Promise<Array<{ pr: number; sha: string; overall: string; findings_total: number; findings_block: number; findings_warn: number }>> {
-  let files: string[] = [];
-  try { files = await readdir(VERDICTS_DIR); } catch { return []; }
-  const out = [];
-  for (const f of files) {
-    if (!f.endsWith(".json")) continue;
-    const m = f.match(/^(\d+)-([0-9a-f]+)\.json$/);
-    if (!m) continue;
-    try {
-      const v = JSON.parse(await readFile(join(VERDICTS_DIR, f), "utf8"));
-      out.push({
-        pr: Number(m[1]),
-        sha: m[2],
-        overall: String(v.overall),
-        findings_total: Number(v.metrics?.findings_total ?? 0),
-        findings_block: Number(v.metrics?.findings_block ?? 0),
-        findings_warn: Number(v.metrics?.findings_warn ?? 0),
-      });
-    } catch { /* skip corrupt */ }
-  }
-  return out;
-}
-
-interface Stats {
-  audit_count: number;
-  verdict_distribution: Record<string, number>;
-  // Same PR with multiple SHAs — if verdicts differ, that's drift across
-  // the PR's commit history. Not a flip-flop in the classical sense,
-  // but worth surfacing (e.g. "PR #8 was block block req req block").
-  per_pr_verdict_sequences: Record<number, string[]>;
-  // For each PR with ≥ 2 audits, how many distinct verdicts did it
-  // produce? 1 = stable; 2+ = some flipping.
-  verdict_instability: { pr_count: number; pr_with_multiple_verdicts: number; pr_with_3plus: number };
-  consensus: { discrepancy_count: number; tiebreaker_used: number; unresolved: number };
-  kb: {
-    audit_lessons_rows: number;
-    audit_facts_rows: number;
-    scrum_reviews_rows: number;
-    distinct_finding_signatures: number;
-    distinct_entities_across_prs: number;
-    entities_in_2plus_prs: number;
-    entities_in_5plus_prs: number;
-  };
-  fact_quality: {
-    verifier_verdict_distribution: Record<string, number>;
-    facts_dropped_by_verifier_total: number;
-    extraction_success_rate: number;
-  };
-  top_entities: Array<{ name: string; distinct_prs: number; count: number; types: string[] }>;
-  kb_by_source: Record<string, number>;
-}
-
-async function collect(args: Args): Promise<Stats> {
-  const verdicts = await loadVerdicts();
-  const lessons = await readJsonl<any>(AUDIT_LESSONS);
-  const facts = await readJsonl<any>(AUDIT_FACTS);
-  const disc = await readJsonl<any>(AUDIT_DISCREPANCIES);
-  const reviews = await readJsonl<any>(SCRUM_REVIEWS);
-
-  // Verdict stability
-  const byPr: Record<number, string[]> = {};
-  const verdictDist: Record<string, number> = {};
-  for (const v of verdicts) {
-    (byPr[v.pr] ??= []).push(v.overall);
-    verdictDist[v.overall] = (verdictDist[v.overall] ?? 0) + 1;
-  }
-  let multi = 0, tri = 0;
-  for (const [_, seq] of Object.entries(byPr)) {
-    const distinct = new Set(seq);
-    if (distinct.size >= 2) multi++;
-    if (distinct.size >= 3) tri++;
-  }
-
-  // Consensus drift
-  const consensus = {
-    discrepancy_count: disc.length,
-    tiebreaker_used: disc.filter(d => String(d.resolution).startsWith("tiebreaker")).length,
-    unresolved: disc.filter(d => d.resolution === "unresolved").length,
-  };
-
-  // Lesson signatures
-  const lessonAgg = await aggregate<any>(AUDIT_LESSONS, {
-    keyFn: r => r?.signature,
-    scopeFn: r => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
-  });
-
-  // Entity aggregation across audit_facts rows
-  interface EntAgg { distinct_prs: Set<number>; count: number; types: Set<string>; name: string; sources: Set<string> }
-  const entAgg = new Map<string, EntAgg>();
-  const sourceCount: Record<string, number> = {};
-  let totalVerdictDist: Record<string, number> = { CORRECT: 0, INCORRECT: 0, UNVERIFIABLE: 0, UNCHECKED: 0 };
-  let factsDroppedTotal = 0;
-  let extractionsWithFacts = 0;
-
-  for (const row of facts) {
-    const src = String(row.source ?? "unknown");
-    sourceCount[src] = (sourceCount[src] ?? 0) + 1;
-    const pr = Number(row.pr_number);
-    if (Array.isArray(row.verifier_verdicts)) {
-      for (const v of row.verifier_verdicts) {
-        totalVerdictDist[v] = (totalVerdictDist[v] ?? 0) + 1;
-      }
-    }
-    factsDroppedTotal += Number(row.facts_dropped_by_verifier ?? 0);
-    if ((Array.isArray(row.facts) && row.facts.length > 0) || (Array.isArray(row.entities) && row.entities.length > 0)) {
-      extractionsWithFacts++;
-    }
-    for (const e of Array.isArray(row.entities) ? row.entities : []) {
-      const name = String(e?.name ?? "").trim();
-      if (name.length < 3) continue;
-      const key = name.toLowerCase();
-      const agg = entAgg.get(key) ?? { distinct_prs: new Set(), count: 0, types: new Set(), name, sources: new Set() };
-      agg.count++;
-      if (Number.isFinite(pr) && pr > 0) agg.distinct_prs.add(pr);
-      if (e?.type) agg.types.add(String(e.type));
-      agg.sources.add(src);
-      entAgg.set(key, agg);
-    }
-  }
-
-  const entitiesIn2Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 2).length;
-  const entitiesIn5Plus = Array.from(entAgg.values()).filter(a => a.distinct_prs.size >= 5).length;
-  const topEntities = Array.from(entAgg.values())
-    .sort((a, b) => b.distinct_prs.size - a.distinct_prs.size || b.count - a.count)
-    .slice(0, args.top)
-    .map(a => ({
-      name: a.name,
-      distinct_prs: a.distinct_prs.size,
-      count: a.count,
-      types: Array.from(a.types),
-    }));
-
-  const stats: Stats = {
-    audit_count: verdicts.length,
-    verdict_distribution: verdictDist,
-    per_pr_verdict_sequences: byPr,
-    verdict_instability: {
-      pr_count: Object.keys(byPr).length,
-      pr_with_multiple_verdicts: multi,
-      pr_with_3plus: tri,
-    },
-    consensus,
-    kb: {
-      audit_lessons_rows: lessons.length,
-      audit_facts_rows: facts.length,
-      scrum_reviews_rows: reviews.length,
-      distinct_finding_signatures: lessonAgg.size,
-      distinct_entities_across_prs: entAgg.size,
-      entities_in_2plus_prs: entitiesIn2Plus,
-      entities_in_5plus_prs: entitiesIn5Plus,
-    },
-    fact_quality: {
-      verifier_verdict_distribution: totalVerdictDist,
-      facts_dropped_by_verifier_total: factsDroppedTotal,
-      extraction_success_rate: facts.length > 0 ? extractionsWithFacts / facts.length : 0,
-    },
-    top_entities: topEntities,
-    kb_by_source: sourceCount,
-  };
-  return stats;
-}
-
-function renderHuman(s: Stats): string {
-  const lines: string[] = [];
-  lines.push("═══ KB STATS ═══");
-  lines.push("");
-  lines.push(`Audits: ${s.audit_count} total across ${s.verdict_instability.pr_count} distinct PRs`);
-  lines.push(`Verdicts: ${Object.entries(s.verdict_distribution).map(([k, v]) => `${k}=${v}`).join("  ")}`);
-  const multiplePct = s.verdict_instability.pr_count > 0
-    ? Math.round(100 * s.verdict_instability.pr_with_multiple_verdicts / s.verdict_instability.pr_count)
-    : 0;
-  lines.push(`Verdict instability: ${s.verdict_instability.pr_with_multiple_verdicts}/${s.verdict_instability.pr_count} PRs had 2+ distinct verdicts (${multiplePct}%) — 3+ distinct: ${s.verdict_instability.pr_with_3plus}`);
-  lines.push("");
-  lines.push("─── Consensus ───");
-  lines.push(`  discrepancies logged: ${s.consensus.discrepancy_count}`);
-  lines.push(`  tiebreaker used: ${s.consensus.tiebreaker_used}`);
-  lines.push(`  unresolved: ${s.consensus.unresolved}`);
-  const dRate = s.audit_count > 0 ? (100 * s.consensus.discrepancy_count / s.audit_count).toFixed(1) : "0";
-  lines.push(`  discrepancy rate: ${dRate}% of audits`);
-  lines.push("");
-  lines.push("─── KB size ───");
-  lines.push(`  audit_lessons.jsonl:     ${s.kb.audit_lessons_rows} rows, ${s.kb.distinct_finding_signatures} distinct signatures`);
-  lines.push(`  audit_facts.jsonl:       ${s.kb.audit_facts_rows} rows, ${s.kb.distinct_entities_across_prs} distinct entities`);
-  lines.push(`  scrum_reviews.jsonl:     ${s.kb.scrum_reviews_rows} rows`);
-  lines.push(`  entities in 2+ PRs:      ${s.kb.entities_in_2plus_prs}`);
-  lines.push(`  entities in 5+ PRs:      ${s.kb.entities_in_5plus_prs} ← strong cross-cutting signal`);
-  lines.push("");
-  lines.push("─── Fact quality ───");
-  const v = s.fact_quality.verifier_verdict_distribution;
-  lines.push(`  verifier verdicts:  CORRECT=${v.CORRECT ?? 0}  UNVERIFIABLE=${v.UNVERIFIABLE ?? 0}  UNCHECKED=${v.UNCHECKED ?? 0}  INCORRECT=${v.INCORRECT ?? 0}`);
-  lines.push(`  facts dropped by verifier: ${s.fact_quality.facts_dropped_by_verifier_total}`);
-  lines.push(`  extraction success rate: ${(s.fact_quality.extraction_success_rate * 100).toFixed(1)}%`);
-  lines.push("");
-  lines.push("─── KB sources ───");
-  for (const [src, n] of Object.entries(s.kb_by_source)) {
-    lines.push(`  ${src}: ${n}`);
-  }
-  lines.push("");
-  lines.push(`─── Top ${s.top_entities.length} recurring entities ───`);
-  for (const e of s.top_entities) {
-    lines.push(`  [${e.distinct_prs} PRs × ${e.count} obs]  ${e.name}  (${e.types.join(",")})`);
-  }
-  return lines.join("\n");
-}
-
-async function main() {
-  const args = parseArgs(process.argv);
-  const stats = await collect(args);
-  if (args.json) {
-    console.log(JSON.stringify(stats, (_, v) => v instanceof Set ? Array.from(v) : v, 2));
-  } else {
-    console.log(renderHuman(stats));
-  }
-}
-
-main().catch(e => { console.error("[kb_stats] fatal:", e); process.exit(1); });
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -343,50 +343,12 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
      attempts_made: history.length,
      tree_split_fired: treeSplitFired,
      suggestions_preview: accepted.slice(0, 2000),
-      schema_version: 2,
-      scrum_master_reviewed: true,
    };
    try {
      await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n");
    } catch (e) {
      console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`);
    }
-
-    // Route the accepted review through llm_team's fact extractor so
-    // its entities + relationships land in audit_facts.jsonl alongside
-    // inference-side extractions. Same index, two sources. Tagged
-    // source:"scrum_review" + scrum_master_reviewed:true so downstream
-    // queries can filter by provenance. Reviews shorter than 120
-    // chars are skipped — they're usually one-liners ("LGTM") with
-    // no extractable knowledge.
-    if (accepted.length >= 120 && process.env.LH_SCRUM_SKIP_EXTRACT !== "1") {
-      try {
-        const { extractFacts } = await import("../../auditor/fact_extractor.ts");
-        const ex = await extractFacts(accepted);
-        if (!ex.error || ex.entities.length + ex.facts.length > 0) {
-          const factRow = {
-            pr_number: 0,                   // scrum runs outside a PR scope
-            file: rel,
-            head_sha: "",                    // no SHA scope; scope is the file+timestamp
-            extracted_at: ex.extracted_at,
-            extractor: ex.extractor_model,
-            verifier: ex.verifier_model,
-            llm_team_run_id: ex.llm_team_run_id ?? null,
-            facts: ex.facts,
-            entities: ex.entities,
-            relationships: ex.relationships,
-            verification_preview: ex.verification.slice(0, 400),
-            schema_version: 2,
-            source: "scrum_review",
-            scrum_master_reviewed: true,
-          };
-          const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
-          await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(factRow) + "\n");
-        }
-      } catch (e) {
-        console.error(`[scrum] fact extraction failed for ${rel}: ${(e as Error).message}`);
-      }
-    }
  }

  return review;