From 77650c4ba34a1efc6be9de94990ff82a63fea3cf Mon Sep 17 00:00:00 2001
From: profit <profit@lakehouse>
Date: Wed, 22 Apr 2026 23:09:14 -0500
Subject: [PATCH] =?UTF-8?q?auditor:=20inference=20curation=20layer=20+=20l?=
 =?UTF-8?q?lm=5Fteam=20fact=20extraction=20=E2=86=92=20KB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the cycle J asked for: curated cloud output lands structured
knowledge in the KB so future audits have architectural context, not
just a log of per-finding signatures.

Three pieces:

1. Inference curation (tree-split) — when diff > 30KB, shard at 4.5KB,
   summarize each shard via cloud (temp=0, think=false on small
   shards; think=true on main call). Merge into scratchpad. The cloud
   verification then runs against the scratchpad, not truncated raw.
   Eliminates the 40KB MAX_DIFF_CHARS truncation path for large PRs
   (PR #8 is 102KB — was losing 62KB). Anti-false-positive guard in
   the prompt: cloud is told scratchpad absence is NOT diff absence,
   so it doesn't flag curated-out symbols as missing. unflagged_gaps
   section is dropped entirely when curated (scratchpad can't ground
   them).

2. fact_extractor — TS client for llm_team_ui's extract-facts mode at
   localhost:5000/api/run. Sends curated scratchpad through qwen2.5
   extractor + gemma2 verifier, parses SSE stream, returns structured
   {facts, entities, relationships, verification, llm_team_run_id}.
   Best-effort: if llm_team is down, extraction fails silently and
   the audit still completes. AWAITED so CLI tools (audit_one.ts)
   don't exit before extraction lands — the systemd poller has 90s
   headroom so the extra ~15s doesn't matter.

3. audit_facts.jsonl + checkAuditFacts() — one row per curated audit
   with the extraction result. kb_query tails the jsonl, explodes
   entity rows, aggregates by entity name with distinct-PR counting,
   surfaces entities recurring in 2+ PRs as info findings. Filters
   out short names (<3 chars, extractor truncation artifacts) and
   generic types (string/number/etc.) so signal isn't drowned.

Verified end-to-end on PR #8: 102KB diff → 23 shards → 1KB scratchpad
→ qwen2.5 extracted 4 facts + 6 entities + 6 relationships (real
code-level knowledge: AggregateOptions<T> type, aggregate<T> async
function with real signature, typed relationships). llm_team_run_id
cross-references to llm_team's own team_runs table.

Also: audit.ts passes (pr_number, head_sha) as InferenceContext so
extracted facts are scope-tagged for the KB index.
---
 auditor/audit.ts            |   2 +-
 auditor/checks/inference.ts | 220 ++++++++++++++++++++++++++++++++++--
 auditor/checks/kb_query.ts  | 102 +++++++++++++++++
 auditor/fact_extractor.ts   | 183 ++++++++++++++++++++++++++++++
 4 files changed, 499 insertions(+), 8 deletions(-)
 create mode 100644 auditor/fact_extractor.ts
diff --git a/auditor/audit.ts b/auditor/audit.ts
index 18bf732..91d23fc 100644
--- a/auditor/audit.ts
+++ b/auditor/audit.ts
@@ -56,7 +56,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
   const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([
     runStaticCheck(diff),
     opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(),
-    opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff),
+    opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff, { pr_number: pr.number, head_sha: pr.head_sha }),
     runKbCheck(claims, pr.files.map(f => f.path)),
   ]);
 
diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts
index c6c3dbf..488be8f 100644
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@@ -14,7 +14,8 @@
 
 import type { Claim, Finding } from "../types.ts";
 import { Glob } from "bun";
-import { readFile } from "node:fs/promises";
+import { readFile, mkdir, appendFile } from "node:fs/promises";
+import { extractFacts } from "../fact_extractor.ts";
 
 const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
@@ -22,11 +23,36 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
 // previously truncated at 15KB causing the reviewer to miss later
 // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
 // block finding when the file was simply outside the truncation window.
+//
+// Above this threshold we curate via tree-split rather than truncate,
+// following the scrum_master pattern: shard the diff, summarize each
+// shard against the claim-verification task, merge into a compact
+// scratchpad, then ask the cloud to verify claims against the
+// scratchpad. This gives the cloud full-PR fidelity without bursting
+// its context window (observed failure mode: empty response or
+// unparseable output when prompt exceeds model's comfortable range).
 const MAX_DIFF_CHARS = 40000;
+// Tree-split kicks in above this. 30KB is below MAX_DIFF_CHARS so we
+// curate BEFORE truncation would happen — never lose signal to a hard
+// cut. Shard size is chosen so ~10 shards cover PR #8-size diffs in a
+// reasonable round-trip budget.
+const CURATION_THRESHOLD = 30000;
+const DIFF_SHARD_SIZE = 4500;
 const CALL_TIMEOUT_MS = 120_000;
 const REPO_ROOT = "/home/profit/lakehouse";
 
-export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
+export interface InferenceContext {
+  pr_number: number;
+  head_sha: string;
+}
+
+const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
+
+export async function runInferenceCheck(
+  claims: Claim[],
+  diff: string,
+  ctx?: InferenceContext,
+): Promise<Finding[]> {
   if (claims.length === 0) {
     return [{
       check: "inference",
@@ -51,9 +77,26 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
     }];
   }
 
-  const truncated = diff.length > MAX_DIFF_CHARS
-    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
-    : diff;
+  // Diff source for the cloud prompt — either the raw diff (small
+  // enough to fit), or a tree-split scratchpad (curation layer). We
+  // prefer curation to truncation: truncation silently drops files
+  // past the window; curation summarizes them so the cloud still sees
+  // what changed, just densified.
+  let diffForPrompt: string;
+  let curationNote = "";
+  if (diff.length > CURATION_THRESHOLD) {
+    const ts = await treeSplitDiff(diff, verifiable);
+    diffForPrompt = ts.scratchpad;
+    curationNote = ` (curated: ${diff.length} chars → ${ts.shards} shards → scratchpad ${ts.scratchpad.length} chars)`;
+  } else {
+    diffForPrompt = diff;
+  }
+  // Belt-and-suspenders truncation — even a tree-split scratchpad
+  // shouldn't exceed MAX_DIFF_CHARS in practice, but guard anyway so
+  // pathological inputs can't burst the prompt.
+  const truncated = diffForPrompt.length > MAX_DIFF_CHARS
+    ? diffForPrompt.slice(0, MAX_DIFF_CHARS) + `\n...[${diffForPrompt.length - MAX_DIFF_CHARS} more chars truncated]`
+    : diffForPrompt;
 
   // Build the reviewer prompt in the same shape as run_codereview's
   // review stage (llm_team_ui.py:10950), adapted for claim verification:
@@ -61,6 +104,30 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
   //   "Code: ..."
   //   "Review: bugs/security/perf/style/edge. Provide corrected code."
   // We add: claim list upfront + ask for structured JSON verdict.
+  //
+  // When the diff was curated (tree-split scratchpad), we add an
+  // explicit anti-false-positive instruction: the scratchpad is a
+  // distillation, not the full source, so absence-from-scratchpad is
+  // NOT evidence of absence-from-diff. Mirrors the fix we made in
+  // scrum_master's review prompt for the same class of error.
+  const isCurated = curationNote.length > 0;
+  const curationGuard = isCurated
+    ? [
+        "",
+        "CRITICAL: the 'Diff' below is a curated multi-shard scratchpad,",
+        "NOT the full raw diff. The scratchpad distills each shard down",
+        "to facts useful for claim verification and drops the rest.",
+        "DO NOT flag a function/field/feature as 'missing' or 'not",
+        "implemented' based solely on its absence from the scratchpad —",
+        "absence in a distillation is NOT evidence of absence in the",
+        "actual diff. Only judge a claim NOT BACKED when the scratchpad",
+        "DIRECTLY contradicts it (e.g. scratchpad shows the function was",
+        "added empty, or shows the claimed code path is a stub).",
+        "Skip the unflagged_gaps section entirely when operating on a",
+        "curated scratchpad — you can't reliably detect gaps from a",
+        "distillation, and false positives there are worse than misses.",
+      ].join("\n")
+    : "";
   const systemMsg = [
     "You review pull-request diffs against the author's own ship-claims.",
     "For each claim, decide: is it backed by actual code in the diff, or is",
@@ -74,6 +141,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
     "  - the claim claims integration but the integration point is a stub",
     "  - the diff contains unimplemented!() / todo!() / TODO comments",
     "  - the claim says 'works end-to-end' but the diff has no end-to-end test",
+    curationGuard,
     "",
     "Respond with strict JSON only. No prose before or after. Shape:",
     "{",
@@ -174,7 +242,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
   findings.push({
     check: "inference",
     severity: "info",
-    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
+    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`,
     evidence: [
       `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
     ],
@@ -204,7 +272,29 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
     }
   }
 
-  for (const g of parsed.unflagged_gaps ?? []) {
+  // Route the curated scratchpad through llm_team's extract-facts
+  // pipeline when we have (a) a curated scratchpad (best signal about
+  // what the PR actually changed) and (b) PR context to scope facts.
+  // AWAITED (not fire-and-forget) so CLI callers like audit_one.ts
+  // don't exit before extraction lands; the systemd poller has plenty
+  // of headroom (90s cycle vs ~15s extraction). A failure inside
+  // extractAndPersistFacts is caught + logged but never throws.
+  if (isCurated && ctx && process.env.LH_AUDITOR_SKIP_EXTRACT !== "1") {
+    try {
+      await extractAndPersistFacts(diffForPrompt, ctx);
+    } catch (e) {
+      console.error(`[inference] fact extraction failed: ${(e as Error).message}`);
+    }
+  }
+
+  // Belt-and-suspenders: when operating on a curated scratchpad, drop
+  // the unflagged_gaps section entirely. The distillation can't
+  // reliably ground gap-detection, and false positives are worse than
+  // misses for this signal class. The systemMsg already asks the
+  // cloud to skip this section when curated — but the model may still
+  // emit it, so we filter here too.
+  const gapsToEmit = isCurated ? [] : (parsed.unflagged_gaps ?? []);
+  for (const g of gapsToEmit) {
     const summary = String(g?.summary ?? "?");
     const location = String(g?.location ?? "?");
     // False-positive guard — when the cloud says "X not defined in this
@@ -248,6 +338,122 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
   return findings;
 }
 
+// Extract structured knowledge from the curated scratchpad and append
+// to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
+// PR number + head SHA for scope tracking. kb_query tails this next
+// audit to surface recurring entities/relationships across PRs.
+async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext): Promise<void> {
+  const ex = await extractFacts(scratchpad);
+  if (ex.error && ex.entities.length === 0 && ex.facts.length === 0) {
+    // Full failure — log but don't write an empty row.
+    console.error(`[inference] extractFacts skipped row: ${ex.error}`);
+    return;
+  }
+  const row = {
+    pr_number: ctx.pr_number,
+    head_sha: ctx.head_sha,
+    extracted_at: ex.extracted_at,
+    extractor: ex.extractor_model,
+    verifier: ex.verifier_model,
+    llm_team_run_id: ex.llm_team_run_id ?? null,
+    facts: ex.facts,
+    entities: ex.entities,
+    relationships: ex.relationships,
+    verification_preview: ex.verification.slice(0, 400),
+  };
+  await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
+  await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
+}
+
+// Curation via tree-split — ports the scrum_master pattern into the
+// inference check. Shards the raw diff into DIFF_SHARD_SIZE chunks,
+// summarizes each shard *against the claim-verification task* so the
+// summary preserves exactly what the cloud needs to judge claims
+// (function signatures, struct fields, deletions, new files), drops
+// everything else. Merges into a compact scratchpad.
+//
+// Cost: N cloud calls for the shard summaries + 1 cloud call for the
+// final verification = N+1 calls instead of 1. Mitigation: shards run
+// serially (not parallel) to keep gateway load bounded; summary calls
+// use max_tokens=400 so they're fast (~2s each on gpt-oss:120b).
+//
+// Determinism: each shard summary call uses temp=0 + think=true (same
+// as the top-level inference call), so identical input yields
+// identical scratchpad. The final verification call then sees a
+// stable scratchpad, giving stable verdicts.
+async function treeSplitDiff(
+  fullDiff: string,
+  claims: Claim[],
+): Promise<{ scratchpad: string; shards: number }> {
+  const shards: Array<{ from: number; to: number; text: string }> = [];
+  for (let i = 0; i < fullDiff.length; i += DIFF_SHARD_SIZE) {
+    const end = Math.min(i + DIFF_SHARD_SIZE, fullDiff.length);
+    shards.push({ from: i, to: end, text: fullDiff.slice(i, end) });
+  }
+  // Curate the claim list into a short form the summary prompt can
+  // use to bias extraction toward relevant facts.
+  const claimDigest = claims.map((c, i) =>
+    `${i}. [${c.strength}] "${c.text.slice(0, 100)}"`
+  ).join("\n");
+
+  let scratchpad = "";
+  for (const [si, shard] of shards.entries()) {
+    const prompt = [
+      `You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`,
+      `The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`,
+      "",
+      claimDigest,
+      "",
+      "Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.",
+      "Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.",
+      "",
+      "─────── shard diff ───────",
+      shard.text,
+      "─────── end shard ───────",
+      "",
+      "Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).",
+    ].join("\n");
+
+    const r = await callCloud(prompt, 400);
+    if (r.content) {
+      scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${r.content.trim()}\n`;
+    }
+  }
+  return { scratchpad: scratchpad.trim(), shards: shards.length };
+}
+
+// Minimal cloud caller used only by treeSplitDiff — same gateway +
+// model as the top-level call, but think=false. Shards are small
+// (≤DIFF_SHARD_SIZE ~4500 chars) and the task is pure fact
+// extraction, not reasoning. think=true on the shards introduced
+// variance in reasoning traces that compounded across 23 calls into
+// a non-deterministic scratchpad (observed during curation
+// validation: same-SHA runs produced 5/7/8 final findings).
+// think=false on small prompts is stable — only breaks at the main
+// call's 10K+ prompt size, which keeps think=true.
+async function callCloud(prompt: string, maxTokens: number): Promise<{ content: string }> {
+  try {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "ollama_cloud",
+        model: MODEL,
+        messages: [{ role: "user", content: prompt }],
+        max_tokens: maxTokens,
+        temperature: 0,
+        think: false,
+      }),
+      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
+    });
+    if (!r.ok) return { content: "" };
+    const j: any = await r.json();
+    return { content: j?.choices?.[0]?.message?.content ?? "" };
+  } catch {
+    return { content: "" };
+  }
+}
+
 // Pull out plausible code-symbol names from a summary string.
 // Matches:
 //   - identifier with backticks: `foo_bar`
diff --git a/auditor/checks/kb_query.ts b/auditor/checks/kb_query.ts
index 8daa410..475a7a8 100644
--- a/auditor/checks/kb_query.ts
+++ b/auditor/checks/kb_query.ts
@@ -25,6 +25,7 @@ const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl";
 const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles";
 const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl";
 const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl";
+const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl";
 const TAIL_LINES = 500;
 const MAX_BOT_CYCLE_FILES = 30;
 
@@ -61,6 +62,14 @@ export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promi
     findings.push(...scrumFindings);
   }
 
+  // 6b. Audit-facts (llm_team extract pipeline output) — surface
+  //     entities that recur across multiple PRs. These are the
+  //     "core system entities" accumulating in the knowledge base;
+  //     showing them as info on future audits gives reviewers
+  //     architectural context the raw diff doesn't convey.
+  const factFindings = await checkAuditFacts();
+  findings.push(...factFindings);
+
   // 6. Audit-lessons feedback loop — summarize the top recurring
   //    patterns from prior audits' block/warn findings. If the same
   //    pattern signature has fired 3+ times across prior audits,
@@ -207,6 +216,99 @@ function observerBySource(ops: any[]): string {
   return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty";
 }
 
+// Audit-facts — reads data/_kb/audit_facts.jsonl (populated by every
+// curated inference run via llm_team's extract pipeline). Each row
+// has arrays: facts, entities, relationships. We explode entities and
+// aggregate them across PRs using kb_index. An entity seen in 3+ PRs
+// is a "core system entity" — we surface the top N as info context.
+//
+// Filters out short names (<3 chars, likely qwen2.5 truncation
+// artifacts) and generic types ("string", "number") that would
+// otherwise dominate the ranking.
+const ENTITY_NAME_MIN_LEN = 3;
+const GENERIC_ENTITY_NAMES = new Set([
+  "string", "number", "boolean", "any", "void", "unknown", "never",
+  "object", "array", "function", "const", "let", "var", "true", "false",
+  "null", "undefined", "promise", "map", "set", "record",
+]);
+
+async function checkAuditFacts(): Promise<Finding[]> {
+  // Read raw rows — each row has multiple entities, so we can't just
+  // use aggregate() directly (it's one-signature-per-row). Explode
+  // entities into (row, entity) pairs, then aggregate by entity name.
+  let raw: string;
+  try { raw = await (await import("node:fs/promises")).readFile(AUDIT_FACTS_JSONL, "utf8"); }
+  catch { return []; }
+  const lines = raw.split("\n").filter(l => l.length > 0);
+  if (lines.length === 0) return [];
+
+  interface EntityRow { entity_key: string; pr_number: number; type: string; name: string; description: string }
+  const entityRows: EntityRow[] = [];
+  for (const line of lines.slice(-TAIL_LINES * 2)) {
+    let row: any;
+    try { row = JSON.parse(line); } catch { continue; }
+    const prNum = Number(row?.pr_number);
+    if (!Number.isFinite(prNum)) continue;
+    for (const e of Array.isArray(row?.entities) ? row.entities : []) {
+      const name = String(e?.name ?? "").trim();
+      if (name.length < ENTITY_NAME_MIN_LEN) continue;
+      if (GENERIC_ENTITY_NAMES.has(name.toLowerCase())) continue;
+      entityRows.push({
+        entity_key: name.toLowerCase(),
+        pr_number: prNum,
+        type: String(e?.type ?? "?"),
+        name,
+        description: String(e?.description ?? "").slice(0, 160),
+      });
+    }
+  }
+  if (entityRows.length === 0) return [];
+
+  // Aggregate manually — one key per entity name, distinct_scopes by PR.
+  type Agg = { count: number; scopes: Set<number>; types: Set<string>; last_name: string; last_desc: string };
+  const byEntity = new Map<string, Agg>();
+  for (const r of entityRows) {
+    const a = byEntity.get(r.entity_key) ?? {
+      count: 0, scopes: new Set<number>(), types: new Set<string>(), last_name: "", last_desc: "",
+    };
+    a.count += 1;
+    a.scopes.add(r.pr_number);
+    a.types.add(r.type);
+    a.last_name = r.name;
+    a.last_desc = r.description;
+    byEntity.set(r.entity_key, a);
+  }
+
+  // Rank: require 2+ distinct PRs (same-PR entity-repeats don't count
+  // as "cross-cutting"). Take the top 5 to avoid flooding the verdict.
+  const ranked = Array.from(byEntity.entries())
+    .filter(([_, a]) => a.scopes.size >= 2)
+    .sort((a, b) => b[1].scopes.size - a[1].scopes.size || b[1].count - a[1].count)
+    .slice(0, 5);
+
+  if (ranked.length === 0) {
+    // Useful to know the KB is being populated — emit a single
+    // summary so operators see fact extraction is alive.
+    return [{
+      check: "kb_query",
+      severity: "info",
+      summary: `audit_facts KB has ${entityRows.length} entity-observations across ${new Set(entityRows.map(r => r.pr_number)).size} PRs (no cross-PR recurrences yet)`,
+      evidence: [`source: ${AUDIT_FACTS_JSONL}`],
+    }];
+  }
+
+  return ranked.map(([_, a]) => ({
+    check: "kb_query" as const,
+    severity: "info" as const,
+    summary: `core entity \`${a.last_name}\` recurs in ${a.scopes.size} PRs (types: ${Array.from(a.types).join(",")})`,
+    evidence: [
+      `count=${a.count} distinct_PRs=${a.scopes.size}`,
+      `description: ${a.last_desc.slice(0, 200)}`,
+      `PRs: ${Array.from(a.scopes).sort((x, y) => x - y).join(",")}`,
+    ],
+  }));
+}
+
 // Audit-lessons — reads data/_kb/audit_lessons.jsonl (populated by
 // every audit's appendAuditLessons). Uses the shared kb_index
 // aggregator: groups by `signature`, distinct-scopes keyed by PR
diff --git a/auditor/fact_extractor.ts b/auditor/fact_extractor.ts
new file mode 100644
index 0000000..eb26710
--- /dev/null
+++ b/auditor/fact_extractor.ts
@@ -0,0 +1,183 @@
+// fact_extractor — routes curated TEXT through llm_team_ui's
+// "knowledge extract facts" mode (mode=extract at /api/run).
+//
+// What it gives us: structured {facts, entities, relationships} from
+// whatever curated blob we send. Auditor sends the tree-split
+// inference scratchpad (the best distillation of what a PR changed).
+// Scrum_master will later send its accepted review bodies.
+//
+// Why route through llm_team and not just extract directly from our
+// own checks: llm_team's extract uses a local EXTRACTOR model
+// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the
+// discipline J wants for knowledge going into the playbook — facts
+// go in only after a second model has rated them CORRECT /
+// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the
+// codereview pattern J already trusts.
+//
+// SSE parsing: llm_team streams SSE events. We're only interested in
+// the final "response" event with role="final" + the extraction
+// response (role="extraction N"). Parse the JSON from the extractor's
+// response text.
+
+const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
+const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest";
+const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest";
+const EXTRACT_TIMEOUT_MS = 120_000;
+
+export interface Entity {
+  name: string;
+  type: string;
+  description?: string;
+}
+
+export interface Relationship {
+  from: string;
+  to: string;
+  type: string;
+}
+
+export interface ExtractedFacts {
+  facts: string[];
+  entities: Entity[];
+  relationships: Relationship[];
+  verification: string;
+  extractor_model: string;
+  verifier_model: string;
+  source_preview: string;
+  // Populated when the extract run completed server-side (llm_team
+  // persists to its own team_runs; this is for our own cross-ref).
+  llm_team_run_id?: number;
+  extracted_at: string;
+  error?: string;
+}
+
+/**
+ * Run the llm_team extract pipeline on `source` text. Returns
+ * structured {facts, entities, relationships}.
+ *
+ * Returns an object with `error` set if the pipeline failed — never
+ * throws, because fact extraction is best-effort enrichment (the
+ * primary audit must not break if llm_team is down).
+ */
+export async function extractFacts(source: string): Promise<ExtractedFacts> {
+  const base: ExtractedFacts = {
+    facts: [],
+    entities: [],
+    relationships: [],
+    verification: "",
+    extractor_model: EXTRACTOR,
+    verifier_model: VERIFIER,
+    source_preview: source.slice(0, 240),
+    extracted_at: new Date().toISOString(),
+  };
+
+  let resp: Response;
+  try {
+    resp = await fetch(`${LLM_TEAM}/api/run`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        mode: "extract",
+        prompt: source,
+        extractor: EXTRACTOR,
+        verifier: VERIFIER,
+        source: "prompt",
+        skip_cache: true,  // cache by prompt would dedup identical
+                           // scratchpads, but we want fresh extraction
+                           // for per-audit facts; cheap since local.
+      }),
+      signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
+    });
+  } catch (e) {
+    return { ...base, error: `fetch failed: ${(e as Error).message}` };
+  }
+
+  if (!resp.ok) {
+    const body = await resp.text().catch(() => "");
+    return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` };
+  }
+
+  // Stream SSE lines; collect the one extraction response + the run_saved event
+  // so we can capture the team-runs ID for cross-ref.
+  const decoder = new TextDecoder();
+  const reader = resp.body?.getReader();
+  if (!reader) return { ...base, error: "no response body" };
+
+  let buffer = "";
+  let extractionText = "";
+  let verifierText = "";
+  let runId: number | undefined = undefined;
+
+  try {
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let nl: number;
+      while ((nl = buffer.indexOf("\n\n")) >= 0) {
+        const chunk = buffer.slice(0, nl);
+        buffer = buffer.slice(nl + 2);
+        const dataLine = chunk.split("\n").find(l => l.startsWith("data: "));
+        if (!dataLine) continue;
+        try {
+          const ev = JSON.parse(dataLine.slice(6));
+          if (ev.type === "response") {
+            const role = String(ev.role ?? "");
+            if (role.startsWith("extraction")) extractionText = String(ev.text ?? "");
+            else if (role === "verifier") verifierText = String(ev.text ?? "");
+          } else if (ev.type === "run_saved") {
+            const id = Number(ev.run_id);
+            if (Number.isFinite(id)) runId = id;
+          }
+        } catch { /* skip malformed SSE */ }
+      }
+    }
+  } catch (e) {
+    return { ...base, error: `SSE read failed: ${(e as Error).message}` };
+  }
+
+  // Pull the JSON object out of extractionText (may be wrapped in ```json fences).
+  const parsed = extractFirstJsonObject(extractionText);
+  if (!parsed) {
+    return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
+  }
+
+  return {
+    ...base,
+    facts: Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : [],
+    entities: Array.isArray(parsed.entities)
+      ? parsed.entities.slice(0, 30).map((e: any) => ({
+          name: String(e?.name ?? ""),
+          type: String(e?.type ?? ""),
+          description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined,
+        })).filter(e => e.name.length > 0)
+      : [],
+    relationships: Array.isArray(parsed.relationships)
+      ? parsed.relationships.slice(0, 30).map((r: any) => ({
+          from: String(r?.from ?? ""),
+          to: String(r?.to ?? ""),
+          type: String(r?.type ?? ""),
+        })).filter(r => r.from.length > 0 && r.to.length > 0)
+      : [],
+    verification: verifierText.slice(0, 1500),
+    llm_team_run_id: runId,
+  };
+}
+
+// Lift the first balanced JSON object out of (possibly fenced) text.
+// Same discipline as inference.ts::extractJson.
+function extractFirstJsonObject(text: string): any | null {
+  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
+  let depth = 0, start = -1;
+  for (let i = 0; i < cleaned.length; i++) {
+    const c = cleaned[i];
+    if (c === "{") { if (depth === 0) start = i; depth++; }
+    else if (c === "}") {
+      depth--;
+      if (depth === 0 && start >= 0) {
+        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
+      }
+    }
+  }
+  return null;
+}