// fact_extractor — routes curated TEXT through llm_team_ui's // "knowledge extract facts" mode (mode=extract at /api/run). // // What it gives us: structured {facts, entities, relationships} from // whatever curated blob we send. Auditor sends the tree-split // inference scratchpad (the best distillation of what a PR changed). // Scrum_master will later send its accepted review bodies. // // Why route through llm_team and not just extract directly from our // own checks: llm_team's extract uses a local EXTRACTOR model // (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the // discipline J wants for knowledge going into the playbook — facts // go in only after a second model has rated them CORRECT / // UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the // codereview pattern J already trusts. // // SSE parsing: llm_team streams SSE events. We're only interested in // the final "response" event with role="final" + the extraction // response (role="extraction N"). Parse the JSON from the extractor's // response text. const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000"; const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest"; const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest"; const EXTRACT_TIMEOUT_MS = 120_000; const PROJECT_CONTEXT_FILE = process.env.LH_AUDITOR_CONTEXT_FILE ?? "/home/profit/lakehouse/docs/AUDITOR_CONTEXT.md"; let cachedContext: string | null = null; async function loadProjectContext(): Promise { if (cachedContext !== null) return cachedContext; try { const { readFile } = await import("node:fs/promises"); const raw = await readFile(PROJECT_CONTEXT_FILE, "utf8"); // Cap at 4KB — anything past that is more noise than signal for // the extractor/verifier's attention budget. cachedContext = raw.slice(0, 4000); } catch { cachedContext = ""; // context file missing → extractor runs without preamble } return cachedContext; } export interface Entity { name: string; type: string; description?: string; } export interface Relationship { from: string; to: string; type: string; } export interface ExtractedFacts { facts: string[]; entities: Entity[]; relationships: Relationship[]; verification: string; extractor_model: string; verifier_model: string; source_preview: string; // Populated when the extract run completed server-side (llm_team // persists to its own team_runs; this is for our own cross-ref). llm_team_run_id?: number; extracted_at: string; // Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/ // UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list // pre-drop so operators can see which verdicts mapped to dropped // facts if needed. verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">; facts_dropped_by_verifier?: number; error?: string; } /** * Run the llm_team extract pipeline on `source` text. Returns * structured {facts, entities, relationships}. * * Returns an object with `error` set if the pipeline failed — never * throws, because fact extraction is best-effort enrichment (the * primary audit must not break if llm_team is down). */ export async function extractFacts(source: string): Promise { const base: ExtractedFacts = { facts: [], entities: [], relationships: [], verification: "", extractor_model: EXTRACTOR, verifier_model: VERIFIER, source_preview: source.slice(0, 240), extracted_at: new Date().toISOString(), }; // Prepend project context to the source so the extractor + verifier // know what codebase/framework these facts belong to. Without this, // the verifier marks most domain-specific facts as UNVERIFIABLE ("I // don't know what Lakehouse is"). With it, the verifier can CORRECT- // stamp facts that align with the stated architecture. const context = await loadProjectContext(); const prompt = context.length > 0 ? `=== PROJECT CONTEXT (for grounding facts; do NOT extract facts from this section) ===\n${context}\n\n=== CONTENT TO EXTRACT FACTS FROM ===\n${source}` : source; let resp: Response; try { resp = await fetch(`${LLM_TEAM}/api/run`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ mode: "extract", prompt, extractor: EXTRACTOR, verifier: VERIFIER, source: "prompt", skip_cache: true, // cache by prompt would dedup identical // scratchpads, but we want fresh extraction // for per-audit facts; cheap since local. }), signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS), }); } catch (e) { return { ...base, error: `fetch failed: ${(e as Error).message}` }; } if (!resp.ok) { const body = await resp.text().catch(() => ""); return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` }; } // Stream SSE lines; collect the one extraction response + the run_saved event // so we can capture the team-runs ID for cross-ref. const decoder = new TextDecoder(); const reader = resp.body?.getReader(); if (!reader) return { ...base, error: "no response body" }; let buffer = ""; let extractionText = ""; let verifierText = ""; let runId: number | undefined = undefined; try { while (true) { const { done, value } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); let nl: number; while ((nl = buffer.indexOf("\n\n")) >= 0) { const chunk = buffer.slice(0, nl); buffer = buffer.slice(nl + 2); const dataLine = chunk.split("\n").find(l => l.startsWith("data: ")); if (!dataLine) continue; try { const ev = JSON.parse(dataLine.slice(6)); if (ev.type === "response") { const role = String(ev.role ?? ""); if (role.startsWith("extraction")) extractionText = String(ev.text ?? ""); else if (role === "verifier") verifierText = String(ev.text ?? ""); } else if (ev.type === "run_saved") { const id = Number(ev.run_id); if (Number.isFinite(id)) runId = id; } } catch { /* skip malformed SSE */ } } } } catch (e) { return { ...base, error: `SSE read failed: ${(e as Error).message}` }; } // Pull the JSON object out of extractionText (may be wrapped in ```json fences). const parsed = extractFirstJsonObject(extractionText); if (!parsed) { return { ...base, error: "extractor returned no parseable JSON", verification: verifierText }; } const rawFacts: string[] = Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : []; // Parse the verifier's free-form prose into per-fact verdicts, then // drop any fact the verifier explicitly marked INCORRECT. Leave // UNVERIFIABLE in place: many of our extractions are domain-specific // (Lakehouse internals) and the verifier has no prior-knowledge // anchor, so UNVERIFIABLE is the expected verdict for new signal, // not a quality fail. This is verifier-gated persistence: drop only // what's affirmatively wrong, not what's novel. const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length); const incorrectIdx = new Set(); verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); }); const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i)); return { ...base, facts: kept, entities: Array.isArray(parsed.entities) ? parsed.entities.slice(0, 30).map((e: any) => ({ name: String(e?.name ?? ""), type: String(e?.type ?? ""), description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined, })).filter(e => e.name.length > 0) : [], relationships: Array.isArray(parsed.relationships) ? parsed.relationships.slice(0, 30).map((r: any) => ({ from: String(r?.from ?? ""), to: String(r?.to ?? ""), type: String(r?.type ?? ""), })).filter(r => r.from.length > 0 && r.to.length > 0) : [], verification: verifierText.slice(0, 1500), facts_dropped_by_verifier: incorrectIdx.size, verifier_verdicts: verdicts, llm_team_run_id: runId, }; } // Parse verifier's free-form output into a per-fact verdict array. // Gemma2 uses several formats depending on prompt mood: // Format A: **1.** claim... * **Verdict:** CORRECT // Format B: **1.** claim... * **CORRECT** (no "Verdict:" label) // Format C: 1. claim... CORRECT // Strategy: split on fact numbers, then find the first // CORRECT|INCORRECT|UNVERIFIABLE token in each section. Handles all // three formats without regex gymnastics. function parseVerifierVerdicts( verifierText: string, numFacts: number, ): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> { const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> = Array(numFacts).fill("UNCHECKED"); if (!verifierText) return out; // Find each fact section start — "**N.**" or "N." at line start — // and slice out the content up to the NEXT fact number. Each section // gets scanned for the first CORRECT/INCORRECT/UNVERIFIABLE token. const starts: Array<{ idx: number; pos: number }> = []; const header = /(?:^|\n)\s*(?:\*\*)?(\d+)[.)]/g; for (const m of verifierText.matchAll(header)) { const factNum = Number(m[1]); if (!Number.isFinite(factNum)) continue; starts.push({ idx: factNum - 1, pos: m.index! }); } for (let i = 0; i < starts.length; i++) { const s = starts[i]; const end = i + 1 < starts.length ? starts[i + 1].pos : verifierText.length; if (s.idx < 0 || s.idx >= numFacts) continue; const section = verifierText.slice(s.pos, end); const v = section.match(/\b(CORRECT|INCORRECT|UNVERIFIABLE)\b/i); if (v) out[s.idx] = v[1].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE"; } return out; } // Lift the first balanced JSON object out of (possibly fenced) text. // Same discipline as inference.ts::extractJson. function extractFirstJsonObject(text: string): any | null { const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); let depth = 0, start = -1; for (let i = 0; i < cleaned.length; i++) { const c = cleaned[i]; if (c === "{") { if (depth === 0) start = i; depth++; } else if (c === "}") { depth--; if (depth === 0 && start >= 0) { try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } } } } return null; }