// fact_extractor — routes curated TEXT through llm_team_ui's // "knowledge extract facts" mode (mode=extract at /api/run). // // What it gives us: structured {facts, entities, relationships} from // whatever curated blob we send. Auditor sends the tree-split // inference scratchpad (the best distillation of what a PR changed). // Scrum_master will later send its accepted review bodies. // // Why route through llm_team and not just extract directly from our // own checks: llm_team's extract uses a local EXTRACTOR model // (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the // discipline J wants for knowledge going into the playbook — facts // go in only after a second model has rated them CORRECT / // UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the // codereview pattern J already trusts. // // SSE parsing: llm_team streams SSE events. We're only interested in // the final "response" event with role="final" + the extraction // response (role="extraction N"). Parse the JSON from the extractor's // response text. const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000"; const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest"; const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest"; const EXTRACT_TIMEOUT_MS = 120_000; export interface Entity { name: string; type: string; description?: string; } export interface Relationship { from: string; to: string; type: string; } export interface ExtractedFacts { facts: string[]; entities: Entity[]; relationships: Relationship[]; verification: string; extractor_model: string; verifier_model: string; source_preview: string; // Populated when the extract run completed server-side (llm_team // persists to its own team_runs; this is for our own cross-ref). llm_team_run_id?: number; extracted_at: string; // Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/ // UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list // pre-drop so operators can see which verdicts mapped to dropped // facts if needed. verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">; facts_dropped_by_verifier?: number; error?: string; } /** * Run the llm_team extract pipeline on `source` text. Returns * structured {facts, entities, relationships}. * * Returns an object with `error` set if the pipeline failed — never * throws, because fact extraction is best-effort enrichment (the * primary audit must not break if llm_team is down). */ export async function extractFacts(source: string): Promise { const base: ExtractedFacts = { facts: [], entities: [], relationships: [], verification: "", extractor_model: EXTRACTOR, verifier_model: VERIFIER, source_preview: source.slice(0, 240), extracted_at: new Date().toISOString(), }; let resp: Response; try { resp = await fetch(`${LLM_TEAM}/api/run`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ mode: "extract", prompt: source, extractor: EXTRACTOR, verifier: VERIFIER, source: "prompt", skip_cache: true, // cache by prompt would dedup identical // scratchpads, but we want fresh extraction // for per-audit facts; cheap since local. }), signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS), }); } catch (e) { return { ...base, error: `fetch failed: ${(e as Error).message}` }; } if (!resp.ok) { const body = await resp.text().catch(() => ""); return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` }; } // Stream SSE lines; collect the one extraction response + the run_saved event // so we can capture the team-runs ID for cross-ref. const decoder = new TextDecoder(); const reader = resp.body?.getReader(); if (!reader) return { ...base, error: "no response body" }; let buffer = ""; let extractionText = ""; let verifierText = ""; let runId: number | undefined = undefined; try { while (true) { const { done, value } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); let nl: number; while ((nl = buffer.indexOf("\n\n")) >= 0) { const chunk = buffer.slice(0, nl); buffer = buffer.slice(nl + 2); const dataLine = chunk.split("\n").find(l => l.startsWith("data: ")); if (!dataLine) continue; try { const ev = JSON.parse(dataLine.slice(6)); if (ev.type === "response") { const role = String(ev.role ?? ""); if (role.startsWith("extraction")) extractionText = String(ev.text ?? ""); else if (role === "verifier") verifierText = String(ev.text ?? ""); } else if (ev.type === "run_saved") { const id = Number(ev.run_id); if (Number.isFinite(id)) runId = id; } } catch { /* skip malformed SSE */ } } } } catch (e) { return { ...base, error: `SSE read failed: ${(e as Error).message}` }; } // Pull the JSON object out of extractionText (may be wrapped in ```json fences). const parsed = extractFirstJsonObject(extractionText); if (!parsed) { return { ...base, error: "extractor returned no parseable JSON", verification: verifierText }; } const rawFacts: string[] = Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : []; // Parse the verifier's free-form prose into per-fact verdicts, then // drop any fact the verifier explicitly marked INCORRECT. Leave // UNVERIFIABLE in place: many of our extractions are domain-specific // (Lakehouse internals) and the verifier has no prior-knowledge // anchor, so UNVERIFIABLE is the expected verdict for new signal, // not a quality fail. This is verifier-gated persistence: drop only // what's affirmatively wrong, not what's novel. const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length); const incorrectIdx = new Set(); verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); }); const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i)); return { ...base, facts: kept, entities: Array.isArray(parsed.entities) ? parsed.entities.slice(0, 30).map((e: any) => ({ name: String(e?.name ?? ""), type: String(e?.type ?? ""), description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined, })).filter(e => e.name.length > 0) : [], relationships: Array.isArray(parsed.relationships) ? parsed.relationships.slice(0, 30).map((r: any) => ({ from: String(r?.from ?? ""), to: String(r?.to ?? ""), type: String(r?.type ?? ""), })).filter(r => r.from.length > 0 && r.to.length > 0) : [], verification: verifierText.slice(0, 1500), facts_dropped_by_verifier: incorrectIdx.size, verifier_verdicts: verdicts, llm_team_run_id: runId, }; } // Parse verifier's free-form output into a per-fact verdict array. // The verifier output typically looks like: // **1.** The claim... // * **Verdict:** CORRECT // **2.** ... // **Verdict:** UNVERIFIABLE // Using matchAll to iterate — returns a verdict array of length // numFacts; unmatched positions stay UNCHECKED. function parseVerifierVerdicts( verifierText: string, numFacts: number, ): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> { const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> = Array(numFacts).fill("UNCHECKED"); const re = /(?:\*\*|#+\s*)?(\d+)[.):]\s[\s\S]*?\bVerdict\s*:\s*\*?\*?\s*(CORRECT|INCORRECT|UNVERIFIABLE)/gi; for (const m of verifierText.matchAll(re)) { const idx = Number(m[1]) - 1; if (idx >= 0 && idx < numFacts) { out[idx] = m[2].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE"; } } return out; } // Lift the first balanced JSON object out of (possibly fenced) text. // Same discipline as inference.ts::extractJson. function extractFirstJsonObject(text: string): any | null { const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); let depth = 0, start = -1; for (let i = 0; i < cleaned.length; i++) { const c = cleaned[i]; if (c === "{") { if (depth === 0) start = i; depth++; } else if (c === "}") { depth--; if (depth === 0 && start >= 0) { try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } } } } return null; }