From 77650c4ba34a1efc6be9de94990ff82a63fea3cf Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 23:09:14 -0500 Subject: [PATCH] =?UTF-8?q?auditor:=20inference=20curation=20layer=20+=20l?= =?UTF-8?q?lm=5Fteam=20fact=20extraction=20=E2=86=92=20KB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the cycle J asked for: curated cloud output lands structured knowledge in the KB so future audits have architectural context, not just a log of per-finding signatures. Three pieces: 1. Inference curation (tree-split) — when diff > 30KB, shard at 4.5KB, summarize each shard via cloud (temp=0, think=false on small shards; think=true on main call). Merge into scratchpad. The cloud verification then runs against the scratchpad, not truncated raw. Eliminates the 40KB MAX_DIFF_CHARS truncation path for large PRs (PR #8 is 102KB — was losing 62KB). Anti-false-positive guard in the prompt: cloud is told scratchpad absence is NOT diff absence, so it doesn't flag curated-out symbols as missing. unflagged_gaps section is dropped entirely when curated (scratchpad can't ground them). 2. fact_extractor — TS client for llm_team_ui's extract-facts mode at localhost:5000/api/run. Sends curated scratchpad through qwen2.5 extractor + gemma2 verifier, parses SSE stream, returns structured {facts, entities, relationships, verification, llm_team_run_id}. Best-effort: if llm_team is down, extraction fails silently and the audit still completes. AWAITED so CLI tools (audit_one.ts) don't exit before extraction lands — the systemd poller has 90s headroom so the extra ~15s doesn't matter. 3. audit_facts.jsonl + checkAuditFacts() — one row per curated audit with the extraction result. kb_query tails the jsonl, explodes entity rows, aggregates by entity name with distinct-PR counting, surfaces entities recurring in 2+ PRs as info findings. Filters out short names (<3 chars, extractor truncation artifacts) and generic types (string/number/etc.) so signal isn't drowned. Verified end-to-end on PR #8: 102KB diff → 23 shards → 1KB scratchpad → qwen2.5 extracted 4 facts + 6 entities + 6 relationships (real code-level knowledge: AggregateOptions type, aggregate async function with real signature, typed relationships). llm_team_run_id cross-references to llm_team's own team_runs table. Also: audit.ts passes (pr_number, head_sha) as InferenceContext so extracted facts are scope-tagged for the KB index. --- auditor/audit.ts | 2 +- auditor/checks/inference.ts | 220 ++++++++++++++++++++++++++++++++++-- auditor/checks/kb_query.ts | 102 +++++++++++++++++ auditor/fact_extractor.ts | 183 ++++++++++++++++++++++++++++++ 4 files changed, 499 insertions(+), 8 deletions(-) create mode 100644 auditor/fact_extractor.ts diff --git a/auditor/audit.ts b/auditor/audit.ts index 18bf732..91d23fc 100644 --- a/auditor/audit.ts +++ b/auditor/audit.ts @@ -56,7 +56,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise< const [staticFindings, dynamicFindings, inferenceFindings, kbFindings] = await Promise.all([ runStaticCheck(diff), opts.skip_dynamic ? Promise.resolve(stubFinding("dynamic", "skipped by options")) : runDynamicCheck(), - opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff), + opts.skip_inference ? Promise.resolve(stubFinding("inference", "skipped by options")) : runInferenceCheck(claims, diff, { pr_number: pr.number, head_sha: pr.head_sha }), runKbCheck(claims, pr.files.map(f => f.path)), ]); diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index c6c3dbf..488be8f 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -14,7 +14,8 @@ import type { Claim, Finding } from "../types.ts"; import { Glob } from "bun"; -import { readFile } from "node:fs/promises"; +import { readFile, mkdir, appendFile } from "node:fs/promises"; +import { extractFacts } from "../fact_extractor.ts"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; @@ -22,11 +23,36 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; // previously truncated at 15KB causing the reviewer to miss later // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a // block finding when the file was simply outside the truncation window. +// +// Above this threshold we curate via tree-split rather than truncate, +// following the scrum_master pattern: shard the diff, summarize each +// shard against the claim-verification task, merge into a compact +// scratchpad, then ask the cloud to verify claims against the +// scratchpad. This gives the cloud full-PR fidelity without bursting +// its context window (observed failure mode: empty response or +// unparseable output when prompt exceeds model's comfortable range). const MAX_DIFF_CHARS = 40000; +// Tree-split kicks in above this. 30KB is below MAX_DIFF_CHARS so we +// curate BEFORE truncation would happen — never lose signal to a hard +// cut. Shard size is chosen so ~10 shards cover PR #8-size diffs in a +// reasonable round-trip budget. +const CURATION_THRESHOLD = 30000; +const DIFF_SHARD_SIZE = 4500; const CALL_TIMEOUT_MS = 120_000; const REPO_ROOT = "/home/profit/lakehouse"; -export async function runInferenceCheck(claims: Claim[], diff: string): Promise { +export interface InferenceContext { + pr_number: number; + head_sha: string; +} + +const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl"; + +export async function runInferenceCheck( + claims: Claim[], + diff: string, + ctx?: InferenceContext, +): Promise { if (claims.length === 0) { return [{ check: "inference", @@ -51,9 +77,26 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< }]; } - const truncated = diff.length > MAX_DIFF_CHARS - ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` - : diff; + // Diff source for the cloud prompt — either the raw diff (small + // enough to fit), or a tree-split scratchpad (curation layer). We + // prefer curation to truncation: truncation silently drops files + // past the window; curation summarizes them so the cloud still sees + // what changed, just densified. + let diffForPrompt: string; + let curationNote = ""; + if (diff.length > CURATION_THRESHOLD) { + const ts = await treeSplitDiff(diff, verifiable); + diffForPrompt = ts.scratchpad; + curationNote = ` (curated: ${diff.length} chars → ${ts.shards} shards → scratchpad ${ts.scratchpad.length} chars)`; + } else { + diffForPrompt = diff; + } + // Belt-and-suspenders truncation — even a tree-split scratchpad + // shouldn't exceed MAX_DIFF_CHARS in practice, but guard anyway so + // pathological inputs can't burst the prompt. + const truncated = diffForPrompt.length > MAX_DIFF_CHARS + ? diffForPrompt.slice(0, MAX_DIFF_CHARS) + `\n...[${diffForPrompt.length - MAX_DIFF_CHARS} more chars truncated]` + : diffForPrompt; // Build the reviewer prompt in the same shape as run_codereview's // review stage (llm_team_ui.py:10950), adapted for claim verification: @@ -61,6 +104,30 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< // "Code: ..." // "Review: bugs/security/perf/style/edge. Provide corrected code." // We add: claim list upfront + ask for structured JSON verdict. + // + // When the diff was curated (tree-split scratchpad), we add an + // explicit anti-false-positive instruction: the scratchpad is a + // distillation, not the full source, so absence-from-scratchpad is + // NOT evidence of absence-from-diff. Mirrors the fix we made in + // scrum_master's review prompt for the same class of error. + const isCurated = curationNote.length > 0; + const curationGuard = isCurated + ? [ + "", + "CRITICAL: the 'Diff' below is a curated multi-shard scratchpad,", + "NOT the full raw diff. The scratchpad distills each shard down", + "to facts useful for claim verification and drops the rest.", + "DO NOT flag a function/field/feature as 'missing' or 'not", + "implemented' based solely on its absence from the scratchpad —", + "absence in a distillation is NOT evidence of absence in the", + "actual diff. Only judge a claim NOT BACKED when the scratchpad", + "DIRECTLY contradicts it (e.g. scratchpad shows the function was", + "added empty, or shows the claimed code path is a stub).", + "Skip the unflagged_gaps section entirely when operating on a", + "curated scratchpad — you can't reliably detect gaps from a", + "distillation, and false positives there are worse than misses.", + ].join("\n") + : ""; const systemMsg = [ "You review pull-request diffs against the author's own ship-claims.", "For each claim, decide: is it backed by actual code in the diff, or is", @@ -74,6 +141,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< " - the claim claims integration but the integration point is a stub", " - the diff contains unimplemented!() / todo!() / TODO comments", " - the claim says 'works end-to-end' but the diff has no end-to-end test", + curationGuard, "", "Respond with strict JSON only. No prose before or after. Shape:", "{", @@ -174,7 +242,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< findings.push({ check: "inference", severity: "info", - summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`, + summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`, evidence: [ `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, ], @@ -204,7 +272,29 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< } } - for (const g of parsed.unflagged_gaps ?? []) { + // Route the curated scratchpad through llm_team's extract-facts + // pipeline when we have (a) a curated scratchpad (best signal about + // what the PR actually changed) and (b) PR context to scope facts. + // AWAITED (not fire-and-forget) so CLI callers like audit_one.ts + // don't exit before extraction lands; the systemd poller has plenty + // of headroom (90s cycle vs ~15s extraction). A failure inside + // extractAndPersistFacts is caught + logged but never throws. + if (isCurated && ctx && process.env.LH_AUDITOR_SKIP_EXTRACT !== "1") { + try { + await extractAndPersistFacts(diffForPrompt, ctx); + } catch (e) { + console.error(`[inference] fact extraction failed: ${(e as Error).message}`); + } + } + + // Belt-and-suspenders: when operating on a curated scratchpad, drop + // the unflagged_gaps section entirely. The distillation can't + // reliably ground gap-detection, and false positives are worse than + // misses for this signal class. The systemMsg already asks the + // cloud to skip this section when curated — but the model may still + // emit it, so we filter here too. + const gapsToEmit = isCurated ? [] : (parsed.unflagged_gaps ?? []); + for (const g of gapsToEmit) { const summary = String(g?.summary ?? "?"); const location = String(g?.location ?? "?"); // False-positive guard — when the cloud says "X not defined in this @@ -248,6 +338,122 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< return findings; } +// Extract structured knowledge from the curated scratchpad and append +// to data/_kb/audit_facts.jsonl — one row per extract run, keyed by +// PR number + head SHA for scope tracking. kb_query tails this next +// audit to surface recurring entities/relationships across PRs. +async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext): Promise { + const ex = await extractFacts(scratchpad); + if (ex.error && ex.entities.length === 0 && ex.facts.length === 0) { + // Full failure — log but don't write an empty row. + console.error(`[inference] extractFacts skipped row: ${ex.error}`); + return; + } + const row = { + pr_number: ctx.pr_number, + head_sha: ctx.head_sha, + extracted_at: ex.extracted_at, + extractor: ex.extractor_model, + verifier: ex.verifier_model, + llm_team_run_id: ex.llm_team_run_id ?? null, + facts: ex.facts, + entities: ex.entities, + relationships: ex.relationships, + verification_preview: ex.verification.slice(0, 400), + }; + await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true }); + await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n"); +} + +// Curation via tree-split — ports the scrum_master pattern into the +// inference check. Shards the raw diff into DIFF_SHARD_SIZE chunks, +// summarizes each shard *against the claim-verification task* so the +// summary preserves exactly what the cloud needs to judge claims +// (function signatures, struct fields, deletions, new files), drops +// everything else. Merges into a compact scratchpad. +// +// Cost: N cloud calls for the shard summaries + 1 cloud call for the +// final verification = N+1 calls instead of 1. Mitigation: shards run +// serially (not parallel) to keep gateway load bounded; summary calls +// use max_tokens=400 so they're fast (~2s each on gpt-oss:120b). +// +// Determinism: each shard summary call uses temp=0 + think=true (same +// as the top-level inference call), so identical input yields +// identical scratchpad. The final verification call then sees a +// stable scratchpad, giving stable verdicts. +async function treeSplitDiff( + fullDiff: string, + claims: Claim[], +): Promise<{ scratchpad: string; shards: number }> { + const shards: Array<{ from: number; to: number; text: string }> = []; + for (let i = 0; i < fullDiff.length; i += DIFF_SHARD_SIZE) { + const end = Math.min(i + DIFF_SHARD_SIZE, fullDiff.length); + shards.push({ from: i, to: end, text: fullDiff.slice(i, end) }); + } + // Curate the claim list into a short form the summary prompt can + // use to bias extraction toward relevant facts. + const claimDigest = claims.map((c, i) => + `${i}. [${c.strength}] "${c.text.slice(0, 100)}"` + ).join("\n"); + + let scratchpad = ""; + for (const [si, shard] of shards.entries()) { + const prompt = [ + `You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`, + `The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`, + "", + claimDigest, + "", + "Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.", + "Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.", + "", + "─────── shard diff ───────", + shard.text, + "─────── end shard ───────", + "", + "Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).", + ].join("\n"); + + const r = await callCloud(prompt, 400); + if (r.content) { + scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${r.content.trim()}\n`; + } + } + return { scratchpad: scratchpad.trim(), shards: shards.length }; +} + +// Minimal cloud caller used only by treeSplitDiff — same gateway + +// model as the top-level call, but think=false. Shards are small +// (≤DIFF_SHARD_SIZE ~4500 chars) and the task is pure fact +// extraction, not reasoning. think=true on the shards introduced +// variance in reasoning traces that compounded across 23 calls into +// a non-deterministic scratchpad (observed during curation +// validation: same-SHA runs produced 5/7/8 final findings). +// think=false on small prompts is stable — only breaks at the main +// call's 10K+ prompt size, which keeps think=true. +async function callCloud(prompt: string, maxTokens: number): Promise<{ content: string }> { + try { + const r = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: "ollama_cloud", + model: MODEL, + messages: [{ role: "user", content: prompt }], + max_tokens: maxTokens, + temperature: 0, + think: false, + }), + signal: AbortSignal.timeout(CALL_TIMEOUT_MS), + }); + if (!r.ok) return { content: "" }; + const j: any = await r.json(); + return { content: j?.choices?.[0]?.message?.content ?? "" }; + } catch { + return { content: "" }; + } +} + // Pull out plausible code-symbol names from a summary string. // Matches: // - identifier with backticks: `foo_bar` diff --git a/auditor/checks/kb_query.ts b/auditor/checks/kb_query.ts index 8daa410..475a7a8 100644 --- a/auditor/checks/kb_query.ts +++ b/auditor/checks/kb_query.ts @@ -25,6 +25,7 @@ const OBSERVER_OPS = "/home/profit/lakehouse/data/_observer/ops.jsonl"; const BOT_CYCLES_DIR = "/home/profit/lakehouse/data/_bot/cycles"; const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl"; const AUDIT_LESSONS_JSONL = "/home/profit/lakehouse/data/_kb/audit_lessons.jsonl"; +const AUDIT_FACTS_JSONL = "/home/profit/lakehouse/data/_kb/audit_facts.jsonl"; const TAIL_LINES = 500; const MAX_BOT_CYCLE_FILES = 30; @@ -61,6 +62,14 @@ export async function runKbCheck(claims: Claim[], prFiles: string[] = []): Promi findings.push(...scrumFindings); } + // 6b. Audit-facts (llm_team extract pipeline output) — surface + // entities that recur across multiple PRs. These are the + // "core system entities" accumulating in the knowledge base; + // showing them as info on future audits gives reviewers + // architectural context the raw diff doesn't convey. + const factFindings = await checkAuditFacts(); + findings.push(...factFindings); + // 6. Audit-lessons feedback loop — summarize the top recurring // patterns from prior audits' block/warn findings. If the same // pattern signature has fired 3+ times across prior audits, @@ -207,6 +216,99 @@ function observerBySource(ops: any[]): string { return Object.entries(c).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ") || "empty"; } +// Audit-facts — reads data/_kb/audit_facts.jsonl (populated by every +// curated inference run via llm_team's extract pipeline). Each row +// has arrays: facts, entities, relationships. We explode entities and +// aggregate them across PRs using kb_index. An entity seen in 3+ PRs +// is a "core system entity" — we surface the top N as info context. +// +// Filters out short names (<3 chars, likely qwen2.5 truncation +// artifacts) and generic types ("string", "number") that would +// otherwise dominate the ranking. +const ENTITY_NAME_MIN_LEN = 3; +const GENERIC_ENTITY_NAMES = new Set([ + "string", "number", "boolean", "any", "void", "unknown", "never", + "object", "array", "function", "const", "let", "var", "true", "false", + "null", "undefined", "promise", "map", "set", "record", +]); + +async function checkAuditFacts(): Promise { + // Read raw rows — each row has multiple entities, so we can't just + // use aggregate() directly (it's one-signature-per-row). Explode + // entities into (row, entity) pairs, then aggregate by entity name. + let raw: string; + try { raw = await (await import("node:fs/promises")).readFile(AUDIT_FACTS_JSONL, "utf8"); } + catch { return []; } + const lines = raw.split("\n").filter(l => l.length > 0); + if (lines.length === 0) return []; + + interface EntityRow { entity_key: string; pr_number: number; type: string; name: string; description: string } + const entityRows: EntityRow[] = []; + for (const line of lines.slice(-TAIL_LINES * 2)) { + let row: any; + try { row = JSON.parse(line); } catch { continue; } + const prNum = Number(row?.pr_number); + if (!Number.isFinite(prNum)) continue; + for (const e of Array.isArray(row?.entities) ? row.entities : []) { + const name = String(e?.name ?? "").trim(); + if (name.length < ENTITY_NAME_MIN_LEN) continue; + if (GENERIC_ENTITY_NAMES.has(name.toLowerCase())) continue; + entityRows.push({ + entity_key: name.toLowerCase(), + pr_number: prNum, + type: String(e?.type ?? "?"), + name, + description: String(e?.description ?? "").slice(0, 160), + }); + } + } + if (entityRows.length === 0) return []; + + // Aggregate manually — one key per entity name, distinct_scopes by PR. + type Agg = { count: number; scopes: Set; types: Set; last_name: string; last_desc: string }; + const byEntity = new Map(); + for (const r of entityRows) { + const a = byEntity.get(r.entity_key) ?? { + count: 0, scopes: new Set(), types: new Set(), last_name: "", last_desc: "", + }; + a.count += 1; + a.scopes.add(r.pr_number); + a.types.add(r.type); + a.last_name = r.name; + a.last_desc = r.description; + byEntity.set(r.entity_key, a); + } + + // Rank: require 2+ distinct PRs (same-PR entity-repeats don't count + // as "cross-cutting"). Take the top 5 to avoid flooding the verdict. + const ranked = Array.from(byEntity.entries()) + .filter(([_, a]) => a.scopes.size >= 2) + .sort((a, b) => b[1].scopes.size - a[1].scopes.size || b[1].count - a[1].count) + .slice(0, 5); + + if (ranked.length === 0) { + // Useful to know the KB is being populated — emit a single + // summary so operators see fact extraction is alive. + return [{ + check: "kb_query", + severity: "info", + summary: `audit_facts KB has ${entityRows.length} entity-observations across ${new Set(entityRows.map(r => r.pr_number)).size} PRs (no cross-PR recurrences yet)`, + evidence: [`source: ${AUDIT_FACTS_JSONL}`], + }]; + } + + return ranked.map(([_, a]) => ({ + check: "kb_query" as const, + severity: "info" as const, + summary: `core entity \`${a.last_name}\` recurs in ${a.scopes.size} PRs (types: ${Array.from(a.types).join(",")})`, + evidence: [ + `count=${a.count} distinct_PRs=${a.scopes.size}`, + `description: ${a.last_desc.slice(0, 200)}`, + `PRs: ${Array.from(a.scopes).sort((x, y) => x - y).join(",")}`, + ], + })); +} + // Audit-lessons — reads data/_kb/audit_lessons.jsonl (populated by // every audit's appendAuditLessons). Uses the shared kb_index // aggregator: groups by `signature`, distinct-scopes keyed by PR diff --git a/auditor/fact_extractor.ts b/auditor/fact_extractor.ts new file mode 100644 index 0000000..eb26710 --- /dev/null +++ b/auditor/fact_extractor.ts @@ -0,0 +1,183 @@ +// fact_extractor — routes curated TEXT through llm_team_ui's +// "knowledge extract facts" mode (mode=extract at /api/run). +// +// What it gives us: structured {facts, entities, relationships} from +// whatever curated blob we send. Auditor sends the tree-split +// inference scratchpad (the best distillation of what a PR changed). +// Scrum_master will later send its accepted review bodies. +// +// Why route through llm_team and not just extract directly from our +// own checks: llm_team's extract uses a local EXTRACTOR model +// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the +// discipline J wants for knowledge going into the playbook — facts +// go in only after a second model has rated them CORRECT / +// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the +// codereview pattern J already trusts. +// +// SSE parsing: llm_team streams SSE events. We're only interested in +// the final "response" event with role="final" + the extraction +// response (role="extraction N"). Parse the JSON from the extractor's +// response text. + +const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000"; +const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest"; +const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest"; +const EXTRACT_TIMEOUT_MS = 120_000; + +export interface Entity { + name: string; + type: string; + description?: string; +} + +export interface Relationship { + from: string; + to: string; + type: string; +} + +export interface ExtractedFacts { + facts: string[]; + entities: Entity[]; + relationships: Relationship[]; + verification: string; + extractor_model: string; + verifier_model: string; + source_preview: string; + // Populated when the extract run completed server-side (llm_team + // persists to its own team_runs; this is for our own cross-ref). + llm_team_run_id?: number; + extracted_at: string; + error?: string; +} + +/** + * Run the llm_team extract pipeline on `source` text. Returns + * structured {facts, entities, relationships}. + * + * Returns an object with `error` set if the pipeline failed — never + * throws, because fact extraction is best-effort enrichment (the + * primary audit must not break if llm_team is down). + */ +export async function extractFacts(source: string): Promise { + const base: ExtractedFacts = { + facts: [], + entities: [], + relationships: [], + verification: "", + extractor_model: EXTRACTOR, + verifier_model: VERIFIER, + source_preview: source.slice(0, 240), + extracted_at: new Date().toISOString(), + }; + + let resp: Response; + try { + resp = await fetch(`${LLM_TEAM}/api/run`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + mode: "extract", + prompt: source, + extractor: EXTRACTOR, + verifier: VERIFIER, + source: "prompt", + skip_cache: true, // cache by prompt would dedup identical + // scratchpads, but we want fresh extraction + // for per-audit facts; cheap since local. + }), + signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS), + }); + } catch (e) { + return { ...base, error: `fetch failed: ${(e as Error).message}` }; + } + + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` }; + } + + // Stream SSE lines; collect the one extraction response + the run_saved event + // so we can capture the team-runs ID for cross-ref. + const decoder = new TextDecoder(); + const reader = resp.body?.getReader(); + if (!reader) return { ...base, error: "no response body" }; + + let buffer = ""; + let extractionText = ""; + let verifierText = ""; + let runId: number | undefined = undefined; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + let nl: number; + while ((nl = buffer.indexOf("\n\n")) >= 0) { + const chunk = buffer.slice(0, nl); + buffer = buffer.slice(nl + 2); + const dataLine = chunk.split("\n").find(l => l.startsWith("data: ")); + if (!dataLine) continue; + try { + const ev = JSON.parse(dataLine.slice(6)); + if (ev.type === "response") { + const role = String(ev.role ?? ""); + if (role.startsWith("extraction")) extractionText = String(ev.text ?? ""); + else if (role === "verifier") verifierText = String(ev.text ?? ""); + } else if (ev.type === "run_saved") { + const id = Number(ev.run_id); + if (Number.isFinite(id)) runId = id; + } + } catch { /* skip malformed SSE */ } + } + } + } catch (e) { + return { ...base, error: `SSE read failed: ${(e as Error).message}` }; + } + + // Pull the JSON object out of extractionText (may be wrapped in ```json fences). + const parsed = extractFirstJsonObject(extractionText); + if (!parsed) { + return { ...base, error: "extractor returned no parseable JSON", verification: verifierText }; + } + + return { + ...base, + facts: Array.isArray(parsed.facts) ? parsed.facts.slice(0, 50).map(String) : [], + entities: Array.isArray(parsed.entities) + ? parsed.entities.slice(0, 30).map((e: any) => ({ + name: String(e?.name ?? ""), + type: String(e?.type ?? ""), + description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined, + })).filter(e => e.name.length > 0) + : [], + relationships: Array.isArray(parsed.relationships) + ? parsed.relationships.slice(0, 30).map((r: any) => ({ + from: String(r?.from ?? ""), + to: String(r?.to ?? ""), + type: String(r?.type ?? ""), + })).filter(r => r.from.length > 0 && r.to.length > 0) + : [], + verification: verifierText.slice(0, 1500), + llm_team_run_id: runId, + }; +} + +// Lift the first balanced JSON object out of (possibly fenced) text. +// Same discipline as inference.ts::extractJson. +function extractFirstJsonObject(text: string): any | null { + const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); + let depth = 0, start = -1; + for (let i = 0; i < cleaned.length; i++) { + const c = cleaned[i]; + if (c === "{") { if (depth === 0) start = i; depth++; } + else if (c === "}") { + depth--; + if (depth === 0 && start >= 0) { + try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } + } + } + } + return null; +}