Some checks failed
lakehouse/auditor 8 warnings — see review
Two bundled changes. Both came out of J's observation that the
verifier was defaulting to UNVERIFIABLE on domain-specific facts
because it had no idea what Lakehouse was, which project's code it
was reading, or what framework the types belonged to.
1. Project context preamble. Added docs/AUDITOR_CONTEXT.md — a <400-
word concise description of the project (crates, services,
architecture phases, the auditor's role itself). fact_extractor
reads it once, caches it, prepends it to the extract prompt as a
"PROJECT CONTEXT (for grounding; do NOT extract from this)"
section. Both extractor and verifier now see this context, so
statements like "aggregate<T> returns Map<string, AggregateRow>"
get grounded as "this is a TypeScript function in the Lakehouse
auditor subsystem" and the verifier can reason about plausibility
instead of guessing.
2. Verifier-verdict parser fix. Gemma2's output format varies between
"**Verdict:** CORRECT" and just "* **CORRECT**" inline (observed
variance across runs). The old regex required "Verdict:" as a
label and missed the second format — causing all verdicts to
stay UNCHECKED. Replaced with a two-pass approach: find each
fact section start ("**N.**" or "N."), slice to the next section,
scan the slice for the first CORRECT|INCORRECT|UNVERIFIABLE
token. Handles both formats plus unfenced fallback.
Verified: 4-fact test extraction went from 0/4 verdicts scored
(pre-fix) to 2/4 CORRECT + 2/4 UNVERIFIABLE (post-fix). The 2
UNVERIFIABLE cases are domain-specific code behavior the verifier
legitimately can't confirm without reading source — correct stance,
not a parser miss.
No new consensus modes yet. J suggested adding codereview or
validator as a second pass; holding until we see whether context
injection alone gives sufficient signal lift.
272 lines
10 KiB
TypeScript
272 lines
10 KiB
TypeScript
// fact_extractor — routes curated TEXT through llm_team_ui's
|
|
// "knowledge extract facts" mode (mode=extract at /api/run).
|
|
//
|
|
// What it gives us: structured {facts, entities, relationships} from
|
|
// whatever curated blob we send. Auditor sends the tree-split
|
|
// inference scratchpad (the best distillation of what a PR changed).
|
|
// Scrum_master will later send its accepted review bodies.
|
|
//
|
|
// Why route through llm_team and not just extract directly from our
|
|
// own checks: llm_team's extract uses a local EXTRACTOR model
|
|
// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the
|
|
// discipline J wants for knowledge going into the playbook — facts
|
|
// go in only after a second model has rated them CORRECT /
|
|
// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the
|
|
// codereview pattern J already trusts.
|
|
//
|
|
// SSE parsing: llm_team streams SSE events. We're only interested in
|
|
// the final "response" event with role="final" + the extraction
|
|
// response (role="extraction N"). Parse the JSON from the extractor's
|
|
// response text.
|
|
|
|
const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
|
|
const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest";
|
|
const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest";
|
|
const EXTRACT_TIMEOUT_MS = 120_000;
|
|
const PROJECT_CONTEXT_FILE = process.env.LH_AUDITOR_CONTEXT_FILE
|
|
?? "/home/profit/lakehouse/docs/AUDITOR_CONTEXT.md";
|
|
|
|
let cachedContext: string | null = null;
|
|
async function loadProjectContext(): Promise<string> {
|
|
if (cachedContext !== null) return cachedContext;
|
|
try {
|
|
const { readFile } = await import("node:fs/promises");
|
|
const raw = await readFile(PROJECT_CONTEXT_FILE, "utf8");
|
|
// Cap at 4KB — anything past that is more noise than signal for
|
|
// the extractor/verifier's attention budget.
|
|
cachedContext = raw.slice(0, 4000);
|
|
} catch {
|
|
cachedContext = ""; // context file missing → extractor runs without preamble
|
|
}
|
|
return cachedContext;
|
|
}
|
|
|
|
export interface Entity {
|
|
name: string;
|
|
type: string;
|
|
description?: string;
|
|
}
|
|
|
|
export interface Relationship {
|
|
from: string;
|
|
to: string;
|
|
type: string;
|
|
}
|
|
|
|
export interface ExtractedFacts {
|
|
facts: string[];
|
|
entities: Entity[];
|
|
relationships: Relationship[];
|
|
verification: string;
|
|
extractor_model: string;
|
|
verifier_model: string;
|
|
source_preview: string;
|
|
// Populated when the extract run completed server-side (llm_team
|
|
// persists to its own team_runs; this is for our own cross-ref).
|
|
llm_team_run_id?: number;
|
|
extracted_at: string;
|
|
// Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
|
|
// UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
|
|
// pre-drop so operators can see which verdicts mapped to dropped
|
|
// facts if needed.
|
|
verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
|
|
facts_dropped_by_verifier?: number;
|
|
error?: string;
|
|
}
|
|
|
|
/**
|
|
* Run the llm_team extract pipeline on `source` text. Returns
|
|
* structured {facts, entities, relationships}.
|
|
*
|
|
* Returns an object with `error` set if the pipeline failed — never
|
|
* throws, because fact extraction is best-effort enrichment (the
|
|
* primary audit must not break if llm_team is down).
|
|
*/
|
|
export async function extractFacts(source: string): Promise<ExtractedFacts> {
|
|
const base: ExtractedFacts = {
|
|
facts: [],
|
|
entities: [],
|
|
relationships: [],
|
|
verification: "",
|
|
extractor_model: EXTRACTOR,
|
|
verifier_model: VERIFIER,
|
|
source_preview: source.slice(0, 240),
|
|
extracted_at: new Date().toISOString(),
|
|
};
|
|
|
|
// Prepend project context to the source so the extractor + verifier
|
|
// know what codebase/framework these facts belong to. Without this,
|
|
// the verifier marks most domain-specific facts as UNVERIFIABLE ("I
|
|
// don't know what Lakehouse is"). With it, the verifier can CORRECT-
|
|
// stamp facts that align with the stated architecture.
|
|
const context = await loadProjectContext();
|
|
const prompt = context.length > 0
|
|
? `=== PROJECT CONTEXT (for grounding facts; do NOT extract facts from this section) ===\n${context}\n\n=== CONTENT TO EXTRACT FACTS FROM ===\n${source}`
|
|
: source;
|
|
|
|
let resp: Response;
|
|
try {
|
|
resp = await fetch(`${LLM_TEAM}/api/run`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
mode: "extract",
|
|
prompt,
|
|
extractor: EXTRACTOR,
|
|
verifier: VERIFIER,
|
|
source: "prompt",
|
|
skip_cache: true, // cache by prompt would dedup identical
|
|
// scratchpads, but we want fresh extraction
|
|
// for per-audit facts; cheap since local.
|
|
}),
|
|
signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
|
|
});
|
|
} catch (e) {
|
|
return { ...base, error: `fetch failed: ${(e as Error).message}` };
|
|
}
|
|
|
|
if (!resp.ok) {
|
|
const body = await resp.text().catch(() => "");
|
|
return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` };
|
|
}
|
|
|
|
// Stream SSE lines; collect the one extraction response + the run_saved event
|
|
// so we can capture the team-runs ID for cross-ref.
|
|
const decoder = new TextDecoder();
|
|
const reader = resp.body?.getReader();
|
|
if (!reader) return { ...base, error: "no response body" };
|
|
|
|
let buffer = "";
|
|
let extractionText = "";
|
|
let verifierText = "";
|
|
let runId: number | undefined = undefined;
|
|
|
|
try {
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
buffer += decoder.decode(value, { stream: true });
|
|
let nl: number;
|
|
while ((nl = buffer.indexOf("\n\n")) >= 0) {
|
|
const chunk = buffer.slice(0, nl);
|
|
buffer = buffer.slice(nl + 2);
|
|
const dataLine = chunk.split("\n").find(l => l.startsWith("data: "));
|
|
if (!dataLine) continue;
|
|
try {
|
|
const ev = JSON.parse(dataLine.slice(6));
|
|
if (ev.type === "response") {
|
|
const role = String(ev.role ?? "");
|
|
if (role.startsWith("extraction")) extractionText = String(ev.text ?? "");
|
|
else if (role === "verifier") verifierText = String(ev.text ?? "");
|
|
} else if (ev.type === "run_saved") {
|
|
const id = Number(ev.run_id);
|
|
if (Number.isFinite(id)) runId = id;
|
|
}
|
|
} catch { /* skip malformed SSE */ }
|
|
}
|
|
}
|
|
} catch (e) {
|
|
return { ...base, error: `SSE read failed: ${(e as Error).message}` };
|
|
}
|
|
|
|
// Pull the JSON object out of extractionText (may be wrapped in ```json fences).
|
|
const parsed = extractFirstJsonObject(extractionText);
|
|
if (!parsed) {
|
|
return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
|
|
}
|
|
|
|
const rawFacts: string[] = Array.isArray(parsed.facts)
|
|
? parsed.facts.slice(0, 50).map(String)
|
|
: [];
|
|
|
|
// Parse the verifier's free-form prose into per-fact verdicts, then
|
|
// drop any fact the verifier explicitly marked INCORRECT. Leave
|
|
// UNVERIFIABLE in place: many of our extractions are domain-specific
|
|
// (Lakehouse internals) and the verifier has no prior-knowledge
|
|
// anchor, so UNVERIFIABLE is the expected verdict for new signal,
|
|
// not a quality fail. This is verifier-gated persistence: drop only
|
|
// what's affirmatively wrong, not what's novel.
|
|
const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
|
|
const incorrectIdx = new Set<number>();
|
|
verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
|
|
const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));
|
|
|
|
return {
|
|
...base,
|
|
facts: kept,
|
|
entities: Array.isArray(parsed.entities)
|
|
? parsed.entities.slice(0, 30).map((e: any) => ({
|
|
name: String(e?.name ?? ""),
|
|
type: String(e?.type ?? ""),
|
|
description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined,
|
|
})).filter(e => e.name.length > 0)
|
|
: [],
|
|
relationships: Array.isArray(parsed.relationships)
|
|
? parsed.relationships.slice(0, 30).map((r: any) => ({
|
|
from: String(r?.from ?? ""),
|
|
to: String(r?.to ?? ""),
|
|
type: String(r?.type ?? ""),
|
|
})).filter(r => r.from.length > 0 && r.to.length > 0)
|
|
: [],
|
|
verification: verifierText.slice(0, 1500),
|
|
facts_dropped_by_verifier: incorrectIdx.size,
|
|
verifier_verdicts: verdicts,
|
|
llm_team_run_id: runId,
|
|
};
|
|
}
|
|
|
|
// Parse verifier's free-form output into a per-fact verdict array.
|
|
// Gemma2 uses several formats depending on prompt mood:
|
|
// Format A: **1.** claim... * **Verdict:** CORRECT
|
|
// Format B: **1.** claim... * **CORRECT** (no "Verdict:" label)
|
|
// Format C: 1. claim... CORRECT
|
|
// Strategy: split on fact numbers, then find the first
|
|
// CORRECT|INCORRECT|UNVERIFIABLE token in each section. Handles all
|
|
// three formats without regex gymnastics.
|
|
function parseVerifierVerdicts(
|
|
verifierText: string,
|
|
numFacts: number,
|
|
): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
|
|
const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
|
|
Array(numFacts).fill("UNCHECKED");
|
|
if (!verifierText) return out;
|
|
|
|
// Find each fact section start — "**N.**" or "N." at line start —
|
|
// and slice out the content up to the NEXT fact number. Each section
|
|
// gets scanned for the first CORRECT/INCORRECT/UNVERIFIABLE token.
|
|
const starts: Array<{ idx: number; pos: number }> = [];
|
|
const header = /(?:^|\n)\s*(?:\*\*)?(\d+)[.)]/g;
|
|
for (const m of verifierText.matchAll(header)) {
|
|
const factNum = Number(m[1]);
|
|
if (!Number.isFinite(factNum)) continue;
|
|
starts.push({ idx: factNum - 1, pos: m.index! });
|
|
}
|
|
for (let i = 0; i < starts.length; i++) {
|
|
const s = starts[i];
|
|
const end = i + 1 < starts.length ? starts[i + 1].pos : verifierText.length;
|
|
if (s.idx < 0 || s.idx >= numFacts) continue;
|
|
const section = verifierText.slice(s.pos, end);
|
|
const v = section.match(/\b(CORRECT|INCORRECT|UNVERIFIABLE)\b/i);
|
|
if (v) out[s.idx] = v[1].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Lift the first balanced JSON object out of (possibly fenced) text.
|
|
// Same discipline as inference.ts::extractJson.
|
|
function extractFirstJsonObject(text: string): any | null {
|
|
const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
|
|
let depth = 0, start = -1;
|
|
for (let i = 0; i < cleaned.length; i++) {
|
|
const c = cleaned[i];
|
|
if (c === "{") { if (depth === 0) start = i; depth++; }
|
|
else if (c === "}") {
|
|
depth--;
|
|
if (depth === 0 && start >= 0) {
|
|
try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|