lakehouse/auditor/fact_extractor.ts

// fact_extractor — routes curated TEXT through llm_team_ui's
// "knowledge extract facts" mode (mode=extract at /api/run).
//
// What it gives us: structured {facts, entities, relationships} from
// whatever curated blob we send. Auditor sends the tree-split
// inference scratchpad (the best distillation of what a PR changed).
// Scrum_master will later send its accepted review bodies.
//
// Why route through llm_team and not just extract directly from our
// own checks: llm_team's extract uses a local EXTRACTOR model
// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the
// discipline J wants for knowledge going into the playbook — facts
// go in only after a second model has rated them CORRECT /
// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the
// codereview pattern J already trusts.
//
// SSE parsing: llm_team streams SSE events. We're only interested in
// the final "response" event with role="final" + the extraction
// response (role="extraction N"). Parse the JSON from the extractor's
// response text.

const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest";
const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest";
const EXTRACT_TIMEOUT_MS = 120_000;
const PROJECT_CONTEXT_FILE = process.env.LH_AUDITOR_CONTEXT_FILE
  ?? "/home/profit/lakehouse/docs/AUDITOR_CONTEXT.md";

let cachedContext: string | null = null;
async function loadProjectContext(): Promise<string> {
  if (cachedContext !== null) return cachedContext;
  try {
    const { readFile } = await import("node:fs/promises");
    const raw = await readFile(PROJECT_CONTEXT_FILE, "utf8");
    // Cap at 4KB — anything past that is more noise than signal for
    // the extractor/verifier's attention budget.
    cachedContext = raw.slice(0, 4000);
  } catch {
    cachedContext = "";  // context file missing → extractor runs without preamble
  }
  return cachedContext;
}

export interface Entity {
  name: string;
  type: string;
  description?: string;
}

export interface Relationship {
  from: string;
  to: string;
  type: string;
}

export interface ExtractedFacts {
  facts: string[];
  entities: Entity[];
  relationships: Relationship[];
  verification: string;
  extractor_model: string;
  verifier_model: string;
  source_preview: string;
  // Populated when the extract run completed server-side (llm_team
  // persists to its own team_runs; this is for our own cross-ref).
  llm_team_run_id?: number;
  extracted_at: string;
  // Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
  // UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
  // pre-drop so operators can see which verdicts mapped to dropped
  // facts if needed.
  verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
  facts_dropped_by_verifier?: number;
  error?: string;
}

/**
 * Run the llm_team extract pipeline on `source` text. Returns
 * structured {facts, entities, relationships}.
 *
 * Returns an object with `error` set if the pipeline failed — never
 * throws, because fact extraction is best-effort enrichment (the
 * primary audit must not break if llm_team is down).
 */
export async function extractFacts(source: string): Promise<ExtractedFacts> {
  const base: ExtractedFacts = {
    facts: [],
    entities: [],
    relationships: [],
    verification: "",
    extractor_model: EXTRACTOR,
    verifier_model: VERIFIER,
    source_preview: source.slice(0, 240),
    extracted_at: new Date().toISOString(),
  };

  // Prepend project context to the source so the extractor + verifier
  // know what codebase/framework these facts belong to. Without this,
  // the verifier marks most domain-specific facts as UNVERIFIABLE ("I
  // don't know what Lakehouse is"). With it, the verifier can CORRECT-
  // stamp facts that align with the stated architecture.
  const context = await loadProjectContext();
  const prompt = context.length > 0
    ? `=== PROJECT CONTEXT (for grounding facts; do NOT extract facts from this section) ===\n${context}\n\n=== CONTENT TO EXTRACT FACTS FROM ===\n${source}`
    : source;

  let resp: Response;
  try {
    resp = await fetch(`${LLM_TEAM}/api/run`, {
      method: "POST",
      headers: { "content-type": "application/json" },
      body: JSON.stringify({
        mode: "extract",
        prompt,
        extractor: EXTRACTOR,
        verifier: VERIFIER,
        source: "prompt",
        skip_cache: true,  // cache by prompt would dedup identical
                           // scratchpads, but we want fresh extraction
                           // for per-audit facts; cheap since local.
      }),
      signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
    });
  } catch (e) {
    return { ...base, error: `fetch failed: ${(e as Error).message}` };
  }

  if (!resp.ok) {
    const body = await resp.text().catch(() => "");
    return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` };
  }

  // Stream SSE lines; collect the one extraction response + the run_saved event
  // so we can capture the team-runs ID for cross-ref.
  const decoder = new TextDecoder();
  const reader = resp.body?.getReader();
  if (!reader) return { ...base, error: "no response body" };

  let buffer = "";
  let extractionText = "";
  let verifierText = "";
  let runId: number | undefined = undefined;

  try {
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      buffer += decoder.decode(value, { stream: true });
      let nl: number;
      while ((nl = buffer.indexOf("\n\n")) >= 0) {
        const chunk = buffer.slice(0, nl);
        buffer = buffer.slice(nl + 2);
        const dataLine = chunk.split("\n").find(l => l.startsWith("data: "));
        if (!dataLine) continue;
        try {
          const ev = JSON.parse(dataLine.slice(6));
          if (ev.type === "response") {
            const role = String(ev.role ?? "");
            if (role.startsWith("extraction")) extractionText = String(ev.text ?? "");
            else if (role === "verifier") verifierText = String(ev.text ?? "");
          } else if (ev.type === "run_saved") {
            const id = Number(ev.run_id);
            if (Number.isFinite(id)) runId = id;
          }
        } catch { /* skip malformed SSE */ }
      }
    }
  } catch (e) {
    return { ...base, error: `SSE read failed: ${(e as Error).message}` };
  }

  // Pull the JSON object out of extractionText (may be wrapped in ```json fences).
  const parsed = extractFirstJsonObject(extractionText);
  if (!parsed) {
    return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
  }

  const rawFacts: string[] = Array.isArray(parsed.facts)
    ? parsed.facts.slice(0, 50).map(String)
    : [];

  // Parse the verifier's free-form prose into per-fact verdicts, then
  // drop any fact the verifier explicitly marked INCORRECT. Leave
  // UNVERIFIABLE in place: many of our extractions are domain-specific
  // (Lakehouse internals) and the verifier has no prior-knowledge
  // anchor, so UNVERIFIABLE is the expected verdict for new signal,
  // not a quality fail. This is verifier-gated persistence: drop only
  // what's affirmatively wrong, not what's novel.
  const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
  const incorrectIdx = new Set<number>();
  verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
  const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));

  return {
    ...base,
    facts: kept,
    entities: Array.isArray(parsed.entities)
      ? parsed.entities.slice(0, 30).map((e: any) => ({
          name: String(e?.name ?? ""),
          type: String(e?.type ?? ""),
          description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined,
        })).filter(e => e.name.length > 0)
      : [],
    relationships: Array.isArray(parsed.relationships)
      ? parsed.relationships.slice(0, 30).map((r: any) => ({
          from: String(r?.from ?? ""),
          to: String(r?.to ?? ""),
          type: String(r?.type ?? ""),
        })).filter(r => r.from.length > 0 && r.to.length > 0)
      : [],
    verification: verifierText.slice(0, 1500),
    facts_dropped_by_verifier: incorrectIdx.size,
    verifier_verdicts: verdicts,
    llm_team_run_id: runId,
  };
}

// Parse verifier's free-form output into a per-fact verdict array.
// Gemma2 uses several formats depending on prompt mood:
//   Format A:  **1.** claim... * **Verdict:** CORRECT
//   Format B:  **1.** claim... * **CORRECT**  (no "Verdict:" label)
//   Format C:  1. claim... CORRECT
// Strategy: split on fact numbers, then find the first
// CORRECT|INCORRECT|UNVERIFIABLE token in each section. Handles all
// three formats without regex gymnastics.
function parseVerifierVerdicts(
  verifierText: string,
  numFacts: number,
): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
  const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
    Array(numFacts).fill("UNCHECKED");
  if (!verifierText) return out;

  // Find each fact section start — "**N.**" or "N." at line start —
  // and slice out the content up to the NEXT fact number. Each section
  // gets scanned for the first CORRECT/INCORRECT/UNVERIFIABLE token.
  const starts: Array<{ idx: number; pos: number }> = [];
  const header = /(?:^|\n)\s*(?:\*\*)?(\d+)[.)]/g;
  for (const m of verifierText.matchAll(header)) {
    const factNum = Number(m[1]);
    if (!Number.isFinite(factNum)) continue;
    starts.push({ idx: factNum - 1, pos: m.index! });
  }
  for (let i = 0; i < starts.length; i++) {
    const s = starts[i];
    const end = i + 1 < starts.length ? starts[i + 1].pos : verifierText.length;
    if (s.idx < 0 || s.idx >= numFacts) continue;
    const section = verifierText.slice(s.pos, end);
    const v = section.match(/\b(CORRECT|INCORRECT|UNVERIFIABLE)\b/i);
    if (v) out[s.idx] = v[1].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
  }
  return out;
}

// Lift the first balanced JSON object out of (possibly fenced) text.
// Same discipline as inference.ts::extractJson.
function extractFirstJsonObject(text: string): any | null {
  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
  let depth = 0, start = -1;
  for (let i = 0; i < cleaned.length; i++) {
    const c = cleaned[i];
    if (c === "{") { if (depth === 0) start = i; depth++; }
    else if (c === "}") {
      depth--;
      if (depth === 0 && start >= 0) {
        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
      }
    }
  }
  return null;
}