lakehouse/auditor/fact_extractor.ts

// fact_extractor — routes curated TEXT through llm_team_ui's
// "knowledge extract facts" mode (mode=extract at /api/run).
//
// What it gives us: structured {facts, entities, relationships} from
// whatever curated blob we send. Auditor sends the tree-split
// inference scratchpad (the best distillation of what a PR changed).
// Scrum_master will later send its accepted review bodies.
//
// Why route through llm_team and not just extract directly from our
// own checks: llm_team's extract uses a local EXTRACTOR model
// (qwen2.5) + a separate VERIFIER (gemma2). This cross-check is the
// discipline J wants for knowledge going into the playbook — facts
// go in only after a second model has rated them CORRECT /
// UNVERIFIABLE. Fast (local models, ~10-20s), free, and matches the
// codereview pattern J already trusts.
//
// SSE parsing: llm_team streams SSE events. We're only interested in
// the final "response" event with role="final" + the extraction
// response (role="extraction N"). Parse the JSON from the extractor's
// response text.

const LLM_TEAM = process.env.LH_LLM_TEAM_URL ?? "http://localhost:5000";
const EXTRACTOR = process.env.LH_FACT_EXTRACTOR ?? "qwen2.5:latest";
const VERIFIER = process.env.LH_FACT_VERIFIER ?? "gemma2:latest";
const EXTRACT_TIMEOUT_MS = 120_000;

export interface Entity {
  name: string;
  type: string;
  description?: string;
}

export interface Relationship {
  from: string;
  to: string;
  type: string;
}

export interface ExtractedFacts {
  facts: string[];
  entities: Entity[];
  relationships: Relationship[];
  verification: string;
  extractor_model: string;
  verifier_model: string;
  source_preview: string;
  // Populated when the extract run completed server-side (llm_team
  // persists to its own team_runs; this is for our own cross-ref).
  llm_team_run_id?: number;
  extracted_at: string;
  // Per-fact verdicts from the verifier pass (CORRECT/INCORRECT/
  // UNVERIFIABLE/UNCHECKED). Aligned 1:1 with the *raw* fact list
  // pre-drop so operators can see which verdicts mapped to dropped
  // facts if needed.
  verifier_verdicts?: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED">;
  facts_dropped_by_verifier?: number;
  error?: string;
}

/**
 * Run the llm_team extract pipeline on `source` text. Returns
 * structured {facts, entities, relationships}.
 *
 * Returns an object with `error` set if the pipeline failed — never
 * throws, because fact extraction is best-effort enrichment (the
 * primary audit must not break if llm_team is down).
 */
export async function extractFacts(source: string): Promise<ExtractedFacts> {
  const base: ExtractedFacts = {
    facts: [],
    entities: [],
    relationships: [],
    verification: "",
    extractor_model: EXTRACTOR,
    verifier_model: VERIFIER,
    source_preview: source.slice(0, 240),
    extracted_at: new Date().toISOString(),
  };

  let resp: Response;
  try {
    resp = await fetch(`${LLM_TEAM}/api/run`, {
      method: "POST",
      headers: { "content-type": "application/json" },
      body: JSON.stringify({
        mode: "extract",
        prompt: source,
        extractor: EXTRACTOR,
        verifier: VERIFIER,
        source: "prompt",
        skip_cache: true,  // cache by prompt would dedup identical
                           // scratchpads, but we want fresh extraction
                           // for per-audit facts; cheap since local.
      }),
      signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
    });
  } catch (e) {
    return { ...base, error: `fetch failed: ${(e as Error).message}` };
  }

  if (!resp.ok) {
    const body = await resp.text().catch(() => "");
    return { ...base, error: `llm_team /api/run ${resp.status}: ${body.slice(0, 200)}` };
  }

  // Stream SSE lines; collect the one extraction response + the run_saved event
  // so we can capture the team-runs ID for cross-ref.
  const decoder = new TextDecoder();
  const reader = resp.body?.getReader();
  if (!reader) return { ...base, error: "no response body" };

  let buffer = "";
  let extractionText = "";
  let verifierText = "";
  let runId: number | undefined = undefined;

  try {
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      buffer += decoder.decode(value, { stream: true });
      let nl: number;
      while ((nl = buffer.indexOf("\n\n")) >= 0) {
        const chunk = buffer.slice(0, nl);
        buffer = buffer.slice(nl + 2);
        const dataLine = chunk.split("\n").find(l => l.startsWith("data: "));
        if (!dataLine) continue;
        try {
          const ev = JSON.parse(dataLine.slice(6));
          if (ev.type === "response") {
            const role = String(ev.role ?? "");
            if (role.startsWith("extraction")) extractionText = String(ev.text ?? "");
            else if (role === "verifier") verifierText = String(ev.text ?? "");
          } else if (ev.type === "run_saved") {
            const id = Number(ev.run_id);
            if (Number.isFinite(id)) runId = id;
          }
        } catch { /* skip malformed SSE */ }
      }
    }
  } catch (e) {
    return { ...base, error: `SSE read failed: ${(e as Error).message}` };
  }

  // Pull the JSON object out of extractionText (may be wrapped in ```json fences).
  const parsed = extractFirstJsonObject(extractionText);
  if (!parsed) {
    return { ...base, error: "extractor returned no parseable JSON", verification: verifierText };
  }

  const rawFacts: string[] = Array.isArray(parsed.facts)
    ? parsed.facts.slice(0, 50).map(String)
    : [];

  // Parse the verifier's free-form prose into per-fact verdicts, then
  // drop any fact the verifier explicitly marked INCORRECT. Leave
  // UNVERIFIABLE in place: many of our extractions are domain-specific
  // (Lakehouse internals) and the verifier has no prior-knowledge
  // anchor, so UNVERIFIABLE is the expected verdict for new signal,
  // not a quality fail. This is verifier-gated persistence: drop only
  // what's affirmatively wrong, not what's novel.
  const verdicts = parseVerifierVerdicts(verifierText, rawFacts.length);
  const incorrectIdx = new Set<number>();
  verdicts.forEach((v, i) => { if (v === "INCORRECT") incorrectIdx.add(i); });
  const kept = rawFacts.filter((_, i) => !incorrectIdx.has(i));

  return {
    ...base,
    facts: kept,
    entities: Array.isArray(parsed.entities)
      ? parsed.entities.slice(0, 30).map((e: any) => ({
          name: String(e?.name ?? ""),
          type: String(e?.type ?? ""),
          description: typeof e?.description === "string" ? e.description.slice(0, 240) : undefined,
        })).filter(e => e.name.length > 0)
      : [],
    relationships: Array.isArray(parsed.relationships)
      ? parsed.relationships.slice(0, 30).map((r: any) => ({
          from: String(r?.from ?? ""),
          to: String(r?.to ?? ""),
          type: String(r?.type ?? ""),
        })).filter(r => r.from.length > 0 && r.to.length > 0)
      : [],
    verification: verifierText.slice(0, 1500),
    facts_dropped_by_verifier: incorrectIdx.size,
    verifier_verdicts: verdicts,
    llm_team_run_id: runId,
  };
}

// Parse verifier's free-form output into a per-fact verdict array.
// The verifier output typically looks like:
//   **1.** The claim...
//   * **Verdict:** CORRECT
//   **2.** ...
//   **Verdict:** UNVERIFIABLE
// Using matchAll to iterate — returns a verdict array of length
// numFacts; unmatched positions stay UNCHECKED.
function parseVerifierVerdicts(
  verifierText: string,
  numFacts: number,
): Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> {
  const out: Array<"CORRECT" | "INCORRECT" | "UNVERIFIABLE" | "UNCHECKED"> =
    Array(numFacts).fill("UNCHECKED");
  const re = /(?:\*\*|#+\s*)?(\d+)[.):]\s[\s\S]*?\bVerdict\s*:\s*\*?\*?\s*(CORRECT|INCORRECT|UNVERIFIABLE)/gi;
  for (const m of verifierText.matchAll(re)) {
    const idx = Number(m[1]) - 1;
    if (idx >= 0 && idx < numFacts) {
      out[idx] = m[2].toUpperCase() as "CORRECT" | "INCORRECT" | "UNVERIFIABLE";
    }
  }
  return out;
}

// Lift the first balanced JSON object out of (possibly fenced) text.
// Same discipline as inference.ts::extractJson.
function extractFirstJsonObject(text: string): any | null {
  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
  let depth = 0, start = -1;
  for (let i = 0; i < cleaned.length; i++) {
    const c = cleaned[i];
    if (c === "{") { if (depth === 0) start = i; depth++; }
    else if (c === "}") {
      depth--;
      if (depth === 0 && start >= 0) {
        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
      }
    }
  }
  return null;
}