lakehouse/auditor/checks/inference.ts

// Cloud inference check — wraps the proven run_codereview pattern
// from llm_team_ui.py (same 3-stage framing, same cloud model) to
// critique a PR's claims against its diff.
//
// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
// that unit tests missed. This module reuses the reviewer prompt
// shape (bugs / security / performance / style / edge cases) and
// adds claim-vs-diff specific framing.
//
// Call surface: runInferenceCheck(claims, diff) → Finding[].
// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
// with a 15KB diff + claim list).

import type { Claim, Finding } from "../types.ts";
import { Glob } from "bun";
import { readFile } from "node:fs/promises";

const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
// previously truncated at 15KB causing the reviewer to miss later
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
// block finding when the file was simply outside the truncation window.
const MAX_DIFF_CHARS = 40000;
const CALL_TIMEOUT_MS = 120_000;
const REPO_ROOT = "/home/profit/lakehouse";

export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
  if (claims.length === 0) {
    return [{
      check: "inference",
      severity: "info",
      summary: "no ship-claims extracted — skipping cloud inference",
      evidence: ["parser returned empty claim list; nothing to verify against cloud"],
    }];
  }

  // Empirical claims (runtime metrics / observed outcomes) can't be
  // verified from the diff. Drop them from the cloud prompt so the
  // reviewer doesn't chase ghosts. A future `runtime_evidence` check
  // can validate these against data/_kb/*/summary.json outputs.
  const verifiable = claims.filter(c => c.strength !== "empirical");
  const empiricalCount = claims.length - verifiable.length;
  if (verifiable.length === 0) {
    return [{
      check: "inference",
      severity: "info",
      summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
      evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
    }];
  }

  const truncated = diff.length > MAX_DIFF_CHARS
    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
    : diff;

  // Build the reviewer prompt in the same shape as run_codereview's
  // review stage (llm_team_ui.py:10950), adapted for claim verification:
  //   "Task: ..."
  //   "Code: ..."
  //   "Review: bugs/security/perf/style/edge. Provide corrected code."
  // We add: claim list upfront + ask for structured JSON verdict.
  const systemMsg = [
    "You review pull-request diffs against the author's own ship-claims.",
    "For each claim, decide: is it backed by actual code in the diff, or is",
    "it placeholder / aspirational / unwired?",
    "",
    "A claim is BACKED when the diff contains a real code path that delivers",
    "the claimed behavior. A claim is NOT BACKED when:",
    "  - the claim asserts functionality but the diff only adds types/fields",
    "    with no consumer",
    "  - the claim mentions tests but no test function was added",
    "  - the claim claims integration but the integration point is a stub",
    "  - the diff contains unimplemented!() / todo!() / TODO comments",
    "  - the claim says 'works end-to-end' but the diff has no end-to-end test",
    "",
    "Respond with strict JSON only. No prose before or after. Shape:",
    "{",
    '  "claim_verdicts": [',
    '    {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
    "  ],",
    '  "unflagged_gaps": [',
    '    {"location": "file:line", "summary": "short description"}',
    "  ]",
    "}",
  ].join("\n");

  const userMsg = [
    `Ship-claims the author made (numbered 0..N-1):`,
    verifiable.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
    "",
    `Diff:`,
    "```",
    truncated,
    "```",
    "",
    `For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
    `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
    `Strict JSON only, matching the shape described. No prose outside JSON.`,
  ].join("\n");

  let resp: Response;
  try {
    resp = await fetch(`${GATEWAY}/v1/chat`, {
      method: "POST",
      headers: { "content-type": "application/json" },
      body: JSON.stringify({
        provider: "ollama_cloud",
        model: MODEL,
        messages: [
          { role: "system", content: systemMsg },
          { role: "user", content: userMsg },
        ],
        // Deterministic classification — temp=0 is greedy-sample, so
        // identical input yields identical output on the same model
        // version. This kills the signature creep we observed in the
        // 9-run empirical test (sig_count 16→27 from cloud phrasing
        // variance at temp=0.2).
        //
        // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
        // model; setting think=false caused it to return empty content
        // on large prompts (observed during Level 1 validation: 13421
        // tokens used, empty content returned). The reasoning trace is
        // variable prose, but at temp=0 the FINAL classification is
        // still deterministic because greedy sampling converges to
        // the same conclusion from the same starting state.
        max_tokens: 3000,
        temperature: 0,
        think: true,
      }),
      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
    });
  } catch (e) {
    // Cloud unreachable → soft-fail. Don't block a PR because the
    // reviewer model is down. Static + dynamic + kb still run.
    return [{
      check: "inference",
      severity: "info",
      summary: "cloud inference unreachable — skipped",
      evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
    }];
  }

  if (!resp.ok) {
    return [{
      check: "inference",
      severity: "info",
      summary: `cloud inference returned ${resp.status} — skipped`,
      evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
    }];
  }

  const body: any = await resp.json();
  const content: string = body?.choices?.[0]?.message?.content ?? "";
  const usage = body?.usage ?? {};

  const parsed = extractJson(content);
  if (!parsed) {
    return [{
      check: "inference",
      severity: "info",
      summary: "cloud returned unparseable output — skipped",
      evidence: [
        `head: ${content.slice(0, 200)}`,
        `tokens: ${usage.total_tokens ?? "?"}`,
      ],
    }];
  }

  const findings: Finding[] = [];

  // One summary info finding so the verdict layer knows the check ran.
  findings.push({
    check: "inference",
    severity: "info",
    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
    evidence: [
      `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
    ],
  });

  for (const v of parsed.claim_verdicts ?? []) {
    if (v?.backed === false) {
      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
      // Indices point at the verifiable[] list we sent the cloud,
      // not the full claims[] list. Translate back.
      const claim = verifiable[idx];
      if (!claim) continue;
      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
        : claim.strength === "moderate" ? "warn"
        : "info";
      findings.push({
        check: "inference",
        severity: sev,
        claim_text: claim.text,
        summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
        evidence: [
          `at ${claim.location}`,
          `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
        ],
      });
    }
  }

  for (const g of parsed.unflagged_gaps ?? []) {
    const summary = String(g?.summary ?? "?");
    const location = String(g?.location ?? "?");
    // False-positive guard — when the cloud says "X not defined in this
    // diff" or "missing implementation of X", the cloud may just mean
    // "X is not in the added lines," not "X doesn't exist in the repo."
    // Extract candidate symbol names and grep the repo. If any symbol
    // is defined elsewhere, drop the finding — it's a known-symbol
    // reference, not a placeholder.
    if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
      const symbols = extractSymbols(summary);
      if (symbols.length > 0) {
        const resolved = await symbolsExistInRepo(symbols);
        if (resolved.length === symbols.length) {
          // Every named symbol exists somewhere in the repo — silent drop.
          continue;
        }
        if (resolved.length > 0) {
          // Partially resolved — demote to info with a note.
          findings.push({
            check: "inference",
            severity: "info",
            summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
            evidence: [
              `location: ${location.slice(0, 140)}`,
              `resolved via grep: ${resolved.join(",")}`,
              `unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
            ],
          });
          continue;
        }
      }
    }
    findings.push({
      check: "inference",
      severity: "warn",
      summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
      evidence: [`location: ${location.slice(0, 140)}`],
    });
  }

  return findings;
}

// Pull out plausible code-symbol names from a summary string.
// Matches:
//   - identifier with backticks: `foo_bar`
//   - identifier followed by parens: foo_bar()
//   - CamelCase types
//   - snake_case_functions
// Filters out common English words that could be matched accidentally.
const STOPWORDS = new Set([
  "not","the","and","for","this","that","with","but","are","was","has",
  "have","been","any","missing","implementation","diff","defined","never",
  "referenced","integrated","flow","code","file","some","only","when",
]);
function extractSymbols(text: string): string[] {
  const out = new Set<string>();
  // `backticked` symbols
  for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
  // foo() or foo_bar() calls
  for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
  // CamelCase types (3+ chars, must start with uppercase)
  for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
  return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
}

// Scan the repo for at least one definition of each symbol. Uses Bun's
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
// and target/. Skips files > 500KB — those are fixtures/snapshots that
// won't contain a definition line and slurping them slows the audit.
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
  const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
  const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
  const MAX_FILE_BYTES = 500_000;
  const { stat } = await import("node:fs/promises");
  const resolved = new Set<string>();
  const toFind = new Set(symbols);
  for (const pat of patterns) {
    if (toFind.size === 0) break;
    const glob = new Glob(pat);
    for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
      if (skip(f)) continue;
      try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; }
      let content: string;
      try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
      for (const sym of Array.from(toFind)) {
        // Definition heuristics: `function sym`, `fn sym`, `const sym`,
        // `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
        // `trait sym`, `async function sym`, `pub (async )?fn sym`.
        const re = new RegExp(
          `\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
        );
        if (re.test(content)) {
          resolved.add(sym);
          toFind.delete(sym);
          if (toFind.size === 0) break;
        }
      }
    }
  }
  return Array.from(resolved);
}

function escapeRe(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}

// Lift the first balanced JSON object out of the response. Tolerates
// leading prose, code fences, and model reasoning preamble when the
// cloud model ignored "strict JSON only."
function extractJson(text: string): any | null {
  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
  let depth = 0;
  let start = -1;
  for (let i = 0; i < cleaned.length; i++) {
    const c = cleaned[i];
    if (c === "{") {
      if (depth === 0) start = i;
      depth++;
    } else if (c === "}") {
      depth--;
      if (depth === 0 && start >= 0) {
        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
      }
    }
  }
  return null;
}