// Cloud inference check — wraps the proven run_codereview pattern // from llm_team_ui.py (same 3-stage framing, same cloud model) to // critique a PR's claims against its diff. // // Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b // caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts // that unit tests missed. This module reuses the reviewer prompt // shape (bugs / security / performance / style / edge cases) and // adds claim-vs-diff specific framing. // // Call surface: runInferenceCheck(claims, diff) → Finding[]. // Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s // with a 15KB diff + claim list). import type { Claim, Finding } from "../types.ts"; import { Glob } from "bun"; import { readFile } from "node:fs/promises"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was // previously truncated at 15KB causing the reviewer to miss later // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a // block finding when the file was simply outside the truncation window. const MAX_DIFF_CHARS = 40000; const CALL_TIMEOUT_MS = 120_000; const REPO_ROOT = "/home/profit/lakehouse"; export async function runInferenceCheck(claims: Claim[], diff: string): Promise { if (claims.length === 0) { return [{ check: "inference", severity: "info", summary: "no ship-claims extracted — skipping cloud inference", evidence: ["parser returned empty claim list; nothing to verify against cloud"], }]; } // Empirical claims (runtime metrics / observed outcomes) can't be // verified from the diff. Drop them from the cloud prompt so the // reviewer doesn't chase ghosts. A future `runtime_evidence` check // can validate these against data/_kb/*/summary.json outputs. const verifiable = claims.filter(c => c.strength !== "empirical"); const empiricalCount = claims.length - verifiable.length; if (verifiable.length === 0) { return [{ check: "inference", severity: "info", summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`, evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`], }]; } const truncated = diff.length > MAX_DIFF_CHARS ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` : diff; // Build the reviewer prompt in the same shape as run_codereview's // review stage (llm_team_ui.py:10950), adapted for claim verification: // "Task: ..." // "Code: ..." // "Review: bugs/security/perf/style/edge. Provide corrected code." // We add: claim list upfront + ask for structured JSON verdict. const systemMsg = [ "You review pull-request diffs against the author's own ship-claims.", "For each claim, decide: is it backed by actual code in the diff, or is", "it placeholder / aspirational / unwired?", "", "A claim is BACKED when the diff contains a real code path that delivers", "the claimed behavior. A claim is NOT BACKED when:", " - the claim asserts functionality but the diff only adds types/fields", " with no consumer", " - the claim mentions tests but no test function was added", " - the claim claims integration but the integration point is a stub", " - the diff contains unimplemented!() / todo!() / TODO comments", " - the claim says 'works end-to-end' but the diff has no end-to-end test", "", "Respond with strict JSON only. No prose before or after. Shape:", "{", ' "claim_verdicts": [', ' {"claim_idx": 0, "backed": false, "evidence": "short reason"}', " ],", ' "unflagged_gaps": [', ' {"location": "file:line", "summary": "short description"}', " ]", "}", ].join("\n"); const userMsg = [ `Ship-claims the author made (numbered 0..N-1):`, verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), "", `Diff:`, "```", truncated, "```", "", `For each numbered claim above, emit a claim_verdicts entry. For gaps the`, `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`, `Strict JSON only, matching the shape described. No prose outside JSON.`, ].join("\n"); let resp: Response; try { resp = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: "ollama_cloud", model: MODEL, messages: [ { role: "system", content: systemMsg }, { role: "user", content: userMsg }, ], // Deterministic classification mode — temp=0 is greedy-sample, // so identical input → identical output on the same model // version. think=false disables the reasoning trace that was // letting variable prose leak into the classification output // and inflate the audit_lessons signature set (observed as // sig_count creep across the 9-run empirical test). // // max_tokens tightened to 1500 — the structured JSON response // fits comfortably in 1500 tokens for typical PRs (~7 claims); // the old 3000 just gave the model room to wander. max_tokens: 1500, temperature: 0, think: false, }), signal: AbortSignal.timeout(CALL_TIMEOUT_MS), }); } catch (e) { // Cloud unreachable → soft-fail. Don't block a PR because the // reviewer model is down. Static + dynamic + kb still run. return [{ check: "inference", severity: "info", summary: "cloud inference unreachable — skipped", evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`], }]; } if (!resp.ok) { return [{ check: "inference", severity: "info", summary: `cloud inference returned ${resp.status} — skipped`, evidence: [`body: ${(await resp.text()).slice(0, 200)}`], }]; } const body: any = await resp.json(); const content: string = body?.choices?.[0]?.message?.content ?? ""; const usage = body?.usage ?? {}; const parsed = extractJson(content); if (!parsed) { return [{ check: "inference", severity: "info", summary: "cloud returned unparseable output — skipped", evidence: [ `head: ${content.slice(0, 200)}`, `tokens: ${usage.total_tokens ?? "?"}`, ], }]; } const findings: Finding[] = []; // One summary info finding so the verdict layer knows the check ran. findings.push({ check: "inference", severity: "info", summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`, evidence: [ `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, ], }); for (const v of parsed.claim_verdicts ?? []) { if (v?.backed === false) { const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; // Indices point at the verifiable[] list we sent the cloud, // not the full claims[] list. Translate back. const claim = verifiable[idx]; if (!claim) continue; // Strong+unbacked = BLOCK. That's the whole point of the auditor. const sev: Finding["severity"] = claim.strength === "strong" ? "block" : claim.strength === "moderate" ? "warn" : "info"; findings.push({ check: "inference", severity: sev, claim_text: claim.text, summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`, evidence: [ `at ${claim.location}`, `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`, ], }); } } for (const g of parsed.unflagged_gaps ?? []) { const summary = String(g?.summary ?? "?"); const location = String(g?.location ?? "?"); // False-positive guard — when the cloud says "X not defined in this // diff" or "missing implementation of X", the cloud may just mean // "X is not in the added lines," not "X doesn't exist in the repo." // Extract candidate symbol names and grep the repo. If any symbol // is defined elsewhere, drop the finding — it's a known-symbol // reference, not a placeholder. if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) { const symbols = extractSymbols(summary); if (symbols.length > 0) { const resolved = await symbolsExistInRepo(symbols); if (resolved.length === symbols.length) { // Every named symbol exists somewhere in the repo — silent drop. continue; } if (resolved.length > 0) { // Partially resolved — demote to info with a note. findings.push({ check: "inference", severity: "info", summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`, evidence: [ `location: ${location.slice(0, 140)}`, `resolved via grep: ${resolved.join(",")}`, `unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`, ], }); continue; } } } findings.push({ check: "inference", severity: "warn", summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`, evidence: [`location: ${location.slice(0, 140)}`], }); } return findings; } // Pull out plausible code-symbol names from a summary string. // Matches: // - identifier with backticks: `foo_bar` // - identifier followed by parens: foo_bar() // - CamelCase types // - snake_case_functions // Filters out common English words that could be matched accidentally. const STOPWORDS = new Set([ "not","the","and","for","this","that","with","but","are","was","has", "have","been","any","missing","implementation","diff","defined","never", "referenced","integrated","flow","code","file","some","only","when", ]); function extractSymbols(text: string): string[] { const out = new Set(); // `backticked` symbols for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]); // foo() or foo_bar() calls for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]); // CamelCase types (3+ chars, must start with uppercase) for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]); return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase())); } // Scan the repo for at least one definition of each symbol. Uses Bun's // Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/, // and target/. Skips files > 500KB — those are fixtures/snapshots that // won't contain a definition line and slurping them slows the audit. async function symbolsExistInRepo(symbols: string[]): Promise { const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"]; const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/"); const MAX_FILE_BYTES = 500_000; const { stat } = await import("node:fs/promises"); const resolved = new Set(); const toFind = new Set(symbols); for (const pat of patterns) { if (toFind.size === 0) break; const glob = new Glob(pat); for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) { if (skip(f)) continue; try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; } let content: string; try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; } for (const sym of Array.from(toFind)) { // Definition heuristics: `function sym`, `fn sym`, `const sym`, // `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`, // `trait sym`, `async function sym`, `pub (async )?fn sym`. const re = new RegExp( `\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b` ); if (re.test(content)) { resolved.add(sym); toFind.delete(sym); if (toFind.size === 0) break; } } } } return Array.from(resolved); } function escapeRe(s: string): string { return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } // Lift the first balanced JSON object out of the response. Tolerates // leading prose, code fences, and model reasoning preamble when the // cloud model ignored "strict JSON only." function extractJson(text: string): any | null { const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); let depth = 0; let start = -1; for (let i = 0; i < cleaned.length; i++) { const c = cleaned[i]; if (c === "{") { if (depth === 0) start = i; depth++; } else if (c === "}") { depth--; if (depth === 0 && start >= 0) { try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } } } } return null; }