// Cloud inference check — wraps the proven run_codereview pattern // from llm_team_ui.py (same 3-stage framing, same cloud model) to // critique a PR's claims against its diff. // // Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b // caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts // that unit tests missed. This module reuses the reviewer prompt // shape (bugs / security / performance / style / edge cases) and // adds claim-vs-diff specific framing. // // Call surface: runInferenceCheck(claims, diff) → Finding[]. // Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s // with a 15KB diff + claim list). import type { Claim, Finding } from "../types.ts"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was // previously truncated at 15KB causing the reviewer to miss later // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a // block finding when the file was simply outside the truncation window. const MAX_DIFF_CHARS = 40000; const CALL_TIMEOUT_MS = 120_000; export async function runInferenceCheck(claims: Claim[], diff: string): Promise { if (claims.length === 0) { return [{ check: "inference", severity: "info", summary: "no ship-claims extracted — skipping cloud inference", evidence: ["parser returned empty claim list; nothing to verify against cloud"], }]; } const truncated = diff.length > MAX_DIFF_CHARS ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` : diff; // Build the reviewer prompt in the same shape as run_codereview's // review stage (llm_team_ui.py:10950), adapted for claim verification: // "Task: ..." // "Code: ..." // "Review: bugs/security/perf/style/edge. Provide corrected code." // We add: claim list upfront + ask for structured JSON verdict. const systemMsg = [ "You review pull-request diffs against the author's own ship-claims.", "For each claim, decide: is it backed by actual code in the diff, or is", "it placeholder / aspirational / unwired?", "", "A claim is BACKED when the diff contains a real code path that delivers", "the claimed behavior. A claim is NOT BACKED when:", " - the claim asserts functionality but the diff only adds types/fields", " with no consumer", " - the claim mentions tests but no test function was added", " - the claim claims integration but the integration point is a stub", " - the diff contains unimplemented!() / todo!() / TODO comments", " - the claim says 'works end-to-end' but the diff has no end-to-end test", "", "Respond with strict JSON only. No prose before or after. Shape:", "{", ' "claim_verdicts": [', ' {"claim_idx": 0, "backed": false, "evidence": "short reason"}', " ],", ' "unflagged_gaps": [', ' {"location": "file:line", "summary": "short description"}', " ]", "}", ].join("\n"); const userMsg = [ `Ship-claims the author made (numbered 0..N-1):`, claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), "", `Diff:`, "```", truncated, "```", "", `For each numbered claim above, emit a claim_verdicts entry. For gaps the`, `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`, `Strict JSON only, matching the shape described. No prose outside JSON.`, ].join("\n"); let resp: Response; try { resp = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: "ollama_cloud", model: MODEL, messages: [ { role: "system", content: systemMsg }, { role: "user", content: userMsg }, ], max_tokens: 3000, temperature: 0.2, think: true, // T3 overseer should reason — JSON shape is still required }), signal: AbortSignal.timeout(CALL_TIMEOUT_MS), }); } catch (e) { // Cloud unreachable → soft-fail. Don't block a PR because the // reviewer model is down. Static + dynamic + kb still run. return [{ check: "inference", severity: "info", summary: "cloud inference unreachable — skipped", evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`], }]; } if (!resp.ok) { return [{ check: "inference", severity: "info", summary: `cloud inference returned ${resp.status} — skipped`, evidence: [`body: ${(await resp.text()).slice(0, 200)}`], }]; } const body: any = await resp.json(); const content: string = body?.choices?.[0]?.message?.content ?? ""; const usage = body?.usage ?? {}; const parsed = extractJson(content); if (!parsed) { return [{ check: "inference", severity: "info", summary: "cloud returned unparseable output — skipped", evidence: [ `head: ${content.slice(0, 200)}`, `tokens: ${usage.total_tokens ?? "?"}`, ], }]; } const findings: Finding[] = []; // One summary info finding so the verdict layer knows the check ran. findings.push({ check: "inference", severity: "info", summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`, evidence: [ `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, ], }); for (const v of parsed.claim_verdicts ?? []) { if (v?.backed === false) { const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; const claim = claims[idx]; if (!claim) continue; // Strong+unbacked = BLOCK. That's the whole point of the auditor. const sev: Finding["severity"] = claim.strength === "strong" ? "block" : claim.strength === "moderate" ? "warn" : "info"; findings.push({ check: "inference", severity: sev, claim_text: claim.text, summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`, evidence: [ `at ${claim.location}`, `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`, ], }); } } for (const g of parsed.unflagged_gaps ?? []) { findings.push({ check: "inference", severity: "warn", summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`, evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`], }); } return findings; } // Lift the first balanced JSON object out of the response. Tolerates // leading prose, code fences, and model reasoning preamble when the // cloud model ignored "strict JSON only." function extractJson(text: string): any | null { const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); let depth = 0; let start = -1; for (let i = 0; i < cleaned.length; i++) { const c = cleaned[i]; if (c === "{") { if (depth === 0) start = i; depth++; } else if (c === "}") { depth--; if (depth === 0 && start >= 0) { try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } } } } return null; }