From efc7b5ac443716fd1696769965f5b8e2fc8413a4 Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 03:54:18 -0500 Subject: [PATCH] Auditor: dynamic + inference checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit auditor/checks/dynamic.ts — wraps runHybridFixture, maps layer results to Findings. Placeholder-style errors (404/unimplemented/ slice N) → info; other failures → warn. Always emits a summary finding with real numbers (shipped/placeholder phase counts + per- layer latency). Live-tested against current stack: 2 info findings, 0 warnings — all shipped layers actually work. auditor/checks/inference.ts — wraps the run_codereview reviewer pattern from llm_team_ui.py, adapted for claim-vs-diff verification. Calls /v1/chat provider=ollama_cloud model=gpt-oss:120b. Requests strict JSON response with claim_verdicts[] and unflagged_gaps[]. A strong claim marked "not backed" by cloud → BLOCK severity; moderate → warn; weak → info. Cloud-unreachable or unparseable-output → info (never blocks on the reviewer being down). Live-tested against PR #1 (this PR, 20 claims, 39KB diff): - 36.9s round-trip - 7 block + 23 warn + 2 info findings - gpt-oss:120b correctly flagged "Fully-functional auditor (tasks 1-9 complete)" as not-backed (only 6/10 tasks done at that commit) — accurate catch - Some false positives from the original 15KB truncation threshold (cloud missed gitea.ts, flagged "no Gitea client present") - Bumped MAX_DIFF_CHARS from 15000 to 40000 to fit the full PR diff in context; reviewer precision improves accordingly Tasks 5 + 6 completed. Remaining: #7 (KB query), #8 (verdict + Gitea poster), #9 (poller), #10 (end-to-end proof), #12 (upsert UPDATE-drops-doc_refs). --- auditor/checks/dynamic.ts | 91 ++++++++++++++++ auditor/checks/inference.ts | 206 ++++++++++++++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 auditor/checks/dynamic.ts create mode 100644 auditor/checks/inference.ts diff --git a/auditor/checks/dynamic.ts b/auditor/checks/dynamic.ts new file mode 100644 index 0000000..c0c4b17 --- /dev/null +++ b/auditor/checks/dynamic.ts @@ -0,0 +1,91 @@ +// Dynamic execution check — runs the hybrid fixture and maps its +// layer results to auditor Findings. +// +// A layer that fails with a "not implemented / 404 / slice N" error +// gets severity=info (honest placeholder signal). A layer that fails +// any other way gets severity=warn (something actually broke). +// An info-level summary finding is always emitted carrying the real +// numbers — shipped/placeholder phase counts, per-layer latency. + +import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts"; +import type { Finding } from "../types.ts"; + +const PLACEHOLDER_MARKERS = [ + "unimplemented", + " 404 ", "(404)", " 405 ", "(405)", + "slice 3", "slice 4", "slice 5", + "endpoint not built", "not yet", +]; + +function isPlaceholderFailure(err?: string): boolean { + if (!err) return false; + const low = err.toLowerCase(); + return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase())); +} + +export async function runDynamicCheck(): Promise { + const findings: Finding[] = []; + + let result; + try { + result = await runHybridFixture(); + } catch (e) { + // Fixture itself crashed — can't run dynamic check at all. + return [ + { + check: "dynamic", + severity: "warn", + summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`, + evidence: [(e as Error).message], + }, + ]; + } + + // Per-layer findings for every non-ok layer. + for (const layer of result.layers) { + if (layer.ok) continue; + const placeholder = isPlaceholderFailure(layer.error); + findings.push({ + check: "dynamic", + severity: placeholder ? "info" : "warn", + summary: placeholder + ? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented` + : `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`, + evidence: [ + `evidence: ${layer.evidence.slice(0, 160)}`, + ...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []), + `latency_ms: ${layer.latency_ms}`, + ], + }); + } + + // One overall summary with real numbers so the report shows what + // DID pass plus per-layer timing. + const metrics_preview = Object.entries(result.real_numbers) + .slice(0, 10) + .map(([k, v]) => `${k}=${v}`); + findings.push({ + check: "dynamic", + severity: "info", + summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`, + evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"], + }); + + // If the fixture ran at all but nothing passed, elevate one of the + // summary findings to warn — something more than "all honest + // placeholders" is wrong. + if (result.overall === "fail") { + findings.push({ + check: "dynamic", + severity: "warn", + summary: `hybrid fixture: 0 layers passed (overall=fail)`, + evidence: [ + "a total fixture fail usually means a precondition service is down", + "(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily", + "the PR's code problem. Check service status before blaming the PR.", + ], + }); + } + + return findings; +} diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts new file mode 100644 index 0000000..6c121ec --- /dev/null +++ b/auditor/checks/inference.ts @@ -0,0 +1,206 @@ +// Cloud inference check — wraps the proven run_codereview pattern +// from llm_team_ui.py (same 3-stage framing, same cloud model) to +// critique a PR's claims against its diff. +// +// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b +// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts +// that unit tests missed. This module reuses the reviewer prompt +// shape (bugs / security / performance / style / edge cases) and +// adds claim-vs-diff specific framing. +// +// Call surface: runInferenceCheck(claims, diff) → Finding[]. +// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s +// with a 15KB diff + claim list). + +import type { Claim, Finding } from "../types.ts"; + +const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; +const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; +// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was +// previously truncated at 15KB causing the reviewer to miss later +// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a +// block finding when the file was simply outside the truncation window. +const MAX_DIFF_CHARS = 40000; +const CALL_TIMEOUT_MS = 120_000; + +export async function runInferenceCheck(claims: Claim[], diff: string): Promise { + if (claims.length === 0) { + return [{ + check: "inference", + severity: "info", + summary: "no ship-claims extracted — skipping cloud inference", + evidence: ["parser returned empty claim list; nothing to verify against cloud"], + }]; + } + + const truncated = diff.length > MAX_DIFF_CHARS + ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` + : diff; + + // Build the reviewer prompt in the same shape as run_codereview's + // review stage (llm_team_ui.py:10950), adapted for claim verification: + // "Task: ..." + // "Code: ..." + // "Review: bugs/security/perf/style/edge. Provide corrected code." + // We add: claim list upfront + ask for structured JSON verdict. + const systemMsg = [ + "You review pull-request diffs against the author's own ship-claims.", + "For each claim, decide: is it backed by actual code in the diff, or is", + "it placeholder / aspirational / unwired?", + "", + "A claim is BACKED when the diff contains a real code path that delivers", + "the claimed behavior. A claim is NOT BACKED when:", + " - the claim asserts functionality but the diff only adds types/fields", + " with no consumer", + " - the claim mentions tests but no test function was added", + " - the claim claims integration but the integration point is a stub", + " - the diff contains unimplemented!() / todo!() / TODO comments", + " - the claim says 'works end-to-end' but the diff has no end-to-end test", + "", + "Respond with strict JSON only. No prose before or after. Shape:", + "{", + ' "claim_verdicts": [', + ' {"claim_idx": 0, "backed": false, "evidence": "short reason"}', + " ],", + ' "unflagged_gaps": [', + ' {"location": "file:line", "summary": "short description"}', + " ]", + "}", + ].join("\n"); + + const userMsg = [ + `Ship-claims the author made (numbered 0..N-1):`, + claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), + "", + `Diff:`, + "```", + truncated, + "```", + "", + `For each numbered claim above, emit a claim_verdicts entry. For gaps the`, + `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`, + `Strict JSON only, matching the shape described. No prose outside JSON.`, + ].join("\n"); + + let resp: Response; + try { + resp = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: "ollama_cloud", + model: MODEL, + messages: [ + { role: "system", content: systemMsg }, + { role: "user", content: userMsg }, + ], + max_tokens: 3000, + temperature: 0.2, + think: true, // T3 overseer should reason — JSON shape is still required + }), + signal: AbortSignal.timeout(CALL_TIMEOUT_MS), + }); + } catch (e) { + // Cloud unreachable → soft-fail. Don't block a PR because the + // reviewer model is down. Static + dynamic + kb still run. + return [{ + check: "inference", + severity: "info", + summary: "cloud inference unreachable — skipped", + evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`], + }]; + } + + if (!resp.ok) { + return [{ + check: "inference", + severity: "info", + summary: `cloud inference returned ${resp.status} — skipped`, + evidence: [`body: ${(await resp.text()).slice(0, 200)}`], + }]; + } + + const body: any = await resp.json(); + const content: string = body?.choices?.[0]?.message?.content ?? ""; + const usage = body?.usage ?? {}; + + const parsed = extractJson(content); + if (!parsed) { + return [{ + check: "inference", + severity: "info", + summary: "cloud returned unparseable output — skipped", + evidence: [ + `head: ${content.slice(0, 200)}`, + `tokens: ${usage.total_tokens ?? "?"}`, + ], + }]; + } + + const findings: Finding[] = []; + + // One summary info finding so the verdict layer knows the check ran. + findings.push({ + check: "inference", + severity: "info", + summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`, + evidence: [ + `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, + ], + }); + + for (const v of parsed.claim_verdicts ?? []) { + if (v?.backed === false) { + const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; + const claim = claims[idx]; + if (!claim) continue; + // Strong+unbacked = BLOCK. That's the whole point of the auditor. + const sev: Finding["severity"] = claim.strength === "strong" ? "block" + : claim.strength === "moderate" ? "warn" + : "info"; + findings.push({ + check: "inference", + severity: sev, + claim_text: claim.text, + summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`, + evidence: [ + `at ${claim.location}`, + `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`, + ], + }); + } + } + + for (const g of parsed.unflagged_gaps ?? []) { + findings.push({ + check: "inference", + severity: "warn", + summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`, + evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`], + }); + } + + return findings; +} + +// Lift the first balanced JSON object out of the response. Tolerates +// leading prose, code fences, and model reasoning preamble when the +// cloud model ignored "strict JSON only." +function extractJson(text: string): any | null { + const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, ""); + let depth = 0; + let start = -1; + for (let i = 0; i < cleaned.length; i++) { + const c = cleaned[i]; + if (c === "{") { + if (depth === 0) start = i; + depth++; + } else if (c === "}") { + depth--; + if (depth === 0 && start >= 0) { + try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; } + } + } + } + return null; +}