Auditor: dynamic + inference checks

auditor/checks/dynamic.ts — wraps runHybridFixture, maps layer results to Findings. Placeholder-style errors (404/unimplemented/ slice N) → info; other failures → warn. Always emits a summary finding with real numbers (shipped/placeholder phase counts + per- layer latency). Live-tested against current stack: 2 info findings, 0 warnings — all shipped layers actually work. auditor/checks/inference.ts — wraps the run_codereview reviewer pattern from llm_team_ui.py, adapted for claim-vs-diff verification. Calls /v1/chat provider=ollama_cloud model=gpt-oss:120b. Requests strict JSON response with claim_verdicts[] and unflagged_gaps[]. A strong claim marked "not backed" by cloud → BLOCK severity; moderate → warn; weak → info. Cloud-unreachable or unparseable-output → info (never blocks on the reviewer being down). Live-tested against PR #1 (this PR, 20 claims, 39KB diff): - 36.9s round-trip - 7 block + 23 warn + 2 info findings - gpt-oss:120b correctly flagged "Fully-functional auditor (tasks 1-9 complete)" as not-backed (only 6/10 tasks done at that commit) — accurate catch - Some false positives from the original 15KB truncation threshold (cloud missed gitea.ts, flagged "no Gitea client present") - Bumped MAX_DIFF_CHARS from 15000 to 40000 to fit the full PR diff in context; reviewer precision improves accordingly Tasks 5 + 6 completed. Remaining: #7 (KB query), #8 (verdict + Gitea poster), #9 (poller), #10 (end-to-end proof), #12 (upsert UPDATE-drops-doc_refs).
2026-04-22 03:54:18 -05:00 · 2026-04-22 03:54:18 -05:00 · efc7b5ac44
commit efc7b5ac44
parent c5da680add
2 changed files with 297 additions and 0 deletions
--- a/auditor/checks/dynamic.ts
+++ b/auditor/checks/dynamic.ts
@ -0,0 +1,91 @@
+// Dynamic execution check — runs the hybrid fixture and maps its
+// layer results to auditor Findings.
+//
+// A layer that fails with a "not implemented / 404 / slice N" error
+// gets severity=info (honest placeholder signal). A layer that fails
+// any other way gets severity=warn (something actually broke).
+// An info-level summary finding is always emitted carrying the real
+// numbers — shipped/placeholder phase counts, per-layer latency.
+
+import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts";
+import type { Finding } from "../types.ts";
+
+const PLACEHOLDER_MARKERS = [
+  "unimplemented",
+  " 404 ", "(404)", " 405 ", "(405)",
+  "slice 3", "slice 4", "slice 5",
+  "endpoint not built", "not yet",
+];
+
+function isPlaceholderFailure(err?: string): boolean {
+  if (!err) return false;
+  const low = err.toLowerCase();
+  return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase()));
+}
+
+export async function runDynamicCheck(): Promise<Finding[]> {
+  const findings: Finding[] = [];
+
+  let result;
+  try {
+    result = await runHybridFixture();
+  } catch (e) {
+    // Fixture itself crashed — can't run dynamic check at all.
+    return [
+      {
+        check: "dynamic",
+        severity: "warn",
+        summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`,
+        evidence: [(e as Error).message],
+      },
+    ];
+  }
+
+  // Per-layer findings for every non-ok layer.
+  for (const layer of result.layers) {
+    if (layer.ok) continue;
+    const placeholder = isPlaceholderFailure(layer.error);
+    findings.push({
+      check: "dynamic",
+      severity: placeholder ? "info" : "warn",
+      summary: placeholder
+        ? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented`
+        : `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`,
+      evidence: [
+        `evidence: ${layer.evidence.slice(0, 160)}`,
+        ...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []),
+        `latency_ms: ${layer.latency_ms}`,
+      ],
+    });
+  }
+
+  // One overall summary with real numbers so the report shows what
+  // DID pass plus per-layer timing.
+  const metrics_preview = Object.entries(result.real_numbers)
+    .slice(0, 10)
+    .map(([k, v]) => `${k}=${v}`);
+  findings.push({
+    check: "dynamic",
+    severity: "info",
+    summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`,
+    evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"],
+  });
+
+  // If the fixture ran at all but nothing passed, elevate one of the
+  // summary findings to warn — something more than "all honest
+  // placeholders" is wrong.
+  if (result.overall === "fail") {
+    findings.push({
+      check: "dynamic",
+      severity: "warn",
+      summary: `hybrid fixture: 0 layers passed (overall=fail)`,
+      evidence: [
+        "a total fixture fail usually means a precondition service is down",
+        "(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily",
+        "the PR's code problem. Check service status before blaming the PR.",
+      ],
+    });
+  }
+
+  return findings;
+}
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -0,0 +1,206 @@
+// Cloud inference check — wraps the proven run_codereview pattern
+// from llm_team_ui.py (same 3-stage framing, same cloud model) to
+// critique a PR's claims against its diff.
+//
+// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
+// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
+// that unit tests missed. This module reuses the reviewer prompt
+// shape (bugs / security / performance / style / edge cases) and
+// adds claim-vs-diff specific framing.
+//
+// Call surface: runInferenceCheck(claims, diff) → Finding[].
+// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
+// with a 15KB diff + claim list).
+
+import type { Claim, Finding } from "../types.ts";
+
+const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
+const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
+// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
+// previously truncated at 15KB causing the reviewer to miss later
+// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
+// block finding when the file was simply outside the truncation window.
+const MAX_DIFF_CHARS = 40000;
+const CALL_TIMEOUT_MS = 120_000;
+
+export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
+  if (claims.length === 0) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "no ship-claims extracted — skipping cloud inference",
+      evidence: ["parser returned empty claim list; nothing to verify against cloud"],
+    }];
+  }
+
+  const truncated = diff.length > MAX_DIFF_CHARS
+    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
+    : diff;
+
+  // Build the reviewer prompt in the same shape as run_codereview's
+  // review stage (llm_team_ui.py:10950), adapted for claim verification:
+  //   "Task: ..."
+  //   "Code: ..."
+  //   "Review: bugs/security/perf/style/edge. Provide corrected code."
+  // We add: claim list upfront + ask for structured JSON verdict.
+  const systemMsg = [
+    "You review pull-request diffs against the author's own ship-claims.",
+    "For each claim, decide: is it backed by actual code in the diff, or is",
+    "it placeholder / aspirational / unwired?",
+    "",
+    "A claim is BACKED when the diff contains a real code path that delivers",
+    "the claimed behavior. A claim is NOT BACKED when:",
+    "  - the claim asserts functionality but the diff only adds types/fields",
+    "    with no consumer",
+    "  - the claim mentions tests but no test function was added",
+    "  - the claim claims integration but the integration point is a stub",
+    "  - the diff contains unimplemented!() / todo!() / TODO comments",
+    "  - the claim says 'works end-to-end' but the diff has no end-to-end test",
+    "",
+    "Respond with strict JSON only. No prose before or after. Shape:",
+    "{",
+    '  "claim_verdicts": [',
+    '    {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
+    "  ],",
+    '  "unflagged_gaps": [',
+    '    {"location": "file:line", "summary": "short description"}',
+    "  ]",
+    "}",
+  ].join("\n");
+
+  const userMsg = [
+    `Ship-claims the author made (numbered 0..N-1):`,
+    claims.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
+    "",
+    `Diff:`,
+    "```",
+    truncated,
+    "```",
+    "",
+    `For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
+    `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
+    `Strict JSON only, matching the shape described. No prose outside JSON.`,
+  ].join("\n");
+
+  let resp: Response;
+  try {
+    resp = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "ollama_cloud",
+        model: MODEL,
+        messages: [
+          { role: "system", content: systemMsg },
+          { role: "user", content: userMsg },
+        ],
+        max_tokens: 3000,
+        temperature: 0.2,
+        think: true, // T3 overseer should reason — JSON shape is still required
+      }),
+      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
+    });
+  } catch (e) {
+    // Cloud unreachable → soft-fail. Don't block a PR because the
+    // reviewer model is down. Static + dynamic + kb still run.
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "cloud inference unreachable — skipped",
+      evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
+    }];
+  }
+
+  if (!resp.ok) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: `cloud inference returned ${resp.status} — skipped`,
+      evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
+    }];
+  }
+
+  const body: any = await resp.json();
+  const content: string = body?.choices?.[0]?.message?.content ?? "";
+  const usage = body?.usage ?? {};
+
+  const parsed = extractJson(content);
+  if (!parsed) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: "cloud returned unparseable output — skipped",
+      evidence: [
+        `head: ${content.slice(0, 200)}`,
+        `tokens: ${usage.total_tokens ?? "?"}`,
+      ],
+    }];
+  }
+
+  const findings: Finding[] = [];
+
+  // One summary info finding so the verdict layer knows the check ran.
+  findings.push({
+    check: "inference",
+    severity: "info",
+    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
+    evidence: [
+      `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
+    ],
+  });
+
+  for (const v of parsed.claim_verdicts ?? []) {
+    if (v?.backed === false) {
+      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
+      const claim = claims[idx];
+      if (!claim) continue;
+      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
+      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
+        : claim.strength === "moderate" ? "warn"
+        : "info";
+      findings.push({
+        check: "inference",
+        severity: sev,
+        claim_text: claim.text,
+        summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
+        evidence: [
+          `at ${claim.location}`,
+          `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
+        ],
+      });
+    }
+  }
+
+  for (const g of parsed.unflagged_gaps ?? []) {
+    findings.push({
+      check: "inference",
+      severity: "warn",
+      summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
+      evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
+    });
+  }
+
+  return findings;
+}
+
+// Lift the first balanced JSON object out of the response. Tolerates
+// leading prose, code fences, and model reasoning preamble when the
+// cloud model ignored "strict JSON only."
+function extractJson(text: string): any | null {
+  const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
+  let depth = 0;
+  let start = -1;
+  for (let i = 0; i < cleaned.length; i++) {
+    const c = cleaned[i];
+    if (c === "{") {
+      if (depth === 0) start = i;
+      depth++;
+    } else if (c === "}") {
+      depth--;
+      if (depth === 0 && start >= 0) {
+        try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
+      }
+    }
+  }
+  return null;
+}