Auditor: dynamic + inference checks
auditor/checks/dynamic.ts — wraps runHybridFixture, maps layer results to Findings. Placeholder-style errors (404/unimplemented/ slice N) → info; other failures → warn. Always emits a summary finding with real numbers (shipped/placeholder phase counts + per- layer latency). Live-tested against current stack: 2 info findings, 0 warnings — all shipped layers actually work. auditor/checks/inference.ts — wraps the run_codereview reviewer pattern from llm_team_ui.py, adapted for claim-vs-diff verification. Calls /v1/chat provider=ollama_cloud model=gpt-oss:120b. Requests strict JSON response with claim_verdicts[] and unflagged_gaps[]. A strong claim marked "not backed" by cloud → BLOCK severity; moderate → warn; weak → info. Cloud-unreachable or unparseable-output → info (never blocks on the reviewer being down). Live-tested against PR #1 (this PR, 20 claims, 39KB diff): - 36.9s round-trip - 7 block + 23 warn + 2 info findings - gpt-oss:120b correctly flagged "Fully-functional auditor (tasks 1-9 complete)" as not-backed (only 6/10 tasks done at that commit) — accurate catch - Some false positives from the original 15KB truncation threshold (cloud missed gitea.ts, flagged "no Gitea client present") - Bumped MAX_DIFF_CHARS from 15000 to 40000 to fit the full PR diff in context; reviewer precision improves accordingly Tasks 5 + 6 completed. Remaining: #7 (KB query), #8 (verdict + Gitea poster), #9 (poller), #10 (end-to-end proof), #12 (upsert UPDATE-drops-doc_refs).
This commit is contained in:
parent
c5da680add
commit
efc7b5ac44
91
auditor/checks/dynamic.ts
Normal file
91
auditor/checks/dynamic.ts
Normal file
@ -0,0 +1,91 @@
|
||||
// Dynamic execution check — runs the hybrid fixture and maps its
|
||||
// layer results to auditor Findings.
|
||||
//
|
||||
// A layer that fails with a "not implemented / 404 / slice N" error
|
||||
// gets severity=info (honest placeholder signal). A layer that fails
|
||||
// any other way gets severity=warn (something actually broke).
|
||||
// An info-level summary finding is always emitted carrying the real
|
||||
// numbers — shipped/placeholder phase counts, per-layer latency.
|
||||
|
||||
import { runHybridFixture } from "../fixtures/hybrid_38_40_45.ts";
|
||||
import type { Finding } from "../types.ts";
|
||||
|
||||
const PLACEHOLDER_MARKERS = [
|
||||
"unimplemented",
|
||||
" 404 ", "(404)", " 405 ", "(405)",
|
||||
"slice 3", "slice 4", "slice 5",
|
||||
"endpoint not built", "not yet",
|
||||
];
|
||||
|
||||
function isPlaceholderFailure(err?: string): boolean {
|
||||
if (!err) return false;
|
||||
const low = err.toLowerCase();
|
||||
return PLACEHOLDER_MARKERS.some(m => low.includes(m.toLowerCase()));
|
||||
}
|
||||
|
||||
export async function runDynamicCheck(): Promise<Finding[]> {
|
||||
const findings: Finding[] = [];
|
||||
|
||||
let result;
|
||||
try {
|
||||
result = await runHybridFixture();
|
||||
} catch (e) {
|
||||
// Fixture itself crashed — can't run dynamic check at all.
|
||||
return [
|
||||
{
|
||||
check: "dynamic",
|
||||
severity: "warn",
|
||||
summary: `hybrid fixture crashed before completing: ${(e as Error).message.slice(0, 140)}`,
|
||||
evidence: [(e as Error).message],
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// Per-layer findings for every non-ok layer.
|
||||
for (const layer of result.layers) {
|
||||
if (layer.ok) continue;
|
||||
const placeholder = isPlaceholderFailure(layer.error);
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: placeholder ? "info" : "warn",
|
||||
summary: placeholder
|
||||
? `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) honestly reports unimplemented`
|
||||
: `hybrid fixture layer ${layer.layer} (Phase ${layer.phase}) failed — not a placeholder, a real failure`,
|
||||
evidence: [
|
||||
`evidence: ${layer.evidence.slice(0, 160)}`,
|
||||
...(layer.error ? [`error: ${layer.error.slice(0, 160)}`] : []),
|
||||
`latency_ms: ${layer.latency_ms}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// One overall summary with real numbers so the report shows what
|
||||
// DID pass plus per-layer timing.
|
||||
const metrics_preview = Object.entries(result.real_numbers)
|
||||
.slice(0, 10)
|
||||
.map(([k, v]) => `${k}=${v}`);
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: "info",
|
||||
summary: `hybrid fixture overall=${result.overall}, shipped [${result.shipped_phases.join(", ")}], placeholder [${result.placeholder_phases.join(", ")}]`,
|
||||
evidence: metrics_preview.length > 0 ? metrics_preview : ["no metrics emitted"],
|
||||
});
|
||||
|
||||
// If the fixture ran at all but nothing passed, elevate one of the
|
||||
// summary findings to warn — something more than "all honest
|
||||
// placeholders" is wrong.
|
||||
if (result.overall === "fail") {
|
||||
findings.push({
|
||||
check: "dynamic",
|
||||
severity: "warn",
|
||||
summary: `hybrid fixture: 0 layers passed (overall=fail)`,
|
||||
evidence: [
|
||||
"a total fixture fail usually means a precondition service is down",
|
||||
"(gateway /health / sidecar / Langfuse /v1/chat) — NOT necessarily",
|
||||
"the PR's code problem. Check service status before blaming the PR.",
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
206
auditor/checks/inference.ts
Normal file
206
auditor/checks/inference.ts
Normal file
@ -0,0 +1,206 @@
|
||||
// Cloud inference check — wraps the proven run_codereview pattern
|
||||
// from llm_team_ui.py (same 3-stage framing, same cloud model) to
|
||||
// critique a PR's claims against its diff.
|
||||
//
|
||||
// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
|
||||
// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
|
||||
// that unit tests missed. This module reuses the reviewer prompt
|
||||
// shape (bugs / security / performance / style / edge cases) and
|
||||
// adds claim-vs-diff specific framing.
|
||||
//
|
||||
// Call surface: runInferenceCheck(claims, diff) → Finding[].
|
||||
// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
|
||||
// with a 15KB diff + claim list).
|
||||
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
||||
// previously truncated at 15KB causing the reviewer to miss later
|
||||
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
||||
// block finding when the file was simply outside the truncation window.
|
||||
const MAX_DIFF_CHARS = 40000;
|
||||
const CALL_TIMEOUT_MS = 120_000;
|
||||
|
||||
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
||||
if (claims.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "no ship-claims extracted — skipping cloud inference",
|
||||
evidence: ["parser returned empty claim list; nothing to verify against cloud"],
|
||||
}];
|
||||
}
|
||||
|
||||
const truncated = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
||||
: diff;
|
||||
|
||||
// Build the reviewer prompt in the same shape as run_codereview's
|
||||
// review stage (llm_team_ui.py:10950), adapted for claim verification:
|
||||
// "Task: ..."
|
||||
// "Code: ..."
|
||||
// "Review: bugs/security/perf/style/edge. Provide corrected code."
|
||||
// We add: claim list upfront + ask for structured JSON verdict.
|
||||
const systemMsg = [
|
||||
"You review pull-request diffs against the author's own ship-claims.",
|
||||
"For each claim, decide: is it backed by actual code in the diff, or is",
|
||||
"it placeholder / aspirational / unwired?",
|
||||
"",
|
||||
"A claim is BACKED when the diff contains a real code path that delivers",
|
||||
"the claimed behavior. A claim is NOT BACKED when:",
|
||||
" - the claim asserts functionality but the diff only adds types/fields",
|
||||
" with no consumer",
|
||||
" - the claim mentions tests but no test function was added",
|
||||
" - the claim claims integration but the integration point is a stub",
|
||||
" - the diff contains unimplemented!() / todo!() / TODO comments",
|
||||
" - the claim says 'works end-to-end' but the diff has no end-to-end test",
|
||||
"",
|
||||
"Respond with strict JSON only. No prose before or after. Shape:",
|
||||
"{",
|
||||
' "claim_verdicts": [',
|
||||
' {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
|
||||
" ],",
|
||||
' "unflagged_gaps": [',
|
||||
' {"location": "file:line", "summary": "short description"}',
|
||||
" ]",
|
||||
"}",
|
||||
].join("\n");
|
||||
|
||||
const userMsg = [
|
||||
`Ship-claims the author made (numbered 0..N-1):`,
|
||||
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
"",
|
||||
`Diff:`,
|
||||
"```",
|
||||
truncated,
|
||||
"```",
|
||||
"",
|
||||
`For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
|
||||
`author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
|
||||
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
||||
].join("\n");
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: "ollama_cloud",
|
||||
model: MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: systemMsg },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
max_tokens: 3000,
|
||||
temperature: 0.2,
|
||||
think: true, // T3 overseer should reason — JSON shape is still required
|
||||
}),
|
||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
||||
});
|
||||
} catch (e) {
|
||||
// Cloud unreachable → soft-fail. Don't block a PR because the
|
||||
// reviewer model is down. Static + dynamic + kb still run.
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "cloud inference unreachable — skipped",
|
||||
evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
if (!resp.ok) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud inference returned ${resp.status} — skipped`,
|
||||
evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
|
||||
}];
|
||||
}
|
||||
|
||||
const body: any = await resp.json();
|
||||
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
||||
const usage = body?.usage ?? {};
|
||||
|
||||
const parsed = extractJson(content);
|
||||
if (!parsed) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: "cloud returned unparseable output — skipped",
|
||||
evidence: [
|
||||
`head: ${content.slice(0, 200)}`,
|
||||
`tokens: ${usage.total_tokens ?? "?"}`,
|
||||
],
|
||||
}];
|
||||
}
|
||||
|
||||
const findings: Finding[] = [];
|
||||
|
||||
// One summary info finding so the verdict layer knows the check ran.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
|
||||
evidence: [
|
||||
`claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
|
||||
],
|
||||
});
|
||||
|
||||
for (const v of parsed.claim_verdicts ?? []) {
|
||||
if (v?.backed === false) {
|
||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
||||
const claim = claims[idx];
|
||||
if (!claim) continue;
|
||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||
: claim.strength === "moderate" ? "warn"
|
||||
: "info";
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: sev,
|
||||
claim_text: claim.text,
|
||||
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
||||
evidence: [
|
||||
`at ${claim.location}`,
|
||||
`cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
|
||||
],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (const g of parsed.unflagged_gaps ?? []) {
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "warn",
|
||||
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
|
||||
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of the response. Tolerates
|
||||
// leading prose, code fences, and model reasoning preamble when the
|
||||
// cloud model ignored "strict JSON only."
|
||||
function extractJson(text: string): any | null {
|
||||
const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
|
||||
let depth = 0;
|
||||
let start = -1;
|
||||
for (let i = 0; i < cleaned.length; i++) {
|
||||
const c = cleaned[i];
|
||||
if (c === "{") {
|
||||
if (depth === 0) start = i;
|
||||
depth++;
|
||||
} else if (c === "}") {
|
||||
depth--;
|
||||
if (depth === 0 && start >= 0) {
|
||||
try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user