From 2afad0f83fe58714c5d4bc6787a2911f043fe9b0 Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 23:38:17 -0500 Subject: [PATCH] auditor/inference: N=3 consensus + qwen3-coder:480b tie-breaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the determinism gap observed in the 3-run baseline test: 1 of 8 findings (the "proven escalation ladder" block) was flipping across identical-state audits. Root cause: cloud non-determinism at temp=0 is real in practice even though it shouldn't be in theory. Fix: run the primary reviewer (gpt-oss:120b) N=3 times in PARALLEL (Promise.all, wall-clock ≈ single call because they're independent HTTP requests). Aggregate votes per claim_idx. Majority wins. On a 1-1-1 split, call a tie-breaker model with different architecture: qwen3-coder:480b — newer coding specialist, 4x params of the primary, distinct training lineage. Every case where the 3 runs disagreed (even when majority resolved) is logged to data/_kb/audit_discrepancies.jsonl with the vote counts and resolution type. This is how we measure consensus drift over time — a dashboard metric is literally `wc -l audit_discrepancies` relative to audit count. Verified: 2 back-to-back audits on unchanged PR #8 produced identical 8 findings each (1 block + 7 warn). consensus=3/3 on every claim, zero discrepancies logged. Cost: 3x primary tokens (7K per audit vs 2K), wall-clock ~unchanged because calls are parallel. New env vars: LH_AUDITOR_CONSENSUS_N default 3 LH_AUDITOR_TIEBREAKER_MODEL default qwen3-coder:480b Factored the cloud call into runCloudInference() helper so the consensus loop is clean and the tie-breaker reuses the same prompt shape as the primary. --- auditor/checks/inference.ts | 265 ++++++++++++++++++++++++++---------- 1 file changed, 193 insertions(+), 72 deletions(-) diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index 488be8f..73b597f 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -19,6 +19,15 @@ import { extractFacts } from "../fact_extractor.ts"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; +// Tie-breaker for claims where the N=3 consensus produces a 1-1-1 +// split (genuinely borderline). Different architecture from the +// primary reviewer (gpt-oss) so the tie-break isn't correlated with +// the original disagreement. qwen3-coder:480b is a newer coding +// specialist at 480B params, well-suited to PR-diff claim verification +// and distinct in training lineage from gpt-oss. +const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "qwen3-coder:480b"; +const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3); +const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl"; // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was // previously truncated at 15KB causing the reviewer to miss later // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a @@ -168,94 +177,131 @@ export async function runInferenceCheck( `Strict JSON only, matching the shape described. No prose outside JSON.`, ].join("\n"); - let resp: Response; - try { - resp = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - provider: "ollama_cloud", - model: MODEL, - messages: [ - { role: "system", content: systemMsg }, - { role: "user", content: userMsg }, - ], - // Deterministic classification — temp=0 is greedy-sample, so - // identical input yields identical output on the same model - // version. This kills the signature creep we observed in the - // 9-run empirical test (sig_count 16→27 from cloud phrasing - // variance at temp=0.2). - // - // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning - // model; setting think=false caused it to return empty content - // on large prompts (observed during Level 1 validation: 13421 - // tokens used, empty content returned). The reasoning trace is - // variable prose, but at temp=0 the FINAL classification is - // still deterministic because greedy sampling converges to - // the same conclusion from the same starting state. - max_tokens: 3000, - temperature: 0, - think: true, - }), - signal: AbortSignal.timeout(CALL_TIMEOUT_MS), - }); - } catch (e) { - // Cloud unreachable → soft-fail. Don't block a PR because the - // reviewer model is down. Static + dynamic + kb still run. + // N=3 consensus — run the primary reviewer in parallel, collect + // all three parsed responses, majority-vote per claim. Parallel + // (Promise.all) because each call is ~20-30s and they're independent; + // wall-clock stays ~same as single call, cost 3x tokens. Empirical + // justification: in 3-run determinism tests, 7/8 findings were + // stable but 1 flipped across runs — majority vote stabilizes the + // flipping class without losing the stable signal. + const primaryRuns = await Promise.all( + Array.from({ length: N_CONSENSUS }, () => + runCloudInference(systemMsg, userMsg, MODEL)), + ); + + const parsedRuns = primaryRuns.filter(r => r.parsed !== null); + if (parsedRuns.length === 0) { + // All N calls failed. Surface the first-run diagnostic so the + // operator sees *why* (unreachable / non-200 / unparseable). + const first = primaryRuns[0]; return [{ check: "inference", severity: "info", - summary: "cloud inference unreachable — skipped", - evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`], - }]; - } - - if (!resp.ok) { - return [{ - check: "inference", - severity: "info", - summary: `cloud inference returned ${resp.status} — skipped`, - evidence: [`body: ${(await resp.text()).slice(0, 200)}`], - }]; - } - - const body: any = await resp.json(); - const content: string = body?.choices?.[0]?.message?.content ?? ""; - const usage = body?.usage ?? {}; - - const parsed = extractJson(content); - if (!parsed) { - return [{ - check: "inference", - severity: "info", - summary: "cloud returned unparseable output — skipped", + summary: `cloud inference all ${N_CONSENSUS} consensus runs failed — ${first.error ?? "unknown"}`, evidence: [ - `head: ${content.slice(0, 200)}`, - `tokens: ${usage.total_tokens ?? "?"}`, + `first-run diagnostic: ${first.diagnostic ?? "(none)"}`, + `successful runs: 0 / ${N_CONSENSUS}`, ], }]; } + // Aggregate votes per claim_idx. + interface Votes { trues: number; falses: number; evidences: string[] } + const votesByClaim = new Map(); + const unflaggedByRun: any[][] = []; + let totalTokens = 0; + for (const run of parsedRuns) { + totalTokens += run.tokens; + unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []); + for (const v of run.parsed?.claim_verdicts ?? []) { + const idx = Number(v?.claim_idx); + if (!Number.isFinite(idx)) continue; + const rec = votesByClaim.get(idx) ?? { trues: 0, falses: 0, evidences: [] }; + if (v.backed === false) { + rec.falses++; + rec.evidences.push(String(v.evidence ?? "")); + } else if (v.backed === true) { + rec.trues++; + } + votesByClaim.set(idx, rec); + } + } + const findings: Finding[] = []; - // One summary info finding so the verdict layer knows the check ran. + // Summary finding so the verdict layer knows the check ran. findings.push({ check: "inference", severity: "info", - summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`, + summary: `cloud review completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, tokens=${totalTokens})${curationNote}`, evidence: [ - `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`, + `claims voted: ${votesByClaim.size}`, + `parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`, ], }); - for (const v of parsed.claim_verdicts ?? []) { - if (v?.backed === false) { - const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; - // Indices point at the verifiable[] list we sent the cloud, - // not the full claims[] list. Translate back. - const claim = verifiable[idx]; - if (!claim) continue; - // Strong+unbacked = BLOCK. That's the whole point of the auditor. + // Per-claim majority vote; tie-break if no majority. + const discrepancies: Array<{ + claim_idx: number; + claim_text: string; + votes: { trues: number; falses: number }; + resolution: "majority_backed" | "majority_not_backed" | "tiebreaker_backed" | "tiebreaker_not_backed" | "unresolved"; + tiebreaker_model?: string; + }> = []; + + for (const [idx, votes] of votesByClaim) { + const claim = verifiable[idx]; + if (!claim) continue; + const totalVotes = votes.trues + votes.falses; + let notBacked: boolean | null = null; + let resolution: typeof discrepancies[number]["resolution"] = "majority_backed"; + let evidenceText = ""; + let tbModel: string | undefined; + + if (votes.falses > votes.trues) { + notBacked = true; + resolution = "majority_not_backed"; + evidenceText = votes.evidences[0] ?? "(no reason given)"; + } else if (votes.trues > votes.falses) { + notBacked = false; + resolution = "majority_backed"; + } else { + // Tie. Run tie-breaker with a different-architecture model. + const tb = await runCloudInference(systemMsg, userMsg, TIEBREAKER_MODEL); + if (tb.parsed) { + const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx); + if (tv?.backed === false) { + notBacked = true; + resolution = "tiebreaker_not_backed"; + evidenceText = `(tie-breaker ${TIEBREAKER_MODEL}) ${String(tv.evidence ?? "")}`; + tbModel = TIEBREAKER_MODEL; + } else if (tv?.backed === true) { + notBacked = false; + resolution = "tiebreaker_backed"; + tbModel = TIEBREAKER_MODEL; + } else { + resolution = "unresolved"; + } + } else { + resolution = "unresolved"; + } + } + + // Log every case where the N runs disagreed — discrepancies are + // signal, not noise. Separate from audit_lessons.jsonl because + // they're about the *auditor's* quality, not the PR's quality. + const disagreed = totalVotes >= 2 && votes.trues > 0 && votes.falses > 0; + if (disagreed || resolution.startsWith("tiebreaker") || resolution === "unresolved") { + discrepancies.push({ + claim_idx: idx, + claim_text: claim.text, + votes: { trues: votes.trues, falses: votes.falses }, + resolution, + tiebreaker_model: tbModel, + }); + } + + if (notBacked === true) { const sev: Finding["severity"] = claim.strength === "strong" ? "block" : claim.strength === "moderate" ? "warn" : "info"; @@ -266,12 +312,22 @@ export async function runInferenceCheck( summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`, evidence: [ `at ${claim.location}`, - `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`, + `consensus: ${votes.falses}/${totalVotes} not-backed (resolution: ${resolution})`, + `cloud reason: ${evidenceText.slice(0, 200)}`, ], }); } } + // Persist discrepancies so we can measure consensus drift over time. + if (discrepancies.length > 0 && ctx) { + persistDiscrepancies(ctx, discrepancies).catch(e => + console.error(`[inference] discrepancy log failed: ${(e as Error).message}`)); + } + + // Use first run's parsed for downstream unflagged_gaps processing. + const parsed = parsedRuns[0].parsed; + // Route the curated scratchpad through llm_team's extract-facts // pipeline when we have (a) a curated scratchpad (best signal about // what the PR actually changed) and (b) PR context to scope facts. @@ -338,6 +394,71 @@ export async function runInferenceCheck( return findings; } +// Single cloud call — the consensus loop calls this N times in +// parallel. Returns the parsed JSON shape + token usage + any error +// diagnostic. NEVER throws; the consensus aggregator handles partial +// failures by dropping non-parsed runs from the vote. +interface CloudRunResult { + parsed: any | null; + tokens: number; + error?: string; // "unreachable" | "non_200" | "unparseable" + diagnostic?: string; // first 200 chars for debugging + model: string; +} + +async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise { + let resp: Response; + try { + resp = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: "ollama_cloud", + model, + messages: [ + { role: "system", content: systemMsg }, + { role: "user", content: userMsg }, + ], + // temp=0 (greedy) + think=true. think=true is required for + // gpt-oss:120b — without it the model returns empty content + // on large prompts. Variance from the think trace is observed + // in practice, which is why we use N=3 consensus, not single- + // call determinism. + max_tokens: 3000, + temperature: 0, + think: true, + }), + signal: AbortSignal.timeout(CALL_TIMEOUT_MS), + }); + } catch (e) { + return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model }; + } + if (!resp.ok) { + return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model }; + } + let body: any; + try { body = await resp.json(); } + catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; } + const content: string = body?.choices?.[0]?.message?.content ?? ""; + const tokens: number = body?.usage?.total_tokens ?? 0; + const parsed = extractJson(content); + if (!parsed) { + return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model }; + } + return { parsed, tokens, model }; +} + +async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise { + await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true }); + const rows = discrepancies.map(d => JSON.stringify({ + pr_number: ctx.pr_number, + head_sha: ctx.head_sha, + logged_at: new Date().toISOString(), + ...d, + })); + await appendFile(AUDIT_DISCREPANCIES_JSONL, rows.join("\n") + "\n"); +} + // Extract structured knowledge from the curated scratchpad and append // to data/_kb/audit_facts.jsonl — one row per extract run, keyed by // PR number + head SHA for scope tracking. kb_query tails this next