auditor/inference: N=3 consensus + qwen3-coder:480b tie-breaker

Closes the determinism gap observed in the 3-run baseline test: 1 of 8 findings (the "proven escalation ladder" block) was flipping across identical-state audits. Root cause: cloud non-determinism at temp=0 is real in practice even though it shouldn't be in theory. Fix: run the primary reviewer (gpt-oss:120b) N=3 times in PARALLEL (Promise.all, wall-clock ≈ single call because they're independent HTTP requests). Aggregate votes per claim_idx. Majority wins. On a 1-1-1 split, call a tie-breaker model with different architecture: qwen3-coder:480b — newer coding specialist, 4x params of the primary, distinct training lineage. Every case where the 3 runs disagreed (even when majority resolved) is logged to data/_kb/audit_discrepancies.jsonl with the vote counts and resolution type. This is how we measure consensus drift over time — a dashboard metric is literally `wc -l audit_discrepancies` relative to audit count. Verified: 2 back-to-back audits on unchanged PR #8 produced identical 8 findings each (1 block + 7 warn). consensus=3/3 on every claim, zero discrepancies logged. Cost: 3x primary tokens (7K per audit vs 2K), wall-clock ~unchanged because calls are parallel. New env vars: LH_AUDITOR_CONSENSUS_N default 3 LH_AUDITOR_TIEBREAKER_MODEL default qwen3-coder:480b Factored the cloud call into runCloudInference() helper so the consensus loop is clean and the tie-breaker reuses the same prompt shape as the primary.
2026-04-22 23:38:17 -05:00 · 2026-04-22 23:38:17 -05:00 · 2afad0f83f
commit 2afad0f83f
parent 77650c4ba3
1 changed files with 193 additions and 72 deletions
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -19,6 +19,15 @@ import { extractFacts } from "../fact_extractor.ts";

 const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
+// Tie-breaker for claims where the N=3 consensus produces a 1-1-1
+// split (genuinely borderline). Different architecture from the
+// primary reviewer (gpt-oss) so the tie-break isn't correlated with
+// the original disagreement. qwen3-coder:480b is a newer coding
+// specialist at 480B params, well-suited to PR-diff claim verification
+// and distinct in training lineage from gpt-oss.
+const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "qwen3-coder:480b";
+const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
+const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
 // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
 // previously truncated at 15KB causing the reviewer to miss later
 // files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
@ -168,94 +177,131 @@ export async function runInferenceCheck(
    `Strict JSON only, matching the shape described. No prose outside JSON.`,
  ].join("\n");

-  let resp: Response;
-  try {
-    resp = await fetch(`${GATEWAY}/v1/chat`, {
-      method: "POST",
-      headers: { "content-type": "application/json" },
-      body: JSON.stringify({
-        provider: "ollama_cloud",
-        model: MODEL,
-        messages: [
-          { role: "system", content: systemMsg },
-          { role: "user", content: userMsg },
-        ],
-        // Deterministic classification — temp=0 is greedy-sample, so
-        // identical input yields identical output on the same model
-        // version. This kills the signature creep we observed in the
-        // 9-run empirical test (sig_count 16→27 from cloud phrasing
-        // variance at temp=0.2).
-        //
-        // IMPORTANT: keep think=true. gpt-oss:120b is a reasoning
-        // model; setting think=false caused it to return empty content
-        // on large prompts (observed during Level 1 validation: 13421
-        // tokens used, empty content returned). The reasoning trace is
-        // variable prose, but at temp=0 the FINAL classification is
-        // still deterministic because greedy sampling converges to
-        // the same conclusion from the same starting state.
-        max_tokens: 3000,
-        temperature: 0,
-        think: true,
-      }),
-      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
-    });
-  } catch (e) {
-    // Cloud unreachable → soft-fail. Don't block a PR because the
-    // reviewer model is down. Static + dynamic + kb still run.
+  // N=3 consensus — run the primary reviewer in parallel, collect
+  // all three parsed responses, majority-vote per claim. Parallel
+  // (Promise.all) because each call is ~20-30s and they're independent;
+  // wall-clock stays ~same as single call, cost 3x tokens. Empirical
+  // justification: in 3-run determinism tests, 7/8 findings were
+  // stable but 1 flipped across runs — majority vote stabilizes the
+  // flipping class without losing the stable signal.
+  const primaryRuns = await Promise.all(
+    Array.from({ length: N_CONSENSUS }, () =>
+      runCloudInference(systemMsg, userMsg, MODEL)),
+  );
+
+  const parsedRuns = primaryRuns.filter(r => r.parsed !== null);
+  if (parsedRuns.length === 0) {
+    // All N calls failed. Surface the first-run diagnostic so the
+    // operator sees *why* (unreachable / non-200 / unparseable).
+    const first = primaryRuns[0];
    return [{
      check: "inference",
      severity: "info",
-      summary: "cloud inference unreachable — skipped",
-      evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
-    }];
-  }
-
-  if (!resp.ok) {
-    return [{
-      check: "inference",
-      severity: "info",
-      summary: `cloud inference returned ${resp.status} — skipped`,
-      evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
-    }];
-  }
-
-  const body: any = await resp.json();
-  const content: string = body?.choices?.[0]?.message?.content ?? "";
-  const usage = body?.usage ?? {};
-
-  const parsed = extractJson(content);
-  if (!parsed) {
-    return [{
-      check: "inference",
-      severity: "info",
-      summary: "cloud returned unparseable output — skipped",
+      summary: `cloud inference all ${N_CONSENSUS} consensus runs failed — ${first.error ?? "unknown"}`,
      evidence: [
-        `head: ${content.slice(0, 200)}`,
-        `tokens: ${usage.total_tokens ?? "?"}`,
+        `first-run diagnostic: ${first.diagnostic ?? "(none)"}`,
+        `successful runs: 0 / ${N_CONSENSUS}`,
      ],
    }];
  }

+  // Aggregate votes per claim_idx.
+  interface Votes { trues: number; falses: number; evidences: string[] }
+  const votesByClaim = new Map<number, Votes>();
+  const unflaggedByRun: any[][] = [];
+  let totalTokens = 0;
+  for (const run of parsedRuns) {
+    totalTokens += run.tokens;
+    unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []);
+    for (const v of run.parsed?.claim_verdicts ?? []) {
+      const idx = Number(v?.claim_idx);
+      if (!Number.isFinite(idx)) continue;
+      const rec = votesByClaim.get(idx) ?? { trues: 0, falses: 0, evidences: [] };
+      if (v.backed === false) {
+        rec.falses++;
+        rec.evidences.push(String(v.evidence ?? ""));
+      } else if (v.backed === true) {
+        rec.trues++;
+      }
+      votesByClaim.set(idx, rec);
+    }
+  }
+
  const findings: Finding[] = [];

-  // One summary info finding so the verdict layer knows the check ran.
+  // Summary finding so the verdict layer knows the check ran.
  findings.push({
    check: "inference",
    severity: "info",
-    summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})${curationNote}`,
+    summary: `cloud review completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, tokens=${totalTokens})${curationNote}`,
    evidence: [
-      `claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
+      `claims voted: ${votesByClaim.size}`,
+      `parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`,
    ],
  });

-  for (const v of parsed.claim_verdicts ?? []) {
-    if (v?.backed === false) {
-      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
-      // Indices point at the verifiable[] list we sent the cloud,
-      // not the full claims[] list. Translate back.
-      const claim = verifiable[idx];
-      if (!claim) continue;
-      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
+  // Per-claim majority vote; tie-break if no majority.
+  const discrepancies: Array<{
+    claim_idx: number;
+    claim_text: string;
+    votes: { trues: number; falses: number };
+    resolution: "majority_backed" | "majority_not_backed" | "tiebreaker_backed" | "tiebreaker_not_backed" | "unresolved";
+    tiebreaker_model?: string;
+  }> = [];
+
+  for (const [idx, votes] of votesByClaim) {
+    const claim = verifiable[idx];
+    if (!claim) continue;
+    const totalVotes = votes.trues + votes.falses;
+    let notBacked: boolean | null = null;
+    let resolution: typeof discrepancies[number]["resolution"] = "majority_backed";
+    let evidenceText = "";
+    let tbModel: string | undefined;
+
+    if (votes.falses > votes.trues) {
+      notBacked = true;
+      resolution = "majority_not_backed";
+      evidenceText = votes.evidences[0] ?? "(no reason given)";
+    } else if (votes.trues > votes.falses) {
+      notBacked = false;
+      resolution = "majority_backed";
+    } else {
+      // Tie. Run tie-breaker with a different-architecture model.
+      const tb = await runCloudInference(systemMsg, userMsg, TIEBREAKER_MODEL);
+      if (tb.parsed) {
+        const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx);
+        if (tv?.backed === false) {
+          notBacked = true;
+          resolution = "tiebreaker_not_backed";
+          evidenceText = `(tie-breaker ${TIEBREAKER_MODEL}) ${String(tv.evidence ?? "")}`;
+          tbModel = TIEBREAKER_MODEL;
+        } else if (tv?.backed === true) {
+          notBacked = false;
+          resolution = "tiebreaker_backed";
+          tbModel = TIEBREAKER_MODEL;
+        } else {
+          resolution = "unresolved";
+        }
+      } else {
+        resolution = "unresolved";
+      }
+    }
+
+    // Log every case where the N runs disagreed — discrepancies are
+    // signal, not noise. Separate from audit_lessons.jsonl because
+    // they're about the *auditor's* quality, not the PR's quality.
+    const disagreed = totalVotes >= 2 && votes.trues > 0 && votes.falses > 0;
+    if (disagreed || resolution.startsWith("tiebreaker") || resolution === "unresolved") {
+      discrepancies.push({
+        claim_idx: idx,
+        claim_text: claim.text,
+        votes: { trues: votes.trues, falses: votes.falses },
+        resolution,
+        tiebreaker_model: tbModel,
+      });
+    }
+
+    if (notBacked === true) {
      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
        : claim.strength === "moderate" ? "warn"
        : "info";
@ -266,12 +312,22 @@ export async function runInferenceCheck(
        summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
        evidence: [
          `at ${claim.location}`,
-          `cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
+          `consensus: ${votes.falses}/${totalVotes} not-backed (resolution: ${resolution})`,
+          `cloud reason: ${evidenceText.slice(0, 200)}`,
        ],
      });
    }
  }

+  // Persist discrepancies so we can measure consensus drift over time.
+  if (discrepancies.length > 0 && ctx) {
+    persistDiscrepancies(ctx, discrepancies).catch(e =>
+      console.error(`[inference] discrepancy log failed: ${(e as Error).message}`));
+  }
+
+  // Use first run's parsed for downstream unflagged_gaps processing.
+  const parsed = parsedRuns[0].parsed;
+
  // Route the curated scratchpad through llm_team's extract-facts
  // pipeline when we have (a) a curated scratchpad (best signal about
  // what the PR actually changed) and (b) PR context to scope facts.
@ -338,6 +394,71 @@ export async function runInferenceCheck(
  return findings;
 }

+// Single cloud call — the consensus loop calls this N times in
+// parallel. Returns the parsed JSON shape + token usage + any error
+// diagnostic. NEVER throws; the consensus aggregator handles partial
+// failures by dropping non-parsed runs from the vote.
+interface CloudRunResult {
+  parsed: any | null;
+  tokens: number;
+  error?: string;       // "unreachable" | "non_200" | "unparseable"
+  diagnostic?: string;  // first 200 chars for debugging
+  model: string;
+}
+
+async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<CloudRunResult> {
+  let resp: Response;
+  try {
+    resp = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "ollama_cloud",
+        model,
+        messages: [
+          { role: "system", content: systemMsg },
+          { role: "user", content: userMsg },
+        ],
+        // temp=0 (greedy) + think=true. think=true is required for
+        // gpt-oss:120b — without it the model returns empty content
+        // on large prompts. Variance from the think trace is observed
+        // in practice, which is why we use N=3 consensus, not single-
+        // call determinism.
+        max_tokens: 3000,
+        temperature: 0,
+        think: true,
+      }),
+      signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
+    });
+  } catch (e) {
+    return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model };
+  }
+  if (!resp.ok) {
+    return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model };
+  }
+  let body: any;
+  try { body = await resp.json(); }
+  catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; }
+  const content: string = body?.choices?.[0]?.message?.content ?? "";
+  const tokens: number = body?.usage?.total_tokens ?? 0;
+  const parsed = extractJson(content);
+  if (!parsed) {
+    return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model };
+  }
+  return { parsed, tokens, model };
+}
+
+async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
+  await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
+  const rows = discrepancies.map(d => JSON.stringify({
+    pr_number: ctx.pr_number,
+    head_sha: ctx.head_sha,
+    logged_at: new Date().toISOString(),
+    ...d,
+  }));
+  await appendFile(AUDIT_DISCREPANCIES_JSONL, rows.join("\n") + "\n");
+}
+
 // Extract structured knowledge from the curated scratchpad and append
 // to data/_kb/audit_facts.jsonl — one row per extract run, keyed by
 // PR number + head SHA for scope tracking. kb_query tails this next