auditor: fix two false-positive classes from cloud inference

Observed on PR #8 audit (de11ac4): 7 warn findings, all from the cloud inference check. Investigation showed two distinct bug classes that weren't "ship bad code", they were "auditor misreads the diff": 1. Cloud flagged "X not defined in this diff / missing implementation" for symbols like `tailJsonl` and `stubFinding` that ARE defined — just not in the added lines of this diff. Fix: extract candidate symbols from the cloud's gap summary, grep the repo for their definitions (function/const/let/def/class/struct/enum/trait/fn). If every named symbol resolves, drop the finding; if some do, demote to info with the resolution in evidence. 2. Cloud flagged runtime metrics like "58 cloud calls, 306s end-to-end" as unbacked claims. These are empirical outputs from running the test, not things a static diff can prove. Fix: claim_parser now has an `empirical` strength class matching iteration counts, cloud-call counts, duration metrics, attempt counts, tier-count phrases. Inference drops empirical claims from its cloud prompt (verifiable[] subset only) and claim-index mapping uses verifiable[] so cloud responses still line up. Added `claims_empirical` to audit metrics so the verdict is introspectable: how many claims WERE runtime-only vs how many are diff-verifiable? Verified: unit tests confirm empirical classification on 5 sample commit messages; symbol resolver found both false-positive symbols (tailJsonl + stubFinding) and correctly skipped a known- fake symbol.
2026-04-22 21:40:03 -05:00 · 2026-04-22 21:40:03 -05:00 · f4be27a879
commit f4be27a879
parent de11ac4018
4 changed files with 159 additions and 7 deletions
--- a/auditor/audit.ts
+++ b/auditor/audit.ts
@ -77,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
    claims_strong: claims.filter(c => c.strength === "strong").length,
    claims_moderate: claims.filter(c => c.strength === "moderate").length,
    claims_weak: claims.filter(c => c.strength === "weak").length,
+    claims_empirical: claims.filter(c => c.strength === "empirical").length,
    claims_total: claims.length,
    diff_bytes: diff.length,
  };
--- a/auditor/checks/inference.ts
+++ b/auditor/checks/inference.ts
@ -13,6 +13,8 @@
 // with a 15KB diff + claim list).

 import type { Claim, Finding } from "../types.ts";
+import { Glob } from "bun";
+import { readFile } from "node:fs/promises";

 const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
 // block finding when the file was simply outside the truncation window.
 const MAX_DIFF_CHARS = 40000;
 const CALL_TIMEOUT_MS = 120_000;
+const REPO_ROOT = "/home/profit/lakehouse";

 export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
  if (claims.length === 0) {
@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
    }];
  }

+  // Empirical claims (runtime metrics / observed outcomes) can't be
+  // verified from the diff. Drop them from the cloud prompt so the
+  // reviewer doesn't chase ghosts. A future `runtime_evidence` check
+  // can validate these against data/_kb/*/summary.json outputs.
+  const verifiable = claims.filter(c => c.strength !== "empirical");
+  const empiricalCount = claims.length - verifiable.length;
+  if (verifiable.length === 0) {
+    return [{
+      check: "inference",
+      severity: "info",
+      summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
+      evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
+    }];
+  }
+
  const truncated = diff.length > MAX_DIFF_CHARS
    ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
    : diff;
@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<

  const userMsg = [
    `Ship-claims the author made (numbered 0..N-1):`,
-    claims.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
+    verifiable.map((c, i) => `  ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
    "",
    `Diff:`,
    "```",
@ -152,7 +170,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
  for (const v of parsed.claim_verdicts ?? []) {
    if (v?.backed === false) {
      const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
-      const claim = claims[idx];
+      // Indices point at the verifiable[] list we sent the cloud,
+      // not the full claims[] list. Translate back.
+      const claim = verifiable[idx];
      if (!claim) continue;
      // Strong+unbacked = BLOCK. That's the whole point of the auditor.
      const sev: Finding["severity"] = claim.strength === "strong" ? "block"
@ -172,17 +192,109 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
  }

  for (const g of parsed.unflagged_gaps ?? []) {
+    const summary = String(g?.summary ?? "?");
+    const location = String(g?.location ?? "?");
+    // False-positive guard — when the cloud says "X not defined in this
+    // diff" or "missing implementation of X", the cloud may just mean
+    // "X is not in the added lines," not "X doesn't exist in the repo."
+    // Extract candidate symbol names and grep the repo. If any symbol
+    // is defined elsewhere, drop the finding — it's a known-symbol
+    // reference, not a placeholder.
+    if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
+      const symbols = extractSymbols(summary);
+      if (symbols.length > 0) {
+        const resolved = await symbolsExistInRepo(symbols);
+        if (resolved.length === symbols.length) {
+          // Every named symbol exists somewhere in the repo — silent drop.
+          continue;
+        }
+        if (resolved.length > 0) {
+          // Partially resolved — demote to info with a note.
+          findings.push({
+            check: "inference",
+            severity: "info",
+            summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
+            evidence: [
+              `location: ${location.slice(0, 140)}`,
+              `resolved via grep: ${resolved.join(",")}`,
+              `unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
+            ],
+          });
+          continue;
+        }
+      }
+    }
    findings.push({
      check: "inference",
      severity: "warn",
-      summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
-      evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
+      summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
+      evidence: [`location: ${location.slice(0, 140)}`],
    });
  }

  return findings;
 }

+// Pull out plausible code-symbol names from a summary string.
+// Matches:
+//   - identifier with backticks: `foo_bar`
+//   - identifier followed by parens: foo_bar()
+//   - CamelCase types
+//   - snake_case_functions
+// Filters out common English words that could be matched accidentally.
+const STOPWORDS = new Set([
+  "not","the","and","for","this","that","with","but","are","was","has",
+  "have","been","any","missing","implementation","diff","defined","never",
+  "referenced","integrated","flow","code","file","some","only","when",
+]);
+function extractSymbols(text: string): string[] {
+  const out = new Set<string>();
+  // `backticked` symbols
+  for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
+  // foo() or foo_bar() calls
+  for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
+  // CamelCase types (3+ chars, must start with uppercase)
+  for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
+  return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
+}
+
+// Scan the repo for at least one definition of each symbol. Uses Bun's
+// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
+// and target/.
+async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
+  const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
+  const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
+  const resolved = new Set<string>();
+  const toFind = new Set(symbols);
+  for (const pat of patterns) {
+    if (toFind.size === 0) break;
+    const glob = new Glob(pat);
+    for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
+      if (skip(f)) continue;
+      let content: string;
+      try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
+      for (const sym of Array.from(toFind)) {
+        // Definition heuristics: `function sym`, `fn sym`, `const sym`,
+        // `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
+        // `trait sym`, `async function sym`, `pub (async )?fn sym`.
+        const re = new RegExp(
+          `\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
+        );
+        if (re.test(content)) {
+          resolved.add(sym);
+          toFind.delete(sym);
+          if (toFind.size === 0) break;
+        }
+      }
+    }
+  }
+  return Array.from(resolved);
+}
+
+function escapeRe(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
 // Lift the first balanced JSON object out of the response. Tolerates
 // leading prose, code fences, and model reasoning preamble when the
 // cloud model ignored "strict JSON only."
--- a/auditor/claim_parser.ts
+++ b/auditor/claim_parser.ts
@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [
  /\bprobably\b/i,
 ];

+// Empirical claims: runtime measurements / observed outcomes that can't
+// be verified from a diff (only from the actual run that produced
+// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
+// end-to-end" — true, but only the test's own summary.json can
+// confirm it. Classifying as empirical lets the inference check skip
+// diff-verification and saves the ladder for falsifiable claims.
+const EMPIRICAL_PATTERNS: RegExp[] = [
+  // Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
+  /\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
+  /\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
+  // Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
+  /\b\d+\s+(cloud\s+)?calls?\b/i,
+  /\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
+  /\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
+  // "escalated through N tiers", "N distinct models"
+  /\bescalated\s+through\s+\d+\b/i,
+  /\b\d+\s+distinct\s+(model|tier)s?\b/i,
+];
+
 export interface ParsedClaims {
  claims: Claim[];
  commits_scanned: number;
@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
    const line = lines[i];
    if (line.length < 3) continue;

-    // Strong patterns first — if a line matches strong, it's strong,
-    // don't double-count as moderate.
+    // Empirical match wins over everything else — if a line ALSO
+    // contains a moderate word like "complete", we still want to
+    // classify it as empirical so the inference check doesn't ask
+    // the cloud to prove "58 cloud calls" from the diff. Order:
+    // empirical → strong → moderate → weak.
+    const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
+    if (empirical) {
+      out.push({
+        text: line.trim().slice(0, 200),
+        commit_sha,
+        location: `${location_prefix}:${i + 1}`,
+        strength: "empirical",
+      });
+      continue;
+    }
    const strong = firstMatch(line, STRONG_PATTERNS);
    if (strong) {
      out.push({
--- a/auditor/types.ts
+++ b/auditor/types.ts
@ -18,7 +18,14 @@ export interface Claim {
  // Heuristic rating of how strong the claim is. "green+tested"
  // is strong; "should work" is weak. Drives sensitivity — stronger
  // claims get harder-blocked on weak evidence.
-  strength: "weak" | "moderate" | "strong";
+  //
+  // "empirical" is a separate class: runtime measurements like
+  // "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
+  // These cannot be verified from a static diff — only from the test
+  // output that produced them. Inference skips diff-verification for
+  // empirical claims; they become info-level context unless a future
+  // runtime_evidence check contradicts them.
+  strength: "weak" | "moderate" | "strong" | "empirical";
 }

 export interface Finding {