From f4be27a87992cdb8f8d5731d753b7913c7145111 Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 21:40:03 -0500 Subject: [PATCH] auditor: fix two false-positive classes from cloud inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observed on PR #8 audit (de11ac4): 7 warn findings, all from the cloud inference check. Investigation showed two distinct bug classes that weren't "ship bad code", they were "auditor misreads the diff": 1. Cloud flagged "X not defined in this diff / missing implementation" for symbols like `tailJsonl` and `stubFinding` that ARE defined — just not in the added lines of this diff. Fix: extract candidate symbols from the cloud's gap summary, grep the repo for their definitions (function/const/let/def/class/struct/enum/trait/fn). If every named symbol resolves, drop the finding; if some do, demote to info with the resolution in evidence. 2. Cloud flagged runtime metrics like "58 cloud calls, 306s end-to-end" as unbacked claims. These are empirical outputs from running the test, not things a static diff can prove. Fix: claim_parser now has an `empirical` strength class matching iteration counts, cloud-call counts, duration metrics, attempt counts, tier-count phrases. Inference drops empirical claims from its cloud prompt (verifiable[] subset only) and claim-index mapping uses verifiable[] so cloud responses still line up. Added `claims_empirical` to audit metrics so the verdict is introspectable: how many claims WERE runtime-only vs how many are diff-verifiable? Verified: unit tests confirm empirical classification on 5 sample commit messages; symbol resolver found both false-positive symbols (tailJsonl + stubFinding) and correctly skipped a known- fake symbol. --- auditor/audit.ts | 1 + auditor/checks/inference.ts | 120 ++++++++++++++++++++++++++++++++++-- auditor/claim_parser.ts | 36 ++++++++++- auditor/types.ts | 9 ++- 4 files changed, 159 insertions(+), 7 deletions(-) diff --git a/auditor/audit.ts b/auditor/audit.ts index 8fb0aee..18bf732 100644 --- a/auditor/audit.ts +++ b/auditor/audit.ts @@ -77,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise< claims_strong: claims.filter(c => c.strength === "strong").length, claims_moderate: claims.filter(c => c.strength === "moderate").length, claims_weak: claims.filter(c => c.strength === "weak").length, + claims_empirical: claims.filter(c => c.strength === "empirical").length, claims_total: claims.length, diff_bytes: diff.length, }; diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index 6c121ec..d24bad2 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -13,6 +13,8 @@ // with a 15KB diff + claim list). import type { Claim, Finding } from "../types.ts"; +import { Glob } from "bun"; +import { readFile } from "node:fs/promises"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; @@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; // block finding when the file was simply outside the truncation window. const MAX_DIFF_CHARS = 40000; const CALL_TIMEOUT_MS = 120_000; +const REPO_ROOT = "/home/profit/lakehouse"; export async function runInferenceCheck(claims: Claim[], diff: string): Promise { if (claims.length === 0) { @@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< }]; } + // Empirical claims (runtime metrics / observed outcomes) can't be + // verified from the diff. Drop them from the cloud prompt so the + // reviewer doesn't chase ghosts. A future `runtime_evidence` check + // can validate these against data/_kb/*/summary.json outputs. + const verifiable = claims.filter(c => c.strength !== "empirical"); + const empiricalCount = claims.length - verifiable.length; + if (verifiable.length === 0) { + return [{ + check: "inference", + severity: "info", + summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`, + evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`], + }]; + } + const truncated = diff.length > MAX_DIFF_CHARS ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]` : diff; @@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< const userMsg = [ `Ship-claims the author made (numbered 0..N-1):`, - claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), + verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), "", `Diff:`, "```", @@ -152,7 +170,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< for (const v of parsed.claim_verdicts ?? []) { if (v?.backed === false) { const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1; - const claim = claims[idx]; + // Indices point at the verifiable[] list we sent the cloud, + // not the full claims[] list. Translate back. + const claim = verifiable[idx]; if (!claim) continue; // Strong+unbacked = BLOCK. That's the whole point of the auditor. const sev: Finding["severity"] = claim.strength === "strong" ? "block" @@ -172,17 +192,109 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise< } for (const g of parsed.unflagged_gaps ?? []) { + const summary = String(g?.summary ?? "?"); + const location = String(g?.location ?? "?"); + // False-positive guard — when the cloud says "X not defined in this + // diff" or "missing implementation of X", the cloud may just mean + // "X is not in the added lines," not "X doesn't exist in the repo." + // Extract candidate symbol names and grep the repo. If any symbol + // is defined elsewhere, drop the finding — it's a known-symbol + // reference, not a placeholder. + if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) { + const symbols = extractSymbols(summary); + if (symbols.length > 0) { + const resolved = await symbolsExistInRepo(symbols); + if (resolved.length === symbols.length) { + // Every named symbol exists somewhere in the repo — silent drop. + continue; + } + if (resolved.length > 0) { + // Partially resolved — demote to info with a note. + findings.push({ + check: "inference", + severity: "info", + summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`, + evidence: [ + `location: ${location.slice(0, 140)}`, + `resolved via grep: ${resolved.join(",")}`, + `unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`, + ], + }); + continue; + } + } + } findings.push({ check: "inference", severity: "warn", - summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`, - evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`], + summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`, + evidence: [`location: ${location.slice(0, 140)}`], }); } return findings; } +// Pull out plausible code-symbol names from a summary string. +// Matches: +// - identifier with backticks: `foo_bar` +// - identifier followed by parens: foo_bar() +// - CamelCase types +// - snake_case_functions +// Filters out common English words that could be matched accidentally. +const STOPWORDS = new Set([ + "not","the","and","for","this","that","with","but","are","was","has", + "have","been","any","missing","implementation","diff","defined","never", + "referenced","integrated","flow","code","file","some","only","when", +]); +function extractSymbols(text: string): string[] { + const out = new Set(); + // `backticked` symbols + for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]); + // foo() or foo_bar() calls + for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]); + // CamelCase types (3+ chars, must start with uppercase) + for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]); + return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase())); +} + +// Scan the repo for at least one definition of each symbol. Uses Bun's +// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/, +// and target/. +async function symbolsExistInRepo(symbols: string[]): Promise { + const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"]; + const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/"); + const resolved = new Set(); + const toFind = new Set(symbols); + for (const pat of patterns) { + if (toFind.size === 0) break; + const glob = new Glob(pat); + for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) { + if (skip(f)) continue; + let content: string; + try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; } + for (const sym of Array.from(toFind)) { + // Definition heuristics: `function sym`, `fn sym`, `const sym`, + // `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`, + // `trait sym`, `async function sym`, `pub (async )?fn sym`. + const re = new RegExp( + `\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b` + ); + if (re.test(content)) { + resolved.add(sym); + toFind.delete(sym); + if (toFind.size === 0) break; + } + } + } + } + return Array.from(resolved); +} + +function escapeRe(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + // Lift the first balanced JSON object out of the response. Tolerates // leading prose, code fences, and model reasoning preamble when the // cloud model ignored "strict JSON only." diff --git a/auditor/claim_parser.ts b/auditor/claim_parser.ts index 46b65a7..0b00663 100644 --- a/auditor/claim_parser.ts +++ b/auditor/claim_parser.ts @@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [ /\bprobably\b/i, ]; +// Empirical claims: runtime measurements / observed outcomes that can't +// be verified from a diff (only from the actual run that produced +// them). Example: "6/6 iterations complete, 58 cloud calls, 306s +// end-to-end" — true, but only the test's own summary.json can +// confirm it. Classifying as empirical lets the inference check skip +// diff-verification and saves the ladder for falsifiable claims. +const EMPIRICAL_PATTERNS: RegExp[] = [ + // Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3" + /\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i, + /\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i, + // Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars" + /\b\d+\s+(cloud\s+)?calls?\b/i, + /\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i, + /\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i, + // "escalated through N tiers", "N distinct models" + /\bescalated\s+through\s+\d+\b/i, + /\b\d+\s+distinct\s+(model|tier)s?\b/i, +]; + export interface ParsedClaims { claims: Claim[]; commits_scanned: number; @@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out const line = lines[i]; if (line.length < 3) continue; - // Strong patterns first — if a line matches strong, it's strong, - // don't double-count as moderate. + // Empirical match wins over everything else — if a line ALSO + // contains a moderate word like "complete", we still want to + // classify it as empirical so the inference check doesn't ask + // the cloud to prove "58 cloud calls" from the diff. Order: + // empirical → strong → moderate → weak. + const empirical = firstMatch(line, EMPIRICAL_PATTERNS); + if (empirical) { + out.push({ + text: line.trim().slice(0, 200), + commit_sha, + location: `${location_prefix}:${i + 1}`, + strength: "empirical", + }); + continue; + } const strong = firstMatch(line, STRONG_PATTERNS); if (strong) { out.push({ diff --git a/auditor/types.ts b/auditor/types.ts index 5ab0360..9ce7609 100644 --- a/auditor/types.ts +++ b/auditor/types.ts @@ -18,7 +18,14 @@ export interface Claim { // Heuristic rating of how strong the claim is. "green+tested" // is strong; "should work" is weak. Drives sensitivity — stronger // claims get harder-blocked on weak evidence. - strength: "weak" | "moderate" | "strong"; + // + // "empirical" is a separate class: runtime measurements like + // "N cloud calls" / "306s end-to-end" / "accepted on attempt N". + // These cannot be verified from a static diff — only from the test + // output that produced them. Inference skips diff-verification for + // empirical claims; they become info-level context unless a future + // runtime_evidence check contradicts them. + strength: "weak" | "moderate" | "strong" | "empirical"; } export interface Finding {