auditor: fix two false-positive classes from cloud inference
Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "the proven escalation ladder with learning context, collects"

Observed on PR #8 audit (de11ac4): 7 warn findings, all from the
cloud inference check. Investigation showed two distinct bug classes
that weren't "ship bad code", they were "auditor misreads the diff":

1. Cloud flagged "X not defined in this diff / missing implementation"
   for symbols like `tailJsonl` and `stubFinding` that ARE defined —
   just not in the added lines of this diff. Fix: extract candidate
   symbols from the cloud's gap summary, grep the repo for their
   definitions (function/const/let/def/class/struct/enum/trait/fn).
   If every named symbol resolves, drop the finding; if some do,
   demote to info with the resolution in evidence.

2. Cloud flagged runtime metrics like "58 cloud calls, 306s
   end-to-end" as unbacked claims. These are empirical outputs
   from running the test, not things a static diff can prove.
   Fix: claim_parser now has an `empirical` strength class
   matching iteration counts, cloud-call counts, duration metrics,
   attempt counts, tier-count phrases. Inference drops empirical
   claims from its cloud prompt (verifiable[] subset only) and
   claim-index mapping uses verifiable[] so cloud responses still
   line up.

Added `claims_empirical` to audit metrics so the verdict is
introspectable: how many claims WERE runtime-only vs how many
are diff-verifiable?

Verified: unit tests confirm empirical classification on 5
sample commit messages; symbol resolver found both false-positive
symbols (tailJsonl + stubFinding) and correctly skipped a known-
fake symbol.
This commit is contained in:
profit 2026-04-22 21:40:03 -05:00
parent de11ac4018
commit f4be27a879
4 changed files with 159 additions and 7 deletions

View File

@ -77,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
claims_strong: claims.filter(c => c.strength === "strong").length,
claims_moderate: claims.filter(c => c.strength === "moderate").length,
claims_weak: claims.filter(c => c.strength === "weak").length,
claims_empirical: claims.filter(c => c.strength === "empirical").length,
claims_total: claims.length,
diff_bytes: diff.length,
};

View File

@ -13,6 +13,8 @@
// with a 15KB diff + claim list).
import type { Claim, Finding } from "../types.ts";
import { Glob } from "bun";
import { readFile } from "node:fs/promises";
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
// block finding when the file was simply outside the truncation window.
const MAX_DIFF_CHARS = 40000;
const CALL_TIMEOUT_MS = 120_000;
const REPO_ROOT = "/home/profit/lakehouse";
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
if (claims.length === 0) {
@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
}];
}
// Empirical claims (runtime metrics / observed outcomes) can't be
// verified from the diff. Drop them from the cloud prompt so the
// reviewer doesn't chase ghosts. A future `runtime_evidence` check
// can validate these against data/_kb/*/summary.json outputs.
const verifiable = claims.filter(c => c.strength !== "empirical");
const empiricalCount = claims.length - verifiable.length;
if (verifiable.length === 0) {
return [{
check: "inference",
severity: "info",
summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
}];
}
const truncated = diff.length > MAX_DIFF_CHARS
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
: diff;
@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
const userMsg = [
`Ship-claims the author made (numbered 0..N-1):`,
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
"",
`Diff:`,
"```",
@ -152,7 +170,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
for (const v of parsed.claim_verdicts ?? []) {
if (v?.backed === false) {
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
const claim = claims[idx];
// Indices point at the verifiable[] list we sent the cloud,
// not the full claims[] list. Translate back.
const claim = verifiable[idx];
if (!claim) continue;
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
@ -172,17 +192,109 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
}
for (const g of parsed.unflagged_gaps ?? []) {
const summary = String(g?.summary ?? "?");
const location = String(g?.location ?? "?");
// False-positive guard — when the cloud says "X not defined in this
// diff" or "missing implementation of X", the cloud may just mean
// "X is not in the added lines," not "X doesn't exist in the repo."
// Extract candidate symbol names and grep the repo. If any symbol
// is defined elsewhere, drop the finding — it's a known-symbol
// reference, not a placeholder.
if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
const symbols = extractSymbols(summary);
if (symbols.length > 0) {
const resolved = await symbolsExistInRepo(symbols);
if (resolved.length === symbols.length) {
// Every named symbol exists somewhere in the repo — silent drop.
continue;
}
if (resolved.length > 0) {
// Partially resolved — demote to info with a note.
findings.push({
check: "inference",
severity: "info",
summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
evidence: [
`location: ${location.slice(0, 140)}`,
`resolved via grep: ${resolved.join(",")}`,
`unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
],
});
continue;
}
}
}
findings.push({
check: "inference",
severity: "warn",
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
evidence: [`location: ${location.slice(0, 140)}`],
});
}
return findings;
}
// Pull out plausible code-symbol names from a summary string.
// Matches:
// - identifier with backticks: `foo_bar`
// - identifier followed by parens: foo_bar()
// - CamelCase types
// - snake_case_functions
// Filters out common English words that could be matched accidentally.
const STOPWORDS = new Set([
"not","the","and","for","this","that","with","but","are","was","has",
"have","been","any","missing","implementation","diff","defined","never",
"referenced","integrated","flow","code","file","some","only","when",
]);
function extractSymbols(text: string): string[] {
const out = new Set<string>();
// `backticked` symbols
for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
// foo() or foo_bar() calls
for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
// CamelCase types (3+ chars, must start with uppercase)
for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
}
// Scan the repo for at least one definition of each symbol. Uses Bun's
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
// and target/.
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
const resolved = new Set<string>();
const toFind = new Set(symbols);
for (const pat of patterns) {
if (toFind.size === 0) break;
const glob = new Glob(pat);
for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
if (skip(f)) continue;
let content: string;
try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
for (const sym of Array.from(toFind)) {
// Definition heuristics: `function sym`, `fn sym`, `const sym`,
// `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
// `trait sym`, `async function sym`, `pub (async )?fn sym`.
const re = new RegExp(
`\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
);
if (re.test(content)) {
resolved.add(sym);
toFind.delete(sym);
if (toFind.size === 0) break;
}
}
}
}
return Array.from(resolved);
}
function escapeRe(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
// Lift the first balanced JSON object out of the response. Tolerates
// leading prose, code fences, and model reasoning preamble when the
// cloud model ignored "strict JSON only."

View File

@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [
/\bprobably\b/i,
];
// Empirical claims: runtime measurements / observed outcomes that can't
// be verified from a diff (only from the actual run that produced
// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
// end-to-end" — true, but only the test's own summary.json can
// confirm it. Classifying as empirical lets the inference check skip
// diff-verification and saves the ladder for falsifiable claims.
const EMPIRICAL_PATTERNS: RegExp[] = [
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
/\b\d+\s+(cloud\s+)?calls?\b/i,
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
// "escalated through N tiers", "N distinct models"
/\bescalated\s+through\s+\d+\b/i,
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
];
export interface ParsedClaims {
claims: Claim[];
commits_scanned: number;
@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
const line = lines[i];
if (line.length < 3) continue;
// Strong patterns first — if a line matches strong, it's strong,
// don't double-count as moderate.
// Empirical match wins over everything else — if a line ALSO
// contains a moderate word like "complete", we still want to
// classify it as empirical so the inference check doesn't ask
// the cloud to prove "58 cloud calls" from the diff. Order:
// empirical → strong → moderate → weak.
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
if (empirical) {
out.push({
text: line.trim().slice(0, 200),
commit_sha,
location: `${location_prefix}:${i + 1}`,
strength: "empirical",
});
continue;
}
const strong = firstMatch(line, STRONG_PATTERNS);
if (strong) {
out.push({

View File

@ -18,7 +18,14 @@ export interface Claim {
// Heuristic rating of how strong the claim is. "green+tested"
// is strong; "should work" is weak. Drives sensitivity — stronger
// claims get harder-blocked on weak evidence.
strength: "weak" | "moderate" | "strong";
//
// "empirical" is a separate class: runtime measurements like
// "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
// These cannot be verified from a static diff — only from the test
// output that produced them. Inference skips diff-verification for
// empirical claims; they become info-level context unless a future
// runtime_evidence check contradicts them.
strength: "weak" | "moderate" | "strong" | "empirical";
}
export interface Finding {