auditor: fix two false-positive classes from cloud inference
Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "the proven escalation ladder with learning context, collects"
Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "the proven escalation ladder with learning context, collects"
Observed on PR #8 audit (de11ac4): 7 warn findings, all from the cloud inference check. Investigation showed two distinct bug classes that weren't "ship bad code", they were "auditor misreads the diff": 1. Cloud flagged "X not defined in this diff / missing implementation" for symbols like `tailJsonl` and `stubFinding` that ARE defined — just not in the added lines of this diff. Fix: extract candidate symbols from the cloud's gap summary, grep the repo for their definitions (function/const/let/def/class/struct/enum/trait/fn). If every named symbol resolves, drop the finding; if some do, demote to info with the resolution in evidence. 2. Cloud flagged runtime metrics like "58 cloud calls, 306s end-to-end" as unbacked claims. These are empirical outputs from running the test, not things a static diff can prove. Fix: claim_parser now has an `empirical` strength class matching iteration counts, cloud-call counts, duration metrics, attempt counts, tier-count phrases. Inference drops empirical claims from its cloud prompt (verifiable[] subset only) and claim-index mapping uses verifiable[] so cloud responses still line up. Added `claims_empirical` to audit metrics so the verdict is introspectable: how many claims WERE runtime-only vs how many are diff-verifiable? Verified: unit tests confirm empirical classification on 5 sample commit messages; symbol resolver found both false-positive symbols (tailJsonl + stubFinding) and correctly skipped a known- fake symbol.
This commit is contained in:
parent
de11ac4018
commit
f4be27a879
@ -77,6 +77,7 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
|
||||
claims_strong: claims.filter(c => c.strength === "strong").length,
|
||||
claims_moderate: claims.filter(c => c.strength === "moderate").length,
|
||||
claims_weak: claims.filter(c => c.strength === "weak").length,
|
||||
claims_empirical: claims.filter(c => c.strength === "empirical").length,
|
||||
claims_total: claims.length,
|
||||
diff_bytes: diff.length,
|
||||
};
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
// with a 15KB diff + claim list).
|
||||
|
||||
import type { Claim, Finding } from "../types.ts";
|
||||
import { Glob } from "bun";
|
||||
import { readFile } from "node:fs/promises";
|
||||
|
||||
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
@ -22,6 +24,7 @@ const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
||||
// block finding when the file was simply outside the truncation window.
|
||||
const MAX_DIFF_CHARS = 40000;
|
||||
const CALL_TIMEOUT_MS = 120_000;
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
|
||||
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
||||
if (claims.length === 0) {
|
||||
@ -33,6 +36,21 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
}];
|
||||
}
|
||||
|
||||
// Empirical claims (runtime metrics / observed outcomes) can't be
|
||||
// verified from the diff. Drop them from the cloud prompt so the
|
||||
// reviewer doesn't chase ghosts. A future `runtime_evidence` check
|
||||
// can validate these against data/_kb/*/summary.json outputs.
|
||||
const verifiable = claims.filter(c => c.strength !== "empirical");
|
||||
const empiricalCount = claims.length - verifiable.length;
|
||||
if (verifiable.length === 0) {
|
||||
return [{
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
|
||||
evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
|
||||
}];
|
||||
}
|
||||
|
||||
const truncated = diff.length > MAX_DIFF_CHARS
|
||||
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
||||
: diff;
|
||||
@ -70,7 +88,7 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
|
||||
const userMsg = [
|
||||
`Ship-claims the author made (numbered 0..N-1):`,
|
||||
claims.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
||||
"",
|
||||
`Diff:`,
|
||||
"```",
|
||||
@ -152,7 +170,9 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
for (const v of parsed.claim_verdicts ?? []) {
|
||||
if (v?.backed === false) {
|
||||
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
||||
const claim = claims[idx];
|
||||
// Indices point at the verifiable[] list we sent the cloud,
|
||||
// not the full claims[] list. Translate back.
|
||||
const claim = verifiable[idx];
|
||||
if (!claim) continue;
|
||||
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
||||
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
||||
@ -172,17 +192,109 @@ export async function runInferenceCheck(claims: Claim[], diff: string): Promise<
|
||||
}
|
||||
|
||||
for (const g of parsed.unflagged_gaps ?? []) {
|
||||
const summary = String(g?.summary ?? "?");
|
||||
const location = String(g?.location ?? "?");
|
||||
// False-positive guard — when the cloud says "X not defined in this
|
||||
// diff" or "missing implementation of X", the cloud may just mean
|
||||
// "X is not in the added lines," not "X doesn't exist in the repo."
|
||||
// Extract candidate symbol names and grep the repo. If any symbol
|
||||
// is defined elsewhere, drop the finding — it's a known-symbol
|
||||
// reference, not a placeholder.
|
||||
if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
|
||||
const symbols = extractSymbols(summary);
|
||||
if (symbols.length > 0) {
|
||||
const resolved = await symbolsExistInRepo(symbols);
|
||||
if (resolved.length === symbols.length) {
|
||||
// Every named symbol exists somewhere in the repo — silent drop.
|
||||
continue;
|
||||
}
|
||||
if (resolved.length > 0) {
|
||||
// Partially resolved — demote to info with a note.
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "info",
|
||||
summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
|
||||
evidence: [
|
||||
`location: ${location.slice(0, 140)}`,
|
||||
`resolved via grep: ${resolved.join(",")}`,
|
||||
`unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
|
||||
],
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
findings.push({
|
||||
check: "inference",
|
||||
severity: "warn",
|
||||
summary: `cloud-flagged gap not in any claim: ${String(g?.summary ?? "?").slice(0, 120)}`,
|
||||
evidence: [`location: ${String(g?.location ?? "?").slice(0, 140)}`],
|
||||
summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
|
||||
evidence: [`location: ${location.slice(0, 140)}`],
|
||||
});
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// Pull out plausible code-symbol names from a summary string.
|
||||
// Matches:
|
||||
// - identifier with backticks: `foo_bar`
|
||||
// - identifier followed by parens: foo_bar()
|
||||
// - CamelCase types
|
||||
// - snake_case_functions
|
||||
// Filters out common English words that could be matched accidentally.
|
||||
const STOPWORDS = new Set([
|
||||
"not","the","and","for","this","that","with","but","are","was","has",
|
||||
"have","been","any","missing","implementation","diff","defined","never",
|
||||
"referenced","integrated","flow","code","file","some","only","when",
|
||||
]);
|
||||
function extractSymbols(text: string): string[] {
|
||||
const out = new Set<string>();
|
||||
// `backticked` symbols
|
||||
for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
|
||||
// foo() or foo_bar() calls
|
||||
for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
|
||||
// CamelCase types (3+ chars, must start with uppercase)
|
||||
for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
|
||||
return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
|
||||
}
|
||||
|
||||
// Scan the repo for at least one definition of each symbol. Uses Bun's
|
||||
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
|
||||
// and target/.
|
||||
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
|
||||
const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
|
||||
const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
|
||||
const resolved = new Set<string>();
|
||||
const toFind = new Set(symbols);
|
||||
for (const pat of patterns) {
|
||||
if (toFind.size === 0) break;
|
||||
const glob = new Glob(pat);
|
||||
for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
|
||||
if (skip(f)) continue;
|
||||
let content: string;
|
||||
try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
|
||||
for (const sym of Array.from(toFind)) {
|
||||
// Definition heuristics: `function sym`, `fn sym`, `const sym`,
|
||||
// `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
|
||||
// `trait sym`, `async function sym`, `pub (async )?fn sym`.
|
||||
const re = new RegExp(
|
||||
`\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
|
||||
);
|
||||
if (re.test(content)) {
|
||||
resolved.add(sym);
|
||||
toFind.delete(sym);
|
||||
if (toFind.size === 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Array.from(resolved);
|
||||
}
|
||||
|
||||
function escapeRe(s: string): string {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
// Lift the first balanced JSON object out of the response. Tolerates
|
||||
// leading prose, code fences, and model reasoning preamble when the
|
||||
// cloud model ignored "strict JSON only."
|
||||
|
||||
@ -49,6 +49,25 @@ const WEAK_PATTERNS: RegExp[] = [
|
||||
/\bprobably\b/i,
|
||||
];
|
||||
|
||||
// Empirical claims: runtime measurements / observed outcomes that can't
|
||||
// be verified from a diff (only from the actual run that produced
|
||||
// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
|
||||
// end-to-end" — true, but only the test's own summary.json can
|
||||
// confirm it. Classifying as empirical lets the inference check skip
|
||||
// diff-verification and saves the ladder for falsifiable claims.
|
||||
const EMPIRICAL_PATTERNS: RegExp[] = [
|
||||
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
|
||||
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
|
||||
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
|
||||
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
|
||||
/\b\d+\s+(cloud\s+)?calls?\b/i,
|
||||
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
|
||||
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
|
||||
// "escalated through N tiers", "N distinct models"
|
||||
/\bescalated\s+through\s+\d+\b/i,
|
||||
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
|
||||
];
|
||||
|
||||
export interface ParsedClaims {
|
||||
claims: Claim[];
|
||||
commits_scanned: number;
|
||||
@ -77,8 +96,21 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
||||
const line = lines[i];
|
||||
if (line.length < 3) continue;
|
||||
|
||||
// Strong patterns first — if a line matches strong, it's strong,
|
||||
// don't double-count as moderate.
|
||||
// Empirical match wins over everything else — if a line ALSO
|
||||
// contains a moderate word like "complete", we still want to
|
||||
// classify it as empirical so the inference check doesn't ask
|
||||
// the cloud to prove "58 cloud calls" from the diff. Order:
|
||||
// empirical → strong → moderate → weak.
|
||||
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
|
||||
if (empirical) {
|
||||
out.push({
|
||||
text: line.trim().slice(0, 200),
|
||||
commit_sha,
|
||||
location: `${location_prefix}:${i + 1}`,
|
||||
strength: "empirical",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const strong = firstMatch(line, STRONG_PATTERNS);
|
||||
if (strong) {
|
||||
out.push({
|
||||
|
||||
@ -18,7 +18,14 @@ export interface Claim {
|
||||
// Heuristic rating of how strong the claim is. "green+tested"
|
||||
// is strong; "should work" is weak. Drives sensitivity — stronger
|
||||
// claims get harder-blocked on weak evidence.
|
||||
strength: "weak" | "moderate" | "strong";
|
||||
//
|
||||
// "empirical" is a separate class: runtime measurements like
|
||||
// "N cloud calls" / "306s end-to-end" / "accepted on attempt N".
|
||||
// These cannot be verified from a static diff — only from the test
|
||||
// output that produced them. Inference skips diff-verification for
|
||||
// empirical claims; they become info-level context unless a future
|
||||
// runtime_evidence check contradicts them.
|
||||
strength: "weak" | "moderate" | "strong" | "empirical";
|
||||
}
|
||||
|
||||
export interface Finding {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user