lakehouse/auditor/claim_parser.ts
profit f4be27a879
Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "the proven escalation ladder with learning context, collects"
auditor: fix two false-positive classes from cloud inference
Observed on PR #8 audit (de11ac4): 7 warn findings, all from the
cloud inference check. Investigation showed two distinct bug classes
that weren't "ship bad code", they were "auditor misreads the diff":

1. Cloud flagged "X not defined in this diff / missing implementation"
   for symbols like `tailJsonl` and `stubFinding` that ARE defined —
   just not in the added lines of this diff. Fix: extract candidate
   symbols from the cloud's gap summary, grep the repo for their
   definitions (function/const/let/def/class/struct/enum/trait/fn).
   If every named symbol resolves, drop the finding; if some do,
   demote to info with the resolution in evidence.

2. Cloud flagged runtime metrics like "58 cloud calls, 306s
   end-to-end" as unbacked claims. These are empirical outputs
   from running the test, not things a static diff can prove.
   Fix: claim_parser now has an `empirical` strength class
   matching iteration counts, cloud-call counts, duration metrics,
   attempt counts, tier-count phrases. Inference drops empirical
   claims from its cloud prompt (verifiable[] subset only) and
   claim-index mapping uses verifiable[] so cloud responses still
   line up.

Added `claims_empirical` to audit metrics so the verdict is
introspectable: how many claims WERE runtime-only vs how many
are diff-verifiable?

Verified: unit tests confirm empirical classification on 5
sample commit messages; symbol resolver found both false-positive
symbols (tailJsonl + stubFinding) and correctly skipped a known-
fake symbol.
2026-04-22 21:40:03 -05:00

152 lines
5.0 KiB
TypeScript

// Claim parser — reads commit messages + PR body, extracts ship-claims.
//
// A "ship-claim" is any phrase that asserts functionality is working,
// tested, complete, or landed. These are the assertions the downstream
// checks (static/dynamic/inference/kb) try to falsify.
//
// Heuristic approach (regex + strength grading) — intentionally NOT
// using an LLM here. Reason: the inference check already asks a cloud
// model "does this match the claim?". The parser's job is to surface
// the claim substrates, not judge them. Over-engineering the parser
// risks false-negatives when the cloud model was going to catch it
// anyway.
import type { Claim, PrSnapshot } from "./types.ts";
// Strong claims: explicit end-to-end + verification vocabulary
const STRONG_PATTERNS: RegExp[] = [
/\bverified\s+(end[- ]to[- ]end|live|in\s+production|against)\b/i,
/\btested\s+(live|end[- ]to[- ]end|against|with)\b/i,
/\bworks\s+(end[- ]to[- ]end|live|in\s+production)\b/i,
/\bproduction[- ]ready\b/i,
/\bfully\s+(functional|wired|working)\b/i,
/\bphase\s+\d+(\.\d+)?\s+(shipped|complete|done|landed)\b/i,
/\bground\s+truth\b/i,
/\bproven\b/i,
];
// Moderate claims: asserted completion or pass but without the strong
// verification qualifier.
const MODERATE_PATTERNS: RegExp[] = [
/\bshipped\b/i,
/\blanded\b/i,
/\bgreen\b/i,
/\b(tests?\s+)?pass(ing|ed)\b/i,
/\bcomplet(e|ed)\b/i,
/\bdone\b/i,
/\bwired\b/i,
/\bfixed\b/i,
/\bworks\b/i,
];
// Weak claims: aspirational or hedged. Usually low-risk but recorded
// for completeness.
const WEAK_PATTERNS: RegExp[] = [
/\bshould\s+work\b/i,
/\bexpected\s+to\b/i,
/\bintended\s+to\b/i,
/\bwill\s+(work|handle|support)\b/i,
/\bprobably\b/i,
];
// Empirical claims: runtime measurements / observed outcomes that can't
// be verified from a diff (only from the actual run that produced
// them). Example: "6/6 iterations complete, 58 cloud calls, 306s
// end-to-end" — true, but only the test's own summary.json can
// confirm it. Classifying as empirical lets the inference check skip
// diff-verification and saves the ladder for falsifiable claims.
const EMPIRICAL_PATTERNS: RegExp[] = [
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
/\b\d+\s+(cloud\s+)?calls?\b/i,
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
// "escalated through N tiers", "N distinct models"
/\bescalated\s+through\s+\d+\b/i,
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
];
export interface ParsedClaims {
claims: Claim[];
commits_scanned: number;
}
export function parseClaims(pr: PrSnapshot): ParsedClaims {
const claims: Claim[] = [];
// PR body — every matching line becomes a claim at location "pr_body:N"
if (pr.body) {
scanText(pr.body, "pr_body", pr.head_sha, claims);
}
// Each commit message gets its own scan.
for (const c of pr.commits) {
if (!c.message) continue;
scanText(c.message, `commit:${c.sha.slice(0, 8)}`, c.sha, claims);
}
return { claims, commits_scanned: pr.commits.length };
}
function scanText(text: string, location_prefix: string, commit_sha: string, out: Claim[]): void {
const lines = text.split(/\r?\n/);
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.length < 3) continue;
// Empirical match wins over everything else — if a line ALSO
// contains a moderate word like "complete", we still want to
// classify it as empirical so the inference check doesn't ask
// the cloud to prove "58 cloud calls" from the diff. Order:
// empirical → strong → moderate → weak.
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
if (empirical) {
out.push({
text: line.trim().slice(0, 200),
commit_sha,
location: `${location_prefix}:${i + 1}`,
strength: "empirical",
});
continue;
}
const strong = firstMatch(line, STRONG_PATTERNS);
if (strong) {
out.push({
text: line.trim().slice(0, 200),
commit_sha,
location: `${location_prefix}:${i + 1}`,
strength: "strong",
});
continue;
}
const moderate = firstMatch(line, MODERATE_PATTERNS);
if (moderate) {
out.push({
text: line.trim().slice(0, 200),
commit_sha,
location: `${location_prefix}:${i + 1}`,
strength: "moderate",
});
continue;
}
const weak = firstMatch(line, WEAK_PATTERNS);
if (weak) {
out.push({
text: line.trim().slice(0, 200),
commit_sha,
location: `${location_prefix}:${i + 1}`,
strength: "weak",
});
}
}
}
function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
for (const p of patterns) {
if (p.test(text)) return p;
}
return null;
}