claim_parser: skip quoted patterns + tighten PR regex
Some checks failed
lakehouse/auditor 7 warnings — see review

Two fixes observed in test sweep on b25e368:

1. The "Phase 45 shipped" quoted test example in a commit message
   body was triggering STRONG_PATTERNS despite being inside quotes —
   produced a block finding that flipped 1/0/1 across 3 back-to-back
   audits. Same bug class as auditor/checks/static.ts (fixed earlier):
   rubric files quote pattern examples, parser can't distinguish.

   Fix: firstUnquotedMatch() wraps firstMatch(); uses isInsideQuotedString()
   to check whether the regex's match position falls inside double /
   single / backtick quotes on the line. Mirrors static.ts exactly.

2. A regex misfire: `(?:PR|commit|prior|...)` in history/proof
   patterns was matching "verified ... in production" because `PR`
   (2 chars) matched the first 2 chars of "production" before the
   `\s*#?\w*` tail absorbed the rest. Tightened to require a digit
   after PR (`PR\s*#?\d+`) and commit to require a hex hash.

Verified: 3 back-to-back audit_one runs before this fix showed the
Phase 45 block flipping 1/0/1; after these fixes, unit tests confirm
quoted examples skip correctly AND real claims ("Phase 45 shipped",
"verified end-to-end against production", "Verified end-to-end on
PR #8") still classify correctly.
This commit is contained in:
profit 2026-04-23 00:18:58 -05:00
parent b25e36881c
commit 2a97fd7237

View File

@ -78,8 +78,11 @@ const EMPIRICAL_PATTERNS: RegExp[] = [
// ─── History / proof references ───
// "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4"
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR|commit|prior|the\s+\w+\s+audit)\s*#?\w*/i,
/\btested\s+(?:against|in|on)\s+(?:PR|commit|prior)\s*#?\w*/i,
// Require PR#N / commit-hash / "prior <word>" to avoid matching
// "verified ... in production" (PR without \b-ish anchor previously
// consumed "pr" of "production").
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+|the\s+\w+\s+audit)\b/i,
/\btested\s+(?:against|in|on)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+)\b/i,
// Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123"
/\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i,
/\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i,
@ -131,7 +134,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
// classify it as empirical so the inference check doesn't ask
// the cloud to prove "58 cloud calls" from the diff. Order:
// empirical → strong → moderate → weak.
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
const empirical = firstUnquotedMatch(line, EMPIRICAL_PATTERNS);
if (empirical) {
out.push({
text: line.trim().slice(0, 200),
@ -141,7 +144,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
});
continue;
}
const strong = firstMatch(line, STRONG_PATTERNS);
const strong = firstUnquotedMatch(line, STRONG_PATTERNS);
if (strong) {
out.push({
text: line.trim().slice(0, 200),
@ -151,7 +154,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
});
continue;
}
const moderate = firstMatch(line, MODERATE_PATTERNS);
const moderate = firstUnquotedMatch(line, MODERATE_PATTERNS);
if (moderate) {
out.push({
text: line.trim().slice(0, 200),
@ -161,7 +164,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
});
continue;
}
const weak = firstMatch(line, WEAK_PATTERNS);
const weak = firstUnquotedMatch(line, WEAK_PATTERNS);
if (weak) {
out.push({
text: line.trim().slice(0, 200),
@ -173,9 +176,35 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
}
}
function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
// Match a pattern only when its match position is NOT inside a quoted
// string on the line. Mirrors the same guard in auditor/checks/static.ts
// — the two files have the same false-positive class: PR authors
// quote pattern examples in commit message bodies (e.g. `"Phase 45
// shipped"` as a test example) and without this guard those quoted
// references get flagged as fresh ship-claims. Only skips when the
// match itself falls inside quotes; real (unquoted) uses of the same
// vocabulary still classify correctly.
function firstUnquotedMatch(text: string, patterns: RegExp[]): RegExp | null {
for (const p of patterns) {
if (p.test(text)) return p;
const m = text.match(p);
if (!m || typeof m.index !== "number") continue;
if (isInsideQuotedString(text, m.index)) continue;
return p;
}
return null;
}
// Walks left→right toggling in-quote state on each unescaped quote.
// Good enough for single-line claims; multi-line strings aren't parsed.
function isInsideQuotedString(line: string, pos: number): boolean {
let inDouble = false, inSingle = false, inBacktick = false;
for (let i = 0; i < pos; i++) {
const c = line[i];
const esc = i > 0 && line[i - 1] === "\\";
if (esc) continue;
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
}
return inDouble || inSingle || inBacktick;
}