diff --git a/auditor/claim_parser.ts b/auditor/claim_parser.ts index 8f05fa5..b17efbd 100644 --- a/auditor/claim_parser.ts +++ b/auditor/claim_parser.ts @@ -78,8 +78,11 @@ const EMPIRICAL_PATTERNS: RegExp[] = [ // ─── History / proof references ─── // "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4" - /\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR|commit|prior|the\s+\w+\s+audit)\s*#?\w*/i, - /\btested\s+(?:against|in|on)\s+(?:PR|commit|prior)\s*#?\w*/i, + // Require PR#N / commit-hash / "prior " to avoid matching + // "verified ... in production" (PR without \b-ish anchor previously + // consumed "pr" of "production"). + /\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+|the\s+\w+\s+audit)\b/i, + /\btested\s+(?:against|in|on)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+)\b/i, // Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123" /\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i, /\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i, @@ -131,7 +134,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out // classify it as empirical so the inference check doesn't ask // the cloud to prove "58 cloud calls" from the diff. Order: // empirical → strong → moderate → weak. - const empirical = firstMatch(line, EMPIRICAL_PATTERNS); + const empirical = firstUnquotedMatch(line, EMPIRICAL_PATTERNS); if (empirical) { out.push({ text: line.trim().slice(0, 200), @@ -141,7 +144,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out }); continue; } - const strong = firstMatch(line, STRONG_PATTERNS); + const strong = firstUnquotedMatch(line, STRONG_PATTERNS); if (strong) { out.push({ text: line.trim().slice(0, 200), @@ -151,7 +154,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out }); continue; } - const moderate = firstMatch(line, MODERATE_PATTERNS); + const moderate = firstUnquotedMatch(line, MODERATE_PATTERNS); if (moderate) { out.push({ text: line.trim().slice(0, 200), @@ -161,7 +164,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out }); continue; } - const weak = firstMatch(line, WEAK_PATTERNS); + const weak = firstUnquotedMatch(line, WEAK_PATTERNS); if (weak) { out.push({ text: line.trim().slice(0, 200), @@ -173,9 +176,35 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out } } -function firstMatch(text: string, patterns: RegExp[]): RegExp | null { +// Match a pattern only when its match position is NOT inside a quoted +// string on the line. Mirrors the same guard in auditor/checks/static.ts +// — the two files have the same false-positive class: PR authors +// quote pattern examples in commit message bodies (e.g. `"Phase 45 +// shipped"` as a test example) and without this guard those quoted +// references get flagged as fresh ship-claims. Only skips when the +// match itself falls inside quotes; real (unquoted) uses of the same +// vocabulary still classify correctly. +function firstUnquotedMatch(text: string, patterns: RegExp[]): RegExp | null { for (const p of patterns) { - if (p.test(text)) return p; + const m = text.match(p); + if (!m || typeof m.index !== "number") continue; + if (isInsideQuotedString(text, m.index)) continue; + return p; } return null; } + +// Walks left→right toggling in-quote state on each unescaped quote. +// Good enough for single-line claims; multi-line strings aren't parsed. +function isInsideQuotedString(line: string, pos: number): boolean { + let inDouble = false, inSingle = false, inBacktick = false; + for (let i = 0; i < pos; i++) { + const c = line[i]; + const esc = i > 0 && line[i - 1] === "\\"; + if (esc) continue; + if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble; + else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle; + else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick; + } + return inDouble || inSingle || inBacktick; +}