claim_parser: skip quoted patterns + tighten PR regex
Some checks failed
lakehouse/auditor 7 warnings — see review
Some checks failed
lakehouse/auditor 7 warnings — see review
Two fixes observed in test sweep on b25e368:
1. The "Phase 45 shipped" quoted test example in a commit message
body was triggering STRONG_PATTERNS despite being inside quotes —
produced a block finding that flipped 1/0/1 across 3 back-to-back
audits. Same bug class as auditor/checks/static.ts (fixed earlier):
rubric files quote pattern examples, parser can't distinguish.
Fix: firstUnquotedMatch() wraps firstMatch(); uses isInsideQuotedString()
to check whether the regex's match position falls inside double /
single / backtick quotes on the line. Mirrors static.ts exactly.
2. A regex misfire: `(?:PR|commit|prior|...)` in history/proof
patterns was matching "verified ... in production" because `PR`
(2 chars) matched the first 2 chars of "production" before the
`\s*#?\w*` tail absorbed the rest. Tightened to require a digit
after PR (`PR\s*#?\d+`) and commit to require a hex hash.
Verified: 3 back-to-back audit_one runs before this fix showed the
Phase 45 block flipping 1/0/1; after these fixes, unit tests confirm
quoted examples skip correctly AND real claims ("Phase 45 shipped",
"verified end-to-end against production", "Verified end-to-end on
PR #8") still classify correctly.
This commit is contained in:
parent
b25e36881c
commit
2a97fd7237
@ -78,8 +78,11 @@ const EMPIRICAL_PATTERNS: RegExp[] = [
|
|||||||
|
|
||||||
// ─── History / proof references ───
|
// ─── History / proof references ───
|
||||||
// "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4"
|
// "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4"
|
||||||
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR|commit|prior|the\s+\w+\s+audit)\s*#?\w*/i,
|
// Require PR#N / commit-hash / "prior <word>" to avoid matching
|
||||||
/\btested\s+(?:against|in|on)\s+(?:PR|commit|prior)\s*#?\w*/i,
|
// "verified ... in production" (PR without \b-ish anchor previously
|
||||||
|
// consumed "pr" of "production").
|
||||||
|
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+|the\s+\w+\s+audit)\b/i,
|
||||||
|
/\btested\s+(?:against|in|on)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+)\b/i,
|
||||||
// Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123"
|
// Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123"
|
||||||
/\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i,
|
/\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i,
|
||||||
/\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i,
|
/\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i,
|
||||||
@ -131,7 +134,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
|||||||
// classify it as empirical so the inference check doesn't ask
|
// classify it as empirical so the inference check doesn't ask
|
||||||
// the cloud to prove "58 cloud calls" from the diff. Order:
|
// the cloud to prove "58 cloud calls" from the diff. Order:
|
||||||
// empirical → strong → moderate → weak.
|
// empirical → strong → moderate → weak.
|
||||||
const empirical = firstMatch(line, EMPIRICAL_PATTERNS);
|
const empirical = firstUnquotedMatch(line, EMPIRICAL_PATTERNS);
|
||||||
if (empirical) {
|
if (empirical) {
|
||||||
out.push({
|
out.push({
|
||||||
text: line.trim().slice(0, 200),
|
text: line.trim().slice(0, 200),
|
||||||
@ -141,7 +144,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
|||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const strong = firstMatch(line, STRONG_PATTERNS);
|
const strong = firstUnquotedMatch(line, STRONG_PATTERNS);
|
||||||
if (strong) {
|
if (strong) {
|
||||||
out.push({
|
out.push({
|
||||||
text: line.trim().slice(0, 200),
|
text: line.trim().slice(0, 200),
|
||||||
@ -151,7 +154,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
|||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const moderate = firstMatch(line, MODERATE_PATTERNS);
|
const moderate = firstUnquotedMatch(line, MODERATE_PATTERNS);
|
||||||
if (moderate) {
|
if (moderate) {
|
||||||
out.push({
|
out.push({
|
||||||
text: line.trim().slice(0, 200),
|
text: line.trim().slice(0, 200),
|
||||||
@ -161,7 +164,7 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
|||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const weak = firstMatch(line, WEAK_PATTERNS);
|
const weak = firstUnquotedMatch(line, WEAK_PATTERNS);
|
||||||
if (weak) {
|
if (weak) {
|
||||||
out.push({
|
out.push({
|
||||||
text: line.trim().slice(0, 200),
|
text: line.trim().slice(0, 200),
|
||||||
@ -173,9 +176,35 @@ function scanText(text: string, location_prefix: string, commit_sha: string, out
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function firstMatch(text: string, patterns: RegExp[]): RegExp | null {
|
// Match a pattern only when its match position is NOT inside a quoted
|
||||||
|
// string on the line. Mirrors the same guard in auditor/checks/static.ts
|
||||||
|
// — the two files have the same false-positive class: PR authors
|
||||||
|
// quote pattern examples in commit message bodies (e.g. `"Phase 45
|
||||||
|
// shipped"` as a test example) and without this guard those quoted
|
||||||
|
// references get flagged as fresh ship-claims. Only skips when the
|
||||||
|
// match itself falls inside quotes; real (unquoted) uses of the same
|
||||||
|
// vocabulary still classify correctly.
|
||||||
|
function firstUnquotedMatch(text: string, patterns: RegExp[]): RegExp | null {
|
||||||
for (const p of patterns) {
|
for (const p of patterns) {
|
||||||
if (p.test(text)) return p;
|
const m = text.match(p);
|
||||||
|
if (!m || typeof m.index !== "number") continue;
|
||||||
|
if (isInsideQuotedString(text, m.index)) continue;
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Walks left→right toggling in-quote state on each unescaped quote.
|
||||||
|
// Good enough for single-line claims; multi-line strings aren't parsed.
|
||||||
|
function isInsideQuotedString(line: string, pos: number): boolean {
|
||||||
|
let inDouble = false, inSingle = false, inBacktick = false;
|
||||||
|
for (let i = 0; i < pos; i++) {
|
||||||
|
const c = line[i];
|
||||||
|
const esc = i > 0 && line[i - 1] === "\\";
|
||||||
|
if (esc) continue;
|
||||||
|
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
|
||||||
|
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
|
||||||
|
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
|
||||||
|
}
|
||||||
|
return inDouble || inSingle || inBacktick;
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user