Bundles PR #9's work for the audit pipeline: - N=3 consensus on cloud inference (gpt-oss:120b parallel) with qwen3-coder:480b tie-breaker - audit_discrepancies.jsonl logs N-run disagreements - scrum_master reviews route through llm_team fact extraction; source="scrum_review" - Verifier-gated persistence: drops INCORRECT, keeps UNVERIFIABLE/UNCHECKED; schema_version:2 - scrum_master_reviewed flag on accepted reviews - auditor/kb_stats.ts: on-demand observability script - claim_parser history/proof pattern class (verified-on-PR, was-flipping, the-proven-X) - claim_parser quoted-string guard (mirrors static.ts fix) - fact_extractor project context injection via docs/AUDITOR_CONTEXT.md - Fixed verifier-verdict parser to handle multiple gemma2 output formats Empirical: 3-run determinism test on unchanged PR #9 SHA showed 7/7 warn findings stable; block count oscillation eliminated; llm_team quality scores 8-9 on context-injected extract runs. See PR #9 for full run-by-run commit history.
211 lines
8.4 KiB
TypeScript
211 lines
8.4 KiB
TypeScript
// Claim parser — reads commit messages + PR body, extracts ship-claims.
|
|
//
|
|
// A "ship-claim" is any phrase that asserts functionality is working,
|
|
// tested, complete, or landed. These are the assertions the downstream
|
|
// checks (static/dynamic/inference/kb) try to falsify.
|
|
//
|
|
// Heuristic approach (regex + strength grading) — intentionally NOT
|
|
// using an LLM here. Reason: the inference check already asks a cloud
|
|
// model "does this match the claim?". The parser's job is to surface
|
|
// the claim substrates, not judge them. Over-engineering the parser
|
|
// risks false-negatives when the cloud model was going to catch it
|
|
// anyway.
|
|
|
|
import type { Claim, PrSnapshot } from "./types.ts";
|
|
|
|
// Strong claims: explicit end-to-end + verification vocabulary
|
|
const STRONG_PATTERNS: RegExp[] = [
|
|
/\bverified\s+(end[- ]to[- ]end|live|in\s+production|against)\b/i,
|
|
/\btested\s+(live|end[- ]to[- ]end|against|with)\b/i,
|
|
/\bworks\s+(end[- ]to[- ]end|live|in\s+production)\b/i,
|
|
/\bproduction[- ]ready\b/i,
|
|
/\bfully\s+(functional|wired|working)\b/i,
|
|
/\bphase\s+\d+(\.\d+)?\s+(shipped|complete|done|landed)\b/i,
|
|
/\bground\s+truth\b/i,
|
|
/\bproven\b/i,
|
|
];
|
|
|
|
// Moderate claims: asserted completion or pass but without the strong
|
|
// verification qualifier.
|
|
const MODERATE_PATTERNS: RegExp[] = [
|
|
/\bshipped\b/i,
|
|
/\blanded\b/i,
|
|
/\bgreen\b/i,
|
|
/\b(tests?\s+)?pass(ing|ed)\b/i,
|
|
/\bcomplet(e|ed)\b/i,
|
|
/\bdone\b/i,
|
|
/\bwired\b/i,
|
|
/\bfixed\b/i,
|
|
/\bworks\b/i,
|
|
];
|
|
|
|
// Weak claims: aspirational or hedged. Usually low-risk but recorded
|
|
// for completeness.
|
|
const WEAK_PATTERNS: RegExp[] = [
|
|
/\bshould\s+work\b/i,
|
|
/\bexpected\s+to\b/i,
|
|
/\bintended\s+to\b/i,
|
|
/\bwill\s+(work|handle|support)\b/i,
|
|
/\bprobably\b/i,
|
|
];
|
|
|
|
// Empirical claims: runtime measurements / observed outcomes that can't
|
|
// be verified from a diff (only from the actual run that produced
|
|
// them). Classifying as empirical lets the inference check skip
|
|
// diff-verification and saves the ladder for falsifiable claims.
|
|
//
|
|
// Two classes share this bucket because they share the skip discipline:
|
|
//
|
|
// 1. Runtime metrics — "58 cloud calls", "306s end-to-end"
|
|
// 2. History/proof refs — "verified on PR #8", "was flipping across runs"
|
|
//
|
|
// Both are assertions about state outside the current diff. The cloud
|
|
// would flag them as "not backed" — but that's a false positive: the
|
|
// proof lives in the referenced run, prior commit, or test output, not
|
|
// in the added lines the cloud is reading.
|
|
const EMPIRICAL_PATTERNS: RegExp[] = [
|
|
// ─── Runtime metrics ───
|
|
// Iteration / attempt counts: "6/6 iterations", "attempt 5", "accepted on attempt 3"
|
|
/\b\d+\s*\/\s*\d+\s+(iterations?|attempts?|cycles?|runs?|shards?)\b/i,
|
|
/\b(accepted|resolved|converged)\s+on\s+attempt\s+\d+\b/i,
|
|
// Runtime metrics: "58 cloud calls", "306s end-to-end", "245s total", "5931 chars"
|
|
/\b\d+\s+(cloud\s+)?calls?\b/i,
|
|
/\b\d+\s*(ms|s|seconds?|minutes?|m)\s+(end[- ]to[- ]end|total|elapsed|duration)\b/i,
|
|
/\b\d+\s+chars?\b.*\b(accepted|generated|produced)\b/i,
|
|
// "escalated through N tiers", "N distinct models"
|
|
/\bescalated\s+through\s+\d+\b/i,
|
|
/\b\d+\s+distinct\s+(model|tier)s?\b/i,
|
|
|
|
// ─── History / proof references ───
|
|
// "verified on PR #8", "verified end-to-end on PR 8", "tested against PR #4"
|
|
// Require PR#N / commit-hash / "prior <word>" to avoid matching
|
|
// "verified ... in production" (PR without \b-ish anchor previously
|
|
// consumed "pr" of "production").
|
|
/\bverified\s+(?:end[- ]to[- ]end\s+)?(?:on|against|in)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+|the\s+\w+\s+audit)\b/i,
|
|
/\btested\s+(?:against|in|on)\s+(?:PR\s*#?\d+|commit\s+[0-9a-f]{6,}|prior\s+\w+)\b/i,
|
|
// Direct PR/commit references: "PR #8", "on PR 9", "from commit abc123"
|
|
/\b(?:on|from|in|via|per)\s+PR\s*#?\d+\b/i,
|
|
/\b(?:from|in|per|against)\s+commit\s+[0-9a-f]{6,}/i,
|
|
// Observational descriptions of prior behavior: "was flipping", "was X before", "previously observed"
|
|
/\b(?:was|were)\s+(?:flipping|drifting|inconsistent|non[- ]deterministic|creeping)\b/i,
|
|
/\bpreviously\s+(?:observed|flagged|reported|seen|landed)\b/i,
|
|
/\bused\s+to\s+(?:flip|fail|flag|reject|block)\b/i,
|
|
/\bobserved\s+(?:in|during|on|across)\s+(?:PR|prior|\d+\s+(?:runs?|audits?))/i,
|
|
// "flipping/drifting across N runs" — historical variance description
|
|
/\b(?:flipping|drifting|varying|oscillating)\s+across\s+(?:\d+\s+)?(?:runs?|audits?|iterations?)\b/i,
|
|
// "the proven X" referring to prior work (proven is a STRONG pattern
|
|
// but in context "the proven FOO" is usually a historical reference,
|
|
// not a fresh claim). We catch it here so the empirical skip wins.
|
|
/\bthe\s+proven\s+(?:escalation\s+ladder|pipeline|flow|loop|tier|path)/i,
|
|
// "from the 9-run test", "across the 5-run validation"
|
|
/\b(?:from|across|in|during)\s+the\s+\d+[- ]run\s+(?:test|validation|probe|experiment)/i,
|
|
];
|
|
|
|
export interface ParsedClaims {
|
|
claims: Claim[];
|
|
commits_scanned: number;
|
|
}
|
|
|
|
export function parseClaims(pr: PrSnapshot): ParsedClaims {
|
|
const claims: Claim[] = [];
|
|
|
|
// PR body — every matching line becomes a claim at location "pr_body:N"
|
|
if (pr.body) {
|
|
scanText(pr.body, "pr_body", pr.head_sha, claims);
|
|
}
|
|
|
|
// Each commit message gets its own scan.
|
|
for (const c of pr.commits) {
|
|
if (!c.message) continue;
|
|
scanText(c.message, `commit:${c.sha.slice(0, 8)}`, c.sha, claims);
|
|
}
|
|
|
|
return { claims, commits_scanned: pr.commits.length };
|
|
}
|
|
|
|
function scanText(text: string, location_prefix: string, commit_sha: string, out: Claim[]): void {
|
|
const lines = text.split(/\r?\n/);
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i];
|
|
if (line.length < 3) continue;
|
|
|
|
// Empirical match wins over everything else — if a line ALSO
|
|
// contains a moderate word like "complete", we still want to
|
|
// classify it as empirical so the inference check doesn't ask
|
|
// the cloud to prove "58 cloud calls" from the diff. Order:
|
|
// empirical → strong → moderate → weak.
|
|
const empirical = firstUnquotedMatch(line, EMPIRICAL_PATTERNS);
|
|
if (empirical) {
|
|
out.push({
|
|
text: line.trim().slice(0, 200),
|
|
commit_sha,
|
|
location: `${location_prefix}:${i + 1}`,
|
|
strength: "empirical",
|
|
});
|
|
continue;
|
|
}
|
|
const strong = firstUnquotedMatch(line, STRONG_PATTERNS);
|
|
if (strong) {
|
|
out.push({
|
|
text: line.trim().slice(0, 200),
|
|
commit_sha,
|
|
location: `${location_prefix}:${i + 1}`,
|
|
strength: "strong",
|
|
});
|
|
continue;
|
|
}
|
|
const moderate = firstUnquotedMatch(line, MODERATE_PATTERNS);
|
|
if (moderate) {
|
|
out.push({
|
|
text: line.trim().slice(0, 200),
|
|
commit_sha,
|
|
location: `${location_prefix}:${i + 1}`,
|
|
strength: "moderate",
|
|
});
|
|
continue;
|
|
}
|
|
const weak = firstUnquotedMatch(line, WEAK_PATTERNS);
|
|
if (weak) {
|
|
out.push({
|
|
text: line.trim().slice(0, 200),
|
|
commit_sha,
|
|
location: `${location_prefix}:${i + 1}`,
|
|
strength: "weak",
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Match a pattern only when its match position is NOT inside a quoted
|
|
// string on the line. Mirrors the same guard in auditor/checks/static.ts
|
|
// — the two files have the same false-positive class: PR authors
|
|
// quote pattern examples in commit message bodies (e.g. `"Phase 45
|
|
// shipped"` as a test example) and without this guard those quoted
|
|
// references get flagged as fresh ship-claims. Only skips when the
|
|
// match itself falls inside quotes; real (unquoted) uses of the same
|
|
// vocabulary still classify correctly.
|
|
function firstUnquotedMatch(text: string, patterns: RegExp[]): RegExp | null {
|
|
for (const p of patterns) {
|
|
const m = text.match(p);
|
|
if (!m || typeof m.index !== "number") continue;
|
|
if (isInsideQuotedString(text, m.index)) continue;
|
|
return p;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Walks left→right toggling in-quote state on each unescaped quote.
|
|
// Good enough for single-line claims; multi-line strings aren't parsed.
|
|
function isInsideQuotedString(line: string, pos: number): boolean {
|
|
let inDouble = false, inSingle = false, inBacktick = false;
|
|
for (let i = 0; i < pos; i++) {
|
|
const c = line[i];
|
|
const esc = i > 0 && line[i - 1] === "\\";
|
|
if (esc) continue;
|
|
if (c === '"' && !inSingle && !inBacktick) inDouble = !inDouble;
|
|
else if (c === "'" && !inDouble && !inBacktick) inSingle = !inSingle;
|
|
else if (c === "`" && !inDouble && !inSingle) inBacktick = !inBacktick;
|
|
}
|
|
return inDouble || inSingle || inBacktick;
|
|
}
|