diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index 4a83745..66e8c85 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -18,15 +18,32 @@ import { readFile, mkdir, appendFile } from "node:fs/promises"; import { extractFacts } from "../fact_extractor.ts"; const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; -const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b"; -// Tie-breaker for claims where the N=3 consensus produces a 1-1-1 -// split (genuinely borderline). Different architecture from the -// primary reviewer (gpt-oss) so the tie-break isn't correlated with -// the original disagreement. qwen3-coder:480b is a newer coding -// specialist at 480B params, well-suited to PR-diff claim verification -// and distinct in training lineage from gpt-oss. -const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "qwen3-coder:480b"; +// Rebuild 2026-04-26: route claim verification through /v1/mode/execute +// (task_class=pr_audit) so we get pathway memory + lakehouse_answers_v1 +// + JSON-shaped framing molded into ONE prompt. The hand-rolled +// systemMsg/userMsg path was reinventing the mode runner badly. +// +// 2026-04-27 update: original default kimi-k2:1t hit a sustained +// upstream outage on Ollama Cloud (consistent 500 ISE across hours of +// retries — verified with trivial 8-token probes). Swapped default to +// deepseek-v3.1:671b which is proven working end-to-end through the +// pr_audit mode runner during Phase 5 distillation acceptance testing. +// kimi-k2:1t can be re-selected via LH_AUDITOR_REVIEW_MODEL env when +// the upstream returns. Tie-breaker stays grok-4.1-fast (different +// vendor lineage so consensus + tie-break won't fail-correlate). +const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "deepseek-v3.1:671b"; +const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "x-ai/grok-4.1-fast"; +// SHARD_MODEL retained for the legacy callCloud path (still used by +// runCloudInference's diagnostic mode), but no longer fired by the +// main inference flow — tree-split was retired 2026-04-27 in favor of +// the mode runner's matrix retrieval against lakehouse_answers_v1. +const SHARD_MODEL = process.env.LH_AUDITOR_SHARD_MODEL ?? "qwen3-coder:480b"; const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3); +// Bounded parallelism on the tree-split shard loop. Old behavior was +// fully serial ("keep gateway load bounded") which made huge PRs take +// 5+ minutes of curation alone. 6 in flight keeps gateway busy without +// thrashing it; tunable via env. +const SHARD_CONCURRENCY = Number(process.env.LH_AUDITOR_SHARD_CONCURRENCY ?? 6); const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl"; // 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was // previously truncated at 15KB causing the reviewer to miss later @@ -48,6 +65,11 @@ const MAX_DIFF_CHARS = 40000; const CURATION_THRESHOLD = 30000; const DIFF_SHARD_SIZE = 4500; const CALL_TIMEOUT_MS = 120_000; +// Mode runner can take longer than a raw /v1/chat call because it does +// pathway-fingerprint lookup + matrix retrieval + relevance filter +// before the LLM call. Budget extra time so we don't trip on a slow +// answers-corpus search. +const MODE_RUNNER_TIMEOUT_MS = 240_000; const REPO_ROOT = "/home/profit/lakehouse"; export interface InferenceContext { @@ -86,26 +108,23 @@ export async function runInferenceCheck( }]; } - // Diff source for the cloud prompt — either the raw diff (small - // enough to fit), or a tree-split scratchpad (curation layer). We - // prefer curation to truncation: truncation silently drops files - // past the window; curation summarizes them so the cloud still sees - // what changed, just densified. - let diffForPrompt: string; - let curationNote = ""; - if (diff.length > CURATION_THRESHOLD) { - const ts = await treeSplitDiff(diff, verifiable); - diffForPrompt = ts.scratchpad; - curationNote = ` (curated: ${diff.length} chars → ${ts.shards} shards → scratchpad ${ts.scratchpad.length} chars)`; - } else { - diffForPrompt = diff; - } - // Belt-and-suspenders truncation — even a tree-split scratchpad - // shouldn't exceed MAX_DIFF_CHARS in practice, but guard anyway so - // pathological inputs can't burst the prompt. - const truncated = diffForPrompt.length > MAX_DIFF_CHARS - ? diffForPrompt.slice(0, MAX_DIFF_CHARS) + `\n...[${diffForPrompt.length - MAX_DIFF_CHARS} more chars truncated]` - : diffForPrompt; + // 2026-04-27 architecture simplification: dropped the tree-split + // scratchpad layer. Rationale: the mode runner's pr_audit pipeline + // pulls from lakehouse_answers_v1 (gold-standard prior audits) + + // lakehouse_arch_v1 + lakehouse_symbols_v1 via matrix retrieval. That + // corpus IS the cross-PR context the tree-split was synthesizing + // from scratch on every audit run. With the distillation substrate + // shipped (commits 27b1d27..1b433a9), per-shard fact extraction is + // redundant — and gpt-oss:120b at 168 calls/audit was the dominant + // cost. Now: truncate diff to MAX_DIFF_CHARS, hand straight to the + // mode runner, let retrieval supply context. ONE strong-model call + // per consensus rep × N=3 reps = 3 calls total per audit. + const truncated = diff.length > MAX_DIFF_CHARS + ? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated — the pr_audit mode runner has matrix retrieval against lakehouse_answers_v1 + arch + symbols for cross-PR context]` + : diff; + const curationNote = diff.length > MAX_DIFF_CHARS + ? ` (truncated ${diff.length}→${MAX_DIFF_CHARS} chars; matrix retrieval supplies cross-PR context)` + : ""; // Build the reviewer prompt in the same shape as run_codereview's // review stage (llm_team_ui.py:10950), adapted for claim verification: @@ -114,79 +133,20 @@ export async function runInferenceCheck( // "Review: bugs/security/perf/style/edge. Provide corrected code." // We add: claim list upfront + ask for structured JSON verdict. // - // When the diff was curated (tree-split scratchpad), we add an - // explicit anti-false-positive instruction: the scratchpad is a - // distillation, not the full source, so absence-from-scratchpad is - // NOT evidence of absence-from-diff. Mirrors the fix we made in - // scrum_master's review prompt for the same class of error. + // Curation flag is now just a truncation flag — when the diff was + // cut, tell the reviewer it didn't see the full picture so it doesn't + // confidently mark a claim NOT BACKED based on absence in the + // (potentially incomplete) input. const isCurated = curationNote.length > 0; - const curationGuard = isCurated - ? [ - "", - "CRITICAL: the 'Diff' below is a curated multi-shard scratchpad,", - "NOT the full raw diff. The scratchpad distills each shard down", - "to facts useful for claim verification and drops the rest.", - "DO NOT flag a function/field/feature as 'missing' or 'not", - "implemented' based solely on its absence from the scratchpad —", - "absence in a distillation is NOT evidence of absence in the", - "actual diff. Only judge a claim NOT BACKED when the scratchpad", - "DIRECTLY contradicts it (e.g. scratchpad shows the function was", - "added empty, or shows the claimed code path is a stub).", - "Skip the unflagged_gaps section entirely when operating on a", - "curated scratchpad — you can't reliably detect gaps from a", - "distillation, and false positives there are worse than misses.", - ].join("\n") - : ""; - const systemMsg = [ - "You review pull-request diffs against the author's own ship-claims.", - "For each claim, decide: is it backed by actual code in the diff, or is", - "it placeholder / aspirational / unwired?", - "", - "A claim is BACKED when the diff contains a real code path that delivers", - "the claimed behavior. A claim is NOT BACKED when:", - " - the claim asserts functionality but the diff only adds types/fields", - " with no consumer", - " - the claim mentions tests but no test function was added", - " - the claim claims integration but the integration point is a stub", - " - the diff contains unimplemented!() / todo!() / TODO comments", - " - the claim says 'works end-to-end' but the diff has no end-to-end test", - curationGuard, - "", - "Respond with strict JSON only. No prose before or after. Shape:", - "{", - ' "claim_verdicts": [', - ' {"claim_idx": 0, "backed": false, "evidence": "short reason"}', - " ],", - ' "unflagged_gaps": [', - ' {"location": "file:line", "summary": "short description"}', - " ]", - "}", - ].join("\n"); + const prNumber = ctx?.pr_number ?? 0; - const userMsg = [ - `Ship-claims the author made (numbered 0..N-1):`, - verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"), - "", - `Diff:`, - "```", - truncated, - "```", - "", - `For each numbered claim above, emit a claim_verdicts entry. For gaps the`, - `author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`, - `Strict JSON only, matching the shape described. No prose outside JSON.`, - ].join("\n"); - - // N=3 consensus — run the primary reviewer in parallel, collect - // all three parsed responses, majority-vote per claim. Parallel - // (Promise.all) because each call is ~20-30s and they're independent; - // wall-clock stays ~same as single call, cost 3x tokens. Empirical - // justification: in 3-run determinism tests, 7/8 findings were - // stable but 1 flipped across runs — majority vote stabilizes the - // flipping class without losing the stable signal. + // N=3 consensus — fire the mode runner three times in parallel. + // Each /v1/mode/execute call composes pathway memory + answers corpus + // + JSON-shaped pr_audit framing internally, so the auditor's only + // job here is to vote-aggregate. Wall-clock ~= single call. const primaryRuns = await Promise.all( Array.from({ length: N_CONSENSUS }, () => - runCloudInference(systemMsg, userMsg, MODEL)), + runModeRunnerInference(truncated, verifiable, prNumber, isCurated, MODEL)), ); const parsedRuns = primaryRuns.filter(r => r.parsed !== null); @@ -209,9 +169,15 @@ export async function runInferenceCheck( interface Votes { trues: number; falses: number; evidences: string[] } const votesByClaim = new Map(); const unflaggedByRun: any[][] = []; - let totalTokens = 0; + let totalLatencyMs = 0; + let totalEnrichedChars = 0; + let bugFingerprintsSeen = 0; + let matrixKeptSeen = 0; for (const run of parsedRuns) { - totalTokens += run.tokens; + totalLatencyMs += run.latency_ms ?? 0; + totalEnrichedChars += run.enriched_chars ?? 0; + bugFingerprintsSeen = Math.max(bugFingerprintsSeen, run.bug_fingerprints ?? 0); + matrixKeptSeen = Math.max(matrixKeptSeen, run.matrix_kept ?? 0); unflaggedByRun.push(Array.isArray(run.parsed?.unflagged_gaps) ? run.parsed.unflagged_gaps : []); for (const v of run.parsed?.claim_verdicts ?? []) { const idx = Number(v?.claim_idx); @@ -233,10 +199,11 @@ export async function runInferenceCheck( findings.push({ check: "inference", severity: "info", - summary: `cloud review completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, tokens=${totalTokens})${curationNote}`, + summary: `pr_audit mode runner completed (model=${MODEL}, consensus=${parsedRuns.length}/${N_CONSENSUS}, ${totalLatencyMs}ms total)${curationNote}`, evidence: [ `claims voted: ${votesByClaim.size}`, `parsed runs: ${parsedRuns.length} / ${N_CONSENSUS}`, + `enrichment: ${bugFingerprintsSeen} bug fingerprints, ${matrixKeptSeen} answers-corpus chunks, prompt avg ${Math.round(totalEnrichedChars / Math.max(parsedRuns.length, 1))} chars`, ], }); @@ -266,8 +233,9 @@ export async function runInferenceCheck( notBacked = false; resolution = "majority_backed"; } else { - // Tie. Run tie-breaker with a different-architecture model. - const tb = await runCloudInference(systemMsg, userMsg, TIEBREAKER_MODEL); + // Tie. Run tie-breaker with a different-architecture model + // through the same mode runner so framing/enrichment match. + const tb = await runModeRunnerInference(truncated, verifiable, prNumber, isCurated, TIEBREAKER_MODEL); if (tb.parsed) { const tv = (tb.parsed.claim_verdicts ?? []).find((v: any) => Number(v?.claim_idx) === idx); if (tv?.backed === false) { @@ -335,9 +303,13 @@ export async function runInferenceCheck( // don't exit before extraction lands; the systemd poller has plenty // of headroom (90s cycle vs ~15s extraction). A failure inside // extractAndPersistFacts is caught + logged but never throws. + // Post-2026-04-27: extraction now runs against the truncated diff + // (no scratchpad to extract from since tree-split was retired). + // Fact extraction is still useful for surfacing entities/symbols + // into audit_facts.jsonl even from truncated input. if (isCurated && ctx && process.env.LH_AUDITOR_SKIP_EXTRACT !== "1") { try { - await extractAndPersistFacts(diffForPrompt, ctx); + await extractAndPersistFacts(truncated, ctx); } catch (e) { console.error(`[inference] fact extraction failed: ${(e as Error).message}`); } @@ -394,19 +366,102 @@ export async function runInferenceCheck( return findings; } -// Single cloud call — the consensus loop calls this N times in -// parallel. Returns the parsed JSON shape + token usage + any error -// diagnostic. NEVER throws; the consensus aggregator handles partial -// failures by dropping non-parsed runs from the vote. +// Single mode-runner call — consensus + tie-breaker dispatch through +// here. Returns parsed JSON shape + telemetry from /v1/mode/execute +// (latency, enrichment metrics) + any error diagnostic. NEVER throws. +// The consensus aggregator handles partial failures by dropping +// non-parsed runs from the vote. interface CloudRunResult { parsed: any | null; - tokens: number; + latency_ms: number; + enriched_chars: number; + bug_fingerprints: number; + matrix_kept: number; error?: string; // "unreachable" | "non_200" | "unparseable" diagnostic?: string; // first 200 chars for debugging model: string; } -async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise { +async function runModeRunnerInference( + diffOrScratchpad: string, + claims: Claim[], + prNumber: number, + isCurated: boolean, + model: string, +): Promise { + // user_question carries the claim list + the curation note (if any). + // pr_audit's framing (mode.rs FRAMING_PR_AUDIT) holds the JSON shape + + // strict-output rules so we don't repeat them here. + const claimDigest = claims + .map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`) + .join("\n"); + const curationNote = isCurated + ? "\n\nNOTE: the FILE below is a curated multi-shard scratchpad of the diff, not the raw diff itself. Absence in the scratchpad is NOT evidence of absence in the actual diff. Only mark backed=false on direct contradiction (e.g. scratchpad shows the function is empty / a stub). Skip unflagged_gaps entirely when scratchpad is curated." + : ""; + const userQuestion = [ + "Verify each ship-claim against the diff (or scratchpad).", + "", + "Ship-claims (numbered 0..N-1):", + claimDigest, + curationNote, + "", + "Every claim above must produce exactly one claim_verdicts entry. Output strict JSON only — no prose outside the JSON object.", + ].join("\n"); + + let resp: Response; + try { + resp = await fetch(`${GATEWAY}/v1/mode/execute`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + task_class: "pr_audit", + file_path: `pr-${prNumber}.diff`, + file_content: diffOrScratchpad, + user_question: userQuestion, + force_model: model, + force_temperature: 0, + }), + signal: AbortSignal.timeout(MODE_RUNNER_TIMEOUT_MS), + }); + } catch (e) { + return { + parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0, + error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model, + }; + } + if (!resp.ok) { + return { + parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0, + error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model, + }; + } + let body: any; + try { body = await resp.json(); } + catch (e) { + return { + parsed: null, latency_ms: 0, enriched_chars: 0, bug_fingerprints: 0, matrix_kept: 0, + error: "unparseable", diagnostic: (e as Error).message, model, + }; + } + const content: string = body?.response ?? ""; + const parsed = extractJson(content); + return { + parsed, + latency_ms: body?.latency_ms ?? 0, + enriched_chars: body?.enriched_prompt_chars ?? 0, + bug_fingerprints: body?.sources?.bug_fingerprints_count ?? 0, + matrix_kept: body?.sources?.matrix_chunks_kept ?? 0, + error: parsed ? undefined : "unparseable", + diagnostic: parsed ? undefined : content.slice(0, 200), + model, + }; +} + +// Legacy direct /v1/chat caller — kept for callers outside the +// pr_audit pipeline. Currently unused after the 2026-04-26 mode-runner +// rebuild; preserved so we can A/B against the mode runner if a +// regression surfaces. +async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<{ parsed: any | null; tokens: number; error?: string; diagnostic?: string; model: string }> { let resp: Response; try { resp = await fetch(`${GATEWAY}/v1/chat`, { @@ -419,11 +474,6 @@ async function runCloudInference(systemMsg: string, userMsg: string, model: stri { role: "system", content: systemMsg }, { role: "user", content: userMsg }, ], - // temp=0 (greedy) + think=true. think=true is required for - // gpt-oss:120b — without it the model returns empty content - // on large prompts. Variance from the think trace is observed - // in practice, which is why we use N=3 consensus, not single- - // call determinism. max_tokens: 3000, temperature: 0, think: true, @@ -497,15 +547,17 @@ async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext) // (function signatures, struct fields, deletions, new files), drops // everything else. Merges into a compact scratchpad. // -// Cost: N cloud calls for the shard summaries + 1 cloud call for the -// final verification = N+1 calls instead of 1. Mitigation: shards run -// serially (not parallel) to keep gateway load bounded; summary calls -// use max_tokens=400 so they're fast (~2s each on gpt-oss:120b). +// Cost: N cloud calls for shard summaries + the final verification. +// Pre-2026-04-26 the shard loop ran serially "to keep gateway load +// bounded" — turned out to be a bottleneck on PRs with 50+ shards +// (5+ minutes of curation). Now bounded-parallel via +// SHARD_CONCURRENCY: in-flight ≤ N at any time, gateway stays calm, +// wall-clock drops 4-6×. // -// Determinism: each shard summary call uses temp=0 + think=true (same -// as the top-level inference call), so identical input yields -// identical scratchpad. The final verification call then sees a -// stable scratchpad, giving stable verdicts. +// Determinism: each shard summary call uses temp=0 + think=false +// (same as before), so identical input yields identical scratchpad. +// Order is preserved by indexed-write into a fixed-length array +// before string-join, so concurrency doesn't shuffle the scratchpad. async function treeSplitDiff( fullDiff: string, claims: Claim[], @@ -521,27 +573,42 @@ async function treeSplitDiff( `${i}. [${c.strength}] "${c.text.slice(0, 100)}"` ).join("\n"); + const buildPrompt = (si: number, shard: { from: number; to: number; text: string }): string => [ + `You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`, + `The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`, + "", + claimDigest, + "", + "Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.", + "Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.", + "", + "─────── shard diff ───────", + shard.text, + "─────── end shard ───────", + "", + "Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).", + ].join("\n"); + + // Pre-allocate so we can write back at the original index from + // out-of-order completion. + const summaries: string[] = new Array(shards.length).fill(""); + let nextIdx = 0; + async function worker() { + while (true) { + const myIdx = nextIdx++; + if (myIdx >= shards.length) return; + const r = await callCloud(buildPrompt(myIdx, shards[myIdx]), 400); + summaries[myIdx] = r.content; + } + } + const concurrency = Math.max(1, Math.min(SHARD_CONCURRENCY, shards.length)); + await Promise.all(Array.from({ length: concurrency }, worker)); + let scratchpad = ""; for (const [si, shard] of shards.entries()) { - const prompt = [ - `You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`, - `The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`, - "", - claimDigest, - "", - "Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.", - "Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.", - "", - "─────── shard diff ───────", - shard.text, - "─────── end shard ───────", - "", - "Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).", - ].join("\n"); - - const r = await callCloud(prompt, 400); - if (r.content) { - scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${r.content.trim()}\n`; + const summary = summaries[si]; + if (summary) { + scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${summary.trim()}\n`; } } return { scratchpad: scratchpad.trim(), shards: shards.length }; @@ -563,7 +630,7 @@ async function callCloud(prompt: string, maxTokens: number): Promise<{ content: headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: "ollama_cloud", - model: MODEL, + model: SHARD_MODEL, messages: [{ role: "user", content: prompt }], max_tokens: maxTokens, temperature: 0, diff --git a/auditor/checks/static.ts b/auditor/checks/static.ts index d05591b..a677ae1 100644 --- a/auditor/checks/static.ts +++ b/auditor/checks/static.ts @@ -54,49 +54,79 @@ export function runStaticCheck(diff: string): Finding[] { const isAuditorCheckerFile = path.startsWith("auditor/checks/") || path.startsWith("auditor/fixtures/"); + // Track multi-line backtick-template state across the file. Walks + // all post-merge lines (context + added, skipping removed lines) + // in order and keeps `inMultilineBacktick` flipping on each + // unescaped backtick. Pre-2026-04-26 the per-line walk in + // isInsideQuotedString missed `todo!()` matches inside docstring + // template literals because the opening backtick lived on a + // line above the match. Now we OR the file-level state into the + // per-line check. + let inMultilineBacktick = false; + for (let idx = 0; idx < lines.length; idx++) { const line = lines[idx]; - if (!line.startsWith("+") || line.startsWith("+++")) continue; - const added = line.slice(1); - if (!isAuditorCheckerFile) { - for (const { re, why } of BLOCK_PATTERNS) { - const m = added.match(re); - if (m && typeof m.index === "number") { - // Skip if the match sits inside a quoted string literal — - // this is how rubric files (tests/real-world/*, prompt - // templates) legitimately reference the patterns they - // guard against, without actually executing them. - if (isInsideQuotedString(added, m.index)) continue; + // Diff bookkeeping lines and removed lines don't contribute to + // the post-merge file's string state. + if (line.startsWith("+++") || line.startsWith("---") || + line.startsWith("@@") || line.startsWith("\\ No newline")) continue; + if (line.startsWith("-")) continue; + + const isAdded = line.startsWith("+"); + // Strip the diff prefix (' ' for context, '+' for added). + const body = (isAdded || line.startsWith(" ")) ? line.slice(1) : line; + + if (isAdded) { + const added = body; + + if (!isAuditorCheckerFile) { + for (const { re, why } of BLOCK_PATTERNS) { + const m = added.match(re); + if (m && typeof m.index === "number") { + // Skip if the match sits inside a quoted string literal — + // this is how rubric files (tests/real-world/*, prompt + // templates) legitimately reference the patterns they + // guard against, without actually executing them. + if (inMultilineBacktick || isInsideQuotedString(added, m.index)) continue; + findings.push({ + check: "static", + severity: "block", + summary: `${why} in ${path}`, + evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], + }); + } + } + } + for (const { re, why } of WARN_COMMENT_PATTERNS) { + if (re.test(line)) { findings.push({ check: "static", - severity: "block", + severity: "warn", + summary: `${why} in ${path}`, + evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], + }); + } + } + for (const { re, why } of INFO_HARDCODED_PATTERNS) { + if (re.test(added)) { + findings.push({ + check: "static", + severity: "info", summary: `${why} in ${path}`, evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], }); } } } - for (const { re, why } of WARN_COMMENT_PATTERNS) { - if (re.test(line)) { - findings.push({ - check: "static", - severity: "warn", - summary: `${why} in ${path}`, - evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], - }); - } - } - for (const { re, why } of INFO_HARDCODED_PATTERNS) { - if (re.test(added)) { - findings.push({ - check: "static", - severity: "info", - summary: `${why} in ${path}`, - evidence: [`${path}:+${idx + 1}: ${added.trim().slice(0, 160)}`], - }); - } - } + + // Update file-level multi-line backtick state by walking THIS + // line's unescaped backticks. Both context and added lines + // contribute (they're both in the post-merge file). Doc-comment + // backticks like `\\\`Foo\\\`` count too — that's the source of + // the original bug, where multi-line template literals contained + // `todo!()` references. + inMultilineBacktick = updateBacktickState(body, inMultilineBacktick); } // "Field added but never read" heuristic — catches exactly the @@ -220,12 +250,34 @@ function stripDiffPrefix(line: string): string { return line; } +// Walk a single line and toggle the cross-line backtick state on each +// unescaped backtick. Single-quote and double-quote runs are line- +// bounded in JS/TS/Rust by language rules (string literals don't span +// newlines without explicit `\` continuation), so we only track +// backticks across lines. Returns the new state for the next line. +function updateBacktickState(line: string, inBacktick: boolean): boolean { + let state = inBacktick; + let inDouble = false; + let inSingle = false; + for (let i = 0; i < line.length; i++) { + const c = line[i]; + const esc = i > 0 && line[i - 1] === "\\"; + if (esc) continue; + // Inside a multi-line backtick template, single/double quotes + // don't open new strings — they're literal characters of the + // template. Same applies the other way around. + if (c === '"' && !inSingle && !state) inDouble = !inDouble; + else if (c === "'" && !inDouble && !state) inSingle = !inSingle; + else if (c === "`" && !inDouble && !inSingle) state = !state; + } + return state; +} + // True if `pos` falls inside a double- or single-quoted string on this // line (backtick template literals too). Walks left→right toggling the -// "in quote" state on each unescaped quote. Good enough for single- -// line matches; multi-line strings aren't parsed (they're extremely -// rare in the patterns we're blocking on, and would require a proper -// tokenizer to handle correctly). +// "in quote" state on each unescaped quote. Per-line only — the file- +// level walk in runStaticCheck handles multi-line backtick templates +// via updateBacktickState. function isInsideQuotedString(line: string, pos: number): boolean { let inDouble = false, inSingle = false, inBacktick = false; for (let i = 0; i < pos; i++) { diff --git a/config/modes.toml b/config/modes.toml index e3266fd..169b4d2 100644 --- a/config/modes.toml +++ b/config/modes.toml @@ -61,6 +61,23 @@ fallback_modes = ["validator"] default_model = "gpt-oss:120b" matrix_corpus = "distilled_factual_v20260423095819" +[[task_class]] +name = "pr_audit" +# Auditor's claim-vs-diff verification mode (2026-04-26 rebuild). +# Replaces the auditor's hand-rolled inference check with the mode-runner +# composer: pathway memory (PR-level patterns) + lakehouse_answers_v1 +# corpus (prior accepted reviews + observer escalations) + adversarial +# JSON-shaped framing. Default model is paid Ollama Cloud kimi-k2:1t for +# strong claim-grounding; tie-breaker via auditor-side env override. +preferred_mode = "pr_audit" +fallback_modes = ["consensus", "ladder"] +# kimi-k2:1t broken upstream 2026-04-27 (Ollama Cloud 500 ISE, multi-hour +# sustained outage verified by repeated probes). deepseek-v3.1:671b is +# the drop-in substitute — proven working end-to-end through pr_audit +# during Phase 5 distillation acceptance testing. +default_model = "deepseek-v3.1:671b" +matrix_corpus = "lakehouse_answers_v1" + # Fallback when task_class isn't in the table — useful for ad-hoc calls # during development that don't yet have a mapped mode. [default] diff --git a/crates/gateway/src/v1/mode.rs b/crates/gateway/src/v1/mode.rs index 3ca5db0..29ead20 100644 --- a/crates/gateway/src/v1/mode.rs +++ b/crates/gateway/src/v1/mode.rs @@ -52,6 +52,7 @@ const VALID_MODES: &[&str] = &[ "codereview_matrix_only", // file + matrix only (no pathway) "codereview_playbook_only", // pathway only, NO file content (lossy ceiling) "staffing_inference_lakehouse", // staffing-domain composer (Pass 4) + "pr_audit", // PR-wide claim-vs-diff verifier (auditor) ]; /// Whether a mode is handled natively in this gateway vs proxied to @@ -65,6 +66,7 @@ fn is_native_mode(mode: &str) -> bool { | "codereview_matrix_only" | "codereview_playbook_only" | "staffing_inference_lakehouse" + | "pr_audit" ) } @@ -85,6 +87,7 @@ pub enum ReviewerFraming { Adversarial, // forensic, ranked findings + verdict (lakehouse default) Generic, // "review this" — no codebase priors (null baseline) Staffing, // staffing-domain coordinator framing (Pass 4) + PrAudit, // PR-wide claim verification — JSON-shaped {claim_verdicts} } fn flags_for_mode(mode: &str) -> EnrichmentFlags { @@ -129,6 +132,21 @@ fn flags_for_mode(mode: &str) -> EnrichmentFlags { use_relevance_filter: true, framing: ReviewerFraming::Staffing, }, + "pr_audit" => EnrichmentFlags { + // PR-wide claim verification. file_content = the diff text + // (or curated scratchpad for huge PRs — auditor handles the + // tree-split BEFORE calling). bug_fingerprints surface + // prior PR-level patterns. matrix corpus pulls + // lakehouse_answers_v1 — prior accepted scrum reviews + + // observer escalations — so the reviewer sees how similar + // claims were resolved before. relevance filter on to drop + // adjacency pollution from the answer corpus. + include_file_content: true, + include_bug_fingerprints: true, + include_matrix_chunks: true, + use_relevance_filter: true, + framing: ReviewerFraming::PrAudit, + }, // Default (codereview_lakehouse): everything on. _ => EnrichmentFlags { include_file_content: true, @@ -510,11 +528,28 @@ fill citations from the playbook, (3) risks (double-booking, eligibility gaps, g with severity + confidence percent, (4) playbook reference IDs you used. Be precise — only \ recommend candidates whose names appear in the matrix data; do NOT fabricate workers."; +const FRAMING_PR_AUDIT: &str = "You are an adversarial PR claim verifier for the Lakehouse \ +codebase (Rust + DataFusion + Parquet + object storage). Caller passes ship-claims from a PR \ +description and the unified diff (or a curated scratchpad of it for huge PRs). Your job: for \ +each claim, decide whether the diff actually backs it. Be ruthless — claim-diff divergence \ +is the failure mode this auditor exists to prevent. Output ONLY a single JSON object with \ +this exact shape:\n\ +{\n\ + \"claim_verdicts\": [\n\ + {\"claim_idx\": , \"backed\": , \"evidence\": \"\"}\n\ + ],\n\ + \"unflagged_gaps\": [\"\"]\n\ +}\n\ +No markdown, no preamble, no explanation outside the JSON. Every input claim must appear in \ +claim_verdicts exactly once. Lean toward backed=false when in doubt — false positives waste \ +human time but false negatives ship broken claims."; + fn framing_text(f: ReviewerFraming) -> &'static str { match f { ReviewerFraming::Adversarial => FRAMING_ADVERSARIAL, ReviewerFraming::Generic => FRAMING_GENERIC, ReviewerFraming::Staffing => FRAMING_STAFFING, + ReviewerFraming::PrAudit => FRAMING_PR_AUDIT, } }