Some checks failed
lakehouse/auditor 1 blocking issue: cloud: claim not backed — "the proven escalation ladder with learning context, collects"
Phase 1 — definition-layer over append-only JSONL scratchpads.
auditor/kb_index.ts is the single shared aggregator:
aggregate<T>(jsonlPath, { keyFn, scopeFn, checkFn, tailLimit })
→ Map<signature, {count, distinct_scopes, confidence,
first_seen, last_seen, representative_summary, ...}>
ratingSeverity(agg) — confidence × count severity policy shared
across all KB readers. Kills the "same unfixed PR inflates its
own recurrence score" failure mode by design: confidence =
distinct_scopes/count, so same-scope noise stays below the 0.3
escalation threshold no matter how many times it repeats.
checkAuditLessons now routes through aggregate + ratingSeverity.
Net effect: the recurrence detector's bespoke Map/Set bookkeeping is
gone; same behavior, shared discipline, reusable by scrum/observer.
Also: symbolsExistInRepo now skips files >500KB so the audit can't
get stuck slurping a fixture.
Phase 2 — nine-consecutive audit runner.
tests/real-world/nine_consecutive_audits.ts pushes 9 empty commits,
waits for each verdict, captures the audit_lessons aggregate state
after each run, reports:
- sig_count trajectory (should stabilize, not grow linearly)
- max_count trajectory (same-signature repeat rate)
- max_confidence trajectory (must stay LOW on same-PR noise)
- verdict_stable across runs (must NOT oscillate)
This is the empirical proof that the KB compounds favorably:
noise doesn't escalate itself, and signal stays distinguishable.
Unit-tested both failure modes: same-PR × 9 repeats = conf=0.11
(info); cross-PR × 5 distinct = conf=1.00 (block). The rating
function correctly discriminates.
323 lines
12 KiB
TypeScript
323 lines
12 KiB
TypeScript
// Cloud inference check — wraps the proven run_codereview pattern
|
|
// from llm_team_ui.py (same 3-stage framing, same cloud model) to
|
|
// critique a PR's claims against its diff.
|
|
//
|
|
// Proved out 2026-04-22 via /tmp/codereview_runner.py — gpt-oss:120b
|
|
// caught a real ternary bug in auditor/fixtures/hybrid_38_40_45.ts
|
|
// that unit tests missed. This module reuses the reviewer prompt
|
|
// shape (bugs / security / performance / style / edge cases) and
|
|
// adds claim-vs-diff specific framing.
|
|
//
|
|
// Call surface: runInferenceCheck(claims, diff) → Finding[].
|
|
// Cloud latency budget: ~60s (gpt-oss:120b reviewer typically 35-50s
|
|
// with a 15KB diff + claim list).
|
|
|
|
import type { Claim, Finding } from "../types.ts";
|
|
import { Glob } from "bun";
|
|
import { readFile } from "node:fs/promises";
|
|
|
|
const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
|
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "gpt-oss:120b";
|
|
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
|
// previously truncated at 15KB causing the reviewer to miss later
|
|
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
|
// block finding when the file was simply outside the truncation window.
|
|
const MAX_DIFF_CHARS = 40000;
|
|
const CALL_TIMEOUT_MS = 120_000;
|
|
const REPO_ROOT = "/home/profit/lakehouse";
|
|
|
|
export async function runInferenceCheck(claims: Claim[], diff: string): Promise<Finding[]> {
|
|
if (claims.length === 0) {
|
|
return [{
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: "no ship-claims extracted — skipping cloud inference",
|
|
evidence: ["parser returned empty claim list; nothing to verify against cloud"],
|
|
}];
|
|
}
|
|
|
|
// Empirical claims (runtime metrics / observed outcomes) can't be
|
|
// verified from the diff. Drop them from the cloud prompt so the
|
|
// reviewer doesn't chase ghosts. A future `runtime_evidence` check
|
|
// can validate these against data/_kb/*/summary.json outputs.
|
|
const verifiable = claims.filter(c => c.strength !== "empirical");
|
|
const empiricalCount = claims.length - verifiable.length;
|
|
if (verifiable.length === 0) {
|
|
return [{
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: `all ${claims.length} claims are empirical (runtime metrics) — skipping cloud inference`,
|
|
evidence: [`empirical claims can't be verified from a static diff; needs runtime-evidence check`],
|
|
}];
|
|
}
|
|
|
|
const truncated = diff.length > MAX_DIFF_CHARS
|
|
? diff.slice(0, MAX_DIFF_CHARS) + `\n...[${diff.length - MAX_DIFF_CHARS} more chars truncated]`
|
|
: diff;
|
|
|
|
// Build the reviewer prompt in the same shape as run_codereview's
|
|
// review stage (llm_team_ui.py:10950), adapted for claim verification:
|
|
// "Task: ..."
|
|
// "Code: ..."
|
|
// "Review: bugs/security/perf/style/edge. Provide corrected code."
|
|
// We add: claim list upfront + ask for structured JSON verdict.
|
|
const systemMsg = [
|
|
"You review pull-request diffs against the author's own ship-claims.",
|
|
"For each claim, decide: is it backed by actual code in the diff, or is",
|
|
"it placeholder / aspirational / unwired?",
|
|
"",
|
|
"A claim is BACKED when the diff contains a real code path that delivers",
|
|
"the claimed behavior. A claim is NOT BACKED when:",
|
|
" - the claim asserts functionality but the diff only adds types/fields",
|
|
" with no consumer",
|
|
" - the claim mentions tests but no test function was added",
|
|
" - the claim claims integration but the integration point is a stub",
|
|
" - the diff contains unimplemented!() / todo!() / TODO comments",
|
|
" - the claim says 'works end-to-end' but the diff has no end-to-end test",
|
|
"",
|
|
"Respond with strict JSON only. No prose before or after. Shape:",
|
|
"{",
|
|
' "claim_verdicts": [',
|
|
' {"claim_idx": 0, "backed": false, "evidence": "short reason"}',
|
|
" ],",
|
|
' "unflagged_gaps": [',
|
|
' {"location": "file:line", "summary": "short description"}',
|
|
" ]",
|
|
"}",
|
|
].join("\n");
|
|
|
|
const userMsg = [
|
|
`Ship-claims the author made (numbered 0..N-1):`,
|
|
verifiable.map((c, i) => ` ${i}. [${c.strength}] "${c.text}" at ${c.location}`).join("\n"),
|
|
"",
|
|
`Diff:`,
|
|
"```",
|
|
truncated,
|
|
"```",
|
|
"",
|
|
`For each numbered claim above, emit a claim_verdicts entry. For gaps the`,
|
|
`author DIDN'T claim but that look like placeholder code, emit unflagged_gaps.`,
|
|
`Strict JSON only, matching the shape described. No prose outside JSON.`,
|
|
].join("\n");
|
|
|
|
let resp: Response;
|
|
try {
|
|
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
|
method: "POST",
|
|
headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
provider: "ollama_cloud",
|
|
model: MODEL,
|
|
messages: [
|
|
{ role: "system", content: systemMsg },
|
|
{ role: "user", content: userMsg },
|
|
],
|
|
max_tokens: 3000,
|
|
temperature: 0.2,
|
|
think: true, // T3 overseer should reason — JSON shape is still required
|
|
}),
|
|
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
|
});
|
|
} catch (e) {
|
|
// Cloud unreachable → soft-fail. Don't block a PR because the
|
|
// reviewer model is down. Static + dynamic + kb still run.
|
|
return [{
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: "cloud inference unreachable — skipped",
|
|
evidence: [`fetch failed: ${(e as Error).message.slice(0, 180)}`],
|
|
}];
|
|
}
|
|
|
|
if (!resp.ok) {
|
|
return [{
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: `cloud inference returned ${resp.status} — skipped`,
|
|
evidence: [`body: ${(await resp.text()).slice(0, 200)}`],
|
|
}];
|
|
}
|
|
|
|
const body: any = await resp.json();
|
|
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
|
const usage = body?.usage ?? {};
|
|
|
|
const parsed = extractJson(content);
|
|
if (!parsed) {
|
|
return [{
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: "cloud returned unparseable output — skipped",
|
|
evidence: [
|
|
`head: ${content.slice(0, 200)}`,
|
|
`tokens: ${usage.total_tokens ?? "?"}`,
|
|
],
|
|
}];
|
|
}
|
|
|
|
const findings: Finding[] = [];
|
|
|
|
// One summary info finding so the verdict layer knows the check ran.
|
|
findings.push({
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: `cloud review completed (model=${MODEL}, tokens=${usage.total_tokens ?? "?"})`,
|
|
evidence: [
|
|
`claim_verdicts: ${parsed.claim_verdicts?.length ?? 0}, unflagged_gaps: ${parsed.unflagged_gaps?.length ?? 0}`,
|
|
],
|
|
});
|
|
|
|
for (const v of parsed.claim_verdicts ?? []) {
|
|
if (v?.backed === false) {
|
|
const idx = typeof v.claim_idx === "number" ? v.claim_idx : -1;
|
|
// Indices point at the verifiable[] list we sent the cloud,
|
|
// not the full claims[] list. Translate back.
|
|
const claim = verifiable[idx];
|
|
if (!claim) continue;
|
|
// Strong+unbacked = BLOCK. That's the whole point of the auditor.
|
|
const sev: Finding["severity"] = claim.strength === "strong" ? "block"
|
|
: claim.strength === "moderate" ? "warn"
|
|
: "info";
|
|
findings.push({
|
|
check: "inference",
|
|
severity: sev,
|
|
claim_text: claim.text,
|
|
summary: `cloud: claim not backed — "${claim.text.slice(0, 100)}"`,
|
|
evidence: [
|
|
`at ${claim.location}`,
|
|
`cloud reason: ${String(v.evidence ?? "no reason given").slice(0, 200)}`,
|
|
],
|
|
});
|
|
}
|
|
}
|
|
|
|
for (const g of parsed.unflagged_gaps ?? []) {
|
|
const summary = String(g?.summary ?? "?");
|
|
const location = String(g?.location ?? "?");
|
|
// False-positive guard — when the cloud says "X not defined in this
|
|
// diff" or "missing implementation of X", the cloud may just mean
|
|
// "X is not in the added lines," not "X doesn't exist in the repo."
|
|
// Extract candidate symbol names and grep the repo. If any symbol
|
|
// is defined elsewhere, drop the finding — it's a known-symbol
|
|
// reference, not a placeholder.
|
|
if (/not\s+defined|missing\s+implementation|never\s+referenced\s+or\s+integrated/i.test(summary)) {
|
|
const symbols = extractSymbols(summary);
|
|
if (symbols.length > 0) {
|
|
const resolved = await symbolsExistInRepo(symbols);
|
|
if (resolved.length === symbols.length) {
|
|
// Every named symbol exists somewhere in the repo — silent drop.
|
|
continue;
|
|
}
|
|
if (resolved.length > 0) {
|
|
// Partially resolved — demote to info with a note.
|
|
findings.push({
|
|
check: "inference",
|
|
severity: "info",
|
|
summary: `cloud gap partially resolved by repo grep: ${summary.slice(0, 120)}`,
|
|
evidence: [
|
|
`location: ${location.slice(0, 140)}`,
|
|
`resolved via grep: ${resolved.join(",")}`,
|
|
`unresolved: ${symbols.filter(s => !resolved.includes(s)).join(",")}`,
|
|
],
|
|
});
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
findings.push({
|
|
check: "inference",
|
|
severity: "warn",
|
|
summary: `cloud-flagged gap not in any claim: ${summary.slice(0, 120)}`,
|
|
evidence: [`location: ${location.slice(0, 140)}`],
|
|
});
|
|
}
|
|
|
|
return findings;
|
|
}
|
|
|
|
// Pull out plausible code-symbol names from a summary string.
|
|
// Matches:
|
|
// - identifier with backticks: `foo_bar`
|
|
// - identifier followed by parens: foo_bar()
|
|
// - CamelCase types
|
|
// - snake_case_functions
|
|
// Filters out common English words that could be matched accidentally.
|
|
const STOPWORDS = new Set([
|
|
"not","the","and","for","this","that","with","but","are","was","has",
|
|
"have","been","any","missing","implementation","diff","defined","never",
|
|
"referenced","integrated","flow","code","file","some","only","when",
|
|
]);
|
|
function extractSymbols(text: string): string[] {
|
|
const out = new Set<string>();
|
|
// `backticked` symbols
|
|
for (const m of text.matchAll(/`([A-Za-z_][A-Za-z0-9_]{2,})`/g)) out.add(m[1]);
|
|
// foo() or foo_bar() calls
|
|
for (const m of text.matchAll(/\b([A-Za-z_][A-Za-z0-9_]{2,})\s*\(/g)) out.add(m[1]);
|
|
// CamelCase types (3+ chars, must start with uppercase)
|
|
for (const m of text.matchAll(/\b([A-Z][A-Za-z0-9]{2,})\b/g)) out.add(m[1]);
|
|
return Array.from(out).filter(s => !STOPWORDS.has(s.toLowerCase()));
|
|
}
|
|
|
|
// Scan the repo for at least one definition of each symbol. Uses Bun's
|
|
// Glob to walk TS/Rust/Python/JS sources; ignores node_modules, data/,
|
|
// and target/. Skips files > 500KB — those are fixtures/snapshots that
|
|
// won't contain a definition line and slurping them slows the audit.
|
|
async function symbolsExistInRepo(symbols: string[]): Promise<string[]> {
|
|
const patterns = ["**/*.ts", "**/*.tsx", "**/*.rs", "**/*.py", "**/*.js"];
|
|
const skip = (p: string) => p.includes("/node_modules/") || p.startsWith("data/") || p.includes("/target/") || p.startsWith("dist/");
|
|
const MAX_FILE_BYTES = 500_000;
|
|
const { stat } = await import("node:fs/promises");
|
|
const resolved = new Set<string>();
|
|
const toFind = new Set(symbols);
|
|
for (const pat of patterns) {
|
|
if (toFind.size === 0) break;
|
|
const glob = new Glob(pat);
|
|
for await (const f of glob.scan({ cwd: REPO_ROOT, onlyFiles: true })) {
|
|
if (skip(f)) continue;
|
|
try { const s = await stat(`${REPO_ROOT}/${f}`); if (s.size > MAX_FILE_BYTES) continue; } catch { continue; }
|
|
let content: string;
|
|
try { content = await readFile(`${REPO_ROOT}/${f}`, "utf8"); } catch { continue; }
|
|
for (const sym of Array.from(toFind)) {
|
|
// Definition heuristics: `function sym`, `fn sym`, `const sym`,
|
|
// `let sym`, `def sym`, `class sym`, `struct sym`, `enum sym`,
|
|
// `trait sym`, `async function sym`, `pub (async )?fn sym`.
|
|
const re = new RegExp(
|
|
`\\b(function|async\\s+function|const|let|var|def|class|struct|enum|trait|impl|type|interface|fn|pub\\s+(async\\s+)?fn)\\s+${escapeRe(sym)}\\b`
|
|
);
|
|
if (re.test(content)) {
|
|
resolved.add(sym);
|
|
toFind.delete(sym);
|
|
if (toFind.size === 0) break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return Array.from(resolved);
|
|
}
|
|
|
|
function escapeRe(s: string): string {
|
|
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
}
|
|
|
|
// Lift the first balanced JSON object out of the response. Tolerates
|
|
// leading prose, code fences, and model reasoning preamble when the
|
|
// cloud model ignored "strict JSON only."
|
|
function extractJson(text: string): any | null {
|
|
const cleaned = text.replace(/^```(?:json)?\s*/im, "").replace(/```\s*$/im, "");
|
|
let depth = 0;
|
|
let start = -1;
|
|
for (let i = 0; i < cleaned.length; i++) {
|
|
const c = cleaned[i];
|
|
if (c === "{") {
|
|
if (depth === 0) start = i;
|
|
depth++;
|
|
} else if (c === "}") {
|
|
depth--;
|
|
if (depth === 0 && start >= 0) {
|
|
try { return JSON.parse(cleaned.slice(start, i + 1)); } catch { start = -1; }
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|