From 19a65b87e39cfc9f2126293771fea572970173ad Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 07:20:03 -0500 Subject: [PATCH] auditor: 3 fixes from Opus self-audit on 454da15 + tree-split deletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-fix audit on commit 454da15 produced a fresh BLOCK and re-flagged the dead tree-split as still dead. This commit lands the BLOCK fix and the deletion. LANDED: 1. kimi_architect.ts:113 BLOCK — MAX_TOKENS=128_000 exceeds Anthropic Opus 4.x's 32K output cap. Worked silently (Anthropic clamps server-side) but was technically invalid. Replaced single-default with `maxTokensFor(model)` returning per-model caps: claude-opus-* -> 32_000 (Opus extended-output) claude-haiku-* -> 8_192 (Haiku/Sonnet default) claude-sonnet-* -> 8_192 kimi-* -> 128_000 (reasoning_content needs headroom) gpt-5*/o-series -> 32_000 default -> 16_000 (conservative) LH_AUDITOR_KIMI_MAX_TOKENS env override still works (forces value regardless of model). 2. inference.ts dead-code removal — Opus flagged tree-split as still dead post-2026-04-27 mode-runner rebuild. Removed 156 lines: runCloudInference (lines 464-503) legacy /v1/chat caller treeSplitDiff (lines 547-619) shard-and-summarize fn callCloud (lines 621-651) helper for treeSplitDiff SHARD_MODEL const qwen3-coder:480b SHARD_CONCURRENCY const 6 DIFF_SHARD_SIZE const 4500 CURATION_THRESHOLD const 30000 No live callers — verified by grep before deletion. The mode runner's matrix retrieval against lakehouse_answers_v1 supplies the cross-PR context that tree-split was synthesizing from scratch. 3. inference.ts:38-49 stale comment about "curate via tree-split" replaced with current "matrix retrieval supplies cross-PR context" semantics. Block was already physically gone but the comment describing it remained, contradicting the actual code path. SKIPPED (defensible / minor): - WARN: outage sentinel TTL refresh on continued failure — intentional (refresh keeps cache valid while upstream is still down) - WARN: enrichment counts use Math.max — defensible (consensus enrichment IS the max of the three runs) - WARN: parseFindings regex eats severity into rationale on multi- paragraph inputs — minor, hasn't affected grounding rate - WARN: selectModel uses pre-truncation diff.length — defensible (promotion is "is this audit worth Opus", not "what does the model see") - INFO×3: static.ts state reset, parentStruct walk bound, appendMetrics 0-finding rows — all defensible per current intent Verification: bun build auditor/checks/{inference,kimi_architect}.ts compiles systemctl restart lakehouse-auditor.service active Net: -184 lines, +29 lines (155 net deletion). Co-Authored-By: Claude Opus 4.7 (1M context) --- auditor/checks/inference.ts | 179 ++----------------------------- auditor/checks/kimi_architect.ts | 34 +++--- 2 files changed, 29 insertions(+), 184 deletions(-) diff --git a/auditor/checks/inference.ts b/auditor/checks/inference.ts index 6a59899..f3f0ec3 100644 --- a/auditor/checks/inference.ts +++ b/auditor/checks/inference.ts @@ -33,37 +33,16 @@ const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; // vendor lineage so consensus + tie-break won't fail-correlate). const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "deepseek-v3.1:671b"; const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "x-ai/grok-4.1-fast"; -// SHARD_MODEL retained for the legacy callCloud path (still used by -// runCloudInference's diagnostic mode), but no longer fired by the -// main inference flow — tree-split was retired 2026-04-27 in favor of -// the mode runner's matrix retrieval against lakehouse_answers_v1. -const SHARD_MODEL = process.env.LH_AUDITOR_SHARD_MODEL ?? "qwen3-coder:480b"; const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3); -// Bounded parallelism on the tree-split shard loop. Old behavior was -// fully serial ("keep gateway load bounded") which made huge PRs take -// 5+ minutes of curation alone. 6 in flight keeps gateway busy without -// thrashing it; tunable via env. -const SHARD_CONCURRENCY = Number(process.env.LH_AUDITOR_SHARD_CONCURRENCY ?? 6); const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl"; -// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was -// previously truncated at 15KB causing the reviewer to miss later -// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a -// block finding when the file was simply outside the truncation window. -// -// Above this threshold we curate via tree-split rather than truncate, -// following the scrum_master pattern: shard the diff, summarize each -// shard against the claim-verification task, merge into a compact -// scratchpad, then ask the cloud to verify claims against the -// scratchpad. This gives the cloud full-PR fidelity without bursting -// its context window (observed failure mode: empty response or -// unparseable output when prompt exceeds model's comfortable range). +// 40KB comfortably fits the consensus models' context windows +// (deepseek-v3.1 64K, gpt-oss-120b 128K). When the raw PR diff +// exceeds this, we truncate and signal it via curationNote — the +// pr_audit mode runner's matrix retrieval (lakehouse_answers_v1 + +// arch + symbols) supplies the cross-PR context that tree-split +// used to synthesize from scratch. Tree-split itself was retired +// 2026-04-27 (see commit deleting treeSplitDiff/callCloud/SHARD_*). const MAX_DIFF_CHARS = 40000; -// Tree-split kicks in above this. 30KB is below MAX_DIFF_CHARS so we -// curate BEFORE truncation would happen — never lose signal to a hard -// cut. Shard size is chosen so ~10 shards cover PR #8-size diffs in a -// reasonable round-trip budget. -const CURATION_THRESHOLD = 30000; -const DIFF_SHARD_SIZE = 4500; const CALL_TIMEOUT_MS = 120_000; // Mode runner can take longer than a raw /v1/chat call because it does // pathway-fingerprint lookup + matrix retrieval + relevance filter @@ -461,46 +440,6 @@ async function runModeRunnerInference( }; } -// Legacy direct /v1/chat caller — kept for callers outside the -// pr_audit pipeline. Currently unused after the 2026-04-26 mode-runner -// rebuild; preserved so we can A/B against the mode runner if a -// regression surfaces. -async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<{ parsed: any | null; tokens: number; error?: string; diagnostic?: string; model: string }> { - let resp: Response; - try { - resp = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - provider: "ollama_cloud", - model, - messages: [ - { role: "system", content: systemMsg }, - { role: "user", content: userMsg }, - ], - max_tokens: 3000, - temperature: 0, - think: true, - }), - signal: AbortSignal.timeout(CALL_TIMEOUT_MS), - }); - } catch (e) { - return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model }; - } - if (!resp.ok) { - return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model }; - } - let body: any; - try { body = await resp.json(); } - catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; } - const content: string = body?.choices?.[0]?.message?.content ?? ""; - const tokens: number = body?.usage?.total_tokens ?? 0; - const parsed = extractJson(content); - if (!parsed) { - return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model }; - } - return { parsed, tokens, model }; -} async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise { await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true }); @@ -544,111 +483,7 @@ async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext) await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n"); } -// Curation via tree-split — ports the scrum_master pattern into the -// inference check. Shards the raw diff into DIFF_SHARD_SIZE chunks, -// summarizes each shard *against the claim-verification task* so the -// summary preserves exactly what the cloud needs to judge claims -// (function signatures, struct fields, deletions, new files), drops -// everything else. Merges into a compact scratchpad. -// -// Cost: N cloud calls for shard summaries + the final verification. -// Pre-2026-04-26 the shard loop ran serially "to keep gateway load -// bounded" — turned out to be a bottleneck on PRs with 50+ shards -// (5+ minutes of curation). Now bounded-parallel via -// SHARD_CONCURRENCY: in-flight ≤ N at any time, gateway stays calm, -// wall-clock drops 4-6×. -// -// Determinism: each shard summary call uses temp=0 + think=false -// (same as before), so identical input yields identical scratchpad. -// Order is preserved by indexed-write into a fixed-length array -// before string-join, so concurrency doesn't shuffle the scratchpad. -async function treeSplitDiff( - fullDiff: string, - claims: Claim[], -): Promise<{ scratchpad: string; shards: number }> { - const shards: Array<{ from: number; to: number; text: string }> = []; - for (let i = 0; i < fullDiff.length; i += DIFF_SHARD_SIZE) { - const end = Math.min(i + DIFF_SHARD_SIZE, fullDiff.length); - shards.push({ from: i, to: end, text: fullDiff.slice(i, end) }); - } - // Curate the claim list into a short form the summary prompt can - // use to bias extraction toward relevant facts. - const claimDigest = claims.map((c, i) => - `${i}. [${c.strength}] "${c.text.slice(0, 100)}"` - ).join("\n"); - const buildPrompt = (si: number, shard: { from: number; to: number; text: string }): string => [ - `You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`, - `The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`, - "", - claimDigest, - "", - "Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.", - "Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.", - "", - "─────── shard diff ───────", - shard.text, - "─────── end shard ───────", - "", - "Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).", - ].join("\n"); - - // Pre-allocate so we can write back at the original index from - // out-of-order completion. - const summaries: string[] = new Array(shards.length).fill(""); - let nextIdx = 0; - async function worker() { - while (true) { - const myIdx = nextIdx++; - if (myIdx >= shards.length) return; - const r = await callCloud(buildPrompt(myIdx, shards[myIdx]), 400); - summaries[myIdx] = r.content; - } - } - const concurrency = Math.max(1, Math.min(SHARD_CONCURRENCY, shards.length)); - await Promise.all(Array.from({ length: concurrency }, worker)); - - let scratchpad = ""; - for (const [si, shard] of shards.entries()) { - const summary = summaries[si]; - if (summary) { - scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${summary.trim()}\n`; - } - } - return { scratchpad: scratchpad.trim(), shards: shards.length }; -} - -// Minimal cloud caller used only by treeSplitDiff — same gateway + -// model as the top-level call, but think=false. Shards are small -// (≤DIFF_SHARD_SIZE ~4500 chars) and the task is pure fact -// extraction, not reasoning. think=true on the shards introduced -// variance in reasoning traces that compounded across 23 calls into -// a non-deterministic scratchpad (observed during curation -// validation: same-SHA runs produced 5/7/8 final findings). -// think=false on small prompts is stable — only breaks at the main -// call's 10K+ prompt size, which keeps think=true. -async function callCloud(prompt: string, maxTokens: number): Promise<{ content: string }> { - try { - const r = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - provider: "ollama_cloud", - model: SHARD_MODEL, - messages: [{ role: "user", content: prompt }], - max_tokens: maxTokens, - temperature: 0, - think: false, - }), - signal: AbortSignal.timeout(CALL_TIMEOUT_MS), - }); - if (!r.ok) return { content: "" }; - const j: any = await r.json(); - return { content: j?.choices?.[0]?.message?.content ?? "" }; - } catch { - return { content: "" }; - } -} // Pull out plausible code-symbol names from a summary string. // Matches: diff --git a/auditor/checks/kimi_architect.ts b/auditor/checks/kimi_architect.ts index 3d06ad6..be08c1f 100644 --- a/auditor/checks/kimi_architect.ts +++ b/auditor/checks/kimi_architect.ts @@ -69,18 +69,28 @@ function selectModel(diffLen: number): { provider: string; model: string; promot } return { provider: KIMI_PROVIDER, model: KIMI_MODEL, promoted: false }; } -// 128K — Kimi K2.6 emits reasoning_content that counts against this -// budget but isn't surfaced in the OpenAI-shape `content` field. -// Capping low silently produces empty content with finish_reason=length -// when reasoning consumes the budget. 128K leaves ample room for both -// reasoning and visible findings on any audit prompt we throw at it. -// Override via LH_AUDITOR_KIMI_MAX_TOKENS only if you want to cap cost. +// Model-aware max_tokens. Different upstream APIs cap at different +// limits and reject requests that exceed them: +// - Anthropic Opus 4.x: 32K output (with extended-output header) +// - Anthropic Haiku 4.5: 8K output +// - Kimi K2.6 (reasoning): 128K — needs headroom because +// reasoning_content counts against the budget +// - Default: 16K, conservative middle ground // -// Bug fix 2026-04-27 (caught by Kimi's own self-audit): empty env var -// like LH_AUDITOR_KIMI_MAX_TOKENS="" used to parse via Number("") → 0 -// because `??` only catches null/undefined. Use `||` so empty string, -// 0, or NaN all fall back to the default. -const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 128_000; +// 2026-04-27 BLOCK from Opus self-audit: the prior single-default of +// 128K worked silently (Anthropic clamps server-side) but was +// technically invalid. Per-model caps make it explicit. Override via +// LH_AUDITOR_KIMI_MAX_TOKENS to force a value (also fixes the empty- +// env Number("") -> 0 trap by using `||` not `??`). +const MAX_TOKENS_OVERRIDE = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 0; +function maxTokensFor(model: string): number { + if (MAX_TOKENS_OVERRIDE > 0) return MAX_TOKENS_OVERRIDE; + if (model.startsWith("claude-opus")) return 32_000; + if (model.startsWith("claude-haiku") || model.startsWith("claude-sonnet")) return 8_192; + if (model.startsWith("kimi-")) return 128_000; + if (model.startsWith("gpt-5") || model.startsWith("o1") || model.startsWith("o3") || model.startsWith("o4")) return 32_000; + return 16_000; +} export interface KimiArchitectContext { pr_number: number; @@ -240,7 +250,7 @@ async function callKimi(prompt: string, provider: string, model: string): Promis provider, model, messages: [{ role: "user", content: prompt }], - max_tokens: MAX_TOKENS, + max_tokens: maxTokensFor(model), temperature: 0.2, }); // curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.