auditor: 3 fixes from Opus self-audit on 454da15 + tree-split deletion
Some checks failed
lakehouse/auditor 14 blocking issues: cloud: claim not backed — "Verified end-to-end:"
Some checks failed
lakehouse/auditor 14 blocking issues: cloud: claim not backed — "Verified end-to-end:"
The post-fix audit on commit 454da15 produced a fresh BLOCK and
re-flagged the dead tree-split as still dead. This commit lands the
BLOCK fix and the deletion.
LANDED:
1. kimi_architect.ts:113 BLOCK — MAX_TOKENS=128_000 exceeds Anthropic
Opus 4.x's 32K output cap. Worked silently (Anthropic clamps
server-side) but was technically invalid. Replaced single-default
with `maxTokensFor(model)` returning per-model caps:
claude-opus-* -> 32_000 (Opus extended-output)
claude-haiku-* -> 8_192 (Haiku/Sonnet default)
claude-sonnet-* -> 8_192
kimi-* -> 128_000 (reasoning_content needs headroom)
gpt-5*/o-series -> 32_000
default -> 16_000 (conservative)
LH_AUDITOR_KIMI_MAX_TOKENS env override still works (forces value
regardless of model).
2. inference.ts dead-code removal — Opus flagged tree-split as still
dead post-2026-04-27 mode-runner rebuild. Removed 156 lines:
runCloudInference (lines 464-503) legacy /v1/chat caller
treeSplitDiff (lines 547-619) shard-and-summarize fn
callCloud (lines 621-651) helper for treeSplitDiff
SHARD_MODEL const qwen3-coder:480b
SHARD_CONCURRENCY const 6
DIFF_SHARD_SIZE const 4500
CURATION_THRESHOLD const 30000
No live callers — verified by grep before deletion. The mode
runner's matrix retrieval against lakehouse_answers_v1 supplies
the cross-PR context that tree-split was synthesizing from scratch.
3. inference.ts:38-49 stale comment about "curate via tree-split"
replaced with current "matrix retrieval supplies cross-PR context"
semantics. Block was already physically gone but the comment
describing it remained, contradicting the actual code path.
SKIPPED (defensible / minor):
- WARN: outage sentinel TTL refresh on continued failure — intentional
(refresh keeps cache valid while upstream is still down)
- WARN: enrichment counts use Math.max — defensible (consensus
enrichment IS the max of the three runs)
- WARN: parseFindings regex eats severity into rationale on multi-
paragraph inputs — minor, hasn't affected grounding rate
- WARN: selectModel uses pre-truncation diff.length — defensible
(promotion is "is this audit worth Opus", not "what does the model
see")
- INFO×3: static.ts state reset, parentStruct walk bound,
appendMetrics 0-finding rows — all defensible per current intent
Verification:
bun build auditor/checks/{inference,kimi_architect}.ts compiles
systemctl restart lakehouse-auditor.service active
Net: -184 lines, +29 lines (155 net deletion).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
454da15301
commit
19a65b87e3
@ -33,37 +33,16 @@ const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
|||||||
// vendor lineage so consensus + tie-break won't fail-correlate).
|
// vendor lineage so consensus + tie-break won't fail-correlate).
|
||||||
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "deepseek-v3.1:671b";
|
const MODEL = process.env.LH_AUDITOR_REVIEW_MODEL ?? "deepseek-v3.1:671b";
|
||||||
const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "x-ai/grok-4.1-fast";
|
const TIEBREAKER_MODEL = process.env.LH_AUDITOR_TIEBREAKER_MODEL ?? "x-ai/grok-4.1-fast";
|
||||||
// SHARD_MODEL retained for the legacy callCloud path (still used by
|
|
||||||
// runCloudInference's diagnostic mode), but no longer fired by the
|
|
||||||
// main inference flow — tree-split was retired 2026-04-27 in favor of
|
|
||||||
// the mode runner's matrix retrieval against lakehouse_answers_v1.
|
|
||||||
const SHARD_MODEL = process.env.LH_AUDITOR_SHARD_MODEL ?? "qwen3-coder:480b";
|
|
||||||
const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
|
const N_CONSENSUS = Number(process.env.LH_AUDITOR_CONSENSUS_N ?? 3);
|
||||||
// Bounded parallelism on the tree-split shard loop. Old behavior was
|
|
||||||
// fully serial ("keep gateway load bounded") which made huge PRs take
|
|
||||||
// 5+ minutes of curation alone. 6 in flight keeps gateway busy without
|
|
||||||
// thrashing it; tunable via env.
|
|
||||||
const SHARD_CONCURRENCY = Number(process.env.LH_AUDITOR_SHARD_CONCURRENCY ?? 6);
|
|
||||||
const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
|
const AUDIT_DISCREPANCIES_JSONL = "/home/profit/lakehouse/data/_kb/audit_discrepancies.jsonl";
|
||||||
// 40KB comfortably fits gpt-oss:120b's context. PR #1 (~39KB) was
|
// 40KB comfortably fits the consensus models' context windows
|
||||||
// previously truncated at 15KB causing the reviewer to miss later
|
// (deepseek-v3.1 64K, gpt-oss-120b 128K). When the raw PR diff
|
||||||
// files (gitea.ts, policy.ts) and flag "no Gitea client present" as a
|
// exceeds this, we truncate and signal it via curationNote — the
|
||||||
// block finding when the file was simply outside the truncation window.
|
// pr_audit mode runner's matrix retrieval (lakehouse_answers_v1 +
|
||||||
//
|
// arch + symbols) supplies the cross-PR context that tree-split
|
||||||
// Above this threshold we curate via tree-split rather than truncate,
|
// used to synthesize from scratch. Tree-split itself was retired
|
||||||
// following the scrum_master pattern: shard the diff, summarize each
|
// 2026-04-27 (see commit deleting treeSplitDiff/callCloud/SHARD_*).
|
||||||
// shard against the claim-verification task, merge into a compact
|
|
||||||
// scratchpad, then ask the cloud to verify claims against the
|
|
||||||
// scratchpad. This gives the cloud full-PR fidelity without bursting
|
|
||||||
// its context window (observed failure mode: empty response or
|
|
||||||
// unparseable output when prompt exceeds model's comfortable range).
|
|
||||||
const MAX_DIFF_CHARS = 40000;
|
const MAX_DIFF_CHARS = 40000;
|
||||||
// Tree-split kicks in above this. 30KB is below MAX_DIFF_CHARS so we
|
|
||||||
// curate BEFORE truncation would happen — never lose signal to a hard
|
|
||||||
// cut. Shard size is chosen so ~10 shards cover PR #8-size diffs in a
|
|
||||||
// reasonable round-trip budget.
|
|
||||||
const CURATION_THRESHOLD = 30000;
|
|
||||||
const DIFF_SHARD_SIZE = 4500;
|
|
||||||
const CALL_TIMEOUT_MS = 120_000;
|
const CALL_TIMEOUT_MS = 120_000;
|
||||||
// Mode runner can take longer than a raw /v1/chat call because it does
|
// Mode runner can take longer than a raw /v1/chat call because it does
|
||||||
// pathway-fingerprint lookup + matrix retrieval + relevance filter
|
// pathway-fingerprint lookup + matrix retrieval + relevance filter
|
||||||
@ -461,46 +440,6 @@ async function runModeRunnerInference(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Legacy direct /v1/chat caller — kept for callers outside the
|
|
||||||
// pr_audit pipeline. Currently unused after the 2026-04-26 mode-runner
|
|
||||||
// rebuild; preserved so we can A/B against the mode runner if a
|
|
||||||
// regression surfaces.
|
|
||||||
async function runCloudInference(systemMsg: string, userMsg: string, model: string): Promise<{ parsed: any | null; tokens: number; error?: string; diagnostic?: string; model: string }> {
|
|
||||||
let resp: Response;
|
|
||||||
try {
|
|
||||||
resp = await fetch(`${GATEWAY}/v1/chat`, {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "content-type": "application/json" },
|
|
||||||
body: JSON.stringify({
|
|
||||||
provider: "ollama_cloud",
|
|
||||||
model,
|
|
||||||
messages: [
|
|
||||||
{ role: "system", content: systemMsg },
|
|
||||||
{ role: "user", content: userMsg },
|
|
||||||
],
|
|
||||||
max_tokens: 3000,
|
|
||||||
temperature: 0,
|
|
||||||
think: true,
|
|
||||||
}),
|
|
||||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
return { parsed: null, tokens: 0, error: "unreachable", diagnostic: (e as Error).message.slice(0, 200), model };
|
|
||||||
}
|
|
||||||
if (!resp.ok) {
|
|
||||||
return { parsed: null, tokens: 0, error: "non_200", diagnostic: `${resp.status}: ${(await resp.text()).slice(0, 160)}`, model };
|
|
||||||
}
|
|
||||||
let body: any;
|
|
||||||
try { body = await resp.json(); }
|
|
||||||
catch (e) { return { parsed: null, tokens: 0, error: "unparseable", diagnostic: (e as Error).message, model }; }
|
|
||||||
const content: string = body?.choices?.[0]?.message?.content ?? "";
|
|
||||||
const tokens: number = body?.usage?.total_tokens ?? 0;
|
|
||||||
const parsed = extractJson(content);
|
|
||||||
if (!parsed) {
|
|
||||||
return { parsed: null, tokens, error: "unparseable", diagnostic: content.slice(0, 200), model };
|
|
||||||
}
|
|
||||||
return { parsed, tokens, model };
|
|
||||||
}
|
|
||||||
|
|
||||||
async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
|
async function persistDiscrepancies(ctx: InferenceContext, discrepancies: any[]): Promise<void> {
|
||||||
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
await mkdir("/home/profit/lakehouse/data/_kb", { recursive: true });
|
||||||
@ -544,111 +483,7 @@ async function extractAndPersistFacts(scratchpad: string, ctx: InferenceContext)
|
|||||||
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
|
await appendFile(AUDIT_FACTS_JSONL, JSON.stringify(row) + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Curation via tree-split — ports the scrum_master pattern into the
|
|
||||||
// inference check. Shards the raw diff into DIFF_SHARD_SIZE chunks,
|
|
||||||
// summarizes each shard *against the claim-verification task* so the
|
|
||||||
// summary preserves exactly what the cloud needs to judge claims
|
|
||||||
// (function signatures, struct fields, deletions, new files), drops
|
|
||||||
// everything else. Merges into a compact scratchpad.
|
|
||||||
//
|
|
||||||
// Cost: N cloud calls for shard summaries + the final verification.
|
|
||||||
// Pre-2026-04-26 the shard loop ran serially "to keep gateway load
|
|
||||||
// bounded" — turned out to be a bottleneck on PRs with 50+ shards
|
|
||||||
// (5+ minutes of curation). Now bounded-parallel via
|
|
||||||
// SHARD_CONCURRENCY: in-flight ≤ N at any time, gateway stays calm,
|
|
||||||
// wall-clock drops 4-6×.
|
|
||||||
//
|
|
||||||
// Determinism: each shard summary call uses temp=0 + think=false
|
|
||||||
// (same as before), so identical input yields identical scratchpad.
|
|
||||||
// Order is preserved by indexed-write into a fixed-length array
|
|
||||||
// before string-join, so concurrency doesn't shuffle the scratchpad.
|
|
||||||
async function treeSplitDiff(
|
|
||||||
fullDiff: string,
|
|
||||||
claims: Claim[],
|
|
||||||
): Promise<{ scratchpad: string; shards: number }> {
|
|
||||||
const shards: Array<{ from: number; to: number; text: string }> = [];
|
|
||||||
for (let i = 0; i < fullDiff.length; i += DIFF_SHARD_SIZE) {
|
|
||||||
const end = Math.min(i + DIFF_SHARD_SIZE, fullDiff.length);
|
|
||||||
shards.push({ from: i, to: end, text: fullDiff.slice(i, end) });
|
|
||||||
}
|
|
||||||
// Curate the claim list into a short form the summary prompt can
|
|
||||||
// use to bias extraction toward relevant facts.
|
|
||||||
const claimDigest = claims.map((c, i) =>
|
|
||||||
`${i}. [${c.strength}] "${c.text.slice(0, 100)}"`
|
|
||||||
).join("\n");
|
|
||||||
|
|
||||||
const buildPrompt = (si: number, shard: { from: number; to: number; text: string }): string => [
|
|
||||||
`You are summarizing shard ${si + 1}/${shards.length} (chars ${shard.from}..${shard.to}) of a PR diff.`,
|
|
||||||
`The downstream task will verify these ship-claims against the full-PR summary. Extract ONLY facts that could confirm or refute these claims:`,
|
|
||||||
"",
|
|
||||||
claimDigest,
|
|
||||||
"",
|
|
||||||
"Extract: new function/method signatures, struct fields, deletions, new files, wiring (function X calls Y), absence-of-implementation markers, TODO comments on added lines.",
|
|
||||||
"Skip: comment-only edits, whitespace, import reordering, unrelated cosmetic changes.",
|
|
||||||
"",
|
|
||||||
"─────── shard diff ───────",
|
|
||||||
shard.text,
|
|
||||||
"─────── end shard ───────",
|
|
||||||
"",
|
|
||||||
"Output: up to 180 words of facts in bullet form. No prose preamble, no claim verdicts (that's for the downstream step).",
|
|
||||||
].join("\n");
|
|
||||||
|
|
||||||
// Pre-allocate so we can write back at the original index from
|
|
||||||
// out-of-order completion.
|
|
||||||
const summaries: string[] = new Array(shards.length).fill("");
|
|
||||||
let nextIdx = 0;
|
|
||||||
async function worker() {
|
|
||||||
while (true) {
|
|
||||||
const myIdx = nextIdx++;
|
|
||||||
if (myIdx >= shards.length) return;
|
|
||||||
const r = await callCloud(buildPrompt(myIdx, shards[myIdx]), 400);
|
|
||||||
summaries[myIdx] = r.content;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const concurrency = Math.max(1, Math.min(SHARD_CONCURRENCY, shards.length));
|
|
||||||
await Promise.all(Array.from({ length: concurrency }, worker));
|
|
||||||
|
|
||||||
let scratchpad = "";
|
|
||||||
for (const [si, shard] of shards.entries()) {
|
|
||||||
const summary = summaries[si];
|
|
||||||
if (summary) {
|
|
||||||
scratchpad += `\n--- shard ${si + 1} (chars ${shard.from}..${shard.to}) ---\n${summary.trim()}\n`;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { scratchpad: scratchpad.trim(), shards: shards.length };
|
|
||||||
}
|
|
||||||
|
|
||||||
// Minimal cloud caller used only by treeSplitDiff — same gateway +
|
|
||||||
// model as the top-level call, but think=false. Shards are small
|
|
||||||
// (≤DIFF_SHARD_SIZE ~4500 chars) and the task is pure fact
|
|
||||||
// extraction, not reasoning. think=true on the shards introduced
|
|
||||||
// variance in reasoning traces that compounded across 23 calls into
|
|
||||||
// a non-deterministic scratchpad (observed during curation
|
|
||||||
// validation: same-SHA runs produced 5/7/8 final findings).
|
|
||||||
// think=false on small prompts is stable — only breaks at the main
|
|
||||||
// call's 10K+ prompt size, which keeps think=true.
|
|
||||||
async function callCloud(prompt: string, maxTokens: number): Promise<{ content: string }> {
|
|
||||||
try {
|
|
||||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "content-type": "application/json" },
|
|
||||||
body: JSON.stringify({
|
|
||||||
provider: "ollama_cloud",
|
|
||||||
model: SHARD_MODEL,
|
|
||||||
messages: [{ role: "user", content: prompt }],
|
|
||||||
max_tokens: maxTokens,
|
|
||||||
temperature: 0,
|
|
||||||
think: false,
|
|
||||||
}),
|
|
||||||
signal: AbortSignal.timeout(CALL_TIMEOUT_MS),
|
|
||||||
});
|
|
||||||
if (!r.ok) return { content: "" };
|
|
||||||
const j: any = await r.json();
|
|
||||||
return { content: j?.choices?.[0]?.message?.content ?? "" };
|
|
||||||
} catch {
|
|
||||||
return { content: "" };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pull out plausible code-symbol names from a summary string.
|
// Pull out plausible code-symbol names from a summary string.
|
||||||
// Matches:
|
// Matches:
|
||||||
|
|||||||
@ -69,18 +69,28 @@ function selectModel(diffLen: number): { provider: string; model: string; promot
|
|||||||
}
|
}
|
||||||
return { provider: KIMI_PROVIDER, model: KIMI_MODEL, promoted: false };
|
return { provider: KIMI_PROVIDER, model: KIMI_MODEL, promoted: false };
|
||||||
}
|
}
|
||||||
// 128K — Kimi K2.6 emits reasoning_content that counts against this
|
// Model-aware max_tokens. Different upstream APIs cap at different
|
||||||
// budget but isn't surfaced in the OpenAI-shape `content` field.
|
// limits and reject requests that exceed them:
|
||||||
// Capping low silently produces empty content with finish_reason=length
|
// - Anthropic Opus 4.x: 32K output (with extended-output header)
|
||||||
// when reasoning consumes the budget. 128K leaves ample room for both
|
// - Anthropic Haiku 4.5: 8K output
|
||||||
// reasoning and visible findings on any audit prompt we throw at it.
|
// - Kimi K2.6 (reasoning): 128K — needs headroom because
|
||||||
// Override via LH_AUDITOR_KIMI_MAX_TOKENS only if you want to cap cost.
|
// reasoning_content counts against the budget
|
||||||
|
// - Default: 16K, conservative middle ground
|
||||||
//
|
//
|
||||||
// Bug fix 2026-04-27 (caught by Kimi's own self-audit): empty env var
|
// 2026-04-27 BLOCK from Opus self-audit: the prior single-default of
|
||||||
// like LH_AUDITOR_KIMI_MAX_TOKENS="" used to parse via Number("") → 0
|
// 128K worked silently (Anthropic clamps server-side) but was
|
||||||
// because `??` only catches null/undefined. Use `||` so empty string,
|
// technically invalid. Per-model caps make it explicit. Override via
|
||||||
// 0, or NaN all fall back to the default.
|
// LH_AUDITOR_KIMI_MAX_TOKENS to force a value (also fixes the empty-
|
||||||
const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 128_000;
|
// env Number("") -> 0 trap by using `||` not `??`).
|
||||||
|
const MAX_TOKENS_OVERRIDE = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 0;
|
||||||
|
function maxTokensFor(model: string): number {
|
||||||
|
if (MAX_TOKENS_OVERRIDE > 0) return MAX_TOKENS_OVERRIDE;
|
||||||
|
if (model.startsWith("claude-opus")) return 32_000;
|
||||||
|
if (model.startsWith("claude-haiku") || model.startsWith("claude-sonnet")) return 8_192;
|
||||||
|
if (model.startsWith("kimi-")) return 128_000;
|
||||||
|
if (model.startsWith("gpt-5") || model.startsWith("o1") || model.startsWith("o3") || model.startsWith("o4")) return 32_000;
|
||||||
|
return 16_000;
|
||||||
|
}
|
||||||
|
|
||||||
export interface KimiArchitectContext {
|
export interface KimiArchitectContext {
|
||||||
pr_number: number;
|
pr_number: number;
|
||||||
@ -240,7 +250,7 @@ async function callKimi(prompt: string, provider: string, model: string): Promis
|
|||||||
provider,
|
provider,
|
||||||
model,
|
model,
|
||||||
messages: [{ role: "user", content: prompt }],
|
messages: [{ role: "user", content: prompt }],
|
||||||
max_tokens: MAX_TOKENS,
|
max_tokens: maxTokensFor(model),
|
||||||
temperature: 0.2,
|
temperature: 0.2,
|
||||||
});
|
});
|
||||||
// curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.
|
// curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user