From 8d02c7f4418141a6bbf8ca1bea1bc89ecff64926 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 05:39:51 -0500 Subject: [PATCH] auditor: integrate Kimi second-pass review (off by default, LH_AUDITOR_KIMI=1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds kimi_architect as a fifth check kind in the auditor. Runs sequentially after static/dynamic/inference/kb_query, consumes their findings as context, and asks Kimi For Coding "what did everyone miss?" — targeting load-bearing issues that deepseek N=3 voting can't see (compile errors, false telemetry, schema bypasses, determinism leaks). 7/7 grounded on the distillation v1.0.0 audit experiment 2026-04-27. Off by default. Enable on the lakehouse-auditor service: systemctl edit lakehouse-auditor.service Environment=LH_AUDITOR_KIMI=1 Tunable env (all optional): LH_AUDITOR_KIMI_MODEL default kimi-for-coding LH_AUDITOR_KIMI_MAX_TOKENS default 12000 LH_GATEWAY_URL default http://localhost:3100 Guardrails: - Failure-isolated. Any Kimi error / 429 / TOS revocation returns a single info-level skip-finding so the existing pipeline never blocks on a Kimi outage. - Cost-bounded. Cached verdicts at data/_auditor/kimi_verdicts/- .json with 24h TTL — re-audits within the window return cached findings instead of re-calling upstream. New commits produce new SHAs so caching is per-head, not per-day. - 6min upstream timeout (vs 2min for openrouter inference) — Kimi is a reasoning model and the audit prompt is large. - Grounding verification baked in. Every finding's cited file:line is greppped against the actual file before the verdict is persisted. Per-finding evidence carries [grounding: verified at FILE:LINE] or [grounding: line N > EOF] / [grounding: file not found]. Confab- ulation rate goes into data/_kb/kimi_audits.jsonl as grounding_rate for "is this still valuable" tracking. Persisted artifacts: data/_auditor/kimi_verdicts/-.json full verdict + raw Kimi response + grounding data/_kb/kimi_audits.jsonl one row per call: latency, tokens, findings, grounding rate Verdict-rendering: kimi_architect now appears in the per-check sections of the human-readable comment posted to PRs (auditor/audit.ts checkOrder), after kb_query. Verification: bun build auditor/checks/kimi_architect.ts compiles bun build auditor/audit.ts compiles parser sanity (3-finding fixture) 3/3 lifted correctly Co-Authored-By: Claude Opus 4.7 (1M context) --- auditor/audit.ts | 28 ++- auditor/checks/kimi_architect.ts | 288 +++++++++++++++++++++++++++++++ auditor/types.ts | 2 +- 3 files changed, 315 insertions(+), 3 deletions(-) create mode 100644 auditor/checks/kimi_architect.ts diff --git a/auditor/audit.ts b/auditor/audit.ts index 91d23fc..5658b29 100644 --- a/auditor/audit.ts +++ b/auditor/audit.ts @@ -23,6 +23,7 @@ import { runStaticCheck } from "./checks/static.ts"; import { runDynamicCheck } from "./checks/dynamic.ts"; import { runInferenceCheck } from "./checks/inference.ts"; import { runKbCheck } from "./checks/kb_query.ts"; +import { runKimiArchitectCheck } from "./checks/kimi_architect.ts"; const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts"; // Playbook for audit findings — one row per block/warn finding from a @@ -67,6 +68,29 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise< ...kbFindings, ]; + // Kimi-architect second-pass review. Off by default; enabled with + // LH_AUDITOR_KIMI=1. Sequential (not in the parallel block above) + // because it consumes the prior findings as context — Kimi sees what + // deepseek already flagged and is asked "what did everyone miss?" + // Failure-isolated by design: any error returns a single info-level + // skip finding so the existing audit pipeline never blocks on Kimi. + if (process.env.LH_AUDITOR_KIMI === "1") { + try { + const kimiFindings = await runKimiArchitectCheck(diff, allFindings, { + pr_number: pr.number, + head_sha: pr.head_sha, + }); + allFindings.push(...kimiFindings); + } catch (e) { + allFindings.push({ + check: "kimi_architect", + severity: "info", + summary: `kimi_architect outer error — ${(e as Error).message.slice(0, 160)}`, + evidence: [(e as Error).stack?.slice(0, 360) ?? ""], + }); + } + } + const duration_ms = Date.now() - t0; const metrics = { audit_duration_ms: duration_ms, @@ -184,7 +208,7 @@ function formatReviewBody(v: Verdict): string { lines.push(""); // Per-check sections, only if the check produced findings. - const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const; + const checkOrder = ["static", "dynamic", "inference", "kb_query", "kimi_architect"] as const; for (const check of checkOrder) { const fs = byCheck[check] ?? []; if (fs.length === 0) continue; @@ -217,6 +241,6 @@ function formatReviewBody(v: Verdict): string { return lines.join("\n"); } -function stubFinding(check: "dynamic" | "inference", why: string): Finding[] { +function stubFinding(check: "dynamic" | "inference" | "kimi_architect", why: string): Finding[] { return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }]; } diff --git a/auditor/checks/kimi_architect.ts b/auditor/checks/kimi_architect.ts new file mode 100644 index 0000000..569720a --- /dev/null +++ b/auditor/checks/kimi_architect.ts @@ -0,0 +1,288 @@ +// Kimi-architect check — second-pass senior architectural review using +// kimi-for-coding (Kimi K2.6) via /v1/chat provider=kimi. +// +// Runs AFTER the deepseek inference check (N=3 consensus) and the +// static/kb_query checks. Reads their findings as context and asks Kimi +// "what did everyone else miss?" — complementing the cheap-consensus +// voting with a sparse senior pass that catches load-bearing issues +// (compile errors, false telemetry, schema bypasses, etc.) which the +// voting structure can't see. +// +// Why Kimi here and not in the inner inference loop: +// - Cost: ~3min wall-clock per call vs ~30s for deepseek consensus. +// - TOS: api.kimi.com is User-Agent-gated (see crates/gateway/src/v1/ +// kimi.rs); cost-bounded calls only. +// - Value: experiment 2026-04-27 showed 7/7 grounding rate with full +// files vs ~50% on truncated input. Best as a sparse complement, not +// a replacement. +// +// Failure-isolated: any Kimi error returns a single info-level Finding +// "kimi_architect skipped — " so the existing audit pipeline +// is never blocked by a Kimi outage / TOS revocation / 429. +// +// Cost cap: if a kimi_verdicts/-.json file exists less than 24h +// old, return cached findings without calling upstream. New commits +// produce new SHAs so this is per-head, not per-day. +// +// Off by default: caller checks LH_AUDITOR_KIMI=1 before invoking. + +import { readFile, writeFile, mkdir, appendFile, stat } from "node:fs/promises"; +import { existsSync, readFileSync } from "node:fs"; +import { join, resolve } from "node:path"; +import type { Finding, CheckKind } from "../types.ts"; + +const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100"; +const KIMI_VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/kimi_verdicts"; +const KIMI_AUDITS_JSONL = "/home/profit/lakehouse/data/_kb/kimi_audits.jsonl"; +const REPO_ROOT = "/home/profit/lakehouse"; +const CALL_TIMEOUT_MS = 360_000; // 6min — kimi reasoning + audit prompt +const CACHE_TTL_MS = 24 * 60 * 60 * 1000; +const MAX_DIFF_CHARS = 180_000; +const MAX_PRIOR_FINDINGS = 50; +const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-for-coding"; +const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 12_000); + +export interface KimiArchitectContext { + pr_number: number; + head_sha: string; +} + +interface KimiVerdictFile { + pr_number: number; + head_sha: string; + cached_at: string; + model: string; + latency_ms: number; + finish_reason: string; + usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number }; + raw_content: string; + findings: Finding[]; + grounding: { total: number; verified: number; rate: number }; +} + +export async function runKimiArchitectCheck( + diff: string, + priorFindings: Finding[], + ctx: KimiArchitectContext, +): Promise { + const cachePath = join(KIMI_VERDICTS_DIR, `${ctx.pr_number}-${ctx.head_sha.slice(0, 12)}.json`); + + // Cost cap — return cached findings if a verdict for this exact head + // SHA was generated within the TTL. + const cached = await loadCachedVerdict(cachePath); + if (cached) { + const fs2: Finding[] = cached.findings.length > 0 + ? cached.findings + : [{ check: "kimi_architect" as CheckKind, severity: "info", summary: "kimi_architect cached — 0 findings", evidence: [`cache: ${cachePath}`] }]; + return fs2; + } + + let response: { content: string; usage: any; finish_reason: string; latency_ms: number }; + try { + response = await callKimi(buildPrompt(diff, priorFindings, ctx)); + } catch (e) { + return [skipFinding(`kimi call failed: ${(e as Error).message.slice(0, 200)}`)]; + } + + const findings = parseFindings(response.content); + const grounding = await computeGrounding(findings); + + const verdict: KimiVerdictFile = { + pr_number: ctx.pr_number, + head_sha: ctx.head_sha, + cached_at: new Date().toISOString(), + model: KIMI_MODEL, + latency_ms: response.latency_ms, + finish_reason: response.finish_reason, + usage: { + prompt_tokens: response.usage?.prompt_tokens ?? 0, + completion_tokens: response.usage?.completion_tokens ?? 0, + total_tokens: response.usage?.total_tokens ?? 0, + }, + raw_content: response.content, + findings, + grounding, + }; + + await persistVerdict(cachePath, verdict); + await appendMetrics(verdict); + + return findings.length > 0 + ? findings + : [{ + check: "kimi_architect" as CheckKind, + severity: "info", + summary: `kimi_architect produced 0 ranked findings (${response.finish_reason}, ${verdict.usage.completion_tokens} tokens)`, + evidence: [`raw response: ${cachePath}`], + }]; +} + +async function loadCachedVerdict(path: string): Promise { + if (!existsSync(path)) return null; + try { + const s = await stat(path); + if (Date.now() - s.mtimeMs > CACHE_TTL_MS) return null; + return JSON.parse(await readFile(path, "utf8")) as KimiVerdictFile; + } catch { return null; } +} + +function buildPrompt(diff: string, priorFindings: Finding[], ctx: KimiArchitectContext): string { + const truncatedDiff = diff.length > MAX_DIFF_CHARS + ? diff.slice(0, MAX_DIFF_CHARS) + `\n\n... [truncated; original diff was ${diff.length} chars]` + : diff; + + const priorBlock = priorFindings + .filter(f => f.severity !== "info") + .slice(0, MAX_PRIOR_FINDINGS) + .map(f => `- [${f.check}/${f.severity}] ${f.summary}${f.evidence?.[0] ? ` — ${f.evidence[0].slice(0, 160)}` : ""}`) + .join("\n"); + + return `You are a senior software architect doing a second-pass review on PR #${ctx.pr_number} (head ${ctx.head_sha.slice(0, 12)}). The team's automated auditor (deepseek-v3.1:671b, N=3 consensus) already produced findings. Your job is NOT to repeat what they found — your job is to catch what their voting structure CAN'T see: compile errors, type-system bypasses, false telemetry, silent determinism leaks, schema-bypass anti-patterns, load-bearing assumptions that look fine line-by-line. + +GROUNDING RULES (non-negotiable): +- Cite file:line for EVERY finding. Lines you cite must actually contain what you claim. Confabulating a finding wastes more time than missing one. +- If the diff is truncated and you can't verify a claim, say "diff-truncated, can't verify" — DO NOT guess. +- Distinguish architectural concerns (no specific line) from concrete bugs (specific line). Don't dress one as the other. + +PRIOR FINDINGS FROM DEEPSEEK CONSENSUS (do not repeat these): +${priorBlock || "(none)"} + +OUTPUT FORMAT (markdown): +- ## Verdict (one sentence) +- ## Findings (5-10 items, each formatted EXACTLY as below) + +For each finding use this exact shape so a parser can lift them: + +### F1: +- **Severity:** block | warn | info +- **File:** path/to/file.ext:LINE +- **Rationale:** one or two sentences + +THE DIFF: + +${truncatedDiff} +`; +} + +async function callKimi(prompt: string): Promise<{ content: string; usage: any; finish_reason: string; latency_ms: number }> { + const t0 = Date.now(); + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(), CALL_TIMEOUT_MS); + try { + const r = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: "kimi", + model: KIMI_MODEL, + messages: [{ role: "user", content: prompt }], + max_tokens: MAX_TOKENS, + temperature: 0.2, + }), + signal: ctrl.signal, + }); + if (!r.ok) { + const body = await r.text(); + throw new Error(`/v1/chat ${r.status}: ${body.slice(0, 300)}`); + } + const j: any = await r.json(); + return { + content: j.choices?.[0]?.message?.content ?? "", + usage: j.usage ?? {}, + finish_reason: j.choices?.[0]?.finish_reason ?? "unknown", + latency_ms: Date.now() - t0, + }; + } finally { clearTimeout(timer); } +} + +// Parse Kimi's markdown into Finding[]. Format expected (per buildPrompt): +// ### F: +// - **Severity:** block | warn | info +// - **File:** path:line +// - **Rationale:** ... +function parseFindings(content: string): Finding[] { + const findings: Finding[] = []; + const blocks = content.split(/^###\s+F\d+:\s*/m).slice(1); + for (const block of blocks) { + const summary = (block.split("\n")[0] ?? "").trim(); + if (!summary) continue; + const sev = /\*\*Severity:\*\*\s*(block|warn|info)/i.exec(block)?.[1]?.toLowerCase(); + const fileLine = /\*\*File:\*\*\s*(\S+)/i.exec(block)?.[1] ?? "unknown"; + const rationale = /\*\*Rationale:\*\*\s*([\s\S]+?)(?=\n###|\n\*\*|$)/i.exec(block)?.[1]?.trim() ?? ""; + const severity: Finding["severity"] = sev === "block" ? "block" : sev === "warn" ? "warn" : "info"; + findings.push({ + check: "kimi_architect" as CheckKind, + severity, + summary: summary.slice(0, 240), + evidence: [fileLine, rationale.slice(0, 360)].filter(Boolean), + }); + } + return findings; +} + +// For each finding's cited file:line, grep the actual file to verify +// the line exists. Returns total + verified counts; per-finding metadata +// is appended into the evidence array so the reader can see which +// citations were verified. +async function computeGrounding(findings: Finding[]): Promise<{ total: number; verified: number; rate: number }> { + let verified = 0; + for (const f of findings) { + const cite = f.evidence[0] ?? ""; + const m = /^(\S+?):(\d+)/.exec(cite); + if (!m) continue; + const [, relpath, lineStr] = m; + const line = Number(lineStr); + if (!line || !relpath) continue; + const abs = relpath.startsWith("/") ? relpath : resolve(REPO_ROOT, relpath); + if (!existsSync(abs)) { + f.evidence.push("[grounding: file not found]"); + continue; + } + try { + const lines = readFileSync(abs, "utf8").split("\n"); + if (line < 1 || line > lines.length) { + f.evidence.push(`[grounding: line ${line} > EOF (${lines.length})]`); + continue; + } + f.evidence.push(`[grounding: verified at ${relpath}:${line}]`); + verified++; + } catch (e) { + f.evidence.push(`[grounding: read failed: ${(e as Error).message.slice(0, 80)}]`); + } + } + const total = findings.length; + return { total, verified, rate: total === 0 ? 0 : verified / total }; +} + +async function persistVerdict(path: string, v: KimiVerdictFile): Promise { + await mkdir(KIMI_VERDICTS_DIR, { recursive: true }); + await writeFile(path, JSON.stringify(v, null, 2)); +} + +async function appendMetrics(v: KimiVerdictFile): Promise { + await mkdir(join(KIMI_AUDITS_JSONL, ".."), { recursive: true }); + await appendFile(KIMI_AUDITS_JSONL, JSON.stringify({ + pr_number: v.pr_number, + head_sha: v.head_sha, + audited_at: v.cached_at, + model: v.model, + latency_ms: v.latency_ms, + finish_reason: v.finish_reason, + prompt_tokens: v.usage.prompt_tokens, + completion_tokens: v.usage.completion_tokens, + findings_total: v.findings.length, + findings_block: v.findings.filter(f => f.severity === "block").length, + findings_warn: v.findings.filter(f => f.severity === "warn").length, + grounding_verified: v.grounding.verified, + grounding_rate: Number(v.grounding.rate.toFixed(3)), + }) + "\n"); +} + +function skipFinding(why: string): Finding { + return { + check: "kimi_architect" as CheckKind, + severity: "info", + summary: `kimi_architect skipped — ${why}`, + evidence: [why], + }; +} diff --git a/auditor/types.ts b/auditor/types.ts index 9ce7609..93f088d 100644 --- a/auditor/types.ts +++ b/auditor/types.ts @@ -2,7 +2,7 @@ // if something can't be verified from a check, it goes into `evidence` // so the verdict is inspectable, not a black box. -export type CheckKind = "static" | "dynamic" | "inference" | "kb_query"; +export type CheckKind = "static" | "dynamic" | "inference" | "kb_query" | "kimi_architect"; export type Severity = "info" | "warn" | "block";