auditor: integrate Kimi second-pass review (off by default, LH_AUDITOR_KIMI=1)

Adds kimi_architect as a fifth check kind in the auditor. Runs sequentially after static/dynamic/inference/kb_query, consumes their findings as context, and asks Kimi For Coding "what did everyone miss?" — targeting load-bearing issues that deepseek N=3 voting can't see (compile errors, false telemetry, schema bypasses, determinism leaks). 7/7 grounded on the distillation v1.0.0 audit experiment 2026-04-27. Off by default. Enable on the lakehouse-auditor service: systemctl edit lakehouse-auditor.service Environment=LH_AUDITOR_KIMI=1 Tunable env (all optional): LH_AUDITOR_KIMI_MODEL default kimi-for-coding LH_AUDITOR_KIMI_MAX_TOKENS default 12000 LH_GATEWAY_URL default http://localhost:3100 Guardrails: - Failure-isolated. Any Kimi error / 429 / TOS revocation returns a single info-level skip-finding so the existing pipeline never blocks on a Kimi outage. - Cost-bounded. Cached verdicts at data/_auditor/kimi_verdicts/<pr>- <sha>.json with 24h TTL — re-audits within the window return cached findings instead of re-calling upstream. New commits produce new SHAs so caching is per-head, not per-day. - 6min upstream timeout (vs 2min for openrouter inference) — Kimi is a reasoning model and the audit prompt is large. - Grounding verification baked in. Every finding's cited file:line is greppped against the actual file before the verdict is persisted. Per-finding evidence carries [grounding: verified at FILE:LINE] or [grounding: line N > EOF] / [grounding: file not found]. Confab- ulation rate goes into data/_kb/kimi_audits.jsonl as grounding_rate for "is this still valuable" tracking. Persisted artifacts: data/_auditor/kimi_verdicts/<pr>-<sha>.json full verdict + raw Kimi response + grounding data/_kb/kimi_audits.jsonl one row per call: latency, tokens, findings, grounding rate Verdict-rendering: kimi_architect now appears in the per-check sections of the human-readable comment posted to PRs (auditor/audit.ts checkOrder), after kb_query. Verification: bun build auditor/checks/kimi_architect.ts compiles bun build auditor/audit.ts compiles parser sanity (3-finding fixture) 3/3 lifted correctly Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 05:39:51 -05:00 · 2026-04-27 05:39:51 -05:00 · 8d02c7f441
commit 8d02c7f441
parent 643dd2d520
3 changed files with 315 additions and 3 deletions
--- a/auditor/audit.ts
+++ b/auditor/audit.ts
@ -23,6 +23,7 @@ import { runStaticCheck } from "./checks/static.ts";
 import { runDynamicCheck } from "./checks/dynamic.ts";
 import { runInferenceCheck } from "./checks/inference.ts";
 import { runKbCheck } from "./checks/kb_query.ts";
+import { runKimiArchitectCheck } from "./checks/kimi_architect.ts";

 const VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/verdicts";
 // Playbook for audit findings — one row per block/warn finding from a
@ -67,6 +68,29 @@ export async function auditPr(pr: PrSnapshot, opts: AuditOptions = {}): Promise<
    ...kbFindings,
  ];

+  // Kimi-architect second-pass review. Off by default; enabled with
+  // LH_AUDITOR_KIMI=1. Sequential (not in the parallel block above)
+  // because it consumes the prior findings as context — Kimi sees what
+  // deepseek already flagged and is asked "what did everyone miss?"
+  // Failure-isolated by design: any error returns a single info-level
+  // skip finding so the existing audit pipeline never blocks on Kimi.
+  if (process.env.LH_AUDITOR_KIMI === "1") {
+    try {
+      const kimiFindings = await runKimiArchitectCheck(diff, allFindings, {
+        pr_number: pr.number,
+        head_sha: pr.head_sha,
+      });
+      allFindings.push(...kimiFindings);
+    } catch (e) {
+      allFindings.push({
+        check: "kimi_architect",
+        severity: "info",
+        summary: `kimi_architect outer error — ${(e as Error).message.slice(0, 160)}`,
+        evidence: [(e as Error).stack?.slice(0, 360) ?? ""],
+      });
+    }
+  }
+
  const duration_ms = Date.now() - t0;
  const metrics = {
    audit_duration_ms: duration_ms,
@ -184,7 +208,7 @@ function formatReviewBody(v: Verdict): string {
  lines.push("");

  // Per-check sections, only if the check produced findings.
-  const checkOrder = ["static", "dynamic", "inference", "kb_query"] as const;
+  const checkOrder = ["static", "dynamic", "inference", "kb_query", "kimi_architect"] as const;
  for (const check of checkOrder) {
    const fs = byCheck[check] ?? [];
    if (fs.length === 0) continue;
@ -217,6 +241,6 @@ function formatReviewBody(v: Verdict): string {
  return lines.join("\n");
 }

-function stubFinding(check: "dynamic" | "inference", why: string): Finding[] {
+function stubFinding(check: "dynamic" | "inference" | "kimi_architect", why: string): Finding[] {
  return [{ check, severity: "info", summary: `${check} check skipped — ${why}`, evidence: [why] }];
 }
--- a/auditor/checks/kimi_architect.ts
+++ b/auditor/checks/kimi_architect.ts
@ -0,0 +1,288 @@
+// Kimi-architect check — second-pass senior architectural review using
+// kimi-for-coding (Kimi K2.6) via /v1/chat provider=kimi.
+//
+// Runs AFTER the deepseek inference check (N=3 consensus) and the
+// static/kb_query checks. Reads their findings as context and asks Kimi
+// "what did everyone else miss?" — complementing the cheap-consensus
+// voting with a sparse senior pass that catches load-bearing issues
+// (compile errors, false telemetry, schema bypasses, etc.) which the
+// voting structure can't see.
+//
+// Why Kimi here and not in the inner inference loop:
+// - Cost: ~3min wall-clock per call vs ~30s for deepseek consensus.
+// - TOS: api.kimi.com is User-Agent-gated (see crates/gateway/src/v1/
+//   kimi.rs); cost-bounded calls only.
+// - Value: experiment 2026-04-27 showed 7/7 grounding rate with full
+//   files vs ~50% on truncated input. Best as a sparse complement, not
+//   a replacement.
+//
+// Failure-isolated: any Kimi error returns a single info-level Finding
+// "kimi_architect skipped — <reason>" so the existing audit pipeline
+// is never blocked by a Kimi outage / TOS revocation / 429.
+//
+// Cost cap: if a kimi_verdicts/<pr>-<sha>.json file exists less than 24h
+// old, return cached findings without calling upstream. New commits
+// produce new SHAs so this is per-head, not per-day.
+//
+// Off by default: caller checks LH_AUDITOR_KIMI=1 before invoking.
+
+import { readFile, writeFile, mkdir, appendFile, stat } from "node:fs/promises";
+import { existsSync, readFileSync } from "node:fs";
+import { join, resolve } from "node:path";
+import type { Finding, CheckKind } from "../types.ts";
+
+const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
+const KIMI_VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/kimi_verdicts";
+const KIMI_AUDITS_JSONL = "/home/profit/lakehouse/data/_kb/kimi_audits.jsonl";
+const REPO_ROOT = "/home/profit/lakehouse";
+const CALL_TIMEOUT_MS = 360_000; // 6min — kimi reasoning + audit prompt
+const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
+const MAX_DIFF_CHARS = 180_000;
+const MAX_PRIOR_FINDINGS = 50;
+const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-for-coding";
+const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 12_000);
+
+export interface KimiArchitectContext {
+  pr_number: number;
+  head_sha: string;
+}
+
+interface KimiVerdictFile {
+  pr_number: number;
+  head_sha: string;
+  cached_at: string;
+  model: string;
+  latency_ms: number;
+  finish_reason: string;
+  usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number };
+  raw_content: string;
+  findings: Finding[];
+  grounding: { total: number; verified: number; rate: number };
+}
+
+export async function runKimiArchitectCheck(
+  diff: string,
+  priorFindings: Finding[],
+  ctx: KimiArchitectContext,
+): Promise<Finding[]> {
+  const cachePath = join(KIMI_VERDICTS_DIR, `${ctx.pr_number}-${ctx.head_sha.slice(0, 12)}.json`);
+
+  // Cost cap — return cached findings if a verdict for this exact head
+  // SHA was generated within the TTL.
+  const cached = await loadCachedVerdict(cachePath);
+  if (cached) {
+    const fs2: Finding[] = cached.findings.length > 0
+      ? cached.findings
+      : [{ check: "kimi_architect" as CheckKind, severity: "info", summary: "kimi_architect cached — 0 findings", evidence: [`cache: ${cachePath}`] }];
+    return fs2;
+  }
+
+  let response: { content: string; usage: any; finish_reason: string; latency_ms: number };
+  try {
+    response = await callKimi(buildPrompt(diff, priorFindings, ctx));
+  } catch (e) {
+    return [skipFinding(`kimi call failed: ${(e as Error).message.slice(0, 200)}`)];
+  }
+
+  const findings = parseFindings(response.content);
+  const grounding = await computeGrounding(findings);
+
+  const verdict: KimiVerdictFile = {
+    pr_number: ctx.pr_number,
+    head_sha: ctx.head_sha,
+    cached_at: new Date().toISOString(),
+    model: KIMI_MODEL,
+    latency_ms: response.latency_ms,
+    finish_reason: response.finish_reason,
+    usage: {
+      prompt_tokens: response.usage?.prompt_tokens ?? 0,
+      completion_tokens: response.usage?.completion_tokens ?? 0,
+      total_tokens: response.usage?.total_tokens ?? 0,
+    },
+    raw_content: response.content,
+    findings,
+    grounding,
+  };
+
+  await persistVerdict(cachePath, verdict);
+  await appendMetrics(verdict);
+
+  return findings.length > 0
+    ? findings
+    : [{
+        check: "kimi_architect" as CheckKind,
+        severity: "info",
+        summary: `kimi_architect produced 0 ranked findings (${response.finish_reason}, ${verdict.usage.completion_tokens} tokens)`,
+        evidence: [`raw response: ${cachePath}`],
+      }];
+}
+
+async function loadCachedVerdict(path: string): Promise<KimiVerdictFile | null> {
+  if (!existsSync(path)) return null;
+  try {
+    const s = await stat(path);
+    if (Date.now() - s.mtimeMs > CACHE_TTL_MS) return null;
+    return JSON.parse(await readFile(path, "utf8")) as KimiVerdictFile;
+  } catch { return null; }
+}
+
+function buildPrompt(diff: string, priorFindings: Finding[], ctx: KimiArchitectContext): string {
+  const truncatedDiff = diff.length > MAX_DIFF_CHARS
+    ? diff.slice(0, MAX_DIFF_CHARS) + `\n\n... [truncated; original diff was ${diff.length} chars]`
+    : diff;
+
+  const priorBlock = priorFindings
+    .filter(f => f.severity !== "info")
+    .slice(0, MAX_PRIOR_FINDINGS)
+    .map(f => `- [${f.check}/${f.severity}] ${f.summary}${f.evidence?.[0] ? ` — ${f.evidence[0].slice(0, 160)}` : ""}`)
+    .join("\n");
+
+  return `You are a senior software architect doing a second-pass review on PR #${ctx.pr_number} (head ${ctx.head_sha.slice(0, 12)}). The team's automated auditor (deepseek-v3.1:671b, N=3 consensus) already produced findings. Your job is NOT to repeat what they found — your job is to catch what their voting structure CAN'T see: compile errors, type-system bypasses, false telemetry, silent determinism leaks, schema-bypass anti-patterns, load-bearing assumptions that look fine line-by-line.
+
+GROUNDING RULES (non-negotiable):
+- Cite file:line for EVERY finding. Lines you cite must actually contain what you claim. Confabulating a finding wastes more time than missing one.
+- If the diff is truncated and you can't verify a claim, say "diff-truncated, can't verify" — DO NOT guess.
+- Distinguish architectural concerns (no specific line) from concrete bugs (specific line). Don't dress one as the other.
+
+PRIOR FINDINGS FROM DEEPSEEK CONSENSUS (do not repeat these):
+${priorBlock || "(none)"}
+
+OUTPUT FORMAT (markdown):
+- ## Verdict (one sentence)
+- ## Findings (5-10 items, each formatted EXACTLY as below)
+
+For each finding use this exact shape so a parser can lift them:
+
+### F1: <one-line summary>
+- **Severity:** block | warn | info
+- **File:** path/to/file.ext:LINE
+- **Rationale:** one or two sentences
+
+THE DIFF:
+
+${truncatedDiff}
+`;
+}
+
+async function callKimi(prompt: string): Promise<{ content: string; usage: any; finish_reason: string; latency_ms: number }> {
+  const t0 = Date.now();
+  const ctrl = new AbortController();
+  const timer = setTimeout(() => ctrl.abort(), CALL_TIMEOUT_MS);
+  try {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: "kimi",
+        model: KIMI_MODEL,
+        messages: [{ role: "user", content: prompt }],
+        max_tokens: MAX_TOKENS,
+        temperature: 0.2,
+      }),
+      signal: ctrl.signal,
+    });
+    if (!r.ok) {
+      const body = await r.text();
+      throw new Error(`/v1/chat ${r.status}: ${body.slice(0, 300)}`);
+    }
+    const j: any = await r.json();
+    return {
+      content: j.choices?.[0]?.message?.content ?? "",
+      usage: j.usage ?? {},
+      finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
+      latency_ms: Date.now() - t0,
+    };
+  } finally { clearTimeout(timer); }
+}
+
+// Parse Kimi's markdown into Finding[]. Format expected (per buildPrompt):
+//   ### F<N>: <summary>
+//   - **Severity:** block | warn | info
+//   - **File:** path:line
+//   - **Rationale:** ...
+function parseFindings(content: string): Finding[] {
+  const findings: Finding[] = [];
+  const blocks = content.split(/^###\s+F\d+:\s*/m).slice(1);
+  for (const block of blocks) {
+    const summary = (block.split("\n")[0] ?? "").trim();
+    if (!summary) continue;
+    const sev = /\*\*Severity:\*\*\s*(block|warn|info)/i.exec(block)?.[1]?.toLowerCase();
+    const fileLine = /\*\*File:\*\*\s*(\S+)/i.exec(block)?.[1] ?? "unknown";
+    const rationale = /\*\*Rationale:\*\*\s*([\s\S]+?)(?=\n###|\n\*\*|$)/i.exec(block)?.[1]?.trim() ?? "";
+    const severity: Finding["severity"] = sev === "block" ? "block" : sev === "warn" ? "warn" : "info";
+    findings.push({
+      check: "kimi_architect" as CheckKind,
+      severity,
+      summary: summary.slice(0, 240),
+      evidence: [fileLine, rationale.slice(0, 360)].filter(Boolean),
+    });
+  }
+  return findings;
+}
+
+// For each finding's cited file:line, grep the actual file to verify
+// the line exists. Returns total + verified counts; per-finding metadata
+// is appended into the evidence array so the reader can see which
+// citations were verified.
+async function computeGrounding(findings: Finding[]): Promise<{ total: number; verified: number; rate: number }> {
+  let verified = 0;
+  for (const f of findings) {
+    const cite = f.evidence[0] ?? "";
+    const m = /^(\S+?):(\d+)/.exec(cite);
+    if (!m) continue;
+    const [, relpath, lineStr] = m;
+    const line = Number(lineStr);
+    if (!line || !relpath) continue;
+    const abs = relpath.startsWith("/") ? relpath : resolve(REPO_ROOT, relpath);
+    if (!existsSync(abs)) {
+      f.evidence.push("[grounding: file not found]");
+      continue;
+    }
+    try {
+      const lines = readFileSync(abs, "utf8").split("\n");
+      if (line < 1 || line > lines.length) {
+        f.evidence.push(`[grounding: line ${line} > EOF (${lines.length})]`);
+        continue;
+      }
+      f.evidence.push(`[grounding: verified at ${relpath}:${line}]`);
+      verified++;
+    } catch (e) {
+      f.evidence.push(`[grounding: read failed: ${(e as Error).message.slice(0, 80)}]`);
+    }
+  }
+  const total = findings.length;
+  return { total, verified, rate: total === 0 ? 0 : verified / total };
+}
+
+async function persistVerdict(path: string, v: KimiVerdictFile): Promise<void> {
+  await mkdir(KIMI_VERDICTS_DIR, { recursive: true });
+  await writeFile(path, JSON.stringify(v, null, 2));
+}
+
+async function appendMetrics(v: KimiVerdictFile): Promise<void> {
+  await mkdir(join(KIMI_AUDITS_JSONL, ".."), { recursive: true });
+  await appendFile(KIMI_AUDITS_JSONL, JSON.stringify({
+    pr_number: v.pr_number,
+    head_sha: v.head_sha,
+    audited_at: v.cached_at,
+    model: v.model,
+    latency_ms: v.latency_ms,
+    finish_reason: v.finish_reason,
+    prompt_tokens: v.usage.prompt_tokens,
+    completion_tokens: v.usage.completion_tokens,
+    findings_total: v.findings.length,
+    findings_block: v.findings.filter(f => f.severity === "block").length,
+    findings_warn: v.findings.filter(f => f.severity === "warn").length,
+    grounding_verified: v.grounding.verified,
+    grounding_rate: Number(v.grounding.rate.toFixed(3)),
+  }) + "\n");
+}
+
+function skipFinding(why: string): Finding {
+  return {
+    check: "kimi_architect" as CheckKind,
+    severity: "info",
+    summary: `kimi_architect skipped — ${why}`,
+    evidence: [why],
+  };
+}
--- a/auditor/types.ts
+++ b/auditor/types.ts
@ -2,7 +2,7 @@
 // if something can't be verified from a check, it goes into `evidence`
 // so the verdict is inspectable, not a black box.

-export type CheckKind = "static" | "dynamic" | "inference" | "kb_query";
+export type CheckKind = "static" | "dynamic" | "inference" | "kb_query" | "kimi_architect";

 export type Severity = "info" | "warn" | "block";