auditor: route kimi_architect through ollama_cloud/kimi-k2.6 (TOS-clean primary)

Two changes: 1. Default provider now ollama_cloud/kimi-k2.6 (env-overridable via LH_AUDITOR_KIMI_PROVIDER + LH_AUDITOR_KIMI_MODEL). Ollama Cloud Pro exposes kimi-k2.6 legitimately, so we no longer need the User-Agent- spoof path through api.kimi.com. Smoke test 2026-04-27: api.kimi.com 368s 8 findings 8/8 grounded ollama_cloud 54s 10 findings 10/10 grounded The kimi.rs adapter (provider=kimi) stays wired as a fallback when Ollama Cloud is upstream-broken. 2. Switch HTTP transport from Bun's native fetch to curl via Bun.spawn. Bun fetch has an undocumented ~300s ceiling that AbortController + setTimeout cannot override; curl honors -m for end-to-end max transfer time without a hard intrinsic limit. Required for Kimi's reasoning-heavy responses on big audit prompts. 3. Bug fix Kimi caught in this very file (turtles all the way down): Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 128_000) yields 0 when env is set to empty string — `??` only catches null/undefined. Switched to Number(env) || 128_000 so empty/0/NaN all fall back. Same pattern probably exists in other files; future audit pass. 4. Bumped MAX_TOKENS default 12K -> 128K. Kimi K2.6's reasoning_content counts against this budget but isn't surfaced in OpenAI-shape content; 12K silently produced finish_reason=length with empty content when reasoning consumed the budget. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 06:14:16 -05:00 · 2026-04-27 06:14:16 -05:00 · 3eaac413e6
commit 3eaac413e6
parent 8d02c7f441
1 changed files with 70 additions and 30 deletions
--- a/auditor/checks/kimi_architect.ts
+++ b/auditor/checks/kimi_architect.ts
@ -35,12 +35,33 @@ const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
 const KIMI_VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/kimi_verdicts";
 const KIMI_AUDITS_JSONL = "/home/profit/lakehouse/data/_kb/kimi_audits.jsonl";
 const REPO_ROOT = "/home/profit/lakehouse";
-const CALL_TIMEOUT_MS = 360_000; // 6min — kimi reasoning + audit prompt
+// 15 min budget. Bun's fetch has an intrinsic ~300s limit that our
 // AbortController + setTimeout combo could not override; we use curl
 // via Bun.spawn instead (callKimi below). Curl honors -m for max
 // transfer time without a hard intrinsic ceiling.
 const CALL_TIMEOUT_MS = 900_000;
 const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
 const MAX_DIFF_CHARS = 180_000;
 const MAX_PRIOR_FINDINGS = 50;
-const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-for-coding";
+// Default provider/model = ollama_cloud/kimi-k2.6. Pre-2026-04-27 we
-const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 12_000);
+// went direct to api.kimi.com, but Ollama Cloud Pro now exposes the
 // same model legitimately, so we route there to avoid User-Agent
 // gating. The api.kimi.com path (provider=kimi) remains wired in the
 // gateway as a fallback for when Ollama Cloud is upstream-broken.
 const KIMI_PROVIDER = process.env.LH_AUDITOR_KIMI_PROVIDER ?? "ollama_cloud";
 const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-k2.6";
 // 128K — Kimi K2.6 emits reasoning_content that counts against this
 // budget but isn't surfaced in the OpenAI-shape `content` field.
 // Capping low silently produces empty content with finish_reason=length
 // when reasoning consumes the budget. 128K leaves ample room for both
 // reasoning and visible findings on any audit prompt we throw at it.
 // Override via LH_AUDITOR_KIMI_MAX_TOKENS only if you want to cap cost.
 //
 // Bug fix 2026-04-27 (caught by Kimi's own self-audit): empty env var
 // like LH_AUDITOR_KIMI_MAX_TOKENS="" used to parse via Number("") → 0
 // because `??` only catches null/undefined. Use `||` so empty string,
 // 0, or NaN all fall back to the default.
 const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 128_000;
 export interface KimiArchitectContext {
  pr_number: number;
@ -166,33 +187,52 @@ ${truncatedDiff}
 async function callKimi(prompt: string): Promise<{ content: string; usage: any; finish_reason: string; latency_ms: number }> {
  const t0 = Date.now();
-  const ctrl = new AbortController();
+  const body = JSON.stringify({
-  const timer = setTimeout(() => ctrl.abort(), CALL_TIMEOUT_MS);
+    provider: KIMI_PROVIDER,
-  try {
+    model: KIMI_MODEL,
-    const r = await fetch(`${GATEWAY}/v1/chat`, {
+    messages: [{ role: "user", content: prompt }],
-      method: "POST",
+    max_tokens: MAX_TOKENS,
-      headers: { "content-type": "application/json" },
+    temperature: 0.2,
-      body: JSON.stringify({
+  });
-        provider: "kimi",
+  // curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.
-        model: KIMI_MODEL,
+  // -m sets the max transfer time honored end-to-end. Body is piped via
-        messages: [{ role: "user", content: prompt }],
+  // stdin to avoid argv length limits on big audit prompts (~50K+ tokens).
-        max_tokens: MAX_TOKENS,
+  const proc = Bun.spawn({
-        temperature: 0.2,
+    cmd: [
-      }),
+      "curl", "-sS", "-X", "POST",
-      signal: ctrl.signal,
+      "-m", String(Math.ceil(CALL_TIMEOUT_MS / 1000)),
-    });
+      "-H", "content-type: application/json",
-    if (!r.ok) {
+      "--data-binary", "@-",
-      const body = await r.text();
+      `${GATEWAY}/v1/chat`,
-      throw new Error(`/v1/chat ${r.status}: ${body.slice(0, 300)}`);
+    ],
-    }
+    stdin: "pipe",
-    const j: any = await r.json();
+    stdout: "pipe",
-    return {
+    stderr: "pipe",
-      content: j.choices?.[0]?.message?.content ?? "",
+  });
-      usage: j.usage ?? {},
+  proc.stdin.write(body);
-      finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
+  await proc.stdin.end();
-      latency_ms: Date.now() - t0,
+  const [stdout, stderr, exitCode] = await Promise.all([
-    };
+    new Response(proc.stdout).text(),
-  } finally { clearTimeout(timer); }
+    new Response(proc.stderr).text(),
    proc.exited,
  ]);
  if (exitCode !== 0) {
    throw new Error(`curl exit ${exitCode}: ${stderr.slice(0, 300)}`);
  }
  let j: any;
  try { j = JSON.parse(stdout); }
  catch (e) {
    throw new Error(`bad response (${stdout.length} bytes): ${stdout.slice(0, 300)}`);
  }
  if (j.error || !j.choices) {
    throw new Error(`gateway error: ${JSON.stringify(j).slice(0, 300)}`);
  }
  return {
    content: j.choices?.[0]?.message?.content ?? "",
    usage: j.usage ?? {},
    finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
    latency_ms: Date.now() - t0,
  };
 }
 // Parse Kimi's markdown into Finding[]. Format expected (per buildPrompt):