auditor: route kimi_architect through ollama_cloud/kimi-k2.6 (TOS-clean primary)
Two changes:
1. Default provider now ollama_cloud/kimi-k2.6 (env-overridable via
LH_AUDITOR_KIMI_PROVIDER + LH_AUDITOR_KIMI_MODEL). Ollama Cloud Pro
exposes kimi-k2.6 legitimately, so we no longer need the User-Agent-
spoof path through api.kimi.com. Smoke test 2026-04-27:
api.kimi.com 368s 8 findings 8/8 grounded
ollama_cloud 54s 10 findings 10/10 grounded
The kimi.rs adapter (provider=kimi) stays wired as a fallback when
Ollama Cloud is upstream-broken.
2. Switch HTTP transport from Bun's native fetch to curl via Bun.spawn.
Bun fetch has an undocumented ~300s ceiling that AbortController +
setTimeout cannot override; curl honors -m for end-to-end max
transfer time without a hard intrinsic limit. Required for Kimi's
reasoning-heavy responses on big audit prompts.
3. Bug fix Kimi caught in this very file (turtles all the way down):
Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 128_000) yields 0
when env is set to empty string — `??` only catches null/undefined.
Switched to Number(env) || 128_000 so empty/0/NaN all fall back.
Same pattern probably exists in other files; future audit pass.
4. Bumped MAX_TOKENS default 12K -> 128K. Kimi K2.6's reasoning_content
counts against this budget but isn't surfaced in OpenAI-shape content;
12K silently produced finish_reason=length with empty content when
reasoning consumed the budget.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8d02c7f441
commit
3eaac413e6
@ -35,12 +35,33 @@ const GATEWAY = process.env.LH_GATEWAY_URL ?? "http://localhost:3100";
|
||||
const KIMI_VERDICTS_DIR = "/home/profit/lakehouse/data/_auditor/kimi_verdicts";
|
||||
const KIMI_AUDITS_JSONL = "/home/profit/lakehouse/data/_kb/kimi_audits.jsonl";
|
||||
const REPO_ROOT = "/home/profit/lakehouse";
|
||||
const CALL_TIMEOUT_MS = 360_000; // 6min — kimi reasoning + audit prompt
|
||||
// 15 min budget. Bun's fetch has an intrinsic ~300s limit that our
|
||||
// AbortController + setTimeout combo could not override; we use curl
|
||||
// via Bun.spawn instead (callKimi below). Curl honors -m for max
|
||||
// transfer time without a hard intrinsic ceiling.
|
||||
const CALL_TIMEOUT_MS = 900_000;
|
||||
const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
|
||||
const MAX_DIFF_CHARS = 180_000;
|
||||
const MAX_PRIOR_FINDINGS = 50;
|
||||
const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-for-coding";
|
||||
const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS ?? 12_000);
|
||||
// Default provider/model = ollama_cloud/kimi-k2.6. Pre-2026-04-27 we
|
||||
// went direct to api.kimi.com, but Ollama Cloud Pro now exposes the
|
||||
// same model legitimately, so we route there to avoid User-Agent
|
||||
// gating. The api.kimi.com path (provider=kimi) remains wired in the
|
||||
// gateway as a fallback for when Ollama Cloud is upstream-broken.
|
||||
const KIMI_PROVIDER = process.env.LH_AUDITOR_KIMI_PROVIDER ?? "ollama_cloud";
|
||||
const KIMI_MODEL = process.env.LH_AUDITOR_KIMI_MODEL ?? "kimi-k2.6";
|
||||
// 128K — Kimi K2.6 emits reasoning_content that counts against this
|
||||
// budget but isn't surfaced in the OpenAI-shape `content` field.
|
||||
// Capping low silently produces empty content with finish_reason=length
|
||||
// when reasoning consumes the budget. 128K leaves ample room for both
|
||||
// reasoning and visible findings on any audit prompt we throw at it.
|
||||
// Override via LH_AUDITOR_KIMI_MAX_TOKENS only if you want to cap cost.
|
||||
//
|
||||
// Bug fix 2026-04-27 (caught by Kimi's own self-audit): empty env var
|
||||
// like LH_AUDITOR_KIMI_MAX_TOKENS="" used to parse via Number("") → 0
|
||||
// because `??` only catches null/undefined. Use `||` so empty string,
|
||||
// 0, or NaN all fall back to the default.
|
||||
const MAX_TOKENS = Number(process.env.LH_AUDITOR_KIMI_MAX_TOKENS) || 128_000;
|
||||
|
||||
export interface KimiArchitectContext {
|
||||
pr_number: number;
|
||||
@ -166,33 +187,52 @@ ${truncatedDiff}
|
||||
|
||||
async function callKimi(prompt: string): Promise<{ content: string; usage: any; finish_reason: string; latency_ms: number }> {
|
||||
const t0 = Date.now();
|
||||
const ctrl = new AbortController();
|
||||
const timer = setTimeout(() => ctrl.abort(), CALL_TIMEOUT_MS);
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: "kimi",
|
||||
model: KIMI_MODEL,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
max_tokens: MAX_TOKENS,
|
||||
temperature: 0.2,
|
||||
}),
|
||||
signal: ctrl.signal,
|
||||
});
|
||||
if (!r.ok) {
|
||||
const body = await r.text();
|
||||
throw new Error(`/v1/chat ${r.status}: ${body.slice(0, 300)}`);
|
||||
}
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
usage: j.usage ?? {},
|
||||
finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
|
||||
latency_ms: Date.now() - t0,
|
||||
};
|
||||
} finally { clearTimeout(timer); }
|
||||
const body = JSON.stringify({
|
||||
provider: KIMI_PROVIDER,
|
||||
model: KIMI_MODEL,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
max_tokens: MAX_TOKENS,
|
||||
temperature: 0.2,
|
||||
});
|
||||
// curl via Bun.spawn — bypasses Bun fetch's ~300s intrinsic ceiling.
|
||||
// -m sets the max transfer time honored end-to-end. Body is piped via
|
||||
// stdin to avoid argv length limits on big audit prompts (~50K+ tokens).
|
||||
const proc = Bun.spawn({
|
||||
cmd: [
|
||||
"curl", "-sS", "-X", "POST",
|
||||
"-m", String(Math.ceil(CALL_TIMEOUT_MS / 1000)),
|
||||
"-H", "content-type: application/json",
|
||||
"--data-binary", "@-",
|
||||
`${GATEWAY}/v1/chat`,
|
||||
],
|
||||
stdin: "pipe",
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
});
|
||||
proc.stdin.write(body);
|
||||
await proc.stdin.end();
|
||||
const [stdout, stderr, exitCode] = await Promise.all([
|
||||
new Response(proc.stdout).text(),
|
||||
new Response(proc.stderr).text(),
|
||||
proc.exited,
|
||||
]);
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(`curl exit ${exitCode}: ${stderr.slice(0, 300)}`);
|
||||
}
|
||||
let j: any;
|
||||
try { j = JSON.parse(stdout); }
|
||||
catch (e) {
|
||||
throw new Error(`bad response (${stdout.length} bytes): ${stdout.slice(0, 300)}`);
|
||||
}
|
||||
if (j.error || !j.choices) {
|
||||
throw new Error(`gateway error: ${JSON.stringify(j).slice(0, 300)}`);
|
||||
}
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
usage: j.usage ?? {},
|
||||
finish_reason: j.choices?.[0]?.finish_reason ?? "unknown",
|
||||
latency_ms: Date.now() - t0,
|
||||
};
|
||||
}
|
||||
|
||||
// Parse Kimi's markdown into Finding[]. Format expected (per buildPrompt):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user