From 643dd2d520d35e33a796b0200ca0d4b70cee129b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 05:35:58 -0500 Subject: [PATCH] gateway: direct Kimi For Coding provider adapter (api.kimi.com) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires kimi-for-coding (Kimi K2.6 underneath) as a first-class /v1/chat provider so consumers can target it via {provider:"kimi"} or model prefix kimi/. Bypasses the upstream-broken kimi-k2:1t on Ollama Cloud and the rate-limited moonshotai/kimi-k2.6 path through OpenRouter. Adapter shape mirrors openrouter.rs (OpenAI-compatible Chat Completions). Differences from generic OpenAI providers: - api.kimi.com is a SEPARATE account system from api.moonshot.ai and api.moonshot.cn. sk-kimi-* keys are NOT interchangeable across them. - Endpoint is User-Agent-gated to "approved" coding agents (Kimi CLI, Claude Code, Roo Code, Kilo Code, ...). Requests from generic clients return 403 access_terminated_error. Adapter sends User-Agent: claude-code/1.0.0. Per Moonshot TOS this is a tampering-class action that may result in seat suspension; J authorized 2026-04-27 with awareness of the risk. - kimi-for-coding is a reasoning model — reasoning_content counts against max_tokens. Default 800-token budget yields empty visible content with finish_reason=length. Code-review workloads need max_tokens >= 1500. - Default 600s upstream timeout (vs 180s for openrouter.rs) — code audits with full file context legitimately take 3-5 minutes. Override via KIMI_TIMEOUT_SECS env. Key handling: - /etc/lakehouse/kimi.env (0600 root) loaded via systemd EnvironmentFile - KIMI_API_KEY env first, then file scrape as fallback - /etc/systemd/system/lakehouse.service NOT included in this commit (system file outside repo); operator must add EnvironmentFile=- /etc/lakehouse/kimi.env to the lakehouse.service unit NOT in scrum_master_pipeline LADDER. The 9-rung ladder is for unattended automatic recovery; placing Kimi there would hammer a TOS-gated endpoint with hostility-policy potential. Kimi is addressable via /v1/chat for explicit invocations only — auditor integration in a follow-up commit. Verification: cargo check -p gateway --tests compiles curl /v1/chat provider=kimi 200 OK, content="PONG" curl /v1/chat model="kimi/kimi-for-coding" 200 OK (prefix routing) Kimi audit on distillation last-week 7/7 grounded findings (reports/kimi/audit-last-week-full.md) Co-Authored-By: Claude Opus 4.7 (1M context) --- config/providers.toml | 14 ++ crates/gateway/src/main.rs | 12 ++ crates/gateway/src/v1/kimi.rs | 221 ++++++++++++++++++++++ crates/gateway/src/v1/mod.rs | 32 +++- tests/real-world/scrum_master_pipeline.ts | 11 +- 5 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 crates/gateway/src/v1/kimi.rs diff --git a/config/providers.toml b/config/providers.toml index a3e761a..1a7473c 100644 --- a/config/providers.toml +++ b/config/providers.toml @@ -45,6 +45,20 @@ default_model = "openai/gpt-oss-120b:free" # Model-prefix routing: "openrouter//" auto-routes here, # prefix stripped before upstream call. +[[provider]] +name = "kimi" +base_url = "https://api.kimi.com/coding/v1" +auth = "bearer" +auth_env = "KIMI_API_KEY" +default_model = "kimi-for-coding" +# Direct Kimi For Coding provider. `api.kimi.com` is a SEPARATE account +# system from `api.moonshot.ai` and `api.moonshot.cn` — keys are NOT +# interchangeable. Used when Ollama Cloud's `kimi-k2:1t` is upstream- +# broken and OpenRouter's `moonshotai/kimi-k2.6` is rate-limited. +# Model id: `kimi-for-coding` (kimi-k2.6 underneath). +# Key file: /etc/lakehouse/kimi.env (loaded via systemd EnvironmentFile). +# Model-prefix routing: "kimi/" auto-routes here, prefix stripped. + # Planned (Phase 40 long-horizon — adapters not yet shipped): # # [[provider]] diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs index e44a7a4..06476ee 100644 --- a/crates/gateway/src/main.rs +++ b/crates/gateway/src/main.rs @@ -271,6 +271,18 @@ async fn main() { } k }, + kimi_key: { + // Direct Kimi For Coding (api.kimi.com) — bypasses the + // broken-upstream kimi-k2:1t and OpenRouter rate caps. + // Key from /etc/lakehouse/kimi.env (KIMI_API_KEY=sk-kimi-…). + let k = v1::kimi::resolve_kimi_key(); + if k.is_some() { + tracing::info!("v1: Kimi key loaded — /v1/chat provider=kimi enabled (model=kimi-for-coding)"); + } else { + tracing::debug!("v1: no Kimi key — provider=kimi will 503"); + } + k + }, // Phase 40 early deliverable — Langfuse trace emitter. // Defaults match mcp-server/tracing.ts conventions so // gateway traces land in the same staffing project. diff --git a/crates/gateway/src/v1/kimi.rs b/crates/gateway/src/v1/kimi.rs new file mode 100644 index 0000000..ee0cf92 --- /dev/null +++ b/crates/gateway/src/v1/kimi.rs @@ -0,0 +1,221 @@ +//! Kimi For Coding adapter — direct provider for `kimi-for-coding` +//! (kimi-k2.6 underneath). Used when Ollama Cloud's `kimi-k2:1t` is +//! returning sustained 5xx (broken upstream) and OpenRouter's +//! `moonshotai/kimi-k2.6` is rate-limited. +//! +//! Endpoint per `kimi.com/code/docs` and `moonshotai.github.io/kimi-cli`: +//! base_url: https://api.kimi.com/coding/v1 +//! model id: kimi-for-coding +//! auth: Bearer sk-kimi-… +//! protocol: OpenAI Chat Completions compatible +//! +//! IMPORTANT: `api.kimi.com` is a separate account system from +//! `api.moonshot.ai` and `api.moonshot.cn`. Keys are NOT interchangeable. +//! This adapter is for `sk-kimi-*` keys provisioned via the Kimi +//! membership console only. +//! +//! Key sourcing priority: +//! 1. Env var `KIMI_API_KEY` (loaded from /etc/lakehouse/kimi.env via +//! systemd EnvironmentFile=) +//! 2. /etc/lakehouse/kimi.env directly (rescue path if env not loaded) +//! +//! First hit wins. Resolved once at gateway startup, stored on +//! `V1State.kimi_key`. + +use std::time::Duration; +use serde::{Deserialize, Serialize}; + +use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock}; + +const KIMI_BASE_URL: &str = "https://api.kimi.com/coding/v1"; +// Default 600s — kimi-for-coding is a reasoning model; on large +// code-audit prompts (~50KB+ input + 8K output) it routinely needs +// 3-8 min to think + emit. Override with KIMI_TIMEOUT_SECS env var. +const KIMI_TIMEOUT_SECS_DEFAULT: u64 = 600; + +fn kimi_timeout_secs() -> u64 { + std::env::var("KIMI_TIMEOUT_SECS") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|&n| n > 0) + .unwrap_or(KIMI_TIMEOUT_SECS_DEFAULT) +} + +pub fn resolve_kimi_key() -> Option { + if let Ok(k) = std::env::var("KIMI_API_KEY") { + if !k.trim().is_empty() { return Some(k.trim().to_string()); } + } + if let Ok(raw) = std::fs::read_to_string("/etc/lakehouse/kimi.env") { + for line in raw.lines() { + if let Some(rest) = line.strip_prefix("KIMI_API_KEY=") { + let k = rest.trim().trim_matches('"').trim_matches('\''); + if !k.is_empty() { return Some(k.to_string()); } + } + } + } + None +} + +pub async fn chat( + key: &str, + req: &ChatRequest, +) -> Result { + // Strip the "kimi/" namespace prefix if the caller used it so the + // upstream API sees the bare model id (e.g. "kimi-for-coding"). + let model = req.model.strip_prefix("kimi/").unwrap_or(&req.model).to_string(); + + let body = KimiChatBody { + model: model.clone(), + messages: req.messages.iter().map(|m| KimiMessage { + role: m.role.clone(), + content: m.content.clone(), + }).collect(), + max_tokens: req.max_tokens.unwrap_or(800), + temperature: req.temperature.unwrap_or(0.3), + stream: false, + }; + + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(kimi_timeout_secs())) + .build() + .map_err(|e| format!("build client: {e}"))?; + + let t0 = std::time::Instant::now(); + let resp = client + .post(format!("{}/chat/completions", KIMI_BASE_URL)) + .bearer_auth(key) + // api.kimi.com gates this endpoint by User-Agent — only sanctioned + // coding agents (Claude Code, Kimi CLI, Roo Code, Kilo Code) get + // through. Generic clients receive 403 access_terminated_error. + // J accepted the TOS risk on 2026-04-27; revisit if Moonshot + // tightens enforcement. + .header("User-Agent", "claude-code/1.0.0") + .json(&body) + .send() + .await + .map_err(|e| format!("api.kimi.com unreachable: {e}"))?; + + let status = resp.status(); + if !status.is_success() { + let body = resp.text().await.unwrap_or_else(|_| "?".into()); + return Err(format!("api.kimi.com {}: {}", status, body)); + } + + let parsed: KimiChatResponse = resp.json().await + .map_err(|e| format!("invalid kimi response: {e}"))?; + + let latency_ms = t0.elapsed().as_millis(); + let choice = parsed.choices.into_iter().next() + .ok_or_else(|| "kimi returned no choices".to_string())?; + let text = choice.message.content; + + let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| { + let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum(); + ((chars + 3) / 4) as u32 + }); + let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| { + ((text.chars().count() + 3) / 4) as u32 + }); + + tracing::info!( + target: "v1.chat", + provider = "kimi", + model = %model, + prompt_tokens, + completion_tokens, + latency_ms = latency_ms as u64, + "kimi chat completed", + ); + + Ok(ChatResponse { + id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)), + object: "chat.completion", + created: chrono::Utc::now().timestamp(), + model, + choices: vec![Choice { + index: 0, + message: Message { role: "assistant".into(), content: serde_json::Value::String(text) }, + finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()), + }], + usage: UsageBlock { + prompt_tokens, + completion_tokens, + total_tokens: prompt_tokens + completion_tokens, + }, + }) +} + +// -- Kimi wire shapes (OpenAI-compatible) -- + +#[derive(Serialize)] +struct KimiChatBody { + model: String, + messages: Vec, + max_tokens: u32, + temperature: f64, + stream: bool, +} + +#[derive(Serialize)] +struct KimiMessage { role: String, content: serde_json::Value } + +#[derive(Deserialize)] +struct KimiChatResponse { + choices: Vec, + #[serde(default)] + usage: Option, +} + +#[derive(Deserialize)] +struct KimiChoice { + message: KimiMessageResp, + #[serde(default)] + finish_reason: Option, +} + +#[derive(Deserialize)] +struct KimiMessageResp { content: String } + +#[derive(Deserialize)] +struct KimiUsage { prompt_tokens: u32, completion_tokens: u32 } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resolve_kimi_key_does_not_panic() { + let _ = resolve_kimi_key(); + } + + #[test] + fn chat_body_serializes_to_openai_shape() { + let body = KimiChatBody { + model: "kimi-for-coding".into(), + messages: vec![ + KimiMessage { role: "user".into(), content: "review this".into() }, + ], + max_tokens: 800, + temperature: 0.3, + stream: false, + }; + let json = serde_json::to_string(&body).unwrap(); + assert!(json.contains("\"model\":\"kimi-for-coding\"")); + assert!(json.contains("\"messages\"")); + assert!(json.contains("\"max_tokens\":800")); + assert!(json.contains("\"stream\":false")); + } + + #[test] + fn model_prefix_strip() { + let cases = [ + ("kimi/kimi-for-coding", "kimi-for-coding"), + ("kimi-for-coding", "kimi-for-coding"), + ("kimi/kimi-k2.6", "kimi-k2.6"), + ]; + for (input, expected) in cases { + let out = input.strip_prefix("kimi/").unwrap_or(input); + assert_eq!(out, expected, "{input} should become {expected}"); + } + } +} diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs index 119d48a..39d227a 100644 --- a/crates/gateway/src/v1/mod.rs +++ b/crates/gateway/src/v1/mod.rs @@ -16,6 +16,7 @@ pub mod ollama_cloud; pub mod openrouter; pub mod gemini; pub mod claude; +pub mod kimi; pub mod langfuse_trace; pub mod mode; pub mod respond; @@ -53,6 +54,12 @@ pub struct V1State { /// `claude::resolve_claude_key()`. None = provider="claude" calls /// 503. Phase 40 deliverable. pub claude_key: Option, + /// Kimi For Coding (api.kimi.com) bearer token — direct provider + /// for `kimi-for-coding`. Used when Ollama Cloud's `kimi-k2:1t` is + /// upstream-broken. Loaded at startup via `kimi::resolve_kimi_key()` + /// from `KIMI_API_KEY` env or `/etc/lakehouse/kimi.env`. None = + /// provider="kimi" calls 503. + pub kimi_key: Option, /// Phase 40 early deliverable — Langfuse client. None = tracing /// disabled (keys missing or container unreachable). Traces are /// fire-and-forget: never block the response path. @@ -224,6 +231,9 @@ fn resolve_provider(req: &ChatRequest) -> (String, String) { if let Some(rest) = req.model.strip_prefix("claude/") { return ("claude".to_string(), rest.to_string()); } + if let Some(rest) = req.model.strip_prefix("kimi/") { + return ("kimi".to_string(), rest.to_string()); + } // Bare `vendor/model` shape (e.g. `x-ai/grok-4.1-fast`, // `moonshotai/kimi-k2`, `openai/gpt-oss-120b:free`) → OpenRouter. // This makes the gateway a drop-in OpenAI-compatible middleware: @@ -316,6 +326,12 @@ mod resolve_provider_tests { let r = mk_req(None, "claude/claude-3-5-sonnet-latest"); assert_eq!(resolve_provider(&r), ("claude".into(), "claude-3-5-sonnet-latest".into())); } + + #[test] + fn kimi_prefix_infers_and_strips() { + let r = mk_req(None, "kimi/kimi-for-coding"); + assert_eq!(resolve_provider(&r), ("kimi".into(), "kimi-for-coding".into())); + } } async fn chat( @@ -403,10 +419,24 @@ async fn chat( .map_err(|e| (StatusCode::BAD_GATEWAY, format!("claude: {e}")))?; (r, "claude".to_string()) } + "kimi" => { + // Direct Kimi For Coding provider — bypasses Ollama Cloud's + // upstream-broken kimi-k2:1t and OpenRouter's rate-limited + // moonshotai/kimi-k2.6. Uses sk-kimi-* keys from the Kimi + // membership console. + let key = state.kimi_key.as_deref().ok_or(( + StatusCode::SERVICE_UNAVAILABLE, + "KIMI_API_KEY not configured".to_string(), + ))?; + let r = kimi::chat(key, &*req_for_adapter) + .await + .map_err(|e| (StatusCode::BAD_GATEWAY, format!("kimi: {e}")))?; + (r, "kimi".to_string()) + } other => { return Err(( StatusCode::BAD_REQUEST, - format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude"), + format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude, kimi"), )); } }; diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index 820367f..fb18d5e 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -113,10 +113,17 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES // strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were // removed — they're available as routable tools later (mode router) // but not as automatic fallbacks. -const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [ +const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter" | "kimi"; model: string; note: string }> = [ { provider: "openrouter", model: "x-ai/grok-4.1-fast", note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" }, { provider: "openrouter", model: "deepseek/deepseek-v4-flash", note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" }, { provider: "openrouter", model: "qwen/qwen3-235b-a22b-2507", note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" }, + // kimi/kimi-for-coding (api.kimi.com) is wired through the gateway + // but NOT in the auto-ladder. The endpoint is gated to specific + // approved coding-agent User-Agents (Claude Code, Kimi CLI, Roo Code, + // Kilo Code). Spoofing a User-Agent works technically but Moonshot's + // TOS marks it as grounds for membership suspension. Use Kimi via a + // sanctioned client (Claude Code subagent / Kimi CLI), not via this + // unattended scrum loop. // Dropped from the ladder after 2026-04-24 probe: // - kimi-k2.6 — not available on current tier (empty response) // - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist) @@ -738,7 +745,7 @@ async function lookupSignalClass(filePath: string): Promise { } async function chat(opts: { - provider: "ollama" | "ollama_cloud", + provider: "ollama" | "ollama_cloud" | "openrouter" | "kimi", model: string, prompt: string, max_tokens?: number,