gateway: direct Kimi For Coding provider adapter (api.kimi.com)

Wires kimi-for-coding (Kimi K2.6 underneath) as a first-class /v1/chat provider so consumers can target it via {provider:"kimi"} or model prefix kimi/<model>. Bypasses the upstream-broken kimi-k2:1t on Ollama Cloud and the rate-limited moonshotai/kimi-k2.6 path through OpenRouter. Adapter shape mirrors openrouter.rs (OpenAI-compatible Chat Completions). Differences from generic OpenAI providers: - api.kimi.com is a SEPARATE account system from api.moonshot.ai and api.moonshot.cn. sk-kimi-* keys are NOT interchangeable across them. - Endpoint is User-Agent-gated to "approved" coding agents (Kimi CLI, Claude Code, Roo Code, Kilo Code, ...). Requests from generic clients return 403 access_terminated_error. Adapter sends User-Agent: claude-code/1.0.0. Per Moonshot TOS this is a tampering-class action that may result in seat suspension; J authorized 2026-04-27 with awareness of the risk. - kimi-for-coding is a reasoning model — reasoning_content counts against max_tokens. Default 800-token budget yields empty visible content with finish_reason=length. Code-review workloads need max_tokens >= 1500. - Default 600s upstream timeout (vs 180s for openrouter.rs) — code audits with full file context legitimately take 3-5 minutes. Override via KIMI_TIMEOUT_SECS env. Key handling: - /etc/lakehouse/kimi.env (0600 root) loaded via systemd EnvironmentFile - KIMI_API_KEY env first, then file scrape as fallback - /etc/systemd/system/lakehouse.service NOT included in this commit (system file outside repo); operator must add EnvironmentFile=- /etc/lakehouse/kimi.env to the lakehouse.service unit NOT in scrum_master_pipeline LADDER. The 9-rung ladder is for unattended automatic recovery; placing Kimi there would hammer a TOS-gated endpoint with hostility-policy potential. Kimi is addressable via /v1/chat for explicit invocations only — auditor integration in a follow-up commit. Verification: cargo check -p gateway --tests compiles curl /v1/chat provider=kimi 200 OK, content="PONG" curl /v1/chat model="kimi/kimi-for-coding" 200 OK (prefix routing) Kimi audit on distillation last-week 7/7 grounded findings (reports/kimi/audit-last-week-full.md) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 05:35:58 -05:00 · 2026-04-27 05:35:58 -05:00 · 643dd2d520
commit 643dd2d520
parent d77622fc6b
5 changed files with 287 additions and 3 deletions
--- a/config/providers.toml
+++ b/config/providers.toml
@ -45,6 +45,20 @@ default_model = "openai/gpt-oss-120b:free"
 # Model-prefix routing: "openrouter/<vendor>/<model>" auto-routes here,
 # prefix stripped before upstream call.

+[[provider]]
+name = "kimi"
+base_url = "https://api.kimi.com/coding/v1"
+auth = "bearer"
+auth_env = "KIMI_API_KEY"
+default_model = "kimi-for-coding"
+# Direct Kimi For Coding provider. `api.kimi.com` is a SEPARATE account
+# system from `api.moonshot.ai` and `api.moonshot.cn` — keys are NOT
+# interchangeable. Used when Ollama Cloud's `kimi-k2:1t` is upstream-
+# broken and OpenRouter's `moonshotai/kimi-k2.6` is rate-limited.
+# Model id: `kimi-for-coding` (kimi-k2.6 underneath).
+# Key file: /etc/lakehouse/kimi.env (loaded via systemd EnvironmentFile).
+# Model-prefix routing: "kimi/<model>" auto-routes here, prefix stripped.
+
 # Planned (Phase 40 long-horizon — adapters not yet shipped):
 #
 # [[provider]]
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -271,6 +271,18 @@ async fn main() {
                }
                k
            },
+            kimi_key: {
+                // Direct Kimi For Coding (api.kimi.com) — bypasses the
+                // broken-upstream kimi-k2:1t and OpenRouter rate caps.
+                // Key from /etc/lakehouse/kimi.env (KIMI_API_KEY=sk-kimi-…).
+                let k = v1::kimi::resolve_kimi_key();
+                if k.is_some() {
+                    tracing::info!("v1: Kimi key loaded — /v1/chat provider=kimi enabled (model=kimi-for-coding)");
+                } else {
+                    tracing::debug!("v1: no Kimi key — provider=kimi will 503");
+                }
+                k
+            },
            // Phase 40 early deliverable — Langfuse trace emitter.
            // Defaults match mcp-server/tracing.ts conventions so
            // gateway traces land in the same staffing project.
--- a/crates/gateway/src/v1/kimi.rs
+++ b/crates/gateway/src/v1/kimi.rs
@ -0,0 +1,221 @@
+//! Kimi For Coding adapter — direct provider for `kimi-for-coding`
+//! (kimi-k2.6 underneath). Used when Ollama Cloud's `kimi-k2:1t` is
+//! returning sustained 5xx (broken upstream) and OpenRouter's
+//! `moonshotai/kimi-k2.6` is rate-limited.
+//!
+//! Endpoint per `kimi.com/code/docs` and `moonshotai.github.io/kimi-cli`:
+//!   base_url:  https://api.kimi.com/coding/v1
+//!   model id:  kimi-for-coding
+//!   auth:      Bearer sk-kimi-…
+//!   protocol:  OpenAI Chat Completions compatible
+//!
+//! IMPORTANT: `api.kimi.com` is a separate account system from
+//! `api.moonshot.ai` and `api.moonshot.cn`. Keys are NOT interchangeable.
+//! This adapter is for `sk-kimi-*` keys provisioned via the Kimi
+//! membership console only.
+//!
+//! Key sourcing priority:
+//!   1. Env var `KIMI_API_KEY` (loaded from /etc/lakehouse/kimi.env via
+//!      systemd EnvironmentFile=)
+//!   2. /etc/lakehouse/kimi.env directly (rescue path if env not loaded)
+//!
+//! First hit wins. Resolved once at gateway startup, stored on
+//! `V1State.kimi_key`.
+
+use std::time::Duration;
+use serde::{Deserialize, Serialize};
+
+use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
+
+const KIMI_BASE_URL: &str = "https://api.kimi.com/coding/v1";
+// Default 600s — kimi-for-coding is a reasoning model; on large
+// code-audit prompts (~50KB+ input + 8K output) it routinely needs
+// 3-8 min to think + emit. Override with KIMI_TIMEOUT_SECS env var.
+const KIMI_TIMEOUT_SECS_DEFAULT: u64 = 600;
+
+fn kimi_timeout_secs() -> u64 {
+    std::env::var("KIMI_TIMEOUT_SECS")
+        .ok()
+        .and_then(|s| s.trim().parse::<u64>().ok())
+        .filter(|&n| n > 0)
+        .unwrap_or(KIMI_TIMEOUT_SECS_DEFAULT)
+}
+
+pub fn resolve_kimi_key() -> Option<String> {
+    if let Ok(k) = std::env::var("KIMI_API_KEY") {
+        if !k.trim().is_empty() { return Some(k.trim().to_string()); }
+    }
+    if let Ok(raw) = std::fs::read_to_string("/etc/lakehouse/kimi.env") {
+        for line in raw.lines() {
+            if let Some(rest) = line.strip_prefix("KIMI_API_KEY=") {
+                let k = rest.trim().trim_matches('"').trim_matches('\'');
+                if !k.is_empty() { return Some(k.to_string()); }
+            }
+        }
+    }
+    None
+}
+
+pub async fn chat(
+    key: &str,
+    req: &ChatRequest,
+) -> Result<ChatResponse, String> {
+    // Strip the "kimi/" namespace prefix if the caller used it so the
+    // upstream API sees the bare model id (e.g. "kimi-for-coding").
+    let model = req.model.strip_prefix("kimi/").unwrap_or(&req.model).to_string();
+
+    let body = KimiChatBody {
+        model: model.clone(),
+        messages: req.messages.iter().map(|m| KimiMessage {
+            role: m.role.clone(),
+            content: m.content.clone(),
+        }).collect(),
+        max_tokens: req.max_tokens.unwrap_or(800),
+        temperature: req.temperature.unwrap_or(0.3),
+        stream: false,
+    };
+
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(kimi_timeout_secs()))
+        .build()
+        .map_err(|e| format!("build client: {e}"))?;
+
+    let t0 = std::time::Instant::now();
+    let resp = client
+        .post(format!("{}/chat/completions", KIMI_BASE_URL))
+        .bearer_auth(key)
+        // api.kimi.com gates this endpoint by User-Agent — only sanctioned
+        // coding agents (Claude Code, Kimi CLI, Roo Code, Kilo Code) get
+        // through. Generic clients receive 403 access_terminated_error.
+        // J accepted the TOS risk on 2026-04-27; revisit if Moonshot
+        // tightens enforcement.
+        .header("User-Agent", "claude-code/1.0.0")
+        .json(&body)
+        .send()
+        .await
+        .map_err(|e| format!("api.kimi.com unreachable: {e}"))?;
+
+    let status = resp.status();
+    if !status.is_success() {
+        let body = resp.text().await.unwrap_or_else(|_| "?".into());
+        return Err(format!("api.kimi.com {}: {}", status, body));
+    }
+
+    let parsed: KimiChatResponse = resp.json().await
+        .map_err(|e| format!("invalid kimi response: {e}"))?;
+
+    let latency_ms = t0.elapsed().as_millis();
+    let choice = parsed.choices.into_iter().next()
+        .ok_or_else(|| "kimi returned no choices".to_string())?;
+    let text = choice.message.content;
+
+    let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
+        let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
+        ((chars + 3) / 4) as u32
+    });
+    let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
+        ((text.chars().count() + 3) / 4) as u32
+    });
+
+    tracing::info!(
+        target: "v1.chat",
+        provider = "kimi",
+        model = %model,
+        prompt_tokens,
+        completion_tokens,
+        latency_ms = latency_ms as u64,
+        "kimi chat completed",
+    );
+
+    Ok(ChatResponse {
+        id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
+        object: "chat.completion",
+        created: chrono::Utc::now().timestamp(),
+        model,
+        choices: vec![Choice {
+            index: 0,
+            message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
+            finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
+        }],
+        usage: UsageBlock {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+        },
+    })
+}
+
+// -- Kimi wire shapes (OpenAI-compatible) --
+
+#[derive(Serialize)]
+struct KimiChatBody {
+    model: String,
+    messages: Vec<KimiMessage>,
+    max_tokens: u32,
+    temperature: f64,
+    stream: bool,
+}
+
+#[derive(Serialize)]
+struct KimiMessage { role: String, content: serde_json::Value }
+
+#[derive(Deserialize)]
+struct KimiChatResponse {
+    choices: Vec<KimiChoice>,
+    #[serde(default)]
+    usage: Option<KimiUsage>,
+}
+
+#[derive(Deserialize)]
+struct KimiChoice {
+    message: KimiMessageResp,
+    #[serde(default)]
+    finish_reason: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct KimiMessageResp { content: String }
+
+#[derive(Deserialize)]
+struct KimiUsage { prompt_tokens: u32, completion_tokens: u32 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn resolve_kimi_key_does_not_panic() {
+        let _ = resolve_kimi_key();
+    }
+
+    #[test]
+    fn chat_body_serializes_to_openai_shape() {
+        let body = KimiChatBody {
+            model: "kimi-for-coding".into(),
+            messages: vec![
+                KimiMessage { role: "user".into(), content: "review this".into() },
+            ],
+            max_tokens: 800,
+            temperature: 0.3,
+            stream: false,
+        };
+        let json = serde_json::to_string(&body).unwrap();
+        assert!(json.contains("\"model\":\"kimi-for-coding\""));
+        assert!(json.contains("\"messages\""));
+        assert!(json.contains("\"max_tokens\":800"));
+        assert!(json.contains("\"stream\":false"));
+    }
+
+    #[test]
+    fn model_prefix_strip() {
+        let cases = [
+            ("kimi/kimi-for-coding", "kimi-for-coding"),
+            ("kimi-for-coding", "kimi-for-coding"),
+            ("kimi/kimi-k2.6", "kimi-k2.6"),
+        ];
+        for (input, expected) in cases {
+            let out = input.strip_prefix("kimi/").unwrap_or(input);
+            assert_eq!(out, expected, "{input} should become {expected}");
+        }
+    }
+}
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@ -16,6 +16,7 @@ pub mod ollama_cloud;
 pub mod openrouter;
 pub mod gemini;
 pub mod claude;
+pub mod kimi;
 pub mod langfuse_trace;
 pub mod mode;
 pub mod respond;
@ -53,6 +54,12 @@ pub struct V1State {
    /// `claude::resolve_claude_key()`. None = provider="claude" calls
    /// 503. Phase 40 deliverable.
    pub claude_key: Option<String>,
+    /// Kimi For Coding (api.kimi.com) bearer token — direct provider
+    /// for `kimi-for-coding`. Used when Ollama Cloud's `kimi-k2:1t` is
+    /// upstream-broken. Loaded at startup via `kimi::resolve_kimi_key()`
+    /// from `KIMI_API_KEY` env or `/etc/lakehouse/kimi.env`. None =
+    /// provider="kimi" calls 503.
+    pub kimi_key: Option<String>,
    /// Phase 40 early deliverable — Langfuse client. None = tracing
    /// disabled (keys missing or container unreachable). Traces are
    /// fire-and-forget: never block the response path.
@ -224,6 +231,9 @@ fn resolve_provider(req: &ChatRequest) -> (String, String) {
    if let Some(rest) = req.model.strip_prefix("claude/") {
        return ("claude".to_string(), rest.to_string());
    }
+    if let Some(rest) = req.model.strip_prefix("kimi/") {
+        return ("kimi".to_string(), rest.to_string());
+    }
    // Bare `vendor/model` shape (e.g. `x-ai/grok-4.1-fast`,
    // `moonshotai/kimi-k2`, `openai/gpt-oss-120b:free`) → OpenRouter.
    // This makes the gateway a drop-in OpenAI-compatible middleware:
@ -316,6 +326,12 @@ mod resolve_provider_tests {
        let r = mk_req(None, "claude/claude-3-5-sonnet-latest");
        assert_eq!(resolve_provider(&r), ("claude".into(), "claude-3-5-sonnet-latest".into()));
    }
+
+    #[test]
+    fn kimi_prefix_infers_and_strips() {
+        let r = mk_req(None, "kimi/kimi-for-coding");
+        assert_eq!(resolve_provider(&r), ("kimi".into(), "kimi-for-coding".into()));
+    }
 }

 async fn chat(
@ -403,10 +419,24 @@ async fn chat(
                .map_err(|e| (StatusCode::BAD_GATEWAY, format!("claude: {e}")))?;
            (r, "claude".to_string())
        }
+        "kimi" => {
+            // Direct Kimi For Coding provider — bypasses Ollama Cloud's
+            // upstream-broken kimi-k2:1t and OpenRouter's rate-limited
+            // moonshotai/kimi-k2.6. Uses sk-kimi-* keys from the Kimi
+            // membership console.
+            let key = state.kimi_key.as_deref().ok_or((
+                StatusCode::SERVICE_UNAVAILABLE,
+                "KIMI_API_KEY not configured".to_string(),
+            ))?;
+            let r = kimi::chat(key, &*req_for_adapter)
+                .await
+                .map_err(|e| (StatusCode::BAD_GATEWAY, format!("kimi: {e}")))?;
+            (r, "kimi".to_string())
+        }
        other => {
            return Err((
                StatusCode::BAD_REQUEST,
-                format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude"),
+                format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude, kimi"),
            ));
        }
    };
--- a/tests/real-world/scrum_master_pipeline.ts
+++ b/tests/real-world/scrum_master_pipeline.ts
@ -113,10 +113,17 @@ const TARGET_FILES: string[] = process.env.LH_SCRUM_FILES
 // strategy. Kimi K2.6, Gemini, free-tier, local fallback, etc. were
 // removed — they're available as routable tools later (mode router)
 // but not as automatic fallbacks.
-const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter"; model: string; note: string }> = [
+const LADDER: Array<{ provider: "ollama" | "ollama_cloud" | "openrouter" | "kimi"; model: string; note: string }> = [
  { provider: "openrouter",   model: "x-ai/grok-4.1-fast",                   note: "PRIMARY · Grok 4.1 fast · $0.20/$0.50 · 2M ctx · single-model strategy" },
  { provider: "openrouter",   model: "deepseek/deepseek-v4-flash",           note: "FALLBACK on provider error · DeepSeek V4 flash · $0.14/$0.28 · 1M ctx" },
  { provider: "openrouter",   model: "qwen/qwen3-235b-a22b-2507",            note: "LAST FALLBACK on provider error · Qwen3 235B · $0.07/$0.10 · 262K" },
+  // kimi/kimi-for-coding (api.kimi.com) is wired through the gateway
+  // but NOT in the auto-ladder. The endpoint is gated to specific
+  // approved coding-agent User-Agents (Claude Code, Kimi CLI, Roo Code,
+  // Kilo Code). Spoofing a User-Agent works technically but Moonshot's
+  // TOS marks it as grounds for membership suspension. Use Kimi via a
+  // sanctioned client (Claude Code subagent / Kimi CLI), not via this
+  // unattended scrum loop.
  // Dropped from the ladder after 2026-04-24 probe:
  //   - kimi-k2.6 — not available on current tier (empty response)
  //   - devstral-2:123b — displaced by qwen3-coder:480b (better coding specialist)
@ -738,7 +745,7 @@ async function lookupSignalClass(filePath: string): Promise<string | null> {
 }

 async function chat(opts: {
-  provider: "ollama" | "ollama_cloud",
+  provider: "ollama" | "ollama_cloud" | "openrouter" | "kimi",
  model: string,
  prompt: string,
  max_tokens?: number,