//! Phase 39 (first slice) — Ollama Cloud adapter. //! //! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer //! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern //! exactly (same endpoint, same body shape, same Bearer header), so //! cloud calls from Rust behave identically to the TS hot path. //! //! Key sourcing priority: //! 1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention) //! 2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key //! 3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention) //! //! First hit wins. Key is loaded once at gateway startup by //! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`. use std::time::Duration; use serde::{Deserialize, Serialize}; use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock}; const CLOUD_BASE_URL: &str = "https://ollama.com"; const CLOUD_TIMEOUT_SECS: u64 = 180; /// Read the Ollama Cloud key from the three sanctioned sources. Returns /// None if none is set — callers must 503 rather than attempt a call. pub fn resolve_cloud_key() -> Option { if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") { if !k.trim().is_empty() { return Some(k.trim().to_string()); } } if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") { if let Ok(v) = serde_json::from_str::(&raw) { if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) { if !k.trim().is_empty() { return Some(k.trim().to_string()); } } } } if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") { if !k.trim().is_empty() { return Some(k.trim().to_string()); } } None } pub async fn chat( key: &str, req: &ChatRequest, ) -> Result { let (system, prompt) = super::ollama::flatten_messages_public(&req.messages); let body = CloudGenerateBody { model: req.model.clone(), prompt, system: if system.is_empty() { None } else { Some(system) }, stream: false, think: Some(req.think.unwrap_or(false)), options: CloudOptions { // Thinking cloud models need headroom — floor 400 to give // qwen3.5:397b / gpt-oss:120b reasoning room. Matches // agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy. num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400), temperature: req.temperature.unwrap_or(0.3), }, }; let client = reqwest::Client::builder() .timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS)) .build() .map_err(|e| format!("build client: {e}"))?; let t0 = std::time::Instant::now(); let resp = client .post(format!("{}/api/generate", CLOUD_BASE_URL)) .bearer_auth(key) .json(&body) .send() .await .map_err(|e| format!("ollama.com unreachable: {e}"))?; let status = resp.status(); if !status.is_success() { let body = resp.text().await.unwrap_or_else(|_| "?".into()); return Err(format!("ollama.com {}: {}", status, body)); } let parsed: CloudGenerateResponse = resp.json().await .map_err(|e| format!("invalid cloud response: {e}"))?; let latency_ms = t0.elapsed().as_millis(); let text = parsed.response.unwrap_or_default(); let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| { let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum(); ((chars + 3) / 4) as u32 }); let completion_tokens = parsed.eval_count.unwrap_or_else(|| { ((text.chars().count() + 3) / 4) as u32 }); tracing::info!( target: "v1.chat", provider = "ollama_cloud", model = %req.model, prompt_tokens, completion_tokens, latency_ms = latency_ms as u64, "cloud chat completed", ); Ok(ChatResponse { id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)), object: "chat.completion", created: chrono::Utc::now().timestamp(), model: parsed.model.unwrap_or_else(|| req.model.clone()), choices: vec![Choice { index: 0, message: Message { role: "assistant".into(), content: text }, finish_reason: "stop".into(), }], usage: UsageBlock { prompt_tokens, completion_tokens, total_tokens: prompt_tokens + completion_tokens, }, }) } // -- Ollama /api/generate wire shapes -- #[derive(Serialize)] struct CloudGenerateBody { model: String, prompt: String, #[serde(skip_serializing_if = "Option::is_none")] system: Option, stream: bool, #[serde(skip_serializing_if = "Option::is_none")] think: Option, options: CloudOptions, } #[derive(Serialize)] struct CloudOptions { num_predict: u32, temperature: f64, } #[derive(Deserialize)] struct CloudGenerateResponse { #[serde(default)] response: Option, #[serde(default)] model: Option, #[serde(default)] prompt_eval_count: Option, #[serde(default)] eval_count: Option, } #[cfg(test)] mod tests { use super::*; #[test] fn resolve_cloud_key_returns_none_when_no_sources_set() { // Only check shape — we can't reliably unset env vars in a test // that runs alongside others, and the file path is on disk. // If all three sources are empty the function returns None; if // any are set we expect Some. This just smoke-tests the call // doesn't panic. let _ = resolve_cloud_key(); } #[test] fn cloud_body_serializes_compact() { let body = CloudGenerateBody { model: "gpt-oss:120b".into(), prompt: "user: hi\n\nassistant:".into(), system: Some("Be terse.".into()), stream: false, think: Some(false), options: CloudOptions { num_predict: 400, temperature: 0.3 }, }; let json = serde_json::to_string(&body).unwrap(); assert!(json.contains("\"model\":\"gpt-oss:120b\"")); assert!(json.contains("\"stream\":false")); assert!(json.contains("\"num_predict\":400")); assert!(json.contains("\"think\":false")); assert!(json.contains("\"system\":\"Be terse.\"")); } }