lakehouse/crates/gateway/src/v1/ollama_cloud.rs
profit 42a11d35cd Phase 39 (first slice): Ollama Cloud adapter on /v1/chat
Second provider wired. /v1/chat now routes by optional `provider`
field: default "ollama" hits local via sidecar, "ollama_cloud"
(or "cloud") hits ollama.com/api/generate directly with Bearer auth.
Key sourced at gateway startup from OLLAMA_CLOUD_KEY env, then
/root/llm_team_config.json (providers.ollama_cloud.api_key), then
OLLAMA_CLOUD_API_KEY env. Config source matches LLM Team convention.

Shape-identical to scenario.ts::generateCloud — same endpoint, same
body, same Bearer auth. Cloud path bypasses sidecar entirely (sidecar
is local-only by design, mirrors TS agent.ts).

Changes:
- crates/gateway/src/v1/ollama_cloud.rs (new, 130 LOC) — reqwest
  client, resolve_cloud_key(), chat() adapter, CloudGenerateBody /
  CloudGenerateResponse wire shapes
- crates/gateway/src/v1/ollama.rs — flatten_messages_public()
  re-export so sibling adapters reuse the shape collapse
- crates/gateway/src/v1/mod.rs — provider field on ChatRequest,
  dispatch match in chat() handler, ollama_cloud_key on V1State
- crates/gateway/src/main.rs — resolves cloud key at startup,
  logs which source provided it
- crates/gateway/Cargo.toml — reqwest 0.12 with rustls-tls

Verified end-to-end after restart:
- provider=ollama → qwen3.5:latest local (~400ms, Phase 38 unchanged)
- provider=ollama_cloud + model=gpt-oss:120b → real 225-word
  technical response in 5.4s, 313 tokens

Tests: 9/9 green (7 from Phase 38 + 2 new for cloud body serialization
and key resolver shape).

Not in this slice: trait extraction (full Phase 39 scope adds
ProviderAdapter trait + OpenRouter adapter + fallback chain logic).
These land next with Phase 40 routing engine on top.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:57:42 -05:00

190 lines
6.3 KiB
Rust

//! Phase 39 (first slice) — Ollama Cloud adapter.
//!
//! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer
//! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern
//! exactly (same endpoint, same body shape, same Bearer header), so
//! cloud calls from Rust behave identically to the TS hot path.
//!
//! Key sourcing priority:
//! 1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention)
//! 2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key
//! 3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention)
//!
//! First hit wins. Key is loaded once at gateway startup by
//! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`.
use std::time::Duration;
use serde::{Deserialize, Serialize};
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
const CLOUD_BASE_URL: &str = "https://ollama.com";
const CLOUD_TIMEOUT_SECS: u64 = 180;
/// Read the Ollama Cloud key from the three sanctioned sources. Returns
/// None if none is set — callers must 503 rather than attempt a call.
pub fn resolve_cloud_key() -> Option<String> {
if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") {
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
}
if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) {
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
}
}
}
if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") {
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
}
None
}
pub async fn chat(
key: &str,
req: &ChatRequest,
) -> Result<ChatResponse, String> {
let (system, prompt) = super::ollama::flatten_messages_public(&req.messages);
let body = CloudGenerateBody {
model: req.model.clone(),
prompt,
system: if system.is_empty() { None } else { Some(system) },
stream: false,
think: Some(req.think.unwrap_or(false)),
options: CloudOptions {
// Thinking cloud models need headroom — floor 400 to give
// qwen3.5:397b / gpt-oss:120b reasoning room. Matches
// agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy.
num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400),
temperature: req.temperature.unwrap_or(0.3),
},
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS))
.build()
.map_err(|e| format!("build client: {e}"))?;
let t0 = std::time::Instant::now();
let resp = client
.post(format!("{}/api/generate", CLOUD_BASE_URL))
.bearer_auth(key)
.json(&body)
.send()
.await
.map_err(|e| format!("ollama.com unreachable: {e}"))?;
let status = resp.status();
if !status.is_success() {
let body = resp.text().await.unwrap_or_else(|_| "?".into());
return Err(format!("ollama.com {}: {}", status, body));
}
let parsed: CloudGenerateResponse = resp.json().await
.map_err(|e| format!("invalid cloud response: {e}"))?;
let latency_ms = t0.elapsed().as_millis();
let text = parsed.response.unwrap_or_default();
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
((chars + 3) / 4) as u32
});
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
((text.chars().count() + 3) / 4) as u32
});
tracing::info!(
target: "v1.chat",
provider = "ollama_cloud",
model = %req.model,
prompt_tokens,
completion_tokens,
latency_ms = latency_ms as u64,
"cloud chat completed",
);
Ok(ChatResponse {
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
object: "chat.completion",
created: chrono::Utc::now().timestamp(),
model: parsed.model.unwrap_or_else(|| req.model.clone()),
choices: vec![Choice {
index: 0,
message: Message { role: "assistant".into(), content: text },
finish_reason: "stop".into(),
}],
usage: UsageBlock {
prompt_tokens,
completion_tokens,
total_tokens: prompt_tokens + completion_tokens,
},
})
}
// -- Ollama /api/generate wire shapes --
#[derive(Serialize)]
struct CloudGenerateBody {
model: String,
prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
system: Option<String>,
stream: bool,
#[serde(skip_serializing_if = "Option::is_none")]
think: Option<bool>,
options: CloudOptions,
}
#[derive(Serialize)]
struct CloudOptions {
num_predict: u32,
temperature: f64,
}
#[derive(Deserialize)]
struct CloudGenerateResponse {
#[serde(default)]
response: Option<String>,
#[serde(default)]
model: Option<String>,
#[serde(default)]
prompt_eval_count: Option<u32>,
#[serde(default)]
eval_count: Option<u32>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolve_cloud_key_returns_none_when_no_sources_set() {
// Only check shape — we can't reliably unset env vars in a test
// that runs alongside others, and the file path is on disk.
// If all three sources are empty the function returns None; if
// any are set we expect Some. This just smoke-tests the call
// doesn't panic.
let _ = resolve_cloud_key();
}
#[test]
fn cloud_body_serializes_compact() {
let body = CloudGenerateBody {
model: "gpt-oss:120b".into(),
prompt: "user: hi\n\nassistant:".into(),
system: Some("Be terse.".into()),
stream: false,
think: Some(false),
options: CloudOptions { num_predict: 400, temperature: 0.3 },
};
let json = serde_json::to_string(&body).unwrap();
assert!(json.contains("\"model\":\"gpt-oss:120b\""));
assert!(json.contains("\"stream\":false"));
assert!(json.contains("\"num_predict\":400"));
assert!(json.contains("\"think\":false"));
assert!(json.contains("\"system\":\"Be terse.\""));
}
}