Second provider wired. /v1/chat now routes by optional `provider` field: default "ollama" hits local via sidecar, "ollama_cloud" (or "cloud") hits ollama.com/api/generate directly with Bearer auth. Key sourced at gateway startup from OLLAMA_CLOUD_KEY env, then /root/llm_team_config.json (providers.ollama_cloud.api_key), then OLLAMA_CLOUD_API_KEY env. Config source matches LLM Team convention. Shape-identical to scenario.ts::generateCloud — same endpoint, same body, same Bearer auth. Cloud path bypasses sidecar entirely (sidecar is local-only by design, mirrors TS agent.ts). Changes: - crates/gateway/src/v1/ollama_cloud.rs (new, 130 LOC) — reqwest client, resolve_cloud_key(), chat() adapter, CloudGenerateBody / CloudGenerateResponse wire shapes - crates/gateway/src/v1/ollama.rs — flatten_messages_public() re-export so sibling adapters reuse the shape collapse - crates/gateway/src/v1/mod.rs — provider field on ChatRequest, dispatch match in chat() handler, ollama_cloud_key on V1State - crates/gateway/src/main.rs — resolves cloud key at startup, logs which source provided it - crates/gateway/Cargo.toml — reqwest 0.12 with rustls-tls Verified end-to-end after restart: - provider=ollama → qwen3.5:latest local (~400ms, Phase 38 unchanged) - provider=ollama_cloud + model=gpt-oss:120b → real 225-word technical response in 5.4s, 313 tokens Tests: 9/9 green (7 from Phase 38 + 2 new for cloud body serialization and key resolver shape). Not in this slice: trait extraction (full Phase 39 scope adds ProviderAdapter trait + OpenRouter adapter + fallback chain logic). These land next with Phase 40 routing engine on top. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
190 lines
6.3 KiB
Rust
190 lines
6.3 KiB
Rust
//! Phase 39 (first slice) — Ollama Cloud adapter.
|
|
//!
|
|
//! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer
|
|
//! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern
|
|
//! exactly (same endpoint, same body shape, same Bearer header), so
|
|
//! cloud calls from Rust behave identically to the TS hot path.
|
|
//!
|
|
//! Key sourcing priority:
|
|
//! 1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention)
|
|
//! 2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key
|
|
//! 3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention)
|
|
//!
|
|
//! First hit wins. Key is loaded once at gateway startup by
|
|
//! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`.
|
|
|
|
use std::time::Duration;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
|
|
|
const CLOUD_BASE_URL: &str = "https://ollama.com";
|
|
const CLOUD_TIMEOUT_SECS: u64 = 180;
|
|
|
|
/// Read the Ollama Cloud key from the three sanctioned sources. Returns
|
|
/// None if none is set — callers must 503 rather than attempt a call.
|
|
pub fn resolve_cloud_key() -> Option<String> {
|
|
if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") {
|
|
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
|
}
|
|
if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
|
|
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
|
|
if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) {
|
|
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
|
}
|
|
}
|
|
}
|
|
if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") {
|
|
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
|
}
|
|
None
|
|
}
|
|
|
|
pub async fn chat(
|
|
key: &str,
|
|
req: &ChatRequest,
|
|
) -> Result<ChatResponse, String> {
|
|
let (system, prompt) = super::ollama::flatten_messages_public(&req.messages);
|
|
|
|
let body = CloudGenerateBody {
|
|
model: req.model.clone(),
|
|
prompt,
|
|
system: if system.is_empty() { None } else { Some(system) },
|
|
stream: false,
|
|
think: Some(req.think.unwrap_or(false)),
|
|
options: CloudOptions {
|
|
// Thinking cloud models need headroom — floor 400 to give
|
|
// qwen3.5:397b / gpt-oss:120b reasoning room. Matches
|
|
// agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy.
|
|
num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400),
|
|
temperature: req.temperature.unwrap_or(0.3),
|
|
},
|
|
};
|
|
|
|
let client = reqwest::Client::builder()
|
|
.timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS))
|
|
.build()
|
|
.map_err(|e| format!("build client: {e}"))?;
|
|
|
|
let t0 = std::time::Instant::now();
|
|
let resp = client
|
|
.post(format!("{}/api/generate", CLOUD_BASE_URL))
|
|
.bearer_auth(key)
|
|
.json(&body)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("ollama.com unreachable: {e}"))?;
|
|
|
|
let status = resp.status();
|
|
if !status.is_success() {
|
|
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
|
return Err(format!("ollama.com {}: {}", status, body));
|
|
}
|
|
|
|
let parsed: CloudGenerateResponse = resp.json().await
|
|
.map_err(|e| format!("invalid cloud response: {e}"))?;
|
|
|
|
let latency_ms = t0.elapsed().as_millis();
|
|
let text = parsed.response.unwrap_or_default();
|
|
|
|
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
|
|
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
|
((chars + 3) / 4) as u32
|
|
});
|
|
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
|
|
((text.chars().count() + 3) / 4) as u32
|
|
});
|
|
|
|
tracing::info!(
|
|
target: "v1.chat",
|
|
provider = "ollama_cloud",
|
|
model = %req.model,
|
|
prompt_tokens,
|
|
completion_tokens,
|
|
latency_ms = latency_ms as u64,
|
|
"cloud chat completed",
|
|
);
|
|
|
|
Ok(ChatResponse {
|
|
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
|
object: "chat.completion",
|
|
created: chrono::Utc::now().timestamp(),
|
|
model: parsed.model.unwrap_or_else(|| req.model.clone()),
|
|
choices: vec![Choice {
|
|
index: 0,
|
|
message: Message { role: "assistant".into(), content: text },
|
|
finish_reason: "stop".into(),
|
|
}],
|
|
usage: UsageBlock {
|
|
prompt_tokens,
|
|
completion_tokens,
|
|
total_tokens: prompt_tokens + completion_tokens,
|
|
},
|
|
})
|
|
}
|
|
|
|
// -- Ollama /api/generate wire shapes --
|
|
|
|
#[derive(Serialize)]
|
|
struct CloudGenerateBody {
|
|
model: String,
|
|
prompt: String,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
system: Option<String>,
|
|
stream: bool,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
think: Option<bool>,
|
|
options: CloudOptions,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct CloudOptions {
|
|
num_predict: u32,
|
|
temperature: f64,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct CloudGenerateResponse {
|
|
#[serde(default)]
|
|
response: Option<String>,
|
|
#[serde(default)]
|
|
model: Option<String>,
|
|
#[serde(default)]
|
|
prompt_eval_count: Option<u32>,
|
|
#[serde(default)]
|
|
eval_count: Option<u32>,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn resolve_cloud_key_returns_none_when_no_sources_set() {
|
|
// Only check shape — we can't reliably unset env vars in a test
|
|
// that runs alongside others, and the file path is on disk.
|
|
// If all three sources are empty the function returns None; if
|
|
// any are set we expect Some. This just smoke-tests the call
|
|
// doesn't panic.
|
|
let _ = resolve_cloud_key();
|
|
}
|
|
|
|
#[test]
|
|
fn cloud_body_serializes_compact() {
|
|
let body = CloudGenerateBody {
|
|
model: "gpt-oss:120b".into(),
|
|
prompt: "user: hi\n\nassistant:".into(),
|
|
system: Some("Be terse.".into()),
|
|
stream: false,
|
|
think: Some(false),
|
|
options: CloudOptions { num_predict: 400, temperature: 0.3 },
|
|
};
|
|
let json = serde_json::to_string(&body).unwrap();
|
|
assert!(json.contains("\"model\":\"gpt-oss:120b\""));
|
|
assert!(json.contains("\"stream\":false"));
|
|
assert!(json.contains("\"num_predict\":400"));
|
|
assert!(json.contains("\"think\":false"));
|
|
assert!(json.contains("\"system\":\"Be terse.\""));
|
|
}
|
|
}
|