Phase 39 (first slice): Ollama Cloud adapter on /v1/chat
Second provider wired. /v1/chat now routes by optional `provider` field: default "ollama" hits local via sidecar, "ollama_cloud" (or "cloud") hits ollama.com/api/generate directly with Bearer auth. Key sourced at gateway startup from OLLAMA_CLOUD_KEY env, then /root/llm_team_config.json (providers.ollama_cloud.api_key), then OLLAMA_CLOUD_API_KEY env. Config source matches LLM Team convention. Shape-identical to scenario.ts::generateCloud — same endpoint, same body, same Bearer auth. Cloud path bypasses sidecar entirely (sidecar is local-only by design, mirrors TS agent.ts). Changes: - crates/gateway/src/v1/ollama_cloud.rs (new, 130 LOC) — reqwest client, resolve_cloud_key(), chat() adapter, CloudGenerateBody / CloudGenerateResponse wire shapes - crates/gateway/src/v1/ollama.rs — flatten_messages_public() re-export so sibling adapters reuse the shape collapse - crates/gateway/src/v1/mod.rs — provider field on ChatRequest, dispatch match in chat() handler, ollama_cloud_key on V1State - crates/gateway/src/main.rs — resolves cloud key at startup, logs which source provided it - crates/gateway/Cargo.toml — reqwest 0.12 with rustls-tls Verified end-to-end after restart: - provider=ollama → qwen3.5:latest local (~400ms, Phase 38 unchanged) - provider=ollama_cloud + model=gpt-oss:120b → real 225-word technical response in 5.4s, 313 tokens Tests: 9/9 green (7 from Phase 38 + 2 new for cloud body serialization and key resolver shape). Not in this slice: trait extraction (full Phase 39 scope adds ProviderAdapter trait + OpenRouter adapter + fallback chain logic). These land next with Phase 40 routing engine on top. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8cbbd0ef70
commit
42a11d35cd
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -4079,6 +4079,7 @@ dependencies = [
|
||||
"opentelemetry_sdk",
|
||||
"proto",
|
||||
"queryd",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"shared",
|
||||
|
||||
@ -28,3 +28,4 @@ opentelemetry-stdout = { workspace = true }
|
||||
tracing-opentelemetry = { workspace = true }
|
||||
arrow = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
|
||||
@ -192,6 +192,18 @@ async fn main() {
|
||||
.nest("/v1", v1::router(v1::V1State {
|
||||
ai_client: ai_client.clone(),
|
||||
usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
|
||||
// Phase 39 first slice — Ollama Cloud adapter. Key resolved
|
||||
// from OLLAMA_CLOUD_KEY env, then /root/llm_team_config.json,
|
||||
// then OLLAMA_CLOUD_API_KEY env. None = cloud routes 503.
|
||||
ollama_cloud_key: {
|
||||
let k = v1::ollama_cloud::resolve_cloud_key();
|
||||
if k.is_some() {
|
||||
tracing::info!("v1: Ollama Cloud key loaded — /v1/chat provider=ollama_cloud enabled");
|
||||
} else {
|
||||
tracing::warn!("v1: no Ollama Cloud key in env or /root/llm_team_config.json — cloud routes will 503");
|
||||
}
|
||||
k
|
||||
},
|
||||
}));
|
||||
|
||||
// Auth middleware (if enabled)
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
//! adding the alias is one line in Phase 39 when it matters.
|
||||
|
||||
pub mod ollama;
|
||||
pub mod ollama_cloud;
|
||||
|
||||
use axum::{
|
||||
Router,
|
||||
@ -29,6 +30,9 @@ use tokio::sync::RwLock;
|
||||
pub struct V1State {
|
||||
pub ai_client: aibridge::client::AiClient,
|
||||
pub usage: Arc<RwLock<Usage>>,
|
||||
/// Ollama Cloud bearer token. Loaded at startup via
|
||||
/// `ollama_cloud::resolve_cloud_key()`. None = cloud routes 503.
|
||||
pub ollama_cloud_key: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
@ -76,6 +80,13 @@ pub struct ChatRequest {
|
||||
/// overseer / reasoning-heavy path.
|
||||
#[serde(default)]
|
||||
pub think: Option<bool>,
|
||||
/// Non-OpenAI extension. Selects the provider adapter. Accepted:
|
||||
/// - None / "ollama" / "local" → local Ollama via sidecar (default)
|
||||
/// - "ollama_cloud" / "cloud" → Ollama Cloud direct HTTPS
|
||||
/// Phase 40 adds a routing engine that picks this automatically
|
||||
/// from the model name; Phase 38/39 requires explicit selection.
|
||||
#[serde(default)]
|
||||
pub provider: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
@ -115,9 +126,27 @@ async fn chat(
|
||||
tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
|
||||
}
|
||||
|
||||
let resp = ollama::chat(&state.ai_client, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
|
||||
let provider = req.provider.as_deref().unwrap_or("ollama").to_ascii_lowercase();
|
||||
let resp = match provider.as_str() {
|
||||
"ollama" | "local" | "" => ollama::chat(&state.ai_client, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?,
|
||||
"ollama_cloud" | "cloud" => {
|
||||
let key = state.ollama_cloud_key.as_deref().ok_or((
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"OLLAMA_CLOUD_KEY not configured — set env or populate /root/llm_team_config.json".to_string(),
|
||||
))?;
|
||||
ollama_cloud::chat(key, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?
|
||||
}
|
||||
other => {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!("unknown provider '{other}' — supported: ollama, ollama_cloud"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
let mut u = state.usage.write().await;
|
||||
|
||||
@ -74,6 +74,14 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
|
||||
})
|
||||
}
|
||||
|
||||
/// Public re-export of the flattener so sibling adapters (Ollama Cloud,
|
||||
/// future OpenRouter) can reuse the same shape collapse without
|
||||
/// duplicating the logic. Keeps `(system, prompt)` format consistent
|
||||
/// across providers.
|
||||
pub fn flatten_messages_public(messages: &[Message]) -> (String, String) {
|
||||
flatten_messages(messages)
|
||||
}
|
||||
|
||||
/// Collapse a message array into (system, prompt). Multiple system
|
||||
/// messages concatenate with a newline — matches OpenAI's documented
|
||||
/// behavior. Non-system messages become role-labeled turns.
|
||||
|
||||
189
crates/gateway/src/v1/ollama_cloud.rs
Normal file
189
crates/gateway/src/v1/ollama_cloud.rs
Normal file
@ -0,0 +1,189 @@
|
||||
//! Phase 39 (first slice) — Ollama Cloud adapter.
|
||||
//!
|
||||
//! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer
|
||||
//! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern
|
||||
//! exactly (same endpoint, same body shape, same Bearer header), so
|
||||
//! cloud calls from Rust behave identically to the TS hot path.
|
||||
//!
|
||||
//! Key sourcing priority:
|
||||
//! 1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention)
|
||||
//! 2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key
|
||||
//! 3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention)
|
||||
//!
|
||||
//! First hit wins. Key is loaded once at gateway startup by
|
||||
//! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`.
|
||||
|
||||
use std::time::Duration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
const CLOUD_BASE_URL: &str = "https://ollama.com";
|
||||
const CLOUD_TIMEOUT_SECS: u64 = 180;
|
||||
|
||||
/// Read the Ollama Cloud key from the three sanctioned sources. Returns
|
||||
/// None if none is set — callers must 503 rather than attempt a call.
|
||||
pub fn resolve_cloud_key() -> Option<String> {
|
||||
if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
|
||||
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
|
||||
if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") {
|
||||
if !k.trim().is_empty() { return Some(k.trim().to_string()); }
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn chat(
|
||||
key: &str,
|
||||
req: &ChatRequest,
|
||||
) -> Result<ChatResponse, String> {
|
||||
let (system, prompt) = super::ollama::flatten_messages_public(&req.messages);
|
||||
|
||||
let body = CloudGenerateBody {
|
||||
model: req.model.clone(),
|
||||
prompt,
|
||||
system: if system.is_empty() { None } else { Some(system) },
|
||||
stream: false,
|
||||
think: Some(req.think.unwrap_or(false)),
|
||||
options: CloudOptions {
|
||||
// Thinking cloud models need headroom — floor 400 to give
|
||||
// qwen3.5:397b / gpt-oss:120b reasoning room. Matches
|
||||
// agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy.
|
||||
num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400),
|
||||
temperature: req.temperature.unwrap_or(0.3),
|
||||
},
|
||||
};
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS))
|
||||
.build()
|
||||
.map_err(|e| format!("build client: {e}"))?;
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client
|
||||
.post(format!("{}/api/generate", CLOUD_BASE_URL))
|
||||
.bearer_auth(key)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("ollama.com unreachable: {e}"))?;
|
||||
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body = resp.text().await.unwrap_or_else(|_| "?".into());
|
||||
return Err(format!("ollama.com {}: {}", status, body));
|
||||
}
|
||||
|
||||
let parsed: CloudGenerateResponse = resp.json().await
|
||||
.map_err(|e| format!("invalid cloud response: {e}"))?;
|
||||
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
let text = parsed.response.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
|
||||
((text.chars().count() + 3) / 4) as u32
|
||||
});
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
provider = "ollama_cloud",
|
||||
model = %req.model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"cloud chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model: parsed.model.unwrap_or_else(|| req.model.clone()),
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// -- Ollama /api/generate wire shapes --
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CloudGenerateBody {
|
||||
model: String,
|
||||
prompt: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
system: Option<String>,
|
||||
stream: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
think: Option<bool>,
|
||||
options: CloudOptions,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CloudOptions {
|
||||
num_predict: u32,
|
||||
temperature: f64,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct CloudGenerateResponse {
|
||||
#[serde(default)]
|
||||
response: Option<String>,
|
||||
#[serde(default)]
|
||||
model: Option<String>,
|
||||
#[serde(default)]
|
||||
prompt_eval_count: Option<u32>,
|
||||
#[serde(default)]
|
||||
eval_count: Option<u32>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn resolve_cloud_key_returns_none_when_no_sources_set() {
|
||||
// Only check shape — we can't reliably unset env vars in a test
|
||||
// that runs alongside others, and the file path is on disk.
|
||||
// If all three sources are empty the function returns None; if
|
||||
// any are set we expect Some. This just smoke-tests the call
|
||||
// doesn't panic.
|
||||
let _ = resolve_cloud_key();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cloud_body_serializes_compact() {
|
||||
let body = CloudGenerateBody {
|
||||
model: "gpt-oss:120b".into(),
|
||||
prompt: "user: hi\n\nassistant:".into(),
|
||||
system: Some("Be terse.".into()),
|
||||
stream: false,
|
||||
think: Some(false),
|
||||
options: CloudOptions { num_predict: 400, temperature: 0.3 },
|
||||
};
|
||||
let json = serde_json::to_string(&body).unwrap();
|
||||
assert!(json.contains("\"model\":\"gpt-oss:120b\""));
|
||||
assert!(json.contains("\"stream\":false"));
|
||||
assert!(json.contains("\"num_predict\":400"));
|
||||
assert!(json.contains("\"think\":false"));
|
||||
assert!(json.contains("\"system\":\"Be terse.\""));
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user