Phase 39 (first slice): Ollama Cloud adapter on /v1/chat

Second provider wired. /v1/chat now routes by optional `provider` field: default "ollama" hits local via sidecar, "ollama_cloud" (or "cloud") hits ollama.com/api/generate directly with Bearer auth. Key sourced at gateway startup from OLLAMA_CLOUD_KEY env, then /root/llm_team_config.json (providers.ollama_cloud.api_key), then OLLAMA_CLOUD_API_KEY env. Config source matches LLM Team convention. Shape-identical to scenario.ts::generateCloud — same endpoint, same body, same Bearer auth. Cloud path bypasses sidecar entirely (sidecar is local-only by design, mirrors TS agent.ts). Changes: - crates/gateway/src/v1/ollama_cloud.rs (new, 130 LOC) — reqwest client, resolve_cloud_key(), chat() adapter, CloudGenerateBody / CloudGenerateResponse wire shapes - crates/gateway/src/v1/ollama.rs — flatten_messages_public() re-export so sibling adapters reuse the shape collapse - crates/gateway/src/v1/mod.rs — provider field on ChatRequest, dispatch match in chat() handler, ollama_cloud_key on V1State - crates/gateway/src/main.rs — resolves cloud key at startup, logs which source provided it - crates/gateway/Cargo.toml — reqwest 0.12 with rustls-tls Verified end-to-end after restart: - provider=ollama → qwen3.5:latest local (~400ms, Phase 38 unchanged) - provider=ollama_cloud + model=gpt-oss:120b → real 225-word technical response in 5.4s, 313 tokens Tests: 9/9 green (7 from Phase 38 + 2 new for cloud body serialization and key resolver shape). Not in this slice: trait extraction (full Phase 39 scope adds ProviderAdapter trait + OpenRouter adapter + fallback chain logic). These land next with Phase 40 routing engine on top. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:57:42 -05:00 · 2026-04-22 02:57:42 -05:00 · 42a11d35cd
commit 42a11d35cd
parent 8cbbd0ef70
6 changed files with 243 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4079,6 +4079,7 @@ dependencies = [
 "opentelemetry_sdk",
 "proto",
 "queryd",
+ "reqwest",
 "serde",
 "serde_json",
 "shared",
--- a/crates/gateway/Cargo.toml
+++ b/crates/gateway/Cargo.toml
@ -28,3 +28,4 @@ opentelemetry-stdout = { workspace = true }
 tracing-opentelemetry = { workspace = true }
 arrow = { workspace = true }
 chrono = { workspace = true }
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -192,6 +192,18 @@ async fn main() {
        .nest("/v1", v1::router(v1::V1State {
            ai_client: ai_client.clone(),
            usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
+            // Phase 39 first slice — Ollama Cloud adapter. Key resolved
+            // from OLLAMA_CLOUD_KEY env, then /root/llm_team_config.json,
+            // then OLLAMA_CLOUD_API_KEY env. None = cloud routes 503.
+            ollama_cloud_key: {
+                let k = v1::ollama_cloud::resolve_cloud_key();
+                if k.is_some() {
+                    tracing::info!("v1: Ollama Cloud key loaded — /v1/chat provider=ollama_cloud enabled");
+                } else {
+                    tracing::warn!("v1: no Ollama Cloud key in env or /root/llm_team_config.json — cloud routes will 503");
+                }
+                k
+            },
        }));

    // Auth middleware (if enabled)
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@ -12,6 +12,7 @@
 //! adding the alias is one line in Phase 39 when it matters.

 pub mod ollama;
+pub mod ollama_cloud;

 use axum::{
    Router,
@ -29,6 +30,9 @@ use tokio::sync::RwLock;
 pub struct V1State {
    pub ai_client: aibridge::client::AiClient,
    pub usage: Arc<RwLock<Usage>>,
+    /// Ollama Cloud bearer token. Loaded at startup via
+    /// `ollama_cloud::resolve_cloud_key()`. None = cloud routes 503.
+    pub ollama_cloud_key: Option<String>,
 }

 #[derive(Default, Clone, Serialize)]
@ -76,6 +80,13 @@ pub struct ChatRequest {
    /// overseer / reasoning-heavy path.
    #[serde(default)]
    pub think: Option<bool>,
+    /// Non-OpenAI extension. Selects the provider adapter. Accepted:
+    ///   - None / "ollama" / "local" → local Ollama via sidecar (default)
+    ///   - "ollama_cloud" / "cloud" → Ollama Cloud direct HTTPS
+    /// Phase 40 adds a routing engine that picks this automatically
+    /// from the model name; Phase 38/39 requires explicit selection.
+    #[serde(default)]
+    pub provider: Option<String>,
 }

 #[derive(Serialize)]
@ -115,9 +126,27 @@ async fn chat(
        tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
    }

-    let resp = ollama::chat(&state.ai_client, &req)
-        .await
-        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
+    let provider = req.provider.as_deref().unwrap_or("ollama").to_ascii_lowercase();
+    let resp = match provider.as_str() {
+        "ollama" | "local" | "" => ollama::chat(&state.ai_client, &req)
+            .await
+            .map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?,
+        "ollama_cloud" | "cloud" => {
+            let key = state.ollama_cloud_key.as_deref().ok_or((
+                StatusCode::SERVICE_UNAVAILABLE,
+                "OLLAMA_CLOUD_KEY not configured — set env or populate /root/llm_team_config.json".to_string(),
+            ))?;
+            ollama_cloud::chat(key, &req)
+                .await
+                .map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?
+        }
+        other => {
+            return Err((
+                StatusCode::BAD_REQUEST,
+                format!("unknown provider '{other}' — supported: ollama, ollama_cloud"),
+            ));
+        }
+    };

    {
        let mut u = state.usage.write().await;
--- a/crates/gateway/src/v1/ollama.rs
+++ b/crates/gateway/src/v1/ollama.rs
@ -74,6 +74,14 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
    })
 }

+/// Public re-export of the flattener so sibling adapters (Ollama Cloud,
+/// future OpenRouter) can reuse the same shape collapse without
+/// duplicating the logic. Keeps `(system, prompt)` format consistent
+/// across providers.
+pub fn flatten_messages_public(messages: &[Message]) -> (String, String) {
+    flatten_messages(messages)
+}
+
 /// Collapse a message array into (system, prompt). Multiple system
 /// messages concatenate with a newline — matches OpenAI's documented
 /// behavior. Non-system messages become role-labeled turns.
--- a/crates/gateway/src/v1/ollama_cloud.rs
+++ b/crates/gateway/src/v1/ollama_cloud.rs
@ -0,0 +1,189 @@
+//! Phase 39 (first slice) — Ollama Cloud adapter.
+//!
+//! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer
+//! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern
+//! exactly (same endpoint, same body shape, same Bearer header), so
+//! cloud calls from Rust behave identically to the TS hot path.
+//!
+//! Key sourcing priority:
+//!   1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention)
+//!   2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key
+//!   3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention)
+//!
+//! First hit wins. Key is loaded once at gateway startup by
+//! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`.
+
+use std::time::Duration;
+use serde::{Deserialize, Serialize};
+
+use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
+
+const CLOUD_BASE_URL: &str = "https://ollama.com";
+const CLOUD_TIMEOUT_SECS: u64 = 180;
+
+/// Read the Ollama Cloud key from the three sanctioned sources. Returns
+/// None if none is set — callers must 503 rather than attempt a call.
+pub fn resolve_cloud_key() -> Option<String> {
+    if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") {
+        if !k.trim().is_empty() { return Some(k.trim().to_string()); }
+    }
+    if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
+        if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
+            if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) {
+                if !k.trim().is_empty() { return Some(k.trim().to_string()); }
+            }
+        }
+    }
+    if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") {
+        if !k.trim().is_empty() { return Some(k.trim().to_string()); }
+    }
+    None
+}
+
+pub async fn chat(
+    key: &str,
+    req: &ChatRequest,
+) -> Result<ChatResponse, String> {
+    let (system, prompt) = super::ollama::flatten_messages_public(&req.messages);
+
+    let body = CloudGenerateBody {
+        model: req.model.clone(),
+        prompt,
+        system: if system.is_empty() { None } else { Some(system) },
+        stream: false,
+        think: Some(req.think.unwrap_or(false)),
+        options: CloudOptions {
+            // Thinking cloud models need headroom — floor 400 to give
+            // qwen3.5:397b / gpt-oss:120b reasoning room. Matches
+            // agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy.
+            num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400),
+            temperature: req.temperature.unwrap_or(0.3),
+        },
+    };
+
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS))
+        .build()
+        .map_err(|e| format!("build client: {e}"))?;
+
+    let t0 = std::time::Instant::now();
+    let resp = client
+        .post(format!("{}/api/generate", CLOUD_BASE_URL))
+        .bearer_auth(key)
+        .json(&body)
+        .send()
+        .await
+        .map_err(|e| format!("ollama.com unreachable: {e}"))?;
+
+    let status = resp.status();
+    if !status.is_success() {
+        let body = resp.text().await.unwrap_or_else(|_| "?".into());
+        return Err(format!("ollama.com {}: {}", status, body));
+    }
+
+    let parsed: CloudGenerateResponse = resp.json().await
+        .map_err(|e| format!("invalid cloud response: {e}"))?;
+
+    let latency_ms = t0.elapsed().as_millis();
+    let text = parsed.response.unwrap_or_default();
+
+    let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
+        let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
+        ((chars + 3) / 4) as u32
+    });
+    let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
+        ((text.chars().count() + 3) / 4) as u32
+    });
+
+    tracing::info!(
+        target: "v1.chat",
+        provider = "ollama_cloud",
+        model = %req.model,
+        prompt_tokens,
+        completion_tokens,
+        latency_ms = latency_ms as u64,
+        "cloud chat completed",
+    );
+
+    Ok(ChatResponse {
+        id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
+        object: "chat.completion",
+        created: chrono::Utc::now().timestamp(),
+        model: parsed.model.unwrap_or_else(|| req.model.clone()),
+        choices: vec![Choice {
+            index: 0,
+            message: Message { role: "assistant".into(), content: text },
+            finish_reason: "stop".into(),
+        }],
+        usage: UsageBlock {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+        },
+    })
+}
+
+// -- Ollama /api/generate wire shapes --
+
+#[derive(Serialize)]
+struct CloudGenerateBody {
+    model: String,
+    prompt: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    system: Option<String>,
+    stream: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    think: Option<bool>,
+    options: CloudOptions,
+}
+
+#[derive(Serialize)]
+struct CloudOptions {
+    num_predict: u32,
+    temperature: f64,
+}
+
+#[derive(Deserialize)]
+struct CloudGenerateResponse {
+    #[serde(default)]
+    response: Option<String>,
+    #[serde(default)]
+    model: Option<String>,
+    #[serde(default)]
+    prompt_eval_count: Option<u32>,
+    #[serde(default)]
+    eval_count: Option<u32>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn resolve_cloud_key_returns_none_when_no_sources_set() {
+        // Only check shape — we can't reliably unset env vars in a test
+        // that runs alongside others, and the file path is on disk.
+        // If all three sources are empty the function returns None; if
+        // any are set we expect Some. This just smoke-tests the call
+        // doesn't panic.
+        let _ = resolve_cloud_key();
+    }
+
+    #[test]
+    fn cloud_body_serializes_compact() {
+        let body = CloudGenerateBody {
+            model: "gpt-oss:120b".into(),
+            prompt: "user: hi\n\nassistant:".into(),
+            system: Some("Be terse.".into()),
+            stream: false,
+            think: Some(false),
+            options: CloudOptions { num_predict: 400, temperature: 0.3 },
+        };
+        let json = serde_json::to_string(&body).unwrap();
+        assert!(json.contains("\"model\":\"gpt-oss:120b\""));
+        assert!(json.contains("\"stream\":false"));
+        assert!(json.contains("\"num_predict\":400"));
+        assert!(json.contains("\"think\":false"));
+        assert!(json.contains("\"system\":\"Be terse.\""));
+    }
+}