lakehouse/crates/gateway/src/v1/ollama_cloud.rs

//! Phase 39 (first slice) — Ollama Cloud adapter.
//!
//! Direct HTTPS call to `https://ollama.com/api/generate` with Bearer
//! auth. Mirrors the `tests/multi-agent/agent.ts::generateCloud` pattern
//! exactly (same endpoint, same body shape, same Bearer header), so
//! cloud calls from Rust behave identically to the TS hot path.
//!
//! Key sourcing priority:
//!   1. Env var `OLLAMA_CLOUD_KEY` (matches agent.ts convention)
//!   2. `/root/llm_team_config.json` → providers.ollama_cloud.api_key
//!   3. Env var `OLLAMA_CLOUD_API_KEY` (LLM Team UI convention)
//!
//! First hit wins. Key is loaded once at gateway startup by
//! `resolve_cloud_key()` and stored on `V1State.ollama_cloud_key`.

use std::time::Duration;
use serde::{Deserialize, Serialize};

use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};

const CLOUD_BASE_URL: &str = "https://ollama.com";
const CLOUD_TIMEOUT_SECS: u64 = 180;

/// Read the Ollama Cloud key from the three sanctioned sources. Returns
/// None if none is set — callers must 503 rather than attempt a call.
pub fn resolve_cloud_key() -> Option<String> {
    if let Ok(k) = std::env::var("OLLAMA_CLOUD_KEY") {
        if !k.trim().is_empty() { return Some(k.trim().to_string()); }
    }
    if let Ok(raw) = std::fs::read_to_string("/root/llm_team_config.json") {
        if let Ok(v) = serde_json::from_str::<serde_json::Value>(&raw) {
            if let Some(k) = v.pointer("/providers/ollama_cloud/api_key").and_then(|x| x.as_str()) {
                if !k.trim().is_empty() { return Some(k.trim().to_string()); }
            }
        }
    }
    if let Ok(k) = std::env::var("OLLAMA_CLOUD_API_KEY") {
        if !k.trim().is_empty() { return Some(k.trim().to_string()); }
    }
    None
}

pub async fn chat(
    key: &str,
    req: &ChatRequest,
) -> Result<ChatResponse, String> {
    let (system, prompt) = super::ollama::flatten_messages_public(&req.messages);

    let body = CloudGenerateBody {
        model: req.model.clone(),
        prompt,
        system: if system.is_empty() { None } else { Some(system) },
        stream: false,
        think: Some(req.think.unwrap_or(false)),
        options: CloudOptions {
            // Thinking cloud models need headroom — floor 400 to give
            // qwen3.5:397b / gpt-oss:120b reasoning room. Matches
            // agent.ts `Math.max(opts.max_tokens ?? 800, 400)` policy.
            num_predict: std::cmp::max(req.max_tokens.unwrap_or(800), 400),
            temperature: req.temperature.unwrap_or(0.3),
        },
    };

    let client = reqwest::Client::builder()
        .timeout(Duration::from_secs(CLOUD_TIMEOUT_SECS))
        .build()
        .map_err(|e| format!("build client: {e}"))?;

    let t0 = std::time::Instant::now();
    let resp = client
        .post(format!("{}/api/generate", CLOUD_BASE_URL))
        .bearer_auth(key)
        .json(&body)
        .send()
        .await
        .map_err(|e| format!("ollama.com unreachable: {e}"))?;

    let status = resp.status();
    if !status.is_success() {
        let body = resp.text().await.unwrap_or_else(|_| "?".into());
        return Err(format!("ollama.com {}: {}", status, body));
    }

    let parsed: CloudGenerateResponse = resp.json().await
        .map_err(|e| format!("invalid cloud response: {e}"))?;

    let latency_ms = t0.elapsed().as_millis();
    let text = parsed.response.unwrap_or_default();

    let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
        let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
        ((chars + 3) / 4) as u32
    });
    let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
        ((text.chars().count() + 3) / 4) as u32
    });

    tracing::info!(
        target: "v1.chat",
        provider = "ollama_cloud",
        model = %req.model,
        prompt_tokens,
        completion_tokens,
        latency_ms = latency_ms as u64,
        "cloud chat completed",
    );

    Ok(ChatResponse {
        id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
        object: "chat.completion",
        created: chrono::Utc::now().timestamp(),
        model: parsed.model.unwrap_or_else(|| req.model.clone()),
        choices: vec![Choice {
            index: 0,
            message: Message { role: "assistant".into(), content: text },
            finish_reason: "stop".into(),
        }],
        usage: UsageBlock {
            prompt_tokens,
            completion_tokens,
            total_tokens: prompt_tokens + completion_tokens,
        },
    })
}

// -- Ollama /api/generate wire shapes --

#[derive(Serialize)]
struct CloudGenerateBody {
    model: String,
    prompt: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    system: Option<String>,
    stream: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
    think: Option<bool>,
    options: CloudOptions,
}

#[derive(Serialize)]
struct CloudOptions {
    num_predict: u32,
    temperature: f64,
}

#[derive(Deserialize)]
struct CloudGenerateResponse {
    #[serde(default)]
    response: Option<String>,
    #[serde(default)]
    model: Option<String>,
    #[serde(default)]
    prompt_eval_count: Option<u32>,
    #[serde(default)]
    eval_count: Option<u32>,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn resolve_cloud_key_returns_none_when_no_sources_set() {
        // Only check shape — we can't reliably unset env vars in a test
        // that runs alongside others, and the file path is on disk.
        // If all three sources are empty the function returns None; if
        // any are set we expect Some. This just smoke-tests the call
        // doesn't panic.
        let _ = resolve_cloud_key();
    }

    #[test]
    fn cloud_body_serializes_compact() {
        let body = CloudGenerateBody {
            model: "gpt-oss:120b".into(),
            prompt: "user: hi\n\nassistant:".into(),
            system: Some("Be terse.".into()),
            stream: false,
            think: Some(false),
            options: CloudOptions { num_predict: 400, temperature: 0.3 },
        };
        let json = serde_json::to_string(&body).unwrap();
        assert!(json.contains("\"model\":\"gpt-oss:120b\""));
        assert!(json.contains("\"stream\":false"));
        assert!(json.contains("\"num_predict\":400"));
        assert!(json.contains("\"think\":false"));
        assert!(json.contains("\"system\":\"Be terse.\""));
    }
}