//! Phase 38 — Universal API skeleton (`/v1/*`). //! //! OpenAI-compatible shape on top of the existing aibridge → Ollama //! path. This is the thin slice: single provider, stateless, no //! streaming. Phase 39 replaces the direct Ollama call with a //! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback. //! //! The shape matches OpenAI's `/v1/chat/completions` closely enough //! that clients using openai-compatible SDKs can point at us with the //! URL swap alone. We keep the endpoint path `/v1/chat` (not //! `/v1/chat/completions`) because our PRD declares the terser form; //! adding the alias is one line in Phase 39 when it matters. pub mod ollama; pub mod ollama_cloud; pub mod openrouter; pub mod gemini; pub mod claude; pub mod langfuse_trace; pub mod respond; pub mod truth; use axum::{ Router, extract::State, http::StatusCode, response::IntoResponse, routing::{get, post}, Json, }; use serde::{Deserialize, Serialize}; use std::sync::Arc; use tokio::sync::RwLock; #[derive(Clone)] pub struct V1State { pub ai_client: aibridge::client::AiClient, pub usage: Arc>, /// Ollama Cloud bearer token. Loaded at startup via /// `ollama_cloud::resolve_cloud_key()`. None = cloud routes 503. pub ollama_cloud_key: Option, /// OpenRouter bearer token — free-tier rescue rung. Loaded at /// startup via `openrouter::resolve_openrouter_key()`. None means /// provider="openrouter" calls 503 rather than attempt. Same key /// sourcing as LLM Team UI so the two share one API quota. pub openrouter_key: Option, /// Gemini API key (Google Generative Language). Loaded at startup /// via `gemini::resolve_gemini_key()`. None = provider="gemini" /// calls 503. Phase 40 deliverable. pub gemini_key: Option, /// Anthropic Claude API key. Loaded at startup via /// `claude::resolve_claude_key()`. None = provider="claude" calls /// 503. Phase 40 deliverable. pub claude_key: Option, /// Phase 40 early deliverable — Langfuse client. None = tracing /// disabled (keys missing or container unreachable). Traces are /// fire-and-forget: never block the response path. pub langfuse: Option, } #[derive(Default, Clone, Serialize)] pub struct Usage { pub requests: u64, pub prompt_tokens: u64, pub completion_tokens: u64, pub total_tokens: u64, #[serde(default)] pub by_provider: std::collections::HashMap, } #[derive(Default, Clone, Serialize)] pub struct ProviderUsage { pub requests: u64, pub prompt_tokens: u64, pub completion_tokens: u64, pub total_tokens: u64, } pub fn router(state: V1State) -> Router { Router::new() .route("/chat", post(chat)) .route("/respond", post(respond::respond)) .route("/usage", get(usage)) .route("/sessions", get(sessions)) .route("/context", get(truth::context)) .with_state(state) } // -- Shared types (OpenAI-compatible) -- #[derive(Serialize, Deserialize, Clone, Debug)] pub struct Message { pub role: String, pub content: String, } #[derive(Deserialize, Debug, Clone)] pub struct ChatRequest { pub model: String, pub messages: Vec, #[serde(default)] pub temperature: Option, #[serde(default)] pub max_tokens: Option, /// Accepted for shape-compat but ignored in the thin slice — /// Phase 38 returns non-streaming even when the client asked for it. /// Phase 39+ wires real streaming. #[serde(default)] pub stream: Option, /// Non-OpenAI extension. Passes through to the provider's thinking /// toggle. Default: **false** — hot-path discipline for thinking /// models (qwen3.5, qwen3, gpt-oss) that otherwise burn the token /// budget on hidden reasoning before visible output starts, /// producing empty responses. Set true explicitly when calling an /// overseer / reasoning-heavy path. #[serde(default)] pub think: Option, /// Non-OpenAI extension. Selects the provider adapter. Accepted: /// - None / "ollama" / "local" → local Ollama via sidecar (default) /// - "ollama_cloud" / "cloud" → Ollama Cloud direct HTTPS /// Phase 40 adds a routing engine that picks this automatically /// from the model name; Phase 38/39 requires explicit selection. #[serde(default)] pub provider: Option, } #[derive(Serialize)] pub struct ChatResponse { pub id: String, pub object: &'static str, pub created: i64, pub model: String, pub choices: Vec, pub usage: UsageBlock, } #[derive(Serialize)] pub struct Choice { pub index: u32, pub message: Message, pub finish_reason: String, } #[derive(Serialize, Deserialize, Clone)] pub struct UsageBlock { pub prompt_tokens: u32, pub completion_tokens: u32, pub total_tokens: u32, } // -- Handlers -- /// Phase 39: resolve (provider, effective_model) from a ChatRequest. /// /// Explicit `req.provider` wins. If absent, infer from a model-name /// prefix: "openrouter/..." → openrouter (strip prefix), "cloud/..." → /// ollama_cloud (strip prefix). Bare names default to "ollama". /// /// The stripped model is what the upstream adapter expects: /// OpenRouter's API wants "openai/gpt-4o-mini", not /// "openrouter/openai/gpt-4o-mini". fn resolve_provider(req: &ChatRequest) -> (String, String) { if let Some(p) = req.provider.as_deref() { return (p.to_ascii_lowercase(), req.model.clone()); } if let Some(rest) = req.model.strip_prefix("openrouter/") { return ("openrouter".to_string(), rest.to_string()); } if let Some(rest) = req.model.strip_prefix("cloud/") { return ("ollama_cloud".to_string(), rest.to_string()); } if let Some(rest) = req.model.strip_prefix("gemini/") { return ("gemini".to_string(), rest.to_string()); } if let Some(rest) = req.model.strip_prefix("claude/") { return ("claude".to_string(), rest.to_string()); } ("ollama".to_string(), req.model.clone()) } #[cfg(test)] mod resolve_provider_tests { use super::*; fn mk_req(provider: Option<&str>, model: &str) -> ChatRequest { ChatRequest { model: model.to_string(), messages: vec![], temperature: None, max_tokens: None, stream: None, think: None, provider: provider.map(|s| s.to_string()), } } #[test] fn explicit_provider_wins() { let r = mk_req(Some("openrouter"), "qwen3.5:latest"); assert_eq!(resolve_provider(&r), ("openrouter".into(), "qwen3.5:latest".into())); } #[test] fn bare_model_defaults_to_ollama() { let r = mk_req(None, "qwen3.5:latest"); assert_eq!(resolve_provider(&r), ("ollama".into(), "qwen3.5:latest".into())); } #[test] fn openrouter_prefix_infers_and_strips() { let r = mk_req(None, "openrouter/openai/gpt-4o-mini"); assert_eq!(resolve_provider(&r), ("openrouter".into(), "openai/gpt-4o-mini".into())); } #[test] fn cloud_prefix_infers_and_strips() { let r = mk_req(None, "cloud/kimi-k2:1t"); assert_eq!(resolve_provider(&r), ("ollama_cloud".into(), "kimi-k2:1t".into())); } #[test] fn explicit_provider_preserves_full_model_even_with_prefix() { // If caller provides both provider and a model with a prefix, // trust them — don't strip. The adapter will get the full model // string as-is. let r = mk_req(Some("openrouter"), "openrouter/openai/gpt-4o-mini"); assert_eq!(resolve_provider(&r), ("openrouter".into(), "openrouter/openai/gpt-4o-mini".into())); } #[test] fn gemini_prefix_infers_and_strips() { let r = mk_req(None, "gemini/gemini-2.0-flash"); assert_eq!(resolve_provider(&r), ("gemini".into(), "gemini-2.0-flash".into())); } #[test] fn claude_prefix_infers_and_strips() { let r = mk_req(None, "claude/claude-3-5-sonnet-latest"); assert_eq!(resolve_provider(&r), ("claude".into(), "claude-3-5-sonnet-latest".into())); } } async fn chat( State(state): State, Json(req): Json, ) -> Result, (StatusCode, String)> { if req.messages.is_empty() { return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into())); } if req.stream.unwrap_or(false) { tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming"); } // Provider resolution: explicit `req.provider` wins; otherwise // infer from a model-name prefix. Phase 39 PRD gate example: // `model: "openrouter/openai/gpt-4o-mini"` → provider "openrouter", // adapter gets the stripped "openai/gpt-4o-mini". let (provider, effective_model) = resolve_provider(&req); let start_time = chrono::Utc::now(); let start_instant = std::time::Instant::now(); // If we stripped a prefix, clone req with the effective model so // the adapter sees what the upstream provider expects (OpenRouter // wants "openai/gpt-4o-mini", not "openrouter/openai/gpt-4o-mini"). let req_for_adapter: std::borrow::Cow<'_, ChatRequest> = if effective_model == req.model { std::borrow::Cow::Borrowed(&req) } else { let mut cloned = req.clone(); cloned.model = effective_model.clone(); std::borrow::Cow::Owned(cloned) }; let (resp, used_provider) = match provider.as_str() { "ollama" | "local" | "" => { let r = ollama::chat(&state.ai_client, &*req_for_adapter) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?; (r, "ollama".to_string()) } "ollama_cloud" | "cloud" => { let key = state.ollama_cloud_key.as_deref().ok_or(( StatusCode::SERVICE_UNAVAILABLE, "OLLAMA_CLOUD_KEY not configured".to_string(), ))?; let r = ollama_cloud::chat(key, &*req_for_adapter) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?; (r, "ollama_cloud".to_string()) } "openrouter" | "openrouter_free" => { // Free-tier rescue rung. Added 2026-04-24 after iter 5 // repeated Ollama Cloud 502s on kimi-k2:1t — OpenRouter // gives a different provider backbone as fallback. let key = state.openrouter_key.as_deref().ok_or(( StatusCode::SERVICE_UNAVAILABLE, "OPENROUTER_API_KEY not configured".to_string(), ))?; let r = openrouter::chat(key, &*req_for_adapter) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("openrouter: {e}")))?; (r, "openrouter".to_string()) } "gemini" => { // Phase 40 provider adapter. Google Generative Language // API via query-string key auth (not bearer). let key = state.gemini_key.as_deref().ok_or(( StatusCode::SERVICE_UNAVAILABLE, "GEMINI_API_KEY not configured".to_string(), ))?; let r = gemini::chat(key, &*req_for_adapter) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("gemini: {e}")))?; (r, "gemini".to_string()) } "claude" | "anthropic" => { // Phase 40 provider adapter. Anthropic Messages API via // x-api-key header + anthropic-version:2023-06-01. let key = state.claude_key.as_deref().ok_or(( StatusCode::SERVICE_UNAVAILABLE, "ANTHROPIC_API_KEY not configured".to_string(), ))?; let r = claude::chat(key, &*req_for_adapter) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("claude: {e}")))?; (r, "claude".to_string()) } other => { return Err(( StatusCode::BAD_REQUEST, format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude"), )); } }; let end_time = chrono::Utc::now(); let latency_ms = start_instant.elapsed().as_millis() as u64; // Phase 40 — emit Langfuse trace. Fire-and-forget: the clone is // cheap (Arc inside), the tokio::spawn never blocks us, a dead // Langfuse just logs a warn. Client-visible response latency is // untouched. if let Some(lf) = &state.langfuse { let output = resp.choices.first() .map(|c| c.message.content.clone()) .unwrap_or_default(); lf.emit_chat(langfuse_trace::ChatTrace { provider: used_provider.clone(), model: resp.model.clone(), input: req.messages.clone(), output, prompt_tokens: resp.usage.prompt_tokens, completion_tokens: resp.usage.completion_tokens, temperature: req.temperature, max_tokens: req.max_tokens, think: req.think, start_time: start_time.to_rfc3339(), end_time: end_time.to_rfc3339(), latency_ms, }); } // Phase 40: per-provider usage tracking { let mut u = state.usage.write().await; u.requests += 1; u.prompt_tokens += resp.usage.prompt_tokens as u64; u.completion_tokens += resp.usage.completion_tokens as u64; u.total_tokens += resp.usage.total_tokens as u64; let provider_usage = u.by_provider.entry(used_provider).or_default(); provider_usage.requests += 1; provider_usage.prompt_tokens += resp.usage.prompt_tokens as u64; provider_usage.completion_tokens += resp.usage.completion_tokens as u64; provider_usage.total_tokens += resp.usage.total_tokens as u64; } Ok(Json(resp)) } async fn usage(State(state): State) -> impl IntoResponse { let snapshot = state.usage.read().await.clone(); Json(snapshot) } // Phase 38 is stateless — no session persistence yet. Return an empty // list in OpenAI-ish shape so clients that probe this endpoint don't // 404. Real session state lands in Phase 41 with the profile-system // expansion. async fn sessions() -> impl IntoResponse { Json(serde_json::json!({ "data": [], "object": "list", "note": "Phase 38: stateless. Session state lands in Phase 41.", })) } #[cfg(test)] mod tests { use super::*; #[test] fn chat_request_parses_openai_shape() { let raw = r#"{ "model": "qwen3.5:latest", "messages": [ {"role": "system", "content": "You are helpful."}, {"role": "user", "content": "Hi"} ], "temperature": 0.2, "max_tokens": 100 }"#; let r: ChatRequest = serde_json::from_str(raw).unwrap(); assert_eq!(r.model, "qwen3.5:latest"); assert_eq!(r.messages.len(), 2); assert_eq!(r.messages[0].role, "system"); assert_eq!(r.messages[1].content, "Hi"); assert_eq!(r.temperature, Some(0.2)); assert_eq!(r.max_tokens, Some(100)); } #[test] fn chat_request_accepts_minimal() { let raw = r#"{ "model": "any", "messages": [{"role": "user", "content": "hi"}] }"#; let r: ChatRequest = serde_json::from_str(raw).unwrap(); assert_eq!(r.temperature, None); assert_eq!(r.max_tokens, None); assert_eq!(r.stream, None); } #[test] fn usage_counter_default_is_zero() { let u = Usage::default(); assert_eq!(u.requests, 0); assert_eq!(u.total_tokens, 0); } }