Phase 40 PRD (docs/CONTROL_PLANE_PRD.md:82-83) listed: - crates/aibridge/src/providers/gemini.rs - crates/aibridge/src/providers/claude.rs Neither existed. Landing both now, in gateway/src/v1/ (matches the existing ollama.rs + openrouter.rs sibling pattern — aibridge's providers/ is for the adapter *trait* abstractions, v1/ holds the concrete /v1/chat dispatchers that know the wire format). gemini.rs: - POST https://generativelanguage.googleapis.com/v1beta/models/ {model}:generateContent?key=<API_KEY> - Auth: query-string key (not bearer) - Maps messages → contents+parts (Gemini's wire shape), extracts from candidates[0].content.parts[0].text - 3 tests: key resolution, body serialization (camelCase generationConfig + maxOutputTokens), prefix-strip claude.rs: - POST https://api.anthropic.com/v1/messages - Auth: x-api-key header + anthropic-version: 2023-06-01 - Carries system prompt in top-level `system` field (not messages[]). Extracts from content[0].text where type=="text" - 4 tests: key resolution, body serialization with/without system field, prefix-strip v1/mod.rs: + V1State.gemini_key + claude_key Option<String> + resolve_provider() strips "gemini/" and "claude/" prefixes + /v1/chat dispatcher handles "gemini" + "claude"/"anthropic" + 2 new resolve_provider tests (prefix + strip per adapter) main.rs: + Construct both keys at startup via resolve_*_key() helpers. Missing keys log at debug (not warn) since these are optional providers — unlike OpenRouter which is the rescue rung. Every /v1/chat error path mirrors the existing pattern: - 503 SERVICE_UNAVAILABLE when key isn't configured - 502 BAD_GATEWAY with the provider's error text when the upstream call fails - Response shape always the OpenAI-compatible ChatResponse Workspace warnings still at 0. 9 new tests pass. Pre-existing test failure `executor_prompt_includes_surfaced_ candidates` at execution_loop/mod.rs:1550 is unrelated (fails on pristine HEAD too — PR fixture divergence). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
441 lines
16 KiB
Rust
441 lines
16 KiB
Rust
//! Phase 38 — Universal API skeleton (`/v1/*`).
|
|
//!
|
|
//! OpenAI-compatible shape on top of the existing aibridge → Ollama
|
|
//! path. This is the thin slice: single provider, stateless, no
|
|
//! streaming. Phase 39 replaces the direct Ollama call with a
|
|
//! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback.
|
|
//!
|
|
//! The shape matches OpenAI's `/v1/chat/completions` closely enough
|
|
//! that clients using openai-compatible SDKs can point at us with the
|
|
//! URL swap alone. We keep the endpoint path `/v1/chat` (not
|
|
//! `/v1/chat/completions`) because our PRD declares the terser form;
|
|
//! adding the alias is one line in Phase 39 when it matters.
|
|
|
|
pub mod ollama;
|
|
pub mod ollama_cloud;
|
|
pub mod openrouter;
|
|
pub mod gemini;
|
|
pub mod claude;
|
|
pub mod langfuse_trace;
|
|
pub mod respond;
|
|
pub mod truth;
|
|
|
|
use axum::{
|
|
Router,
|
|
extract::State,
|
|
http::StatusCode,
|
|
response::IntoResponse,
|
|
routing::{get, post},
|
|
Json,
|
|
};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
|
|
#[derive(Clone)]
|
|
pub struct V1State {
|
|
pub ai_client: aibridge::client::AiClient,
|
|
pub usage: Arc<RwLock<Usage>>,
|
|
/// Ollama Cloud bearer token. Loaded at startup via
|
|
/// `ollama_cloud::resolve_cloud_key()`. None = cloud routes 503.
|
|
pub ollama_cloud_key: Option<String>,
|
|
/// OpenRouter bearer token — free-tier rescue rung. Loaded at
|
|
/// startup via `openrouter::resolve_openrouter_key()`. None means
|
|
/// provider="openrouter" calls 503 rather than attempt. Same key
|
|
/// sourcing as LLM Team UI so the two share one API quota.
|
|
pub openrouter_key: Option<String>,
|
|
/// Gemini API key (Google Generative Language). Loaded at startup
|
|
/// via `gemini::resolve_gemini_key()`. None = provider="gemini"
|
|
/// calls 503. Phase 40 deliverable.
|
|
pub gemini_key: Option<String>,
|
|
/// Anthropic Claude API key. Loaded at startup via
|
|
/// `claude::resolve_claude_key()`. None = provider="claude" calls
|
|
/// 503. Phase 40 deliverable.
|
|
pub claude_key: Option<String>,
|
|
/// Phase 40 early deliverable — Langfuse client. None = tracing
|
|
/// disabled (keys missing or container unreachable). Traces are
|
|
/// fire-and-forget: never block the response path.
|
|
pub langfuse: Option<langfuse_trace::LangfuseClient>,
|
|
}
|
|
|
|
#[derive(Default, Clone, Serialize)]
|
|
pub struct Usage {
|
|
pub requests: u64,
|
|
pub prompt_tokens: u64,
|
|
pub completion_tokens: u64,
|
|
pub total_tokens: u64,
|
|
#[serde(default)]
|
|
pub by_provider: std::collections::HashMap<String, ProviderUsage>,
|
|
}
|
|
|
|
#[derive(Default, Clone, Serialize)]
|
|
pub struct ProviderUsage {
|
|
pub requests: u64,
|
|
pub prompt_tokens: u64,
|
|
pub completion_tokens: u64,
|
|
pub total_tokens: u64,
|
|
}
|
|
|
|
pub fn router(state: V1State) -> Router {
|
|
Router::new()
|
|
.route("/chat", post(chat))
|
|
.route("/respond", post(respond::respond))
|
|
.route("/usage", get(usage))
|
|
.route("/sessions", get(sessions))
|
|
.route("/context", get(truth::context))
|
|
.with_state(state)
|
|
}
|
|
|
|
// -- Shared types (OpenAI-compatible) --
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
|
pub struct Message {
|
|
pub role: String,
|
|
pub content: String,
|
|
}
|
|
|
|
#[derive(Deserialize, Debug, Clone)]
|
|
pub struct ChatRequest {
|
|
pub model: String,
|
|
pub messages: Vec<Message>,
|
|
#[serde(default)]
|
|
pub temperature: Option<f64>,
|
|
#[serde(default)]
|
|
pub max_tokens: Option<u32>,
|
|
/// Accepted for shape-compat but ignored in the thin slice —
|
|
/// Phase 38 returns non-streaming even when the client asked for it.
|
|
/// Phase 39+ wires real streaming.
|
|
#[serde(default)]
|
|
pub stream: Option<bool>,
|
|
/// Non-OpenAI extension. Passes through to the provider's thinking
|
|
/// toggle. Default: **false** — hot-path discipline for thinking
|
|
/// models (qwen3.5, qwen3, gpt-oss) that otherwise burn the token
|
|
/// budget on hidden reasoning before visible output starts,
|
|
/// producing empty responses. Set true explicitly when calling an
|
|
/// overseer / reasoning-heavy path.
|
|
#[serde(default)]
|
|
pub think: Option<bool>,
|
|
/// Non-OpenAI extension. Selects the provider adapter. Accepted:
|
|
/// - None / "ollama" / "local" → local Ollama via sidecar (default)
|
|
/// - "ollama_cloud" / "cloud" → Ollama Cloud direct HTTPS
|
|
/// Phase 40 adds a routing engine that picks this automatically
|
|
/// from the model name; Phase 38/39 requires explicit selection.
|
|
#[serde(default)]
|
|
pub provider: Option<String>,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
pub struct ChatResponse {
|
|
pub id: String,
|
|
pub object: &'static str,
|
|
pub created: i64,
|
|
pub model: String,
|
|
pub choices: Vec<Choice>,
|
|
pub usage: UsageBlock,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
pub struct Choice {
|
|
pub index: u32,
|
|
pub message: Message,
|
|
pub finish_reason: String,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Clone)]
|
|
pub struct UsageBlock {
|
|
pub prompt_tokens: u32,
|
|
pub completion_tokens: u32,
|
|
pub total_tokens: u32,
|
|
}
|
|
|
|
// -- Handlers --
|
|
|
|
/// Phase 39: resolve (provider, effective_model) from a ChatRequest.
|
|
///
|
|
/// Explicit `req.provider` wins. If absent, infer from a model-name
|
|
/// prefix: "openrouter/..." → openrouter (strip prefix), "cloud/..." →
|
|
/// ollama_cloud (strip prefix). Bare names default to "ollama".
|
|
///
|
|
/// The stripped model is what the upstream adapter expects:
|
|
/// OpenRouter's API wants "openai/gpt-4o-mini", not
|
|
/// "openrouter/openai/gpt-4o-mini".
|
|
fn resolve_provider(req: &ChatRequest) -> (String, String) {
|
|
if let Some(p) = req.provider.as_deref() {
|
|
return (p.to_ascii_lowercase(), req.model.clone());
|
|
}
|
|
if let Some(rest) = req.model.strip_prefix("openrouter/") {
|
|
return ("openrouter".to_string(), rest.to_string());
|
|
}
|
|
if let Some(rest) = req.model.strip_prefix("cloud/") {
|
|
return ("ollama_cloud".to_string(), rest.to_string());
|
|
}
|
|
if let Some(rest) = req.model.strip_prefix("gemini/") {
|
|
return ("gemini".to_string(), rest.to_string());
|
|
}
|
|
if let Some(rest) = req.model.strip_prefix("claude/") {
|
|
return ("claude".to_string(), rest.to_string());
|
|
}
|
|
("ollama".to_string(), req.model.clone())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod resolve_provider_tests {
|
|
use super::*;
|
|
|
|
fn mk_req(provider: Option<&str>, model: &str) -> ChatRequest {
|
|
ChatRequest {
|
|
model: model.to_string(),
|
|
messages: vec![],
|
|
temperature: None,
|
|
max_tokens: None,
|
|
stream: None,
|
|
think: None,
|
|
provider: provider.map(|s| s.to_string()),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn explicit_provider_wins() {
|
|
let r = mk_req(Some("openrouter"), "qwen3.5:latest");
|
|
assert_eq!(resolve_provider(&r), ("openrouter".into(), "qwen3.5:latest".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn bare_model_defaults_to_ollama() {
|
|
let r = mk_req(None, "qwen3.5:latest");
|
|
assert_eq!(resolve_provider(&r), ("ollama".into(), "qwen3.5:latest".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn openrouter_prefix_infers_and_strips() {
|
|
let r = mk_req(None, "openrouter/openai/gpt-4o-mini");
|
|
assert_eq!(resolve_provider(&r), ("openrouter".into(), "openai/gpt-4o-mini".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn cloud_prefix_infers_and_strips() {
|
|
let r = mk_req(None, "cloud/kimi-k2:1t");
|
|
assert_eq!(resolve_provider(&r), ("ollama_cloud".into(), "kimi-k2:1t".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn explicit_provider_preserves_full_model_even_with_prefix() {
|
|
// If caller provides both provider and a model with a prefix,
|
|
// trust them — don't strip. The adapter will get the full model
|
|
// string as-is.
|
|
let r = mk_req(Some("openrouter"), "openrouter/openai/gpt-4o-mini");
|
|
assert_eq!(resolve_provider(&r), ("openrouter".into(), "openrouter/openai/gpt-4o-mini".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn gemini_prefix_infers_and_strips() {
|
|
let r = mk_req(None, "gemini/gemini-2.0-flash");
|
|
assert_eq!(resolve_provider(&r), ("gemini".into(), "gemini-2.0-flash".into()));
|
|
}
|
|
|
|
#[test]
|
|
fn claude_prefix_infers_and_strips() {
|
|
let r = mk_req(None, "claude/claude-3-5-sonnet-latest");
|
|
assert_eq!(resolve_provider(&r), ("claude".into(), "claude-3-5-sonnet-latest".into()));
|
|
}
|
|
}
|
|
|
|
async fn chat(
|
|
State(state): State<V1State>,
|
|
Json(req): Json<ChatRequest>,
|
|
) -> Result<Json<ChatResponse>, (StatusCode, String)> {
|
|
if req.messages.is_empty() {
|
|
return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into()));
|
|
}
|
|
if req.stream.unwrap_or(false) {
|
|
tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
|
|
}
|
|
|
|
// Provider resolution: explicit `req.provider` wins; otherwise
|
|
// infer from a model-name prefix. Phase 39 PRD gate example:
|
|
// `model: "openrouter/openai/gpt-4o-mini"` → provider "openrouter",
|
|
// adapter gets the stripped "openai/gpt-4o-mini".
|
|
let (provider, effective_model) = resolve_provider(&req);
|
|
let start_time = chrono::Utc::now();
|
|
let start_instant = std::time::Instant::now();
|
|
|
|
// If we stripped a prefix, clone req with the effective model so
|
|
// the adapter sees what the upstream provider expects (OpenRouter
|
|
// wants "openai/gpt-4o-mini", not "openrouter/openai/gpt-4o-mini").
|
|
let req_for_adapter: std::borrow::Cow<'_, ChatRequest> =
|
|
if effective_model == req.model {
|
|
std::borrow::Cow::Borrowed(&req)
|
|
} else {
|
|
let mut cloned = req.clone();
|
|
cloned.model = effective_model.clone();
|
|
std::borrow::Cow::Owned(cloned)
|
|
};
|
|
|
|
let (resp, used_provider) = match provider.as_str() {
|
|
"ollama" | "local" | "" => {
|
|
let r = ollama::chat(&state.ai_client, &*req_for_adapter)
|
|
.await
|
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama local: {e}")))?;
|
|
(r, "ollama".to_string())
|
|
}
|
|
"ollama_cloud" | "cloud" => {
|
|
let key = state.ollama_cloud_key.as_deref().ok_or((
|
|
StatusCode::SERVICE_UNAVAILABLE,
|
|
"OLLAMA_CLOUD_KEY not configured".to_string(),
|
|
))?;
|
|
let r = ollama_cloud::chat(key, &*req_for_adapter)
|
|
.await
|
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("ollama cloud: {e}")))?;
|
|
(r, "ollama_cloud".to_string())
|
|
}
|
|
"openrouter" | "openrouter_free" => {
|
|
// Free-tier rescue rung. Added 2026-04-24 after iter 5
|
|
// repeated Ollama Cloud 502s on kimi-k2:1t — OpenRouter
|
|
// gives a different provider backbone as fallback.
|
|
let key = state.openrouter_key.as_deref().ok_or((
|
|
StatusCode::SERVICE_UNAVAILABLE,
|
|
"OPENROUTER_API_KEY not configured".to_string(),
|
|
))?;
|
|
let r = openrouter::chat(key, &*req_for_adapter)
|
|
.await
|
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("openrouter: {e}")))?;
|
|
(r, "openrouter".to_string())
|
|
}
|
|
"gemini" => {
|
|
// Phase 40 provider adapter. Google Generative Language
|
|
// API via query-string key auth (not bearer).
|
|
let key = state.gemini_key.as_deref().ok_or((
|
|
StatusCode::SERVICE_UNAVAILABLE,
|
|
"GEMINI_API_KEY not configured".to_string(),
|
|
))?;
|
|
let r = gemini::chat(key, &*req_for_adapter)
|
|
.await
|
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("gemini: {e}")))?;
|
|
(r, "gemini".to_string())
|
|
}
|
|
"claude" | "anthropic" => {
|
|
// Phase 40 provider adapter. Anthropic Messages API via
|
|
// x-api-key header + anthropic-version:2023-06-01.
|
|
let key = state.claude_key.as_deref().ok_or((
|
|
StatusCode::SERVICE_UNAVAILABLE,
|
|
"ANTHROPIC_API_KEY not configured".to_string(),
|
|
))?;
|
|
let r = claude::chat(key, &*req_for_adapter)
|
|
.await
|
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("claude: {e}")))?;
|
|
(r, "claude".to_string())
|
|
}
|
|
other => {
|
|
return Err((
|
|
StatusCode::BAD_REQUEST,
|
|
format!("unknown provider '{other}' — supported: ollama, ollama_cloud, openrouter, gemini, claude"),
|
|
));
|
|
}
|
|
};
|
|
|
|
let end_time = chrono::Utc::now();
|
|
let latency_ms = start_instant.elapsed().as_millis() as u64;
|
|
|
|
// Phase 40 — emit Langfuse trace. Fire-and-forget: the clone is
|
|
// cheap (Arc inside), the tokio::spawn never blocks us, a dead
|
|
// Langfuse just logs a warn. Client-visible response latency is
|
|
// untouched.
|
|
if let Some(lf) = &state.langfuse {
|
|
let output = resp.choices.first()
|
|
.map(|c| c.message.content.clone())
|
|
.unwrap_or_default();
|
|
lf.emit_chat(langfuse_trace::ChatTrace {
|
|
provider: used_provider.clone(),
|
|
model: resp.model.clone(),
|
|
input: req.messages.clone(),
|
|
output,
|
|
prompt_tokens: resp.usage.prompt_tokens,
|
|
completion_tokens: resp.usage.completion_tokens,
|
|
temperature: req.temperature,
|
|
max_tokens: req.max_tokens,
|
|
think: req.think,
|
|
start_time: start_time.to_rfc3339(),
|
|
end_time: end_time.to_rfc3339(),
|
|
latency_ms,
|
|
});
|
|
}
|
|
|
|
// Phase 40: per-provider usage tracking
|
|
{
|
|
let mut u = state.usage.write().await;
|
|
u.requests += 1;
|
|
u.prompt_tokens += resp.usage.prompt_tokens as u64;
|
|
u.completion_tokens += resp.usage.completion_tokens as u64;
|
|
u.total_tokens += resp.usage.total_tokens as u64;
|
|
|
|
let provider_usage = u.by_provider.entry(used_provider).or_default();
|
|
provider_usage.requests += 1;
|
|
provider_usage.prompt_tokens += resp.usage.prompt_tokens as u64;
|
|
provider_usage.completion_tokens += resp.usage.completion_tokens as u64;
|
|
provider_usage.total_tokens += resp.usage.total_tokens as u64;
|
|
}
|
|
|
|
Ok(Json(resp))
|
|
}
|
|
|
|
async fn usage(State(state): State<V1State>) -> impl IntoResponse {
|
|
let snapshot = state.usage.read().await.clone();
|
|
Json(snapshot)
|
|
}
|
|
|
|
// Phase 38 is stateless — no session persistence yet. Return an empty
|
|
// list in OpenAI-ish shape so clients that probe this endpoint don't
|
|
// 404. Real session state lands in Phase 41 with the profile-system
|
|
// expansion.
|
|
async fn sessions() -> impl IntoResponse {
|
|
Json(serde_json::json!({
|
|
"data": [],
|
|
"object": "list",
|
|
"note": "Phase 38: stateless. Session state lands in Phase 41.",
|
|
}))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn chat_request_parses_openai_shape() {
|
|
let raw = r#"{
|
|
"model": "qwen3.5:latest",
|
|
"messages": [
|
|
{"role": "system", "content": "You are helpful."},
|
|
{"role": "user", "content": "Hi"}
|
|
],
|
|
"temperature": 0.2,
|
|
"max_tokens": 100
|
|
}"#;
|
|
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
|
assert_eq!(r.model, "qwen3.5:latest");
|
|
assert_eq!(r.messages.len(), 2);
|
|
assert_eq!(r.messages[0].role, "system");
|
|
assert_eq!(r.messages[1].content, "Hi");
|
|
assert_eq!(r.temperature, Some(0.2));
|
|
assert_eq!(r.max_tokens, Some(100));
|
|
}
|
|
|
|
#[test]
|
|
fn chat_request_accepts_minimal() {
|
|
let raw = r#"{
|
|
"model": "any",
|
|
"messages": [{"role": "user", "content": "hi"}]
|
|
}"#;
|
|
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
|
assert_eq!(r.temperature, None);
|
|
assert_eq!(r.max_tokens, None);
|
|
assert_eq!(r.stream, None);
|
|
}
|
|
|
|
#[test]
|
|
fn usage_counter_default_is_zero() {
|
|
let u = Usage::default();
|
|
assert_eq!(u.requests, 0);
|
|
assert_eq!(u.total_tokens, 0);
|
|
}
|
|
}
|