diff --git a/crates/aibridge/src/client.rs b/crates/aibridge/src/client.rs index b8b45e9..83382fa 100644 --- a/crates/aibridge/src/client.rs +++ b/crates/aibridge/src/client.rs @@ -3,10 +3,26 @@ use serde::{Deserialize, Serialize}; use std::time::Duration; /// HTTP client for the Python AI sidecar. +/// +/// `generate()` has two transport modes: +/// - When `gateway_url` is None (default), it posts to +/// `${base_url}/generate` (sidecar direct). +/// - When `gateway_url` is `Some(url)`, it posts to +/// `${url}/v1/chat` with `provider="ollama"` so the call appears +/// in `/v1/usage` and Langfuse traces. +/// +/// `embed()`, `rerank()`, and admin methods always go direct to the +/// sidecar — no `/v1` equivalent yet, no point round-tripping. +/// +/// Phase 44 part 2 (2026-04-27): the gateway URL is wired in by +/// callers that want observability (vectord modules); it's left +/// unset by callers that ARE the gateway internals (avoids self-loops +/// + redundant hops). #[derive(Clone)] pub struct AiClient { client: Client, base_url: String, + gateway_url: Option, } // -- Request/Response types -- @@ -86,9 +102,22 @@ impl AiClient { Self { client, base_url: base_url.trim_end_matches('/').to_string(), + gateway_url: None, } } + /// Same as `new`, but every `generate()` is routed through + /// `${gateway_url}/v1/chat` (provider=ollama) for observability. + /// Use this for callers OUTSIDE the gateway. Inside the gateway + /// itself, prefer `new()` — calling /v1/chat from /v1/chat works + /// (no infinite loop, ollama_arm doesn't use AiClient) but adds + /// a wasted localhost hop. + pub fn new_with_gateway(base_url: &str, gateway_url: &str) -> Self { + let mut c = Self::new(base_url); + c.gateway_url = Some(gateway_url.trim_end_matches('/').to_string()); + c + } + pub async fn health(&self) -> Result { let resp = self.client .get(format!("{}/health", self.base_url)) @@ -114,6 +143,13 @@ impl AiClient { } pub async fn generate(&self, req: GenerateRequest) -> Result { + if let Some(gw) = self.gateway_url.as_deref() { + return self.generate_via_gateway(gw, req).await; + } + // Direct-sidecar legacy path. Used by gateway internals (so + // ollama_arm can call sidecar without a self-loop) and by + // any consumer that wants raw transport without /v1/usage + // accounting. let resp = self.client .post(format!("{}/generate", self.base_url)) .json(&req) @@ -128,6 +164,59 @@ impl AiClient { resp.json().await.map_err(|e| format!("generate parse error: {e}")) } + /// Phase 44 part 2: route generate() through the gateway's + /// /v1/chat with provider="ollama" so the call lands in + /// /v1/usage + Langfuse. Translates between the sidecar + /// GenerateRequest/Response shape and the OpenAI-compat + /// chat shape on the wire. + async fn generate_via_gateway(&self, gateway_url: &str, req: GenerateRequest) -> Result { + let mut messages = Vec::with_capacity(2); + if let Some(sys) = &req.system { + messages.push(serde_json::json!({"role": "system", "content": sys})); + } + messages.push(serde_json::json!({"role": "user", "content": req.prompt})); + let mut body = serde_json::json!({ + "messages": messages, + "provider": "ollama", + }); + if let Some(m) = &req.model { body["model"] = serde_json::json!(m); } + if let Some(t) = req.temperature { body["temperature"] = serde_json::json!(t); } + if let Some(mt) = req.max_tokens { body["max_tokens"] = serde_json::json!(mt); } + if let Some(th) = req.think { body["think"] = serde_json::json!(th); } + + let resp = self.client + .post(format!("{}/v1/chat", gateway_url)) + .json(&body) + .send() + .await + .map_err(|e| format!("/v1/chat request failed: {e}"))?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + return Err(format!("/v1/chat error: {text}")); + } + let parsed: serde_json::Value = resp.json().await + .map_err(|e| format!("/v1/chat parse error: {e}"))?; + + let text = parsed + .pointer("/choices/0/message/content") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let model = parsed.get("model") + .and_then(|v| v.as_str()) + .unwrap_or_else(|| req.model.as_deref().unwrap_or("")) + .to_string(); + let prompt_tokens = parsed.pointer("/usage/prompt_tokens").and_then(|v| v.as_u64()); + let completion_tokens = parsed.pointer("/usage/completion_tokens").and_then(|v| v.as_u64()); + + Ok(GenerateResponse { + text, + model, + tokens_evaluated: prompt_tokens, + tokens_generated: completion_tokens, + }) + } + pub async fn rerank(&self, req: RerankRequest) -> Result { let resp = self.client .post(format!("{}/rerank", self.base_url))