use lru::LruCache; use reqwest::Client; use serde::{Deserialize, Serialize}; use std::num::NonZeroUsize; use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use std::time::Duration; /// HTTP client for Ollama (post-2026-05-02 — sidecar dropped). /// /// `base_url` was historically the Python sidecar at `:3200`, which /// pass-through-proxied to Ollama at `:11434`. The sidecar added zero /// logic on the hot path (embed.py + generate.py + rerank.py + /// admin.py = ~120 LOC of pure Ollama wrappers), so this client now /// talks to Ollama directly and the sidecar process can be retired. /// /// What stayed Python: `lab_ui.py` + `pipeline_lab.py` (~888 LOC of /// dev-mode Streamlit-shape UIs) — those aren't on the runtime hot /// path and continue running for prompt experimentation. /// /// `generate()` has two transport modes: /// - When `gateway_url` is None (default), posts directly to Ollama's /// `${base_url}/api/generate`. /// - When `gateway_url` is `Some(url)`, posts to `${url}/v1/chat` /// with `provider="ollama"` so the call appears in `/v1/usage` and /// Langfuse traces. /// /// `embed()`, `rerank()`, and admin methods always go direct to /// Ollama — no `/v1` equivalent for those surfaces yet. /// /// Phase 44 part 2 (2026-04-27): the gateway URL is wired in by /// callers that want observability (vectord modules); it's left /// unset by callers that ARE the gateway internals (avoids self-loops /// + redundant hops). /// Per-text embed cache key. We key on (model, text) so different /// model selections produce distinct cache lines — a query embedded /// under nomic-embed-text-v2-moe must NOT collide with the same /// query under nomic-embed-text v1. #[derive(Eq, PartialEq, Hash, Clone)] struct EmbedCacheKey { model: String, text: String, } /// Default LRU cache size — 4096 entries × ~6KB per 768-d f64 /// vector ≈ 24MB. Sized for typical staffing-domain repetition /// (coordinator workflows have query repetition rates around 70-90% /// per session). Tunable via [aibridge].embed_cache_size in the /// config; 0 disables the cache entirely. const DEFAULT_EMBED_CACHE_SIZE: usize = 4096; #[derive(Clone)] pub struct AiClient { client: Client, base_url: String, gateway_url: Option, /// Closes the 63× perf gap with Go side. Mirrors the shape of /// Go's internal/embed/cached.go::CachedProvider — same /// (model, text) → vector caching, same nil-disable semantics. /// None = caching disabled (cache_size=0); Some = bounded LRU. embed_cache: Option>>>>, /// Hit / miss counters for /admin observability + load-test /// validation. Atomic so Clone'd AiClients share the same counts. embed_cache_hits: Arc, embed_cache_misses: Arc, /// Pinned at construction time so the EmbedResponse can carry /// dimension consistently even when every text was a cache hit /// (no fresh upstream call to learn the dim from). Set on first /// successful Ollama embed; checked on every cache hit. cached_dim: Arc, } // -- Request/Response types -- #[derive(Serialize, Deserialize)] pub struct EmbedRequest { pub texts: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, } #[derive(Deserialize, Serialize, Clone)] pub struct EmbedResponse { pub embeddings: Vec>, pub model: String, pub dimensions: usize, } #[derive(Clone, Serialize, Deserialize)] pub struct GenerateRequest { pub prompt: String, #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, #[serde(skip_serializing_if = "Option::is_none")] pub system: Option, #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, #[serde(skip_serializing_if = "Option::is_none")] pub max_tokens: Option, /// Phase 21 — per-call opt-out of hidden reasoning. Thinking models /// (qwen3.5, gpt-oss, etc) burn tokens on reasoning before the /// visible response starts; setting this to `false` on hot-path /// JSON emitters avoids empty returns when the budget is tight. /// Sidecar forwards this to Ollama's `think` parameter; if the /// sidecar drops an unknown field the request still succeeds. #[serde(skip_serializing_if = "Option::is_none")] pub think: Option, } #[derive(Deserialize, Serialize, Clone)] pub struct GenerateResponse { pub text: String, pub model: String, pub tokens_evaluated: Option, pub tokens_generated: Option, } #[derive(Serialize, Deserialize)] pub struct RerankRequest { pub query: String, pub documents: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, #[serde(skip_serializing_if = "Option::is_none")] pub top_k: Option, } #[derive(Deserialize, Serialize, Clone)] pub struct ScoredDocument { pub index: usize, pub text: String, pub score: f64, } #[derive(Deserialize, Serialize, Clone)] pub struct RerankResponse { pub results: Vec, pub model: String, } impl AiClient { pub fn new(base_url: &str) -> Self { Self::with_embed_cache(base_url, DEFAULT_EMBED_CACHE_SIZE) } /// Constructs an AiClient with an explicit embed-cache size. /// Pass 0 to disable the cache entirely (matches Go-side /// CachedProvider's nil-cache semantics). pub fn with_embed_cache(base_url: &str, cache_size: usize) -> Self { let client = Client::builder() .timeout(Duration::from_secs(120)) .build() .expect("failed to build HTTP client"); let embed_cache = if cache_size > 0 { // SAFETY: cache_size > 0 just verified, NonZeroUsize::new // returns Some. let cap = NonZeroUsize::new(cache_size).expect("cache_size > 0"); Some(Arc::new(Mutex::new(LruCache::new(cap)))) } else { None }; Self { client, base_url: base_url.trim_end_matches('/').to_string(), gateway_url: None, embed_cache, embed_cache_hits: Arc::new(AtomicU64::new(0)), embed_cache_misses: Arc::new(AtomicU64::new(0)), cached_dim: Arc::new(AtomicU64::new(0)), } } /// Cache hit/miss/size snapshot. Useful for /admin endpoints + /// load-test validation ("did the cache fire as expected?"). pub fn embed_cache_stats(&self) -> (u64, u64, usize) { let hits = self.embed_cache_hits.load(Ordering::Relaxed); let misses = self.embed_cache_misses.load(Ordering::Relaxed); let len = self .embed_cache .as_ref() .map(|c| c.lock().map(|g| g.len()).unwrap_or(0)) .unwrap_or(0); (hits, misses, len) } /// Same as `new`, but every `generate()` is routed through /// `${gateway_url}/v1/chat` (provider=ollama) for observability. /// Use this for callers OUTSIDE the gateway. Inside the gateway /// itself, prefer `new()` — calling /v1/chat from /v1/chat works /// (no infinite loop, ollama_arm doesn't use AiClient) but adds /// a wasted localhost hop. pub fn new_with_gateway(base_url: &str, gateway_url: &str) -> Self { let mut c = Self::new(base_url); c.gateway_url = Some(gateway_url.trim_end_matches('/').to_string()); c } /// Reachability + version check. Hits Ollama's `/api/version`, /// returns a sidecar-shaped envelope so callers reading /// `.status` / `.ollama_url` don't break across the /// pre-/post-2026-05-02 cutover. pub async fn health(&self) -> Result { let resp = self.client .get(format!("{}/api/version", self.base_url)) .send() .await .map_err(|e| format!("ollama unreachable: {e}"))?; let body: serde_json::Value = resp.json().await .map_err(|e| format!("invalid response: {e}"))?; Ok(serde_json::json!({ "status": "ok", "ollama_url": &self.base_url, "ollama_version": body.get("version"), })) } /// Embed with per-text LRU caching. Mirrors Go-side /// CachedProvider behavior: cache key is (model, text); /// cache-hit texts skip the sidecar; cache-miss texts batch /// into a single sidecar call; results are interleaved in the /// caller's input order. /// /// Closes ~95% of the load-test perf gap vs Go side (loadgen /// 2026-05-01: Rust 128 RPS → with cache ≥ 7000 RPS expected /// for warm-cache workloads). Cold-cache behavior unchanged /// (every text is a miss → single sidecar call, identical to /// pre-cache). pub async fn embed(&self, req: EmbedRequest) -> Result { let model_key = req.model.clone().unwrap_or_default(); // Fast path: cache disabled → original behavior. let Some(cache) = self.embed_cache.as_ref() else { return self.embed_uncached(&req).await; }; if req.texts.is_empty() { return self.embed_uncached(&req).await; } // First pass: check cache for each text. Track which positions // need a sidecar fetch. let mut embeddings: Vec>> = vec![None; req.texts.len()]; let mut miss_indices: Vec = Vec::new(); let mut miss_texts: Vec = Vec::new(); { let mut guard = cache.lock().map_err(|e| format!("cache lock poisoned: {e}"))?; for (i, text) in req.texts.iter().enumerate() { let key = EmbedCacheKey { model: model_key.clone(), text: text.clone() }; if let Some(vec) = guard.get(&key) { embeddings[i] = Some(vec.clone()); self.embed_cache_hits.fetch_add(1, Ordering::Relaxed); } else { miss_indices.push(i); miss_texts.push(text.clone()); self.embed_cache_misses.fetch_add(1, Ordering::Relaxed); } } } // All hit? Return immediately. Use cached_dim to populate // the response dimension (no sidecar to ask). if miss_indices.is_empty() { let dim = self.cached_dim.load(Ordering::Relaxed) as usize; let dim = if dim == 0 { embeddings[0].as_ref().map(|v| v.len()).unwrap_or(0) } else { dim }; return Ok(EmbedResponse { embeddings: embeddings.into_iter().map(|opt| opt.expect("filled")).collect(), model: req.model.unwrap_or_else(|| "nomic-embed-text".to_string()), dimensions: dim, }); } // Second pass: fetch the misses in one sidecar call. let miss_req = EmbedRequest { texts: miss_texts.clone(), model: req.model.clone() }; let resp = self.embed_uncached(&miss_req).await?; if resp.embeddings.len() != miss_texts.len() { return Err(format!( "embed cache: sidecar returned {} embeddings for {} texts", resp.embeddings.len(), miss_texts.len() )); } // Pin cached_dim on first successful response. if resp.dimensions > 0 { self.cached_dim.store(resp.dimensions as u64, Ordering::Relaxed); } // Insert misses into cache + fill response slots. { let mut guard = cache.lock().map_err(|e| format!("cache lock poisoned: {e}"))?; for (j, idx) in miss_indices.iter().enumerate() { let key = EmbedCacheKey { model: model_key.clone(), text: miss_texts[j].clone(), }; let vec = resp.embeddings[j].clone(); guard.put(key, vec.clone()); embeddings[*idx] = Some(vec); } } Ok(EmbedResponse { embeddings: embeddings.into_iter().map(|opt| opt.expect("filled")).collect(), model: resp.model, dimensions: resp.dimensions, }) } /// Direct Ollama call — used internally by embed() for cache-miss /// batches and as the transparent fallback when the cache is /// disabled. Loops per-text against `${base_url}/api/embed`, /// matching the sidecar's pre-2026-05-02 behavior. Ollama 0.4+ /// supports batch input but per-text keeps compatibility broader /// + lets cache-miss-only batches share the loop with cold runs. async fn embed_uncached(&self, req: &EmbedRequest) -> Result { let model = req.model.clone().unwrap_or_else(|| "nomic-embed-text".to_string()); let mut embeddings: Vec> = Vec::with_capacity(req.texts.len()); for text in &req.texts { let resp = self.client .post(format!("{}/api/embed", self.base_url)) .json(&serde_json::json!({ "model": &model, "input": text, })) .send() .await .map_err(|e| format!("embed request failed: {e}"))?; if !resp.status().is_success() { let body = resp.text().await.unwrap_or_default(); return Err(format!("ollama embed error: {body}")); } // Ollama returns {"embeddings": [[...]], "model": "...", ...}. // The outer `embeddings` is always a list; for a scalar input // we get a single inner vector. let parsed: serde_json::Value = resp.json().await .map_err(|e| format!("embed parse error: {e}"))?; let arr = parsed.get("embeddings") .and_then(|v| v.as_array()) .ok_or_else(|| format!("ollama embed: missing 'embeddings' field in {parsed}"))?; if arr.is_empty() { return Err("ollama embed: empty embeddings array".to_string()); } let first = arr[0].as_array() .ok_or_else(|| "ollama embed: embeddings[0] not an array".to_string())?; let vec: Vec = first.iter() .filter_map(|n| n.as_f64()) .collect(); if vec.is_empty() { return Err("ollama embed: numeric coercion produced empty vector".to_string()); } embeddings.push(vec); } let dimensions = embeddings.first().map(|v| v.len()).unwrap_or(0); Ok(EmbedResponse { embeddings, model, dimensions, }) } pub async fn generate(&self, req: GenerateRequest) -> Result { if let Some(gw) = self.gateway_url.as_deref() { return self.generate_via_gateway(gw, req).await; } // Direct Ollama path. Used by gateway internals (so the ollama // provider can call upstream without a self-loop through // /v1/chat) and by any consumer that wants raw transport // without /v1/usage accounting. let model = req.model.clone().unwrap_or_else(|| "qwen3.5:latest".to_string()); let mut body = serde_json::json!({ "model": &model, "prompt": &req.prompt, "stream": false, }); let mut options = serde_json::Map::new(); if let Some(t) = req.temperature { options.insert("temperature".to_string(), serde_json::json!(t)); } if let Some(mt) = req.max_tokens { options.insert("num_predict".to_string(), serde_json::json!(mt)); } if !options.is_empty() { body["options"] = serde_json::Value::Object(options); } if let Some(sys) = &req.system { body["system"] = serde_json::json!(sys); } if let Some(th) = req.think { body["think"] = serde_json::json!(th); } let resp = self.client .post(format!("{}/api/generate", self.base_url)) .json(&body) .send() .await .map_err(|e| format!("generate request failed: {e}"))?; if !resp.status().is_success() { let text = resp.text().await.unwrap_or_default(); return Err(format!("ollama generate error: {text}")); } let parsed: serde_json::Value = resp.json().await .map_err(|e| format!("generate parse error: {e}"))?; Ok(GenerateResponse { text: parsed.get("response").and_then(|v| v.as_str()).unwrap_or("").to_string(), model, tokens_evaluated: parsed.get("prompt_eval_count").and_then(|v| v.as_u64()), tokens_generated: parsed.get("eval_count").and_then(|v| v.as_u64()), }) } /// Phase 44 part 2: route generate() through the gateway's /// /v1/chat with provider="ollama" so the call lands in /// /v1/usage + Langfuse. Translates between the sidecar /// GenerateRequest/Response shape and the OpenAI-compat /// chat shape on the wire. async fn generate_via_gateway(&self, gateway_url: &str, req: GenerateRequest) -> Result { let mut messages = Vec::with_capacity(2); if let Some(sys) = &req.system { messages.push(serde_json::json!({"role": "system", "content": sys})); } messages.push(serde_json::json!({"role": "user", "content": req.prompt})); let mut body = serde_json::json!({ "messages": messages, "provider": "ollama", }); if let Some(m) = &req.model { body["model"] = serde_json::json!(m); } if let Some(t) = req.temperature { body["temperature"] = serde_json::json!(t); } if let Some(mt) = req.max_tokens { body["max_tokens"] = serde_json::json!(mt); } if let Some(th) = req.think { body["think"] = serde_json::json!(th); } let resp = self.client .post(format!("{}/v1/chat", gateway_url)) .json(&body) .send() .await .map_err(|e| format!("/v1/chat request failed: {e}"))?; if !resp.status().is_success() { let text = resp.text().await.unwrap_or_default(); return Err(format!("/v1/chat error: {text}")); } let parsed: serde_json::Value = resp.json().await .map_err(|e| format!("/v1/chat parse error: {e}"))?; let text = parsed .pointer("/choices/0/message/content") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let model = parsed.get("model") .and_then(|v| v.as_str()) .unwrap_or_else(|| req.model.as_deref().unwrap_or("")) .to_string(); let prompt_tokens = parsed.pointer("/usage/prompt_tokens").and_then(|v| v.as_u64()); let completion_tokens = parsed.pointer("/usage/completion_tokens").and_then(|v| v.as_u64()); Ok(GenerateResponse { text, model, tokens_evaluated: prompt_tokens, tokens_generated: completion_tokens, }) } /// Cross-encoder reranking via Ollama generate. Asks the model to /// rate each document's relevance to the query 0-10, then sorts /// descending. Mirrors the sidecar's pre-2026-05-02 algorithm /// exactly so callers see the same scores. pub async fn rerank(&self, req: RerankRequest) -> Result { let model = req.model.clone().unwrap_or_else(|| "qwen3.5:latest".to_string()); let mut scored: Vec = Vec::with_capacity(req.documents.len()); for (i, doc) in req.documents.iter().enumerate() { let prompt = format!( "Rate the relevance of the following document to the query on a scale of 0 to 10. \ Respond with ONLY a number.\n\n\ Query: {}\n\n\ Document: {}\n\n\ Score:", req.query, doc, ); let resp = self.client .post(format!("{}/api/generate", self.base_url)) .json(&serde_json::json!({ "model": &model, "prompt": prompt, "stream": false, "options": {"temperature": 0.0, "num_predict": 8}, })) .send() .await .map_err(|e| format!("rerank request failed: {e}"))?; if !resp.status().is_success() { let body = resp.text().await.unwrap_or_default(); return Err(format!("ollama rerank error: {body}")); } let parsed: serde_json::Value = resp.json().await .map_err(|e| format!("rerank parse error: {e}"))?; let text = parsed.get("response").and_then(|v| v.as_str()).unwrap_or("").trim(); // Parse the leading number; tolerate "7", "7.5", "7 — strong match". let score = text.split_whitespace().next() .and_then(|t| t.parse::().ok()) .unwrap_or(0.0) .clamp(0.0, 10.0); scored.push(ScoredDocument { index: i, text: doc.clone(), score, }); } scored.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); if let Some(k) = req.top_k { scored.truncate(k); } Ok(RerankResponse { results: scored, model }) } /// Force Ollama to unload the named model from VRAM (keep_alive=0). /// Used for predictable profile swaps — without this, Ollama holds a /// model for its configured TTL (default 5min) and the previous /// profile's model can linger in VRAM next to the new one. pub async fn unload_model(&self, model: &str) -> Result { let resp = self.client .post(format!("{}/api/generate", self.base_url)) .json(&serde_json::json!({ "model": model, "prompt": "", "keep_alive": 0, "stream": false, })) .send().await .map_err(|e| format!("unload request failed: {e}"))?; if !resp.status().is_success() { let text = resp.text().await.unwrap_or_default(); return Err(format!("ollama unload error: {text}")); } // Ollama returns 200 with the empty-prompt response shape. // Fold into the legacy {"unloaded": ""} envelope so // callers' parsing doesn't break. Ok(serde_json::json!({ "unloaded": model })) } /// Ask Ollama to load the named model into VRAM proactively. Makes /// the first real request after profile activation fast (no cold-load /// latency). Empty prompts confuse some models, so we send a single /// space + cap num_predict=1 (matches the sidecar's prior behavior). pub async fn preload_model(&self, model: &str) -> Result { let resp = self.client .post(format!("{}/api/generate", self.base_url)) .json(&serde_json::json!({ "model": model, "prompt": " ", "keep_alive": "5m", "stream": false, "options": {"num_predict": 1}, })) .send().await .map_err(|e| format!("preload request failed: {e}"))?; if !resp.status().is_success() { let text = resp.text().await.unwrap_or_default(); return Err(format!("ollama preload error: {text}")); } let parsed: serde_json::Value = resp.json().await .map_err(|e| format!("preload parse error: {e}"))?; Ok(serde_json::json!({ "preloaded": model, "load_duration_ns": parsed.get("load_duration"), "total_duration_ns": parsed.get("total_duration"), })) } /// GPU + loaded-model snapshot. Combines nvidia-smi output (when /// available) with Ollama's /api/ps. Same shape as the prior /// sidecar /admin/vram endpoint so callers don't need updating. pub async fn vram_snapshot(&self) -> Result { let resp = self.client .get(format!("{}/api/ps", self.base_url)) .send().await .map_err(|e| format!("ollama ps request failed: {e}"))?; let loaded: Vec = if resp.status().is_success() { let parsed: serde_json::Value = resp.json().await.unwrap_or(serde_json::Value::Null); parsed.get("models") .and_then(|v| v.as_array()) .map(|arr| arr.iter().map(|m| serde_json::json!({ "name": m.get("name"), "size_vram_mib": m.get("size_vram").and_then(|v| v.as_u64()).map(|n| n / (1024 * 1024)), "expires_at": m.get("expires_at"), })).collect()) .unwrap_or_default() } else { Vec::new() }; let gpu = nvidia_smi_snapshot(); Ok(serde_json::json!({ "gpu": gpu, "ollama_loaded": loaded, })) } } /// One-shot nvidia-smi poll. Returns Null if the tool isn't on PATH /// or the call fails. Mirrors the sidecar's `_nvidia_smi_snapshot` /// shape exactly so callers reading vram_snapshot don't break. fn nvidia_smi_snapshot() -> serde_json::Value { use std::process::Command; let out = Command::new("nvidia-smi") .args([ "--query-gpu=memory.used,memory.total,utilization.gpu,name", "--format=csv,noheader,nounits", ]) .output(); let stdout = match out { Ok(o) if o.status.success() => o.stdout, _ => return serde_json::Value::Null, }; let line = String::from_utf8_lossy(&stdout); let line = line.trim(); if line.is_empty() { return serde_json::Value::Null; } let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect(); if parts.len() < 4 { return serde_json::Value::Null; } let used = parts[0].parse::().unwrap_or(0); let total = parts[1].parse::().unwrap_or(0); let util = parts[2].parse::().unwrap_or(0); serde_json::json!({ "name": parts[3], "used_mib": used, "total_mib": total, "utilization_pct": util, }) }