//! Phase 38 — Ollama shape adapter. //! //! Translates `/v1/chat` (OpenAI-compatible) requests into the //! existing aibridge `GenerateRequest` shape, and the `GenerateResponse` //! back into an OpenAI-compatible `ChatResponse`. This is a bridge, //! not a new client — aibridge + the Python sidecar stay as-is. //! //! Phase 39 replaces this direct call with a `ProviderAdapter` trait //! dispatch so the same `/v1/chat` handler routes to any provider. use aibridge::client::{AiClient, GenerateRequest}; use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock}; pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result { let (system, prompt) = flatten_messages(&req.messages); let gen_req = GenerateRequest { prompt, model: Some(req.model.clone()), system: if system.is_empty() { None } else { Some(system) }, temperature: req.temperature, max_tokens: req.max_tokens, // Phase 38 default: think=false. Hot-path discipline for // thinking models (qwen3.5, qwen3, gpt-oss) which otherwise // burn the max_tokens budget on hidden reasoning before any // visible output, producing empty responses. Callers that // actually want reasoning (overseers, T3+ tiers) opt in via // the `think: true` extension field. Phase 40 routing engine // flips this per task class. think: Some(req.think.unwrap_or(false)), }; let t0 = std::time::Instant::now(); let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?; let latency_ms = t0.elapsed().as_millis(); // Prefer sidecar-reported token counts when present. Fall back to // chars/4 estimate (biased safe ~15%, matches Phase 21 convention // in crates/aibridge/src/context.rs::estimate_tokens). let prompt_tokens = resp.tokens_evaluated .map(|n| n as u32) .unwrap_or_else(|| estimate_prompt_tokens(&req.messages)); let completion_tokens = resp.tokens_generated .map(|n| n as u32) .unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32); tracing::info!( target: "v1.chat", model = %req.model, prompt_tokens, completion_tokens, latency_ms = latency_ms as u64, "ollama chat completed", ); Ok(ChatResponse { id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)), object: "chat.completion", created: chrono::Utc::now().timestamp(), model: resp.model, choices: vec![Choice { index: 0, message: Message::new_text("assistant", resp.text), finish_reason: "stop".into(), }], usage: UsageBlock { prompt_tokens, completion_tokens, total_tokens: prompt_tokens + completion_tokens, }, }) } /// Public re-export of the flattener so sibling adapters (Ollama Cloud, /// future OpenRouter) can reuse the same shape collapse without /// duplicating the logic. Keeps `(system, prompt)` format consistent /// across providers. pub fn flatten_messages_public(messages: &[Message]) -> (String, String) { flatten_messages(messages) } /// Collapse a message array into (system, prompt). Multiple system /// messages concatenate with a newline — matches OpenAI's documented /// behavior. Non-system messages become role-labeled turns. fn flatten_messages(messages: &[Message]) -> (String, String) { let mut system = String::new(); let mut prompt = String::new(); for m in messages { let body = m.text(); if m.role == "system" { if !system.is_empty() { system.push('\n'); } system.push_str(&body); } else { prompt.push_str(&m.role); prompt.push_str(": "); prompt.push_str(&body); prompt.push_str("\n\n"); } } prompt.push_str("assistant:"); (system, prompt) } fn estimate_prompt_tokens(messages: &[Message]) -> u32 { let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 } #[cfg(test)] mod tests { use super::*; #[test] fn flatten_separates_system_from_turns() { let msgs = vec![ Message { role: "system".into(), content: "Rules here.".into() }, Message { role: "user".into(), content: "Q1".into() }, Message { role: "assistant".into(), content: "A1".into() }, Message { role: "user".into(), content: "Q2".into() }, ]; let (system, prompt) = flatten_messages(&msgs); assert_eq!(system, "Rules here."); assert!(prompt.contains("user: Q1")); assert!(prompt.contains("assistant: A1")); assert!(prompt.contains("user: Q2")); assert!(prompt.trim_end().ends_with("assistant:")); } #[test] fn flatten_concatenates_multiple_system_messages() { let msgs = vec![ Message { role: "system".into(), content: "First.".into() }, Message { role: "system".into(), content: "Second.".into() }, Message { role: "user".into(), content: "Hi".into() }, ]; let (system, _) = flatten_messages(&msgs); assert_eq!(system, "First.\nSecond."); } #[test] fn flatten_with_no_system_returns_empty_system() { let msgs = vec![Message { role: "user".into(), content: "hi".into() }]; let (system, prompt) = flatten_messages(&msgs); assert!(system.is_empty()); assert!(prompt.contains("user: hi")); } #[test] fn estimate_tokens_chars_div_4_ceiling() { let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }]; // 8 chars / 4 = 2, with ceiling → 2 assert_eq!(estimate_prompt_tokens(&msgs), 2); let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }]; // 9 chars → (9+3)/4 = 3 (ceiling) assert_eq!(estimate_prompt_tokens(&msgs2), 3); } }