root 540a9a27ee
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
v1: accept OpenAI multimodal content shape (array-of-parts)
Modern OpenAI clients (pi-ai, openai SDK 6.x, langchain-js, the official
agents) send `messages[].content` as an array of content parts:
`[{type:"text", text:"..."}, {type:"image_url", ...}]`. Our gateway
typed `content` as plain `String` and 422'd those calls.

Fix: `Message.content` is now `serde_json::Value` so requests
deserialize regardless of shape. `Message::text()` flattens
content-parts arrays (concat'd `text` fields, non-text parts skipped)
for places that need a plain string — Ollama prompt assembly, char
counts, the assistant's own response synthesis. `Message::new_text()`
constructs string-content messages without writing the wrapper at
each call site. Forwarders (openrouter) clone content through
verbatim so providers see exactly what the client sent.

Verified end-to-end: Pi CLI (`pi --print --provider openrouter`)
landed a clean 1902-token request through `/v1/chat/completions`,
routed to OpenRouter as `openai/gpt-oss-120b:free`, response in
1.62s, Langfuse trace `v1.chat:openrouter` recorded with provider
tag. Same path that any tool using the official openai SDK takes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:56:46 -05:00

159 lines
5.9 KiB
Rust

//! Phase 38 — Ollama shape adapter.
//!
//! Translates `/v1/chat` (OpenAI-compatible) requests into the
//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
//! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
//! not a new client — aibridge + the Python sidecar stay as-is.
//!
//! Phase 39 replaces this direct call with a `ProviderAdapter` trait
//! dispatch so the same `/v1/chat` handler routes to any provider.
use aibridge::client::{AiClient, GenerateRequest};
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
let (system, prompt) = flatten_messages(&req.messages);
let gen_req = GenerateRequest {
prompt,
model: Some(req.model.clone()),
system: if system.is_empty() { None } else { Some(system) },
temperature: req.temperature,
max_tokens: req.max_tokens,
// Phase 38 default: think=false. Hot-path discipline for
// thinking models (qwen3.5, qwen3, gpt-oss) which otherwise
// burn the max_tokens budget on hidden reasoning before any
// visible output, producing empty responses. Callers that
// actually want reasoning (overseers, T3+ tiers) opt in via
// the `think: true` extension field. Phase 40 routing engine
// flips this per task class.
think: Some(req.think.unwrap_or(false)),
};
let t0 = std::time::Instant::now();
let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
let latency_ms = t0.elapsed().as_millis();
// Prefer sidecar-reported token counts when present. Fall back to
// chars/4 estimate (biased safe ~15%, matches Phase 21 convention
// in crates/aibridge/src/context.rs::estimate_tokens).
let prompt_tokens = resp.tokens_evaluated
.map(|n| n as u32)
.unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
let completion_tokens = resp.tokens_generated
.map(|n| n as u32)
.unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
tracing::info!(
target: "v1.chat",
model = %req.model,
prompt_tokens,
completion_tokens,
latency_ms = latency_ms as u64,
"ollama chat completed",
);
Ok(ChatResponse {
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
object: "chat.completion",
created: chrono::Utc::now().timestamp(),
model: resp.model,
choices: vec![Choice {
index: 0,
message: Message::new_text("assistant", resp.text),
finish_reason: "stop".into(),
}],
usage: UsageBlock {
prompt_tokens,
completion_tokens,
total_tokens: prompt_tokens + completion_tokens,
},
})
}
/// Public re-export of the flattener so sibling adapters (Ollama Cloud,
/// future OpenRouter) can reuse the same shape collapse without
/// duplicating the logic. Keeps `(system, prompt)` format consistent
/// across providers.
pub fn flatten_messages_public(messages: &[Message]) -> (String, String) {
flatten_messages(messages)
}
/// Collapse a message array into (system, prompt). Multiple system
/// messages concatenate with a newline — matches OpenAI's documented
/// behavior. Non-system messages become role-labeled turns.
fn flatten_messages(messages: &[Message]) -> (String, String) {
let mut system = String::new();
let mut prompt = String::new();
for m in messages {
let body = m.text();
if m.role == "system" {
if !system.is_empty() { system.push('\n'); }
system.push_str(&body);
} else {
prompt.push_str(&m.role);
prompt.push_str(": ");
prompt.push_str(&body);
prompt.push_str("\n\n");
}
}
prompt.push_str("assistant:");
(system, prompt)
}
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn flatten_separates_system_from_turns() {
let msgs = vec![
Message { role: "system".into(), content: "Rules here.".into() },
Message { role: "user".into(), content: "Q1".into() },
Message { role: "assistant".into(), content: "A1".into() },
Message { role: "user".into(), content: "Q2".into() },
];
let (system, prompt) = flatten_messages(&msgs);
assert_eq!(system, "Rules here.");
assert!(prompt.contains("user: Q1"));
assert!(prompt.contains("assistant: A1"));
assert!(prompt.contains("user: Q2"));
assert!(prompt.trim_end().ends_with("assistant:"));
}
#[test]
fn flatten_concatenates_multiple_system_messages() {
let msgs = vec![
Message { role: "system".into(), content: "First.".into() },
Message { role: "system".into(), content: "Second.".into() },
Message { role: "user".into(), content: "Hi".into() },
];
let (system, _) = flatten_messages(&msgs);
assert_eq!(system, "First.\nSecond.");
}
#[test]
fn flatten_with_no_system_returns_empty_system() {
let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
let (system, prompt) = flatten_messages(&msgs);
assert!(system.is_empty());
assert!(prompt.contains("user: hi"));
}
#[test]
fn estimate_tokens_chars_div_4_ceiling() {
let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
// 8 chars / 4 = 2, with ceiling → 2
assert_eq!(estimate_prompt_tokens(&msgs), 2);
let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
// 9 chars → (9+3)/4 = 3 (ceiling)
assert_eq!(estimate_prompt_tokens(&msgs2), 3);
}
}