Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Modern OpenAI clients (pi-ai, openai SDK 6.x, langchain-js, the official
agents) send `messages[].content` as an array of content parts:
`[{type:"text", text:"..."}, {type:"image_url", ...}]`. Our gateway
typed `content` as plain `String` and 422'd those calls.
Fix: `Message.content` is now `serde_json::Value` so requests
deserialize regardless of shape. `Message::text()` flattens
content-parts arrays (concat'd `text` fields, non-text parts skipped)
for places that need a plain string — Ollama prompt assembly, char
counts, the assistant's own response synthesis. `Message::new_text()`
constructs string-content messages without writing the wrapper at
each call site. Forwarders (openrouter) clone content through
verbatim so providers see exactly what the client sent.
Verified end-to-end: Pi CLI (`pi --print --provider openrouter`)
landed a clean 1902-token request through `/v1/chat/completions`,
routed to OpenRouter as `openai/gpt-oss-120b:free`, response in
1.62s, Langfuse trace `v1.chat:openrouter` recorded with provider
tag. Same path that any tool using the official openai SDK takes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
159 lines
5.9 KiB
Rust
159 lines
5.9 KiB
Rust
//! Phase 38 — Ollama shape adapter.
|
|
//!
|
|
//! Translates `/v1/chat` (OpenAI-compatible) requests into the
|
|
//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
|
|
//! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
|
|
//! not a new client — aibridge + the Python sidecar stay as-is.
|
|
//!
|
|
//! Phase 39 replaces this direct call with a `ProviderAdapter` trait
|
|
//! dispatch so the same `/v1/chat` handler routes to any provider.
|
|
|
|
use aibridge::client::{AiClient, GenerateRequest};
|
|
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
|
|
|
pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
|
|
let (system, prompt) = flatten_messages(&req.messages);
|
|
|
|
let gen_req = GenerateRequest {
|
|
prompt,
|
|
model: Some(req.model.clone()),
|
|
system: if system.is_empty() { None } else { Some(system) },
|
|
temperature: req.temperature,
|
|
max_tokens: req.max_tokens,
|
|
// Phase 38 default: think=false. Hot-path discipline for
|
|
// thinking models (qwen3.5, qwen3, gpt-oss) which otherwise
|
|
// burn the max_tokens budget on hidden reasoning before any
|
|
// visible output, producing empty responses. Callers that
|
|
// actually want reasoning (overseers, T3+ tiers) opt in via
|
|
// the `think: true` extension field. Phase 40 routing engine
|
|
// flips this per task class.
|
|
think: Some(req.think.unwrap_or(false)),
|
|
};
|
|
|
|
let t0 = std::time::Instant::now();
|
|
let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
|
|
let latency_ms = t0.elapsed().as_millis();
|
|
|
|
// Prefer sidecar-reported token counts when present. Fall back to
|
|
// chars/4 estimate (biased safe ~15%, matches Phase 21 convention
|
|
// in crates/aibridge/src/context.rs::estimate_tokens).
|
|
let prompt_tokens = resp.tokens_evaluated
|
|
.map(|n| n as u32)
|
|
.unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
|
|
let completion_tokens = resp.tokens_generated
|
|
.map(|n| n as u32)
|
|
.unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
|
|
|
|
tracing::info!(
|
|
target: "v1.chat",
|
|
model = %req.model,
|
|
prompt_tokens,
|
|
completion_tokens,
|
|
latency_ms = latency_ms as u64,
|
|
"ollama chat completed",
|
|
);
|
|
|
|
Ok(ChatResponse {
|
|
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
|
object: "chat.completion",
|
|
created: chrono::Utc::now().timestamp(),
|
|
model: resp.model,
|
|
choices: vec![Choice {
|
|
index: 0,
|
|
message: Message::new_text("assistant", resp.text),
|
|
finish_reason: "stop".into(),
|
|
}],
|
|
usage: UsageBlock {
|
|
prompt_tokens,
|
|
completion_tokens,
|
|
total_tokens: prompt_tokens + completion_tokens,
|
|
},
|
|
})
|
|
}
|
|
|
|
/// Public re-export of the flattener so sibling adapters (Ollama Cloud,
|
|
/// future OpenRouter) can reuse the same shape collapse without
|
|
/// duplicating the logic. Keeps `(system, prompt)` format consistent
|
|
/// across providers.
|
|
pub fn flatten_messages_public(messages: &[Message]) -> (String, String) {
|
|
flatten_messages(messages)
|
|
}
|
|
|
|
/// Collapse a message array into (system, prompt). Multiple system
|
|
/// messages concatenate with a newline — matches OpenAI's documented
|
|
/// behavior. Non-system messages become role-labeled turns.
|
|
fn flatten_messages(messages: &[Message]) -> (String, String) {
|
|
let mut system = String::new();
|
|
let mut prompt = String::new();
|
|
for m in messages {
|
|
let body = m.text();
|
|
if m.role == "system" {
|
|
if !system.is_empty() { system.push('\n'); }
|
|
system.push_str(&body);
|
|
} else {
|
|
prompt.push_str(&m.role);
|
|
prompt.push_str(": ");
|
|
prompt.push_str(&body);
|
|
prompt.push_str("\n\n");
|
|
}
|
|
}
|
|
prompt.push_str("assistant:");
|
|
(system, prompt)
|
|
}
|
|
|
|
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
|
|
let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
|
|
((chars + 3) / 4) as u32
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn flatten_separates_system_from_turns() {
|
|
let msgs = vec![
|
|
Message { role: "system".into(), content: "Rules here.".into() },
|
|
Message { role: "user".into(), content: "Q1".into() },
|
|
Message { role: "assistant".into(), content: "A1".into() },
|
|
Message { role: "user".into(), content: "Q2".into() },
|
|
];
|
|
let (system, prompt) = flatten_messages(&msgs);
|
|
assert_eq!(system, "Rules here.");
|
|
assert!(prompt.contains("user: Q1"));
|
|
assert!(prompt.contains("assistant: A1"));
|
|
assert!(prompt.contains("user: Q2"));
|
|
assert!(prompt.trim_end().ends_with("assistant:"));
|
|
}
|
|
|
|
#[test]
|
|
fn flatten_concatenates_multiple_system_messages() {
|
|
let msgs = vec![
|
|
Message { role: "system".into(), content: "First.".into() },
|
|
Message { role: "system".into(), content: "Second.".into() },
|
|
Message { role: "user".into(), content: "Hi".into() },
|
|
];
|
|
let (system, _) = flatten_messages(&msgs);
|
|
assert_eq!(system, "First.\nSecond.");
|
|
}
|
|
|
|
#[test]
|
|
fn flatten_with_no_system_returns_empty_system() {
|
|
let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
|
|
let (system, prompt) = flatten_messages(&msgs);
|
|
assert!(system.is_empty());
|
|
assert!(prompt.contains("user: hi"));
|
|
}
|
|
|
|
#[test]
|
|
fn estimate_tokens_chars_div_4_ceiling() {
|
|
let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
|
|
// 8 chars / 4 = 2, with ceiling → 2
|
|
assert_eq!(estimate_prompt_tokens(&msgs), 2);
|
|
|
|
let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
|
|
// 9 chars → (9+3)/4 = 3 (ceiling)
|
|
assert_eq!(estimate_prompt_tokens(&msgs2), 3);
|
|
}
|
|
}
|