v1: accept OpenAI multimodal content shape (array-of-parts)
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts

Modern OpenAI clients (pi-ai, openai SDK 6.x, langchain-js, the official
agents) send `messages[].content` as an array of content parts:
`[{type:"text", text:"..."}, {type:"image_url", ...}]`. Our gateway
typed `content` as plain `String` and 422'd those calls.

Fix: `Message.content` is now `serde_json::Value` so requests
deserialize regardless of shape. `Message::text()` flattens
content-parts arrays (concat'd `text` fields, non-text parts skipped)
for places that need a plain string — Ollama prompt assembly, char
counts, the assistant's own response synthesis. `Message::new_text()`
constructs string-content messages without writing the wrapper at
each call site. Forwarders (openrouter) clone content through
verbatim so providers see exactly what the client sent.

Verified end-to-end: Pi CLI (`pi --print --provider openrouter`)
landed a clean 1902-token request through `/v1/chat/completions`,
routed to OpenRouter as `openai/gpt-oss-120b:free`, response in
1.62s, Langfuse trace `v1.chat:openrouter` recorded with provider
tag. Same path that any tool using the official openai SDK takes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-26 17:56:46 -05:00
parent 3a0b37ed93
commit 540a9a27ee
7 changed files with 71 additions and 33 deletions

View File

@ -378,7 +378,7 @@ impl ExecutionLoop {
attempts = attempt + 1;
let req = ChatRequest {
model: model.to_string(),
messages: vec![Message { role: "user".into(), content: prompt.to_string() }],
messages: vec![Message::new_text("user", prompt.to_string())],
temperature: Some(temperature),
max_tokens: None,
stream: Some(false),
@ -389,8 +389,8 @@ impl ExecutionLoop {
.map_err(|e| format!("ollama_cloud: {e}"))?;
tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens);
tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens);
let t = resp.choices.into_iter().next()
.map(|c| c.message.content).unwrap_or_default();
let t: String = resp.choices.into_iter().next()
.map(|c| c.message.text()).unwrap_or_default();
if !t.trim().is_empty() {
text = t;
break;
@ -428,7 +428,7 @@ impl ExecutionLoop {
lf.emit_chat(ChatTrace {
provider: provider.to_string(),
model: model.to_string(),
input: vec![Message { role: "user".into(), content: prompt.to_string() }],
input: vec![Message::new_text("user", prompt.to_string())],
output: text.clone(),
prompt_tokens,
completion_tokens,
@ -605,10 +605,7 @@ impl ExecutionLoop {
let start_time = chrono::Utc::now();
let chat_req = crate::v1::ChatRequest {
model: "gpt-oss:120b".to_string(),
messages: vec![crate::v1::Message {
role: "user".into(),
content: prompt.clone(),
}],
messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
temperature: Some(0.1),
max_tokens: None,
stream: Some(false),
@ -619,8 +616,8 @@ impl ExecutionLoop {
.map_err(|e| format!("ollama_cloud: {e}"))?;
let latency_ms = started.elapsed().as_millis() as u64;
let end_time = chrono::Utc::now();
let correction_text = resp.choices.into_iter().next()
.map(|c| c.message.content).unwrap_or_default();
let correction_text: String = resp.choices.into_iter().next()
.map(|c| c.message.text()).unwrap_or_default();
// Stamp per-task stats — cloud call counts against the same
// usage counter so `/v1/usage` shows cloud token spend too.
@ -638,7 +635,7 @@ impl ExecutionLoop {
lf.emit_chat(ChatTrace {
provider: "ollama_cloud".into(),
model: "gpt-oss:120b".into(),
input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }],
input: vec![crate::v1::Message::new_text("user", prompt.clone())],
output: correction_text.clone(),
prompt_tokens: resp.usage.prompt_tokens,
completion_tokens: resp.usage.completion_tokens,

View File

@ -46,12 +46,12 @@ pub async fn chat(
let mut msgs: Vec<AnMessage> = Vec::new();
for m in &req.messages {
if m.role == "system" {
system_parts.push(m.content.clone());
system_parts.push(m.text());
} else {
// Anthropic expects strictly "user" or "assistant"; anything
// else we normalize to "user".
let role = if m.role == "assistant" { "assistant" } else { "user" };
msgs.push(AnMessage { role: role.to_string(), content: m.content.clone() });
msgs.push(AnMessage { role: role.to_string(), content: m.text() });
}
}
let system = if system_parts.is_empty() {
@ -99,7 +99,7 @@ pub async fn chat(
.unwrap_or_default();
let prompt_tokens = parsed.usage.as_ref().map(|u| u.input_tokens).unwrap_or_else(|| {
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
});
let completion_tokens = parsed.usage.as_ref().map(|u| u.output_tokens).unwrap_or_else(|| {
@ -123,7 +123,7 @@ pub async fn chat(
model,
choices: vec![Choice {
index: 0,
message: Message { role: "assistant".into(), content: text },
message: Message::new_text("assistant", text),
finish_reason: parsed.stop_reason.unwrap_or_else(|| "stop".into()),
}],
usage: UsageBlock {

View File

@ -52,7 +52,7 @@ pub async fn chat(
};
contents.push(GmContent {
role: role.to_string(),
parts: vec![GmPart { text: m.content.clone() }],
parts: vec![GmPart { text: m.text() }],
});
}
@ -98,7 +98,7 @@ pub async fn chat(
let prompt_tokens = parsed.usage_metadata.as_ref()
.map(|u| u.prompt_token_count)
.unwrap_or_else(|| {
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
});
let completion_tokens = parsed.usage_metadata.as_ref()
@ -122,7 +122,7 @@ pub async fn chat(
model,
choices: vec![Choice {
index: 0,
message: Message { role: "assistant".into(), content: text },
message: Message::new_text("assistant", text),
finish_reason: candidate.finish_reason.unwrap_or_else(|| "stop".into()),
}],
usage: UsageBlock {

View File

@ -97,10 +97,50 @@ pub fn router(state: V1State) -> Router {
// -- Shared types (OpenAI-compatible) --
/// OpenAI-compatible message. `content` accepts either a plain string or
/// an array of content parts (the modern multimodal shape:
/// `[{type:"text", text:"..."}, {type:"image_url", ...}]`). We store as
/// `serde_json::Value` to preserve client shape on forward; downstream
/// providers can take it verbatim. `Message::text()` flattens for
/// places that need a plain string (Ollama prompt assembly, char
/// counts, the assistant's own response synthesis).
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct Message {
pub role: String,
pub content: String,
pub content: serde_json::Value,
}
impl Message {
/// Construct a plain text message — the common shape for callers
/// that don't need multimodal content. Wraps the body in
/// `serde_json::Value::String` so downstream serializers see the
/// canonical OpenAI shape.
pub fn new_text(role: impl Into<String>, body: impl Into<String>) -> Self {
Self {
role: role.into(),
content: serde_json::Value::String(body.into()),
}
}
/// Flatten content to a plain string. Strings pass through; content-
/// part arrays concatenate the `text` fields with newlines and skip
/// non-text parts (images etc.) — Phase 38/39 callers are text-only,
/// real multimodal forwarding is queued.
pub fn text(&self) -> String {
match &self.content {
serde_json::Value::String(s) => s.clone(),
serde_json::Value::Array(parts) => {
let mut out = String::new();
for p in parts {
if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
if !out.is_empty() { out.push('\n'); }
out.push_str(t);
}
}
out
}
other => other.to_string(),
}
}
}
#[derive(Deserialize, Debug, Clone)]
@ -380,7 +420,7 @@ async fn chat(
// untouched.
if let Some(lf) = &state.langfuse {
let output = resp.choices.first()
.map(|c| c.message.content.clone())
.map(|c| c.message.text())
.unwrap_or_default();
lf.emit_chat(langfuse_trace::ChatTrace {
provider: used_provider.clone(),
@ -452,7 +492,7 @@ mod tests {
assert_eq!(r.model, "qwen3.5:latest");
assert_eq!(r.messages.len(), 2);
assert_eq!(r.messages[0].role, "system");
assert_eq!(r.messages[1].content, "Hi");
assert_eq!(r.messages[1].text(), "Hi");
assert_eq!(r.temperature, Some(0.2));
assert_eq!(r.max_tokens, Some(100));
}

View File

@ -60,10 +60,7 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
model: resp.model,
choices: vec![Choice {
index: 0,
message: Message {
role: "assistant".into(),
content: resp.text,
},
message: Message::new_text("assistant", resp.text),
finish_reason: "stop".into(),
}],
usage: UsageBlock {
@ -89,13 +86,14 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
let mut system = String::new();
let mut prompt = String::new();
for m in messages {
let body = m.text();
if m.role == "system" {
if !system.is_empty() { system.push('\n'); }
system.push_str(&m.content);
system.push_str(&body);
} else {
prompt.push_str(&m.role);
prompt.push_str(": ");
prompt.push_str(&m.content);
prompt.push_str(&body);
prompt.push_str("\n\n");
}
}
@ -104,7 +102,7 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
}
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
}

View File

@ -88,7 +88,7 @@ pub async fn chat(
let text = parsed.response.unwrap_or_default();
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
});
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
@ -112,7 +112,7 @@ pub async fn chat(
model: parsed.model.unwrap_or_else(|| req.model.clone()),
choices: vec![Choice {
index: 0,
message: Message { role: "assistant".into(), content: text },
message: Message::new_text("assistant", text),
finish_reason: "stop".into(),
}],
usage: UsageBlock {

View File

@ -59,6 +59,9 @@ pub async fn chat(
let body = ORChatBody {
model: model.clone(),
// Pass content through verbatim — preserves OpenAI's multimodal
// content-parts shape (`[{type:"text",text:"..."}, ...]`) so the
// upstream provider sees exactly what the client sent.
messages: req.messages.iter().map(|m| ORMessage {
role: m.role.clone(),
content: m.content.clone(),
@ -102,7 +105,7 @@ pub async fn chat(
let text = choice.message.content;
let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
((chars + 3) / 4) as u32
});
let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
@ -126,7 +129,7 @@ pub async fn chat(
model,
choices: vec![Choice {
index: 0,
message: Message { role: "assistant".into(), content: text },
message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
}],
usage: UsageBlock {
@ -149,7 +152,7 @@ struct ORChatBody {
}
#[derive(Serialize)]
struct ORMessage { role: String, content: String }
struct ORMessage { role: String, content: serde_json::Value }
#[derive(Deserialize)]
struct ORChatResponse {