v1: accept OpenAI multimodal content shape (array-of-parts)
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Modern OpenAI clients (pi-ai, openai SDK 6.x, langchain-js, the official
agents) send `messages[].content` as an array of content parts:
`[{type:"text", text:"..."}, {type:"image_url", ...}]`. Our gateway
typed `content` as plain `String` and 422'd those calls.
Fix: `Message.content` is now `serde_json::Value` so requests
deserialize regardless of shape. `Message::text()` flattens
content-parts arrays (concat'd `text` fields, non-text parts skipped)
for places that need a plain string — Ollama prompt assembly, char
counts, the assistant's own response synthesis. `Message::new_text()`
constructs string-content messages without writing the wrapper at
each call site. Forwarders (openrouter) clone content through
verbatim so providers see exactly what the client sent.
Verified end-to-end: Pi CLI (`pi --print --provider openrouter`)
landed a clean 1902-token request through `/v1/chat/completions`,
routed to OpenRouter as `openai/gpt-oss-120b:free`, response in
1.62s, Langfuse trace `v1.chat:openrouter` recorded with provider
tag. Same path that any tool using the official openai SDK takes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3a0b37ed93
commit
540a9a27ee
@ -378,7 +378,7 @@ impl ExecutionLoop {
|
||||
attempts = attempt + 1;
|
||||
let req = ChatRequest {
|
||||
model: model.to_string(),
|
||||
messages: vec![Message { role: "user".into(), content: prompt.to_string() }],
|
||||
messages: vec![Message::new_text("user", prompt.to_string())],
|
||||
temperature: Some(temperature),
|
||||
max_tokens: None,
|
||||
stream: Some(false),
|
||||
@ -389,8 +389,8 @@ impl ExecutionLoop {
|
||||
.map_err(|e| format!("ollama_cloud: {e}"))?;
|
||||
tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens);
|
||||
tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens);
|
||||
let t = resp.choices.into_iter().next()
|
||||
.map(|c| c.message.content).unwrap_or_default();
|
||||
let t: String = resp.choices.into_iter().next()
|
||||
.map(|c| c.message.text()).unwrap_or_default();
|
||||
if !t.trim().is_empty() {
|
||||
text = t;
|
||||
break;
|
||||
@ -428,7 +428,7 @@ impl ExecutionLoop {
|
||||
lf.emit_chat(ChatTrace {
|
||||
provider: provider.to_string(),
|
||||
model: model.to_string(),
|
||||
input: vec![Message { role: "user".into(), content: prompt.to_string() }],
|
||||
input: vec![Message::new_text("user", prompt.to_string())],
|
||||
output: text.clone(),
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
@ -605,10 +605,7 @@ impl ExecutionLoop {
|
||||
let start_time = chrono::Utc::now();
|
||||
let chat_req = crate::v1::ChatRequest {
|
||||
model: "gpt-oss:120b".to_string(),
|
||||
messages: vec![crate::v1::Message {
|
||||
role: "user".into(),
|
||||
content: prompt.clone(),
|
||||
}],
|
||||
messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
|
||||
temperature: Some(0.1),
|
||||
max_tokens: None,
|
||||
stream: Some(false),
|
||||
@ -619,8 +616,8 @@ impl ExecutionLoop {
|
||||
.map_err(|e| format!("ollama_cloud: {e}"))?;
|
||||
let latency_ms = started.elapsed().as_millis() as u64;
|
||||
let end_time = chrono::Utc::now();
|
||||
let correction_text = resp.choices.into_iter().next()
|
||||
.map(|c| c.message.content).unwrap_or_default();
|
||||
let correction_text: String = resp.choices.into_iter().next()
|
||||
.map(|c| c.message.text()).unwrap_or_default();
|
||||
|
||||
// Stamp per-task stats — cloud call counts against the same
|
||||
// usage counter so `/v1/usage` shows cloud token spend too.
|
||||
@ -638,7 +635,7 @@ impl ExecutionLoop {
|
||||
lf.emit_chat(ChatTrace {
|
||||
provider: "ollama_cloud".into(),
|
||||
model: "gpt-oss:120b".into(),
|
||||
input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }],
|
||||
input: vec![crate::v1::Message::new_text("user", prompt.clone())],
|
||||
output: correction_text.clone(),
|
||||
prompt_tokens: resp.usage.prompt_tokens,
|
||||
completion_tokens: resp.usage.completion_tokens,
|
||||
|
||||
@ -46,12 +46,12 @@ pub async fn chat(
|
||||
let mut msgs: Vec<AnMessage> = Vec::new();
|
||||
for m in &req.messages {
|
||||
if m.role == "system" {
|
||||
system_parts.push(m.content.clone());
|
||||
system_parts.push(m.text());
|
||||
} else {
|
||||
// Anthropic expects strictly "user" or "assistant"; anything
|
||||
// else we normalize to "user".
|
||||
let role = if m.role == "assistant" { "assistant" } else { "user" };
|
||||
msgs.push(AnMessage { role: role.to_string(), content: m.content.clone() });
|
||||
msgs.push(AnMessage { role: role.to_string(), content: m.text() });
|
||||
}
|
||||
}
|
||||
let system = if system_parts.is_empty() {
|
||||
@ -99,7 +99,7 @@ pub async fn chat(
|
||||
.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.input_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.output_tokens).unwrap_or_else(|| {
|
||||
@ -123,7 +123,7 @@ pub async fn chat(
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
message: Message::new_text("assistant", text),
|
||||
finish_reason: parsed.stop_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
|
||||
@ -52,7 +52,7 @@ pub async fn chat(
|
||||
};
|
||||
contents.push(GmContent {
|
||||
role: role.to_string(),
|
||||
parts: vec![GmPart { text: m.content.clone() }],
|
||||
parts: vec![GmPart { text: m.text() }],
|
||||
});
|
||||
}
|
||||
|
||||
@ -98,7 +98,7 @@ pub async fn chat(
|
||||
let prompt_tokens = parsed.usage_metadata.as_ref()
|
||||
.map(|u| u.prompt_token_count)
|
||||
.unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage_metadata.as_ref()
|
||||
@ -122,7 +122,7 @@ pub async fn chat(
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
message: Message::new_text("assistant", text),
|
||||
finish_reason: candidate.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
|
||||
@ -97,10 +97,50 @@ pub fn router(state: V1State) -> Router {
|
||||
|
||||
// -- Shared types (OpenAI-compatible) --
|
||||
|
||||
/// OpenAI-compatible message. `content` accepts either a plain string or
|
||||
/// an array of content parts (the modern multimodal shape:
|
||||
/// `[{type:"text", text:"..."}, {type:"image_url", ...}]`). We store as
|
||||
/// `serde_json::Value` to preserve client shape on forward; downstream
|
||||
/// providers can take it verbatim. `Message::text()` flattens for
|
||||
/// places that need a plain string (Ollama prompt assembly, char
|
||||
/// counts, the assistant's own response synthesis).
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct Message {
|
||||
pub role: String,
|
||||
pub content: String,
|
||||
pub content: serde_json::Value,
|
||||
}
|
||||
|
||||
impl Message {
|
||||
/// Construct a plain text message — the common shape for callers
|
||||
/// that don't need multimodal content. Wraps the body in
|
||||
/// `serde_json::Value::String` so downstream serializers see the
|
||||
/// canonical OpenAI shape.
|
||||
pub fn new_text(role: impl Into<String>, body: impl Into<String>) -> Self {
|
||||
Self {
|
||||
role: role.into(),
|
||||
content: serde_json::Value::String(body.into()),
|
||||
}
|
||||
}
|
||||
/// Flatten content to a plain string. Strings pass through; content-
|
||||
/// part arrays concatenate the `text` fields with newlines and skip
|
||||
/// non-text parts (images etc.) — Phase 38/39 callers are text-only,
|
||||
/// real multimodal forwarding is queued.
|
||||
pub fn text(&self) -> String {
|
||||
match &self.content {
|
||||
serde_json::Value::String(s) => s.clone(),
|
||||
serde_json::Value::Array(parts) => {
|
||||
let mut out = String::new();
|
||||
for p in parts {
|
||||
if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
|
||||
if !out.is_empty() { out.push('\n'); }
|
||||
out.push_str(t);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
other => other.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
@ -380,7 +420,7 @@ async fn chat(
|
||||
// untouched.
|
||||
if let Some(lf) = &state.langfuse {
|
||||
let output = resp.choices.first()
|
||||
.map(|c| c.message.content.clone())
|
||||
.map(|c| c.message.text())
|
||||
.unwrap_or_default();
|
||||
lf.emit_chat(langfuse_trace::ChatTrace {
|
||||
provider: used_provider.clone(),
|
||||
@ -452,7 +492,7 @@ mod tests {
|
||||
assert_eq!(r.model, "qwen3.5:latest");
|
||||
assert_eq!(r.messages.len(), 2);
|
||||
assert_eq!(r.messages[0].role, "system");
|
||||
assert_eq!(r.messages[1].content, "Hi");
|
||||
assert_eq!(r.messages[1].text(), "Hi");
|
||||
assert_eq!(r.temperature, Some(0.2));
|
||||
assert_eq!(r.max_tokens, Some(100));
|
||||
}
|
||||
|
||||
@ -60,10 +60,7 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
|
||||
model: resp.model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message {
|
||||
role: "assistant".into(),
|
||||
content: resp.text,
|
||||
},
|
||||
message: Message::new_text("assistant", resp.text),
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
@ -89,13 +86,14 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||
let mut system = String::new();
|
||||
let mut prompt = String::new();
|
||||
for m in messages {
|
||||
let body = m.text();
|
||||
if m.role == "system" {
|
||||
if !system.is_empty() { system.push('\n'); }
|
||||
system.push_str(&m.content);
|
||||
system.push_str(&body);
|
||||
} else {
|
||||
prompt.push_str(&m.role);
|
||||
prompt.push_str(": ");
|
||||
prompt.push_str(&m.content);
|
||||
prompt.push_str(&body);
|
||||
prompt.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
@ -104,7 +102,7 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||
}
|
||||
|
||||
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
|
||||
let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
}
|
||||
|
||||
|
||||
@ -88,7 +88,7 @@ pub async fn chat(
|
||||
let text = parsed.response.unwrap_or_default();
|
||||
|
||||
let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
|
||||
@ -112,7 +112,7 @@ pub async fn chat(
|
||||
model: parsed.model.unwrap_or_else(|| req.model.clone()),
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
message: Message::new_text("assistant", text),
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
|
||||
@ -59,6 +59,9 @@ pub async fn chat(
|
||||
|
||||
let body = ORChatBody {
|
||||
model: model.clone(),
|
||||
// Pass content through verbatim — preserves OpenAI's multimodal
|
||||
// content-parts shape (`[{type:"text",text:"..."}, ...]`) so the
|
||||
// upstream provider sees exactly what the client sent.
|
||||
messages: req.messages.iter().map(|m| ORMessage {
|
||||
role: m.role.clone(),
|
||||
content: m.content.clone(),
|
||||
@ -102,7 +105,7 @@ pub async fn chat(
|
||||
let text = choice.message.content;
|
||||
|
||||
let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
|
||||
let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
});
|
||||
let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
|
||||
@ -126,7 +129,7 @@ pub async fn chat(
|
||||
model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message { role: "assistant".into(), content: text },
|
||||
message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
|
||||
finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
@ -149,7 +152,7 @@ struct ORChatBody {
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ORMessage { role: String, content: String }
|
||||
struct ORMessage { role: String, content: serde_json::Value }
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ORChatResponse {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user