diff --git a/crates/gateway/src/execution_loop/mod.rs b/crates/gateway/src/execution_loop/mod.rs index 0da0f66..aaab58d 100644 --- a/crates/gateway/src/execution_loop/mod.rs +++ b/crates/gateway/src/execution_loop/mod.rs @@ -378,7 +378,7 @@ impl ExecutionLoop { attempts = attempt + 1; let req = ChatRequest { model: model.to_string(), - messages: vec![Message { role: "user".into(), content: prompt.to_string() }], + messages: vec![Message::new_text("user", prompt.to_string())], temperature: Some(temperature), max_tokens: None, stream: Some(false), @@ -389,8 +389,8 @@ impl ExecutionLoop { .map_err(|e| format!("ollama_cloud: {e}"))?; tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens); tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens); - let t = resp.choices.into_iter().next() - .map(|c| c.message.content).unwrap_or_default(); + let t: String = resp.choices.into_iter().next() + .map(|c| c.message.text()).unwrap_or_default(); if !t.trim().is_empty() { text = t; break; @@ -428,7 +428,7 @@ impl ExecutionLoop { lf.emit_chat(ChatTrace { provider: provider.to_string(), model: model.to_string(), - input: vec![Message { role: "user".into(), content: prompt.to_string() }], + input: vec![Message::new_text("user", prompt.to_string())], output: text.clone(), prompt_tokens, completion_tokens, @@ -605,10 +605,7 @@ impl ExecutionLoop { let start_time = chrono::Utc::now(); let chat_req = crate::v1::ChatRequest { model: "gpt-oss:120b".to_string(), - messages: vec![crate::v1::Message { - role: "user".into(), - content: prompt.clone(), - }], + messages: vec![crate::v1::Message::new_text("user", prompt.clone())], temperature: Some(0.1), max_tokens: None, stream: Some(false), @@ -619,8 +616,8 @@ impl ExecutionLoop { .map_err(|e| format!("ollama_cloud: {e}"))?; let latency_ms = started.elapsed().as_millis() as u64; let end_time = chrono::Utc::now(); - let correction_text = resp.choices.into_iter().next() - .map(|c| c.message.content).unwrap_or_default(); + let correction_text: String = resp.choices.into_iter().next() + .map(|c| c.message.text()).unwrap_or_default(); // Stamp per-task stats — cloud call counts against the same // usage counter so `/v1/usage` shows cloud token spend too. @@ -638,7 +635,7 @@ impl ExecutionLoop { lf.emit_chat(ChatTrace { provider: "ollama_cloud".into(), model: "gpt-oss:120b".into(), - input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }], + input: vec![crate::v1::Message::new_text("user", prompt.clone())], output: correction_text.clone(), prompt_tokens: resp.usage.prompt_tokens, completion_tokens: resp.usage.completion_tokens, diff --git a/crates/gateway/src/v1/claude.rs b/crates/gateway/src/v1/claude.rs index ccbe8c3..a71a15f 100644 --- a/crates/gateway/src/v1/claude.rs +++ b/crates/gateway/src/v1/claude.rs @@ -46,12 +46,12 @@ pub async fn chat( let mut msgs: Vec = Vec::new(); for m in &req.messages { if m.role == "system" { - system_parts.push(m.content.clone()); + system_parts.push(m.text()); } else { // Anthropic expects strictly "user" or "assistant"; anything // else we normalize to "user". let role = if m.role == "assistant" { "assistant" } else { "user" }; - msgs.push(AnMessage { role: role.to_string(), content: m.content.clone() }); + msgs.push(AnMessage { role: role.to_string(), content: m.text() }); } } let system = if system_parts.is_empty() { @@ -99,7 +99,7 @@ pub async fn chat( .unwrap_or_default(); let prompt_tokens = parsed.usage.as_ref().map(|u| u.input_tokens).unwrap_or_else(|| { - let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum(); + let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 }); let completion_tokens = parsed.usage.as_ref().map(|u| u.output_tokens).unwrap_or_else(|| { @@ -123,7 +123,7 @@ pub async fn chat( model, choices: vec![Choice { index: 0, - message: Message { role: "assistant".into(), content: text }, + message: Message::new_text("assistant", text), finish_reason: parsed.stop_reason.unwrap_or_else(|| "stop".into()), }], usage: UsageBlock { diff --git a/crates/gateway/src/v1/gemini.rs b/crates/gateway/src/v1/gemini.rs index 99d4c98..5ef0782 100644 --- a/crates/gateway/src/v1/gemini.rs +++ b/crates/gateway/src/v1/gemini.rs @@ -52,7 +52,7 @@ pub async fn chat( }; contents.push(GmContent { role: role.to_string(), - parts: vec![GmPart { text: m.content.clone() }], + parts: vec![GmPart { text: m.text() }], }); } @@ -98,7 +98,7 @@ pub async fn chat( let prompt_tokens = parsed.usage_metadata.as_ref() .map(|u| u.prompt_token_count) .unwrap_or_else(|| { - let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum(); + let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 }); let completion_tokens = parsed.usage_metadata.as_ref() @@ -122,7 +122,7 @@ pub async fn chat( model, choices: vec![Choice { index: 0, - message: Message { role: "assistant".into(), content: text }, + message: Message::new_text("assistant", text), finish_reason: candidate.finish_reason.unwrap_or_else(|| "stop".into()), }], usage: UsageBlock { diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs index 875a077..c455418 100644 --- a/crates/gateway/src/v1/mod.rs +++ b/crates/gateway/src/v1/mod.rs @@ -97,10 +97,50 @@ pub fn router(state: V1State) -> Router { // -- Shared types (OpenAI-compatible) -- +/// OpenAI-compatible message. `content` accepts either a plain string or +/// an array of content parts (the modern multimodal shape: +/// `[{type:"text", text:"..."}, {type:"image_url", ...}]`). We store as +/// `serde_json::Value` to preserve client shape on forward; downstream +/// providers can take it verbatim. `Message::text()` flattens for +/// places that need a plain string (Ollama prompt assembly, char +/// counts, the assistant's own response synthesis). #[derive(Serialize, Deserialize, Clone, Debug)] pub struct Message { pub role: String, - pub content: String, + pub content: serde_json::Value, +} + +impl Message { + /// Construct a plain text message — the common shape for callers + /// that don't need multimodal content. Wraps the body in + /// `serde_json::Value::String` so downstream serializers see the + /// canonical OpenAI shape. + pub fn new_text(role: impl Into, body: impl Into) -> Self { + Self { + role: role.into(), + content: serde_json::Value::String(body.into()), + } + } + /// Flatten content to a plain string. Strings pass through; content- + /// part arrays concatenate the `text` fields with newlines and skip + /// non-text parts (images etc.) — Phase 38/39 callers are text-only, + /// real multimodal forwarding is queued. + pub fn text(&self) -> String { + match &self.content { + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(parts) => { + let mut out = String::new(); + for p in parts { + if let Some(t) = p.get("text").and_then(|v| v.as_str()) { + if !out.is_empty() { out.push('\n'); } + out.push_str(t); + } + } + out + } + other => other.to_string(), + } + } } #[derive(Deserialize, Debug, Clone)] @@ -380,7 +420,7 @@ async fn chat( // untouched. if let Some(lf) = &state.langfuse { let output = resp.choices.first() - .map(|c| c.message.content.clone()) + .map(|c| c.message.text()) .unwrap_or_default(); lf.emit_chat(langfuse_trace::ChatTrace { provider: used_provider.clone(), @@ -452,7 +492,7 @@ mod tests { assert_eq!(r.model, "qwen3.5:latest"); assert_eq!(r.messages.len(), 2); assert_eq!(r.messages[0].role, "system"); - assert_eq!(r.messages[1].content, "Hi"); + assert_eq!(r.messages[1].text(), "Hi"); assert_eq!(r.temperature, Some(0.2)); assert_eq!(r.max_tokens, Some(100)); } diff --git a/crates/gateway/src/v1/ollama.rs b/crates/gateway/src/v1/ollama.rs index 71ffec3..240d8da 100644 --- a/crates/gateway/src/v1/ollama.rs +++ b/crates/gateway/src/v1/ollama.rs @@ -60,10 +60,7 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result (String, String) { let mut system = String::new(); let mut prompt = String::new(); for m in messages { + let body = m.text(); if m.role == "system" { if !system.is_empty() { system.push('\n'); } - system.push_str(&m.content); + system.push_str(&body); } else { prompt.push_str(&m.role); prompt.push_str(": "); - prompt.push_str(&m.content); + prompt.push_str(&body); prompt.push_str("\n\n"); } } @@ -104,7 +102,7 @@ fn flatten_messages(messages: &[Message]) -> (String, String) { } fn estimate_prompt_tokens(messages: &[Message]) -> u32 { - let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum(); + let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 } diff --git a/crates/gateway/src/v1/ollama_cloud.rs b/crates/gateway/src/v1/ollama_cloud.rs index b6d089c..8c6c05e 100644 --- a/crates/gateway/src/v1/ollama_cloud.rs +++ b/crates/gateway/src/v1/ollama_cloud.rs @@ -88,7 +88,7 @@ pub async fn chat( let text = parsed.response.unwrap_or_default(); let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| { - let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum(); + let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 }); let completion_tokens = parsed.eval_count.unwrap_or_else(|| { @@ -112,7 +112,7 @@ pub async fn chat( model: parsed.model.unwrap_or_else(|| req.model.clone()), choices: vec![Choice { index: 0, - message: Message { role: "assistant".into(), content: text }, + message: Message::new_text("assistant", text), finish_reason: "stop".into(), }], usage: UsageBlock { diff --git a/crates/gateway/src/v1/openrouter.rs b/crates/gateway/src/v1/openrouter.rs index d6374db..610c5eb 100644 --- a/crates/gateway/src/v1/openrouter.rs +++ b/crates/gateway/src/v1/openrouter.rs @@ -59,6 +59,9 @@ pub async fn chat( let body = ORChatBody { model: model.clone(), + // Pass content through verbatim — preserves OpenAI's multimodal + // content-parts shape (`[{type:"text",text:"..."}, ...]`) so the + // upstream provider sees exactly what the client sent. messages: req.messages.iter().map(|m| ORMessage { role: m.role.clone(), content: m.content.clone(), @@ -102,7 +105,7 @@ pub async fn chat( let text = choice.message.content; let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| { - let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum(); + let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum(); ((chars + 3) / 4) as u32 }); let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| { @@ -126,7 +129,7 @@ pub async fn chat( model, choices: vec![Choice { index: 0, - message: Message { role: "assistant".into(), content: text }, + message: Message { role: "assistant".into(), content: serde_json::Value::String(text) }, finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()), }], usage: UsageBlock { @@ -149,7 +152,7 @@ struct ORChatBody { } #[derive(Serialize)] -struct ORMessage { role: String, content: String } +struct ORMessage { role: String, content: serde_json::Value } #[derive(Deserialize)] struct ORChatResponse {