v1: accept OpenAI multimodal content shape (array-of-parts)

Modern OpenAI clients (pi-ai, openai SDK 6.x, langchain-js, the official agents) send `messages[].content` as an array of content parts: `[{type:"text", text:"..."}, {type:"image_url", ...}]`. Our gateway typed `content` as plain `String` and 422'd those calls. Fix: `Message.content` is now `serde_json::Value` so requests deserialize regardless of shape. `Message::text()` flattens content-parts arrays (concat'd `text` fields, non-text parts skipped) for places that need a plain string — Ollama prompt assembly, char counts, the assistant's own response synthesis. `Message::new_text()` constructs string-content messages without writing the wrapper at each call site. Forwarders (openrouter) clone content through verbatim so providers see exactly what the client sent. Verified end-to-end: Pi CLI (`pi --print --provider openrouter`) landed a clean 1902-token request through `/v1/chat/completions`, routed to OpenRouter as `openai/gpt-oss-120b:free`, response in 1.62s, Langfuse trace `v1.chat:openrouter` recorded with provider tag. Same path that any tool using the official openai SDK takes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:56:46 -05:00 · 2026-04-26 17:56:46 -05:00 · 540a9a27ee
commit 540a9a27ee
parent 3a0b37ed93
7 changed files with 71 additions and 33 deletions
--- a/crates/gateway/src/execution_loop/mod.rs
+++ b/crates/gateway/src/execution_loop/mod.rs
@ -378,7 +378,7 @@ impl ExecutionLoop {
                attempts = attempt + 1;
                let req = ChatRequest {
                    model: model.to_string(),
-                    messages: vec![Message { role: "user".into(), content: prompt.to_string() }],
+                    messages: vec![Message::new_text("user", prompt.to_string())],
                    temperature: Some(temperature),
                    max_tokens: None,
                    stream: Some(false),
@ -389,8 +389,8 @@ impl ExecutionLoop {
                    .map_err(|e| format!("ollama_cloud: {e}"))?;
                tokens_p = tokens_p.saturating_add(resp.usage.prompt_tokens);
                tokens_c = tokens_c.saturating_add(resp.usage.completion_tokens);
-                let t = resp.choices.into_iter().next()
-                    .map(|c| c.message.content).unwrap_or_default();
+                let t: String = resp.choices.into_iter().next()
+                    .map(|c| c.message.text()).unwrap_or_default();
                if !t.trim().is_empty() {
                    text = t;
                    break;
@ -428,7 +428,7 @@ impl ExecutionLoop {
            lf.emit_chat(ChatTrace {
                provider: provider.to_string(),
                model: model.to_string(),
-                input: vec![Message { role: "user".into(), content: prompt.to_string() }],
+                input: vec![Message::new_text("user", prompt.to_string())],
                output: text.clone(),
                prompt_tokens,
                completion_tokens,
@ -605,10 +605,7 @@ impl ExecutionLoop {
        let start_time = chrono::Utc::now();
        let chat_req = crate::v1::ChatRequest {
            model: "gpt-oss:120b".to_string(),
-            messages: vec![crate::v1::Message {
-                role: "user".into(),
-                content: prompt.clone(),
-            }],
+            messages: vec![crate::v1::Message::new_text("user", prompt.clone())],
            temperature: Some(0.1),
            max_tokens: None,
            stream: Some(false),
@ -619,8 +616,8 @@ impl ExecutionLoop {
            .map_err(|e| format!("ollama_cloud: {e}"))?;
        let latency_ms = started.elapsed().as_millis() as u64;
        let end_time = chrono::Utc::now();
-        let correction_text = resp.choices.into_iter().next()
-            .map(|c| c.message.content).unwrap_or_default();
+        let correction_text: String = resp.choices.into_iter().next()
+            .map(|c| c.message.text()).unwrap_or_default();

        // Stamp per-task stats — cloud call counts against the same
        // usage counter so `/v1/usage` shows cloud token spend too.
@ -638,7 +635,7 @@ impl ExecutionLoop {
            lf.emit_chat(ChatTrace {
                provider: "ollama_cloud".into(),
                model: "gpt-oss:120b".into(),
-                input: vec![crate::v1::Message { role: "user".into(), content: prompt.clone() }],
+                input: vec![crate::v1::Message::new_text("user", prompt.clone())],
                output: correction_text.clone(),
                prompt_tokens: resp.usage.prompt_tokens,
                completion_tokens: resp.usage.completion_tokens,
--- a/crates/gateway/src/v1/claude.rs
+++ b/crates/gateway/src/v1/claude.rs
@ -46,12 +46,12 @@ pub async fn chat(
    let mut msgs: Vec<AnMessage> = Vec::new();
    for m in &req.messages {
        if m.role == "system" {
-            system_parts.push(m.content.clone());
+            system_parts.push(m.text());
        } else {
            // Anthropic expects strictly "user" or "assistant"; anything
            // else we normalize to "user".
            let role = if m.role == "assistant" { "assistant" } else { "user" };
-            msgs.push(AnMessage { role: role.to_string(), content: m.content.clone() });
+            msgs.push(AnMessage { role: role.to_string(), content: m.text() });
        }
    }
    let system = if system_parts.is_empty() {
@ -99,7 +99,7 @@ pub async fn chat(
        .unwrap_or_default();

    let prompt_tokens = parsed.usage.as_ref().map(|u| u.input_tokens).unwrap_or_else(|| {
-        let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
+        let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
        ((chars + 3) / 4) as u32
    });
    let completion_tokens = parsed.usage.as_ref().map(|u| u.output_tokens).unwrap_or_else(|| {
@ -123,7 +123,7 @@ pub async fn chat(
        model,
        choices: vec![Choice {
            index: 0,
-            message: Message { role: "assistant".into(), content: text },
+            message: Message::new_text("assistant", text),
            finish_reason: parsed.stop_reason.unwrap_or_else(|| "stop".into()),
        }],
        usage: UsageBlock {
--- a/crates/gateway/src/v1/gemini.rs
+++ b/crates/gateway/src/v1/gemini.rs
@ -52,7 +52,7 @@ pub async fn chat(
        };
        contents.push(GmContent {
            role: role.to_string(),
-            parts: vec![GmPart { text: m.content.clone() }],
+            parts: vec![GmPart { text: m.text() }],
        });
    }

@ -98,7 +98,7 @@ pub async fn chat(
    let prompt_tokens = parsed.usage_metadata.as_ref()
        .map(|u| u.prompt_token_count)
        .unwrap_or_else(|| {
-            let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
+            let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
            ((chars + 3) / 4) as u32
        });
    let completion_tokens = parsed.usage_metadata.as_ref()
@ -122,7 +122,7 @@ pub async fn chat(
        model,
        choices: vec![Choice {
            index: 0,
-            message: Message { role: "assistant".into(), content: text },
+            message: Message::new_text("assistant", text),
            finish_reason: candidate.finish_reason.unwrap_or_else(|| "stop".into()),
        }],
        usage: UsageBlock {
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@ -97,10 +97,50 @@ pub fn router(state: V1State) -> Router {

 // -- Shared types (OpenAI-compatible) --

+/// OpenAI-compatible message. `content` accepts either a plain string or
+/// an array of content parts (the modern multimodal shape:
+/// `[{type:"text", text:"..."}, {type:"image_url", ...}]`). We store as
+/// `serde_json::Value` to preserve client shape on forward; downstream
+/// providers can take it verbatim. `Message::text()` flattens for
+/// places that need a plain string (Ollama prompt assembly, char
+/// counts, the assistant's own response synthesis).
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct Message {
    pub role: String,
-    pub content: String,
+    pub content: serde_json::Value,
+}
+
+impl Message {
+    /// Construct a plain text message — the common shape for callers
+    /// that don't need multimodal content. Wraps the body in
+    /// `serde_json::Value::String` so downstream serializers see the
+    /// canonical OpenAI shape.
+    pub fn new_text(role: impl Into<String>, body: impl Into<String>) -> Self {
+        Self {
+            role: role.into(),
+            content: serde_json::Value::String(body.into()),
+        }
+    }
+    /// Flatten content to a plain string. Strings pass through; content-
+    /// part arrays concatenate the `text` fields with newlines and skip
+    /// non-text parts (images etc.) — Phase 38/39 callers are text-only,
+    /// real multimodal forwarding is queued.
+    pub fn text(&self) -> String {
+        match &self.content {
+            serde_json::Value::String(s) => s.clone(),
+            serde_json::Value::Array(parts) => {
+                let mut out = String::new();
+                for p in parts {
+                    if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
+                        if !out.is_empty() { out.push('\n'); }
+                        out.push_str(t);
+                    }
+                }
+                out
+            }
+            other => other.to_string(),
+        }
+    }
 }

 #[derive(Deserialize, Debug, Clone)]
@ -380,7 +420,7 @@ async fn chat(
    // untouched.
    if let Some(lf) = &state.langfuse {
        let output = resp.choices.first()
-            .map(|c| c.message.content.clone())
+            .map(|c| c.message.text())
            .unwrap_or_default();
        lf.emit_chat(langfuse_trace::ChatTrace {
            provider: used_provider.clone(),
@ -452,7 +492,7 @@ mod tests {
        assert_eq!(r.model, "qwen3.5:latest");
        assert_eq!(r.messages.len(), 2);
        assert_eq!(r.messages[0].role, "system");
-        assert_eq!(r.messages[1].content, "Hi");
+        assert_eq!(r.messages[1].text(), "Hi");
        assert_eq!(r.temperature, Some(0.2));
        assert_eq!(r.max_tokens, Some(100));
    }
--- a/crates/gateway/src/v1/ollama.rs
+++ b/crates/gateway/src/v1/ollama.rs
@ -60,10 +60,7 @@ pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse,
        model: resp.model,
        choices: vec![Choice {
            index: 0,
-            message: Message {
-                role: "assistant".into(),
-                content: resp.text,
-            },
+            message: Message::new_text("assistant", resp.text),
            finish_reason: "stop".into(),
        }],
        usage: UsageBlock {
@ -89,13 +86,14 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
    let mut system = String::new();
    let mut prompt = String::new();
    for m in messages {
+        let body = m.text();
        if m.role == "system" {
            if !system.is_empty() { system.push('\n'); }
-            system.push_str(&m.content);
+            system.push_str(&body);
        } else {
            prompt.push_str(&m.role);
            prompt.push_str(": ");
-            prompt.push_str(&m.content);
+            prompt.push_str(&body);
            prompt.push_str("\n\n");
        }
    }
@ -104,7 +102,7 @@ fn flatten_messages(messages: &[Message]) -> (String, String) {
 }

 fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
-    let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
+    let chars: usize = messages.iter().map(|m| m.text().chars().count()).sum();
    ((chars + 3) / 4) as u32
 }

--- a/crates/gateway/src/v1/ollama_cloud.rs
+++ b/crates/gateway/src/v1/ollama_cloud.rs
@ -88,7 +88,7 @@ pub async fn chat(
    let text = parsed.response.unwrap_or_default();

    let prompt_tokens = parsed.prompt_eval_count.unwrap_or_else(|| {
-        let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
+        let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
        ((chars + 3) / 4) as u32
    });
    let completion_tokens = parsed.eval_count.unwrap_or_else(|| {
@ -112,7 +112,7 @@ pub async fn chat(
        model: parsed.model.unwrap_or_else(|| req.model.clone()),
        choices: vec![Choice {
            index: 0,
-            message: Message { role: "assistant".into(), content: text },
+            message: Message::new_text("assistant", text),
            finish_reason: "stop".into(),
        }],
        usage: UsageBlock {
--- a/crates/gateway/src/v1/openrouter.rs
+++ b/crates/gateway/src/v1/openrouter.rs
@ -59,6 +59,9 @@ pub async fn chat(

    let body = ORChatBody {
        model: model.clone(),
+        // Pass content through verbatim — preserves OpenAI's multimodal
+        // content-parts shape (`[{type:"text",text:"..."}, ...]`) so the
+        // upstream provider sees exactly what the client sent.
        messages: req.messages.iter().map(|m| ORMessage {
            role: m.role.clone(),
            content: m.content.clone(),
@ -102,7 +105,7 @@ pub async fn chat(
    let text = choice.message.content;

    let prompt_tokens = parsed.usage.as_ref().map(|u| u.prompt_tokens).unwrap_or_else(|| {
-        let chars: usize = req.messages.iter().map(|m| m.content.chars().count()).sum();
+        let chars: usize = req.messages.iter().map(|m| m.text().chars().count()).sum();
        ((chars + 3) / 4) as u32
    });
    let completion_tokens = parsed.usage.as_ref().map(|u| u.completion_tokens).unwrap_or_else(|| {
@ -126,7 +129,7 @@ pub async fn chat(
        model,
        choices: vec![Choice {
            index: 0,
-            message: Message { role: "assistant".into(), content: text },
+            message: Message { role: "assistant".into(), content: serde_json::Value::String(text) },
            finish_reason: choice.finish_reason.unwrap_or_else(|| "stop".into()),
        }],
        usage: UsageBlock {
@ -149,7 +152,7 @@ struct ORChatBody {
 }

 #[derive(Serialize)]
-struct ORMessage { role: String, content: String }
+struct ORMessage { role: String, content: serde_json::Value }

 #[derive(Deserialize)]
 struct ORChatResponse {