Phase 38: Universal API skeleton — /v1/chat, /v1/usage, /v1/sessions

First slice of the control-plane pivot. OpenAI-compatible surface over the existing aibridge → Ollama path. Additive — no existing routes touched. All 7 unit tests green, release build clean. What ships: - crates/gateway/src/v1/mod.rs — router, V1State (ai_client + Usage counter), ChatRequest/ChatResponse/Message/UsageBlock types, handlers for /chat, /usage, /sessions. OpenAI-compatible field shapes: {model, messages[{role,content}], temperature?, max_tokens?, stream?} - crates/gateway/src/v1/ollama.rs — shape adapter. Flattens messages into (system, prompt), calls aibridge.generate, unwraps response back into OpenAI /v1/chat shape. Prefers sidecar-reported tokens; falls back to chars/4 ceiling estimate matching Phase 21 convention. - crates/gateway/src/main.rs — one new mod, one .nest("/v1", ...) Tests (7/7): - chat_request_parses_openai_shape - chat_request_accepts_minimal - usage_counter_default_is_zero - flatten_separates_system_from_turns - flatten_concatenates_multiple_system_messages - flatten_with_no_system_returns_empty_system - estimate_tokens_chars_div_4_ceiling Not in this phase (per CONTROL_PLANE_PRD.md): streaming, tool calls, session state, multi-provider, fallback chain, cost gating. All land in Phases 39-44. Next: live-test POST /v1/chat after gateway restart, then migrate bot/propose.ts off direct sidecar calls to prove the loop end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:47:15 -05:00 · 2026-04-22 02:47:15 -05:00 · 4cb405bb42
commit 4cb405bb42
parent f44b6b3e6b
3 changed files with 342 additions and 0 deletions
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -3,6 +3,7 @@ mod access_service;
 mod auth;
 mod observability;
 mod tools;
+mod v1;

 use axum::{Router, extract::DefaultBodyLimit, routing::get};
 use proto::lakehouse::catalog_service_server::CatalogServiceServer;
@ -183,6 +184,14 @@ async fn main() {
                registry: tool_reg,
                query_fn: tools::QueryExecutor::new(engine.clone()),
            }
+        }))
+        // Phase 38 — Universal API skeleton. Thin OpenAI-compatible
+        // surface over the existing aibridge → Ollama path. Future
+        // phases add provider adapters (39), routing engine (40),
+        // session state (41), etc. All without changing this mount.
+        .nest("/v1", v1::router(v1::V1State {
+            ai_client: ai_client.clone(),
+            usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
        }));

    // Auth middleware (if enabled)
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@ -0,0 +1,184 @@
+//! Phase 38 — Universal API skeleton (`/v1/*`).
+//!
+//! OpenAI-compatible shape on top of the existing aibridge → Ollama
+//! path. This is the thin slice: single provider, stateless, no
+//! streaming. Phase 39 replaces the direct Ollama call with a
+//! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback.
+//!
+//! The shape matches OpenAI's `/v1/chat/completions` closely enough
+//! that clients using openai-compatible SDKs can point at us with the
+//! URL swap alone. We keep the endpoint path `/v1/chat` (not
+//! `/v1/chat/completions`) because our PRD declares the terser form;
+//! adding the alias is one line in Phase 39 when it matters.
+
+pub mod ollama;
+
+use axum::{
+    Router,
+    extract::State,
+    http::StatusCode,
+    response::IntoResponse,
+    routing::{get, post},
+    Json,
+};
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+use tokio::sync::RwLock;
+
+#[derive(Clone)]
+pub struct V1State {
+    pub ai_client: aibridge::client::AiClient,
+    pub usage: Arc<RwLock<Usage>>,
+}
+
+#[derive(Default, Clone, Serialize)]
+pub struct Usage {
+    pub requests: u64,
+    pub prompt_tokens: u64,
+    pub completion_tokens: u64,
+    pub total_tokens: u64,
+}
+
+pub fn router(state: V1State) -> Router {
+    Router::new()
+        .route("/chat", post(chat))
+        .route("/usage", get(usage))
+        .route("/sessions", get(sessions))
+        .with_state(state)
+}
+
+// -- Shared types (OpenAI-compatible) --
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct Message {
+    pub role: String,
+    pub content: String,
+}
+
+#[derive(Deserialize, Debug)]
+pub struct ChatRequest {
+    pub model: String,
+    pub messages: Vec<Message>,
+    #[serde(default)]
+    pub temperature: Option<f64>,
+    #[serde(default)]
+    pub max_tokens: Option<u32>,
+    /// Accepted for shape-compat but ignored in the thin slice —
+    /// Phase 38 returns non-streaming even when the client asked for it.
+    /// Phase 39+ wires real streaming.
+    #[serde(default)]
+    pub stream: Option<bool>,
+}
+
+#[derive(Serialize)]
+pub struct ChatResponse {
+    pub id: String,
+    pub object: &'static str,
+    pub created: i64,
+    pub model: String,
+    pub choices: Vec<Choice>,
+    pub usage: UsageBlock,
+}
+
+#[derive(Serialize)]
+pub struct Choice {
+    pub index: u32,
+    pub message: Message,
+    pub finish_reason: String,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct UsageBlock {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+}
+
+// -- Handlers --
+
+async fn chat(
+    State(state): State<V1State>,
+    Json(req): Json<ChatRequest>,
+) -> Result<Json<ChatResponse>, (StatusCode, String)> {
+    if req.messages.is_empty() {
+        return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into()));
+    }
+    if req.stream.unwrap_or(false) {
+        tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
+    }
+
+    let resp = ollama::chat(&state.ai_client, &req)
+        .await
+        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
+
+    {
+        let mut u = state.usage.write().await;
+        u.requests += 1;
+        u.prompt_tokens += resp.usage.prompt_tokens as u64;
+        u.completion_tokens += resp.usage.completion_tokens as u64;
+        u.total_tokens += resp.usage.total_tokens as u64;
+    }
+
+    Ok(Json(resp))
+}
+
+async fn usage(State(state): State<V1State>) -> impl IntoResponse {
+    let snapshot = state.usage.read().await.clone();
+    Json(snapshot)
+}
+
+// Phase 38 is stateless — no session persistence yet. Return an empty
+// list in OpenAI-ish shape so clients that probe this endpoint don't
+// 404. Real session state lands in Phase 41 with the profile-system
+// expansion.
+async fn sessions() -> impl IntoResponse {
+    Json(serde_json::json!({
+        "data": [],
+        "object": "list",
+        "note": "Phase 38: stateless. Session state lands in Phase 41.",
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn chat_request_parses_openai_shape() {
+        let raw = r#"{
+            "model": "qwen3.5:latest",
+            "messages": [
+                {"role": "system", "content": "You are helpful."},
+                {"role": "user", "content": "Hi"}
+            ],
+            "temperature": 0.2,
+            "max_tokens": 100
+        }"#;
+        let r: ChatRequest = serde_json::from_str(raw).unwrap();
+        assert_eq!(r.model, "qwen3.5:latest");
+        assert_eq!(r.messages.len(), 2);
+        assert_eq!(r.messages[0].role, "system");
+        assert_eq!(r.messages[1].content, "Hi");
+        assert_eq!(r.temperature, Some(0.2));
+        assert_eq!(r.max_tokens, Some(100));
+    }
+
+    #[test]
+    fn chat_request_accepts_minimal() {
+        let raw = r#"{
+            "model": "any",
+            "messages": [{"role": "user", "content": "hi"}]
+        }"#;
+        let r: ChatRequest = serde_json::from_str(raw).unwrap();
+        assert_eq!(r.temperature, None);
+        assert_eq!(r.max_tokens, None);
+        assert_eq!(r.stream, None);
+    }
+
+    #[test]
+    fn usage_counter_default_is_zero() {
+        let u = Usage::default();
+        assert_eq!(u.requests, 0);
+        assert_eq!(u.total_tokens, 0);
+    }
+}
--- a/crates/gateway/src/v1/ollama.rs
+++ b/crates/gateway/src/v1/ollama.rs
@ -0,0 +1,149 @@
+//! Phase 38 — Ollama shape adapter.
+//!
+//! Translates `/v1/chat` (OpenAI-compatible) requests into the
+//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
+//! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
+//! not a new client — aibridge + the Python sidecar stay as-is.
+//!
+//! Phase 39 replaces this direct call with a `ProviderAdapter` trait
+//! dispatch so the same `/v1/chat` handler routes to any provider.
+
+use aibridge::client::{AiClient, GenerateRequest};
+use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
+
+pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
+    let (system, prompt) = flatten_messages(&req.messages);
+
+    let gen_req = GenerateRequest {
+        prompt,
+        model: Some(req.model.clone()),
+        system: if system.is_empty() { None } else { Some(system) },
+        temperature: req.temperature,
+        max_tokens: req.max_tokens,
+        // Phase 38 default: leave thinking behavior to the model's
+        // default (None). Phase 21's `think:false` discipline is a
+        // call-site concern for hot-path JSON emitters — Phase 40's
+        // routing engine can set it per task class.
+        think: None,
+    };
+
+    let t0 = std::time::Instant::now();
+    let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
+    let latency_ms = t0.elapsed().as_millis();
+
+    // Prefer sidecar-reported token counts when present. Fall back to
+    // chars/4 estimate (biased safe ~15%, matches Phase 21 convention
+    // in crates/aibridge/src/context.rs::estimate_tokens).
+    let prompt_tokens = resp.tokens_evaluated
+        .map(|n| n as u32)
+        .unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
+    let completion_tokens = resp.tokens_generated
+        .map(|n| n as u32)
+        .unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
+
+    tracing::info!(
+        target: "v1.chat",
+        model = %req.model,
+        prompt_tokens,
+        completion_tokens,
+        latency_ms = latency_ms as u64,
+        "ollama chat completed",
+    );
+
+    Ok(ChatResponse {
+        id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
+        object: "chat.completion",
+        created: chrono::Utc::now().timestamp(),
+        model: resp.model,
+        choices: vec![Choice {
+            index: 0,
+            message: Message {
+                role: "assistant".into(),
+                content: resp.text,
+            },
+            finish_reason: "stop".into(),
+        }],
+        usage: UsageBlock {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+        },
+    })
+}
+
+/// Collapse a message array into (system, prompt). Multiple system
+/// messages concatenate with a newline — matches OpenAI's documented
+/// behavior. Non-system messages become role-labeled turns.
+fn flatten_messages(messages: &[Message]) -> (String, String) {
+    let mut system = String::new();
+    let mut prompt = String::new();
+    for m in messages {
+        if m.role == "system" {
+            if !system.is_empty() { system.push('\n'); }
+            system.push_str(&m.content);
+        } else {
+            prompt.push_str(&m.role);
+            prompt.push_str(": ");
+            prompt.push_str(&m.content);
+            prompt.push_str("\n\n");
+        }
+    }
+    prompt.push_str("assistant:");
+    (system, prompt)
+}
+
+fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
+    let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
+    ((chars + 3) / 4) as u32
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn flatten_separates_system_from_turns() {
+        let msgs = vec![
+            Message { role: "system".into(), content: "Rules here.".into() },
+            Message { role: "user".into(), content: "Q1".into() },
+            Message { role: "assistant".into(), content: "A1".into() },
+            Message { role: "user".into(), content: "Q2".into() },
+        ];
+        let (system, prompt) = flatten_messages(&msgs);
+        assert_eq!(system, "Rules here.");
+        assert!(prompt.contains("user: Q1"));
+        assert!(prompt.contains("assistant: A1"));
+        assert!(prompt.contains("user: Q2"));
+        assert!(prompt.trim_end().ends_with("assistant:"));
+    }
+
+    #[test]
+    fn flatten_concatenates_multiple_system_messages() {
+        let msgs = vec![
+            Message { role: "system".into(), content: "First.".into() },
+            Message { role: "system".into(), content: "Second.".into() },
+            Message { role: "user".into(), content: "Hi".into() },
+        ];
+        let (system, _) = flatten_messages(&msgs);
+        assert_eq!(system, "First.\nSecond.");
+    }
+
+    #[test]
+    fn flatten_with_no_system_returns_empty_system() {
+        let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
+        let (system, prompt) = flatten_messages(&msgs);
+        assert!(system.is_empty());
+        assert!(prompt.contains("user: hi"));
+    }
+
+    #[test]
+    fn estimate_tokens_chars_div_4_ceiling() {
+        let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
+        // 8 chars / 4 = 2, with ceiling → 2
+        assert_eq!(estimate_prompt_tokens(&msgs), 2);
+
+        let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
+        // 9 chars → (9+3)/4 = 3 (ceiling)
+        assert_eq!(estimate_prompt_tokens(&msgs2), 3);
+    }
+}