Phase 38: Universal API skeleton — /v1/chat, /v1/usage, /v1/sessions

First slice of the control-plane pivot. OpenAI-compatible surface over the existing aibridge → Ollama path. Additive — no existing routes touched. All 7 unit tests green, release build clean. What ships: - crates/gateway/src/v1/mod.rs — router, V1State (ai_client + Usage counter), ChatRequest/ChatResponse/Message/UsageBlock types, handlers for /chat, /usage, /sessions. OpenAI-compatible field shapes: {model, messages[{role,content}], temperature?, max_tokens?, stream?} - crates/gateway/src/v1/ollama.rs — shape adapter. Flattens messages into (system, prompt), calls aibridge.generate, unwraps response back into OpenAI /v1/chat shape. Prefers sidecar-reported tokens; falls back to chars/4 ceiling estimate matching Phase 21 convention. - crates/gateway/src/main.rs — one new mod, one .nest("/v1", ...) Tests (7/7): - chat_request_parses_openai_shape - chat_request_accepts_minimal - usage_counter_default_is_zero - flatten_separates_system_from_turns - flatten_concatenates_multiple_system_messages - flatten_with_no_system_returns_empty_system - estimate_tokens_chars_div_4_ceiling Not in this phase (per CONTROL_PLANE_PRD.md): streaming, tool calls, session state, multi-provider, fallback chain, cost gating. All land in Phases 39-44. Next: live-test POST /v1/chat after gateway restart, then migrate bot/propose.ts off direct sidecar calls to prove the loop end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:47:15 -05:00 · 2026-04-22 02:47:15 -05:00 · 4cb405bb42
commit 4cb405bb42
parent f44b6b3e6b
3 changed files with 342 additions and 0 deletions
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -3,6 +3,7 @@ mod access_service;
 mod auth;
 mod observability;
 mod tools;
 mod v1;
 use axum::{Router, extract::DefaultBodyLimit, routing::get};
 use proto::lakehouse::catalog_service_server::CatalogServiceServer;
@ -183,6 +184,14 @@ async fn main() {
                registry: tool_reg,
                query_fn: tools::QueryExecutor::new(engine.clone()),
            }
        }))
        // Phase 38 — Universal API skeleton. Thin OpenAI-compatible
        // surface over the existing aibridge → Ollama path. Future
        // phases add provider adapters (39), routing engine (40),
        // session state (41), etc. All without changing this mount.
        .nest("/v1", v1::router(v1::V1State {
            ai_client: ai_client.clone(),
            usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
        }));
    // Auth middleware (if enabled)
--- a/crates/gateway/src/v1/mod.rs
+++ b/crates/gateway/src/v1/mod.rs
@ -0,0 +1,184 @@
 //! Phase 38 — Universal API skeleton (`/v1/*`).
 //!
 //! OpenAI-compatible shape on top of the existing aibridge → Ollama
 //! path. This is the thin slice: single provider, stateless, no
 //! streaming. Phase 39 replaces the direct Ollama call with a
 //! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback.
 //!
 //! The shape matches OpenAI's `/v1/chat/completions` closely enough
 //! that clients using openai-compatible SDKs can point at us with the
 //! URL swap alone. We keep the endpoint path `/v1/chat` (not
 //! `/v1/chat/completions`) because our PRD declares the terser form;
 //! adding the alias is one line in Phase 39 when it matters.
 pub mod ollama;
 use axum::{
    Router,
    extract::State,
    http::StatusCode,
    response::IntoResponse,
    routing::{get, post},
    Json,
 };
 use serde::{Deserialize, Serialize};
 use std::sync::Arc;
 use tokio::sync::RwLock;
 #[derive(Clone)]
 pub struct V1State {
    pub ai_client: aibridge::client::AiClient,
    pub usage: Arc<RwLock<Usage>>,
 }
 #[derive(Default, Clone, Serialize)]
 pub struct Usage {
    pub requests: u64,
    pub prompt_tokens: u64,
    pub completion_tokens: u64,
    pub total_tokens: u64,
 }
 pub fn router(state: V1State) -> Router {
    Router::new()
        .route("/chat", post(chat))
        .route("/usage", get(usage))
        .route("/sessions", get(sessions))
        .with_state(state)
 }
 // -- Shared types (OpenAI-compatible) --
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct Message {
    pub role: String,
    pub content: String,
 }
 #[derive(Deserialize, Debug)]
 pub struct ChatRequest {
    pub model: String,
    pub messages: Vec<Message>,
    #[serde(default)]
    pub temperature: Option<f64>,
    #[serde(default)]
    pub max_tokens: Option<u32>,
    /// Accepted for shape-compat but ignored in the thin slice —
    /// Phase 38 returns non-streaming even when the client asked for it.
    /// Phase 39+ wires real streaming.
    #[serde(default)]
    pub stream: Option<bool>,
 }
 #[derive(Serialize)]
 pub struct ChatResponse {
    pub id: String,
    pub object: &'static str,
    pub created: i64,
    pub model: String,
    pub choices: Vec<Choice>,
    pub usage: UsageBlock,
 }
 #[derive(Serialize)]
 pub struct Choice {
    pub index: u32,
    pub message: Message,
    pub finish_reason: String,
 }
 #[derive(Serialize, Deserialize, Clone)]
 pub struct UsageBlock {
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
    pub total_tokens: u32,
 }
 // -- Handlers --
 async fn chat(
    State(state): State<V1State>,
    Json(req): Json<ChatRequest>,
 ) -> Result<Json<ChatResponse>, (StatusCode, String)> {
    if req.messages.is_empty() {
        return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into()));
    }
    if req.stream.unwrap_or(false) {
        tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
    }
    let resp = ollama::chat(&state.ai_client, &req)
        .await
        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
    {
        let mut u = state.usage.write().await;
        u.requests += 1;
        u.prompt_tokens += resp.usage.prompt_tokens as u64;
        u.completion_tokens += resp.usage.completion_tokens as u64;
        u.total_tokens += resp.usage.total_tokens as u64;
    }
    Ok(Json(resp))
 }
 async fn usage(State(state): State<V1State>) -> impl IntoResponse {
    let snapshot = state.usage.read().await.clone();
    Json(snapshot)
 }
 // Phase 38 is stateless — no session persistence yet. Return an empty
 // list in OpenAI-ish shape so clients that probe this endpoint don't
 // 404. Real session state lands in Phase 41 with the profile-system
 // expansion.
 async fn sessions() -> impl IntoResponse {
    Json(serde_json::json!({
        "data": [],
        "object": "list",
        "note": "Phase 38: stateless. Session state lands in Phase 41.",
    }))
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn chat_request_parses_openai_shape() {
        let raw = r#"{
            "model": "qwen3.5:latest",
            "messages": [
                {"role": "system", "content": "You are helpful."},
                {"role": "user", "content": "Hi"}
            ],
            "temperature": 0.2,
            "max_tokens": 100
        }"#;
        let r: ChatRequest = serde_json::from_str(raw).unwrap();
        assert_eq!(r.model, "qwen3.5:latest");
        assert_eq!(r.messages.len(), 2);
        assert_eq!(r.messages[0].role, "system");
        assert_eq!(r.messages[1].content, "Hi");
        assert_eq!(r.temperature, Some(0.2));
        assert_eq!(r.max_tokens, Some(100));
    }
    #[test]
    fn chat_request_accepts_minimal() {
        let raw = r#"{
            "model": "any",
            "messages": [{"role": "user", "content": "hi"}]
        }"#;
        let r: ChatRequest = serde_json::from_str(raw).unwrap();
        assert_eq!(r.temperature, None);
        assert_eq!(r.max_tokens, None);
        assert_eq!(r.stream, None);
    }
    #[test]
    fn usage_counter_default_is_zero() {
        let u = Usage::default();
        assert_eq!(u.requests, 0);
        assert_eq!(u.total_tokens, 0);
    }
 }
--- a/crates/gateway/src/v1/ollama.rs
+++ b/crates/gateway/src/v1/ollama.rs
@ -0,0 +1,149 @@
 //! Phase 38 — Ollama shape adapter.
 //!
 //! Translates `/v1/chat` (OpenAI-compatible) requests into the
 //! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
 //! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
 //! not a new client — aibridge + the Python sidecar stay as-is.
 //!
 //! Phase 39 replaces this direct call with a `ProviderAdapter` trait
 //! dispatch so the same `/v1/chat` handler routes to any provider.
 use aibridge::client::{AiClient, GenerateRequest};
 use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
 pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
    let (system, prompt) = flatten_messages(&req.messages);
    let gen_req = GenerateRequest {
        prompt,
        model: Some(req.model.clone()),
        system: if system.is_empty() { None } else { Some(system) },
        temperature: req.temperature,
        max_tokens: req.max_tokens,
        // Phase 38 default: leave thinking behavior to the model's
        // default (None). Phase 21's `think:false` discipline is a
        // call-site concern for hot-path JSON emitters — Phase 40's
        // routing engine can set it per task class.
        think: None,
    };
    let t0 = std::time::Instant::now();
    let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
    let latency_ms = t0.elapsed().as_millis();
    // Prefer sidecar-reported token counts when present. Fall back to
    // chars/4 estimate (biased safe ~15%, matches Phase 21 convention
    // in crates/aibridge/src/context.rs::estimate_tokens).
    let prompt_tokens = resp.tokens_evaluated
        .map(|n| n as u32)
        .unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
    let completion_tokens = resp.tokens_generated
        .map(|n| n as u32)
        .unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
    tracing::info!(
        target: "v1.chat",
        model = %req.model,
        prompt_tokens,
        completion_tokens,
        latency_ms = latency_ms as u64,
        "ollama chat completed",
    );
    Ok(ChatResponse {
        id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
        object: "chat.completion",
        created: chrono::Utc::now().timestamp(),
        model: resp.model,
        choices: vec![Choice {
            index: 0,
            message: Message {
                role: "assistant".into(),
                content: resp.text,
            },
            finish_reason: "stop".into(),
        }],
        usage: UsageBlock {
            prompt_tokens,
            completion_tokens,
            total_tokens: prompt_tokens + completion_tokens,
        },
    })
 }
 /// Collapse a message array into (system, prompt). Multiple system
 /// messages concatenate with a newline — matches OpenAI's documented
 /// behavior. Non-system messages become role-labeled turns.
 fn flatten_messages(messages: &[Message]) -> (String, String) {
    let mut system = String::new();
    let mut prompt = String::new();
    for m in messages {
        if m.role == "system" {
            if !system.is_empty() { system.push('\n'); }
            system.push_str(&m.content);
        } else {
            prompt.push_str(&m.role);
            prompt.push_str(": ");
            prompt.push_str(&m.content);
            prompt.push_str("\n\n");
        }
    }
    prompt.push_str("assistant:");
    (system, prompt)
 }
 fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
    let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
    ((chars + 3) / 4) as u32
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn flatten_separates_system_from_turns() {
        let msgs = vec![
            Message { role: "system".into(), content: "Rules here.".into() },
            Message { role: "user".into(), content: "Q1".into() },
            Message { role: "assistant".into(), content: "A1".into() },
            Message { role: "user".into(), content: "Q2".into() },
        ];
        let (system, prompt) = flatten_messages(&msgs);
        assert_eq!(system, "Rules here.");
        assert!(prompt.contains("user: Q1"));
        assert!(prompt.contains("assistant: A1"));
        assert!(prompt.contains("user: Q2"));
        assert!(prompt.trim_end().ends_with("assistant:"));
    }
    #[test]
    fn flatten_concatenates_multiple_system_messages() {
        let msgs = vec![
            Message { role: "system".into(), content: "First.".into() },
            Message { role: "system".into(), content: "Second.".into() },
            Message { role: "user".into(), content: "Hi".into() },
        ];
        let (system, _) = flatten_messages(&msgs);
        assert_eq!(system, "First.\nSecond.");
    }
    #[test]
    fn flatten_with_no_system_returns_empty_system() {
        let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
        let (system, prompt) = flatten_messages(&msgs);
        assert!(system.is_empty());
        assert!(prompt.contains("user: hi"));
    }
    #[test]
    fn estimate_tokens_chars_div_4_ceiling() {
        let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
        // 8 chars / 4 = 2, with ceiling → 2
        assert_eq!(estimate_prompt_tokens(&msgs), 2);
        let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
        // 9 chars → (9+3)/4 = 3 (ceiling)
        assert_eq!(estimate_prompt_tokens(&msgs2), 3);
    }
 }