From 4cb405bb4236542677cb2ed5b78b57d42fcb1570 Mon Sep 17 00:00:00 2001 From: profit Date: Wed, 22 Apr 2026 02:47:15 -0500 Subject: [PATCH] =?UTF-8?q?Phase=2038:=20Universal=20API=20skeleton=20?= =?UTF-8?q?=E2=80=94=20/v1/chat,=20/v1/usage,=20/v1/sessions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First slice of the control-plane pivot. OpenAI-compatible surface over the existing aibridge → Ollama path. Additive — no existing routes touched. All 7 unit tests green, release build clean. What ships: - crates/gateway/src/v1/mod.rs — router, V1State (ai_client + Usage counter), ChatRequest/ChatResponse/Message/UsageBlock types, handlers for /chat, /usage, /sessions. OpenAI-compatible field shapes: {model, messages[{role,content}], temperature?, max_tokens?, stream?} - crates/gateway/src/v1/ollama.rs — shape adapter. Flattens messages into (system, prompt), calls aibridge.generate, unwraps response back into OpenAI /v1/chat shape. Prefers sidecar-reported tokens; falls back to chars/4 ceiling estimate matching Phase 21 convention. - crates/gateway/src/main.rs — one new mod, one .nest("/v1", ...) Tests (7/7): - chat_request_parses_openai_shape - chat_request_accepts_minimal - usage_counter_default_is_zero - flatten_separates_system_from_turns - flatten_concatenates_multiple_system_messages - flatten_with_no_system_returns_empty_system - estimate_tokens_chars_div_4_ceiling Not in this phase (per CONTROL_PLANE_PRD.md): streaming, tool calls, session state, multi-provider, fallback chain, cost gating. All land in Phases 39-44. Next: live-test POST /v1/chat after gateway restart, then migrate bot/propose.ts off direct sidecar calls to prove the loop end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/gateway/src/main.rs | 9 ++ crates/gateway/src/v1/mod.rs | 184 ++++++++++++++++++++++++++++++++ crates/gateway/src/v1/ollama.rs | 149 ++++++++++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100644 crates/gateway/src/v1/mod.rs create mode 100644 crates/gateway/src/v1/ollama.rs diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs index 3a55ba0..7c2b20d 100644 --- a/crates/gateway/src/main.rs +++ b/crates/gateway/src/main.rs @@ -3,6 +3,7 @@ mod access_service; mod auth; mod observability; mod tools; +mod v1; use axum::{Router, extract::DefaultBodyLimit, routing::get}; use proto::lakehouse::catalog_service_server::CatalogServiceServer; @@ -183,6 +184,14 @@ async fn main() { registry: tool_reg, query_fn: tools::QueryExecutor::new(engine.clone()), } + })) + // Phase 38 — Universal API skeleton. Thin OpenAI-compatible + // surface over the existing aibridge → Ollama path. Future + // phases add provider adapters (39), routing engine (40), + // session state (41), etc. All without changing this mount. + .nest("/v1", v1::router(v1::V1State { + ai_client: ai_client.clone(), + usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())), })); // Auth middleware (if enabled) diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs new file mode 100644 index 0000000..ca18487 --- /dev/null +++ b/crates/gateway/src/v1/mod.rs @@ -0,0 +1,184 @@ +//! Phase 38 — Universal API skeleton (`/v1/*`). +//! +//! OpenAI-compatible shape on top of the existing aibridge → Ollama +//! path. This is the thin slice: single provider, stateless, no +//! streaming. Phase 39 replaces the direct Ollama call with a +//! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback. +//! +//! The shape matches OpenAI's `/v1/chat/completions` closely enough +//! that clients using openai-compatible SDKs can point at us with the +//! URL swap alone. We keep the endpoint path `/v1/chat` (not +//! `/v1/chat/completions`) because our PRD declares the terser form; +//! adding the alias is one line in Phase 39 when it matters. + +pub mod ollama; + +use axum::{ + Router, + extract::State, + http::StatusCode, + response::IntoResponse, + routing::{get, post}, + Json, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use tokio::sync::RwLock; + +#[derive(Clone)] +pub struct V1State { + pub ai_client: aibridge::client::AiClient, + pub usage: Arc>, +} + +#[derive(Default, Clone, Serialize)] +pub struct Usage { + pub requests: u64, + pub prompt_tokens: u64, + pub completion_tokens: u64, + pub total_tokens: u64, +} + +pub fn router(state: V1State) -> Router { + Router::new() + .route("/chat", post(chat)) + .route("/usage", get(usage)) + .route("/sessions", get(sessions)) + .with_state(state) +} + +// -- Shared types (OpenAI-compatible) -- + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct Message { + pub role: String, + pub content: String, +} + +#[derive(Deserialize, Debug)] +pub struct ChatRequest { + pub model: String, + pub messages: Vec, + #[serde(default)] + pub temperature: Option, + #[serde(default)] + pub max_tokens: Option, + /// Accepted for shape-compat but ignored in the thin slice — + /// Phase 38 returns non-streaming even when the client asked for it. + /// Phase 39+ wires real streaming. + #[serde(default)] + pub stream: Option, +} + +#[derive(Serialize)] +pub struct ChatResponse { + pub id: String, + pub object: &'static str, + pub created: i64, + pub model: String, + pub choices: Vec, + pub usage: UsageBlock, +} + +#[derive(Serialize)] +pub struct Choice { + pub index: u32, + pub message: Message, + pub finish_reason: String, +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct UsageBlock { + pub prompt_tokens: u32, + pub completion_tokens: u32, + pub total_tokens: u32, +} + +// -- Handlers -- + +async fn chat( + State(state): State, + Json(req): Json, +) -> Result, (StatusCode, String)> { + if req.messages.is_empty() { + return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into())); + } + if req.stream.unwrap_or(false) { + tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming"); + } + + let resp = ollama::chat(&state.ai_client, &req) + .await + .map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?; + + { + let mut u = state.usage.write().await; + u.requests += 1; + u.prompt_tokens += resp.usage.prompt_tokens as u64; + u.completion_tokens += resp.usage.completion_tokens as u64; + u.total_tokens += resp.usage.total_tokens as u64; + } + + Ok(Json(resp)) +} + +async fn usage(State(state): State) -> impl IntoResponse { + let snapshot = state.usage.read().await.clone(); + Json(snapshot) +} + +// Phase 38 is stateless — no session persistence yet. Return an empty +// list in OpenAI-ish shape so clients that probe this endpoint don't +// 404. Real session state lands in Phase 41 with the profile-system +// expansion. +async fn sessions() -> impl IntoResponse { + Json(serde_json::json!({ + "data": [], + "object": "list", + "note": "Phase 38: stateless. Session state lands in Phase 41.", + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn chat_request_parses_openai_shape() { + let raw = r#"{ + "model": "qwen3.5:latest", + "messages": [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hi"} + ], + "temperature": 0.2, + "max_tokens": 100 + }"#; + let r: ChatRequest = serde_json::from_str(raw).unwrap(); + assert_eq!(r.model, "qwen3.5:latest"); + assert_eq!(r.messages.len(), 2); + assert_eq!(r.messages[0].role, "system"); + assert_eq!(r.messages[1].content, "Hi"); + assert_eq!(r.temperature, Some(0.2)); + assert_eq!(r.max_tokens, Some(100)); + } + + #[test] + fn chat_request_accepts_minimal() { + let raw = r#"{ + "model": "any", + "messages": [{"role": "user", "content": "hi"}] + }"#; + let r: ChatRequest = serde_json::from_str(raw).unwrap(); + assert_eq!(r.temperature, None); + assert_eq!(r.max_tokens, None); + assert_eq!(r.stream, None); + } + + #[test] + fn usage_counter_default_is_zero() { + let u = Usage::default(); + assert_eq!(u.requests, 0); + assert_eq!(u.total_tokens, 0); + } +} diff --git a/crates/gateway/src/v1/ollama.rs b/crates/gateway/src/v1/ollama.rs new file mode 100644 index 0000000..c768960 --- /dev/null +++ b/crates/gateway/src/v1/ollama.rs @@ -0,0 +1,149 @@ +//! Phase 38 — Ollama shape adapter. +//! +//! Translates `/v1/chat` (OpenAI-compatible) requests into the +//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse` +//! back into an OpenAI-compatible `ChatResponse`. This is a bridge, +//! not a new client — aibridge + the Python sidecar stay as-is. +//! +//! Phase 39 replaces this direct call with a `ProviderAdapter` trait +//! dispatch so the same `/v1/chat` handler routes to any provider. + +use aibridge::client::{AiClient, GenerateRequest}; +use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock}; + +pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result { + let (system, prompt) = flatten_messages(&req.messages); + + let gen_req = GenerateRequest { + prompt, + model: Some(req.model.clone()), + system: if system.is_empty() { None } else { Some(system) }, + temperature: req.temperature, + max_tokens: req.max_tokens, + // Phase 38 default: leave thinking behavior to the model's + // default (None). Phase 21's `think:false` discipline is a + // call-site concern for hot-path JSON emitters — Phase 40's + // routing engine can set it per task class. + think: None, + }; + + let t0 = std::time::Instant::now(); + let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?; + let latency_ms = t0.elapsed().as_millis(); + + // Prefer sidecar-reported token counts when present. Fall back to + // chars/4 estimate (biased safe ~15%, matches Phase 21 convention + // in crates/aibridge/src/context.rs::estimate_tokens). + let prompt_tokens = resp.tokens_evaluated + .map(|n| n as u32) + .unwrap_or_else(|| estimate_prompt_tokens(&req.messages)); + let completion_tokens = resp.tokens_generated + .map(|n| n as u32) + .unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32); + + tracing::info!( + target: "v1.chat", + model = %req.model, + prompt_tokens, + completion_tokens, + latency_ms = latency_ms as u64, + "ollama chat completed", + ); + + Ok(ChatResponse { + id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)), + object: "chat.completion", + created: chrono::Utc::now().timestamp(), + model: resp.model, + choices: vec![Choice { + index: 0, + message: Message { + role: "assistant".into(), + content: resp.text, + }, + finish_reason: "stop".into(), + }], + usage: UsageBlock { + prompt_tokens, + completion_tokens, + total_tokens: prompt_tokens + completion_tokens, + }, + }) +} + +/// Collapse a message array into (system, prompt). Multiple system +/// messages concatenate with a newline — matches OpenAI's documented +/// behavior. Non-system messages become role-labeled turns. +fn flatten_messages(messages: &[Message]) -> (String, String) { + let mut system = String::new(); + let mut prompt = String::new(); + for m in messages { + if m.role == "system" { + if !system.is_empty() { system.push('\n'); } + system.push_str(&m.content); + } else { + prompt.push_str(&m.role); + prompt.push_str(": "); + prompt.push_str(&m.content); + prompt.push_str("\n\n"); + } + } + prompt.push_str("assistant:"); + (system, prompt) +} + +fn estimate_prompt_tokens(messages: &[Message]) -> u32 { + let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum(); + ((chars + 3) / 4) as u32 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn flatten_separates_system_from_turns() { + let msgs = vec![ + Message { role: "system".into(), content: "Rules here.".into() }, + Message { role: "user".into(), content: "Q1".into() }, + Message { role: "assistant".into(), content: "A1".into() }, + Message { role: "user".into(), content: "Q2".into() }, + ]; + let (system, prompt) = flatten_messages(&msgs); + assert_eq!(system, "Rules here."); + assert!(prompt.contains("user: Q1")); + assert!(prompt.contains("assistant: A1")); + assert!(prompt.contains("user: Q2")); + assert!(prompt.trim_end().ends_with("assistant:")); + } + + #[test] + fn flatten_concatenates_multiple_system_messages() { + let msgs = vec![ + Message { role: "system".into(), content: "First.".into() }, + Message { role: "system".into(), content: "Second.".into() }, + Message { role: "user".into(), content: "Hi".into() }, + ]; + let (system, _) = flatten_messages(&msgs); + assert_eq!(system, "First.\nSecond."); + } + + #[test] + fn flatten_with_no_system_returns_empty_system() { + let msgs = vec![Message { role: "user".into(), content: "hi".into() }]; + let (system, prompt) = flatten_messages(&msgs); + assert!(system.is_empty()); + assert!(prompt.contains("user: hi")); + } + + #[test] + fn estimate_tokens_chars_div_4_ceiling() { + let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }]; + // 8 chars / 4 = 2, with ceiling → 2 + assert_eq!(estimate_prompt_tokens(&msgs), 2); + + let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }]; + // 9 chars → (9+3)/4 = 3 (ceiling) + assert_eq!(estimate_prompt_tokens(&msgs2), 3); + } +}