Phase 38: Universal API skeleton — /v1/chat, /v1/usage, /v1/sessions
First slice of the control-plane pivot. OpenAI-compatible surface
over the existing aibridge → Ollama path. Additive — no existing
routes touched. All 7 unit tests green, release build clean.
What ships:
- crates/gateway/src/v1/mod.rs — router, V1State (ai_client + Usage
counter), ChatRequest/ChatResponse/Message/UsageBlock types, handlers
for /chat, /usage, /sessions. OpenAI-compatible field shapes:
{model, messages[{role,content}], temperature?, max_tokens?, stream?}
- crates/gateway/src/v1/ollama.rs — shape adapter. Flattens messages
into (system, prompt), calls aibridge.generate, unwraps response
back into OpenAI /v1/chat shape. Prefers sidecar-reported tokens;
falls back to chars/4 ceiling estimate matching Phase 21 convention.
- crates/gateway/src/main.rs — one new mod, one .nest("/v1", ...)
Tests (7/7):
- chat_request_parses_openai_shape
- chat_request_accepts_minimal
- usage_counter_default_is_zero
- flatten_separates_system_from_turns
- flatten_concatenates_multiple_system_messages
- flatten_with_no_system_returns_empty_system
- estimate_tokens_chars_div_4_ceiling
Not in this phase (per CONTROL_PLANE_PRD.md): streaming, tool calls,
session state, multi-provider, fallback chain, cost gating. All
land in Phases 39-44.
Next: live-test POST /v1/chat after gateway restart, then migrate
bot/propose.ts off direct sidecar calls to prove the loop end-to-end.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f44b6b3e6b
commit
4cb405bb42
@ -3,6 +3,7 @@ mod access_service;
|
|||||||
mod auth;
|
mod auth;
|
||||||
mod observability;
|
mod observability;
|
||||||
mod tools;
|
mod tools;
|
||||||
|
mod v1;
|
||||||
|
|
||||||
use axum::{Router, extract::DefaultBodyLimit, routing::get};
|
use axum::{Router, extract::DefaultBodyLimit, routing::get};
|
||||||
use proto::lakehouse::catalog_service_server::CatalogServiceServer;
|
use proto::lakehouse::catalog_service_server::CatalogServiceServer;
|
||||||
@ -183,6 +184,14 @@ async fn main() {
|
|||||||
registry: tool_reg,
|
registry: tool_reg,
|
||||||
query_fn: tools::QueryExecutor::new(engine.clone()),
|
query_fn: tools::QueryExecutor::new(engine.clone()),
|
||||||
}
|
}
|
||||||
|
}))
|
||||||
|
// Phase 38 — Universal API skeleton. Thin OpenAI-compatible
|
||||||
|
// surface over the existing aibridge → Ollama path. Future
|
||||||
|
// phases add provider adapters (39), routing engine (40),
|
||||||
|
// session state (41), etc. All without changing this mount.
|
||||||
|
.nest("/v1", v1::router(v1::V1State {
|
||||||
|
ai_client: ai_client.clone(),
|
||||||
|
usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Auth middleware (if enabled)
|
// Auth middleware (if enabled)
|
||||||
|
|||||||
184
crates/gateway/src/v1/mod.rs
Normal file
184
crates/gateway/src/v1/mod.rs
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
//! Phase 38 — Universal API skeleton (`/v1/*`).
|
||||||
|
//!
|
||||||
|
//! OpenAI-compatible shape on top of the existing aibridge → Ollama
|
||||||
|
//! path. This is the thin slice: single provider, stateless, no
|
||||||
|
//! streaming. Phase 39 replaces the direct Ollama call with a
|
||||||
|
//! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback.
|
||||||
|
//!
|
||||||
|
//! The shape matches OpenAI's `/v1/chat/completions` closely enough
|
||||||
|
//! that clients using openai-compatible SDKs can point at us with the
|
||||||
|
//! URL swap alone. We keep the endpoint path `/v1/chat` (not
|
||||||
|
//! `/v1/chat/completions`) because our PRD declares the terser form;
|
||||||
|
//! adding the alias is one line in Phase 39 when it matters.
|
||||||
|
|
||||||
|
pub mod ollama;
|
||||||
|
|
||||||
|
use axum::{
|
||||||
|
Router,
|
||||||
|
extract::State,
|
||||||
|
http::StatusCode,
|
||||||
|
response::IntoResponse,
|
||||||
|
routing::{get, post},
|
||||||
|
Json,
|
||||||
|
};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct V1State {
|
||||||
|
pub ai_client: aibridge::client::AiClient,
|
||||||
|
pub usage: Arc<RwLock<Usage>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Clone, Serialize)]
|
||||||
|
pub struct Usage {
|
||||||
|
pub requests: u64,
|
||||||
|
pub prompt_tokens: u64,
|
||||||
|
pub completion_tokens: u64,
|
||||||
|
pub total_tokens: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn router(state: V1State) -> Router {
|
||||||
|
Router::new()
|
||||||
|
.route("/chat", post(chat))
|
||||||
|
.route("/usage", get(usage))
|
||||||
|
.route("/sessions", get(sessions))
|
||||||
|
.with_state(state)
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Shared types (OpenAI-compatible) --
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
pub struct Message {
|
||||||
|
pub role: String,
|
||||||
|
pub content: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct ChatRequest {
|
||||||
|
pub model: String,
|
||||||
|
pub messages: Vec<Message>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub temperature: Option<f64>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub max_tokens: Option<u32>,
|
||||||
|
/// Accepted for shape-compat but ignored in the thin slice —
|
||||||
|
/// Phase 38 returns non-streaming even when the client asked for it.
|
||||||
|
/// Phase 39+ wires real streaming.
|
||||||
|
#[serde(default)]
|
||||||
|
pub stream: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct ChatResponse {
|
||||||
|
pub id: String,
|
||||||
|
pub object: &'static str,
|
||||||
|
pub created: i64,
|
||||||
|
pub model: String,
|
||||||
|
pub choices: Vec<Choice>,
|
||||||
|
pub usage: UsageBlock,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct Choice {
|
||||||
|
pub index: u32,
|
||||||
|
pub message: Message,
|
||||||
|
pub finish_reason: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
pub struct UsageBlock {
|
||||||
|
pub prompt_tokens: u32,
|
||||||
|
pub completion_tokens: u32,
|
||||||
|
pub total_tokens: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Handlers --
|
||||||
|
|
||||||
|
async fn chat(
|
||||||
|
State(state): State<V1State>,
|
||||||
|
Json(req): Json<ChatRequest>,
|
||||||
|
) -> Result<Json<ChatResponse>, (StatusCode, String)> {
|
||||||
|
if req.messages.is_empty() {
|
||||||
|
return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into()));
|
||||||
|
}
|
||||||
|
if req.stream.unwrap_or(false) {
|
||||||
|
tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = ollama::chat(&state.ai_client, &req)
|
||||||
|
.await
|
||||||
|
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut u = state.usage.write().await;
|
||||||
|
u.requests += 1;
|
||||||
|
u.prompt_tokens += resp.usage.prompt_tokens as u64;
|
||||||
|
u.completion_tokens += resp.usage.completion_tokens as u64;
|
||||||
|
u.total_tokens += resp.usage.total_tokens as u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Json(resp))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn usage(State(state): State<V1State>) -> impl IntoResponse {
|
||||||
|
let snapshot = state.usage.read().await.clone();
|
||||||
|
Json(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 38 is stateless — no session persistence yet. Return an empty
|
||||||
|
// list in OpenAI-ish shape so clients that probe this endpoint don't
|
||||||
|
// 404. Real session state lands in Phase 41 with the profile-system
|
||||||
|
// expansion.
|
||||||
|
async fn sessions() -> impl IntoResponse {
|
||||||
|
Json(serde_json::json!({
|
||||||
|
"data": [],
|
||||||
|
"object": "list",
|
||||||
|
"note": "Phase 38: stateless. Session state lands in Phase 41.",
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chat_request_parses_openai_shape() {
|
||||||
|
let raw = r#"{
|
||||||
|
"model": "qwen3.5:latest",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are helpful."},
|
||||||
|
{"role": "user", "content": "Hi"}
|
||||||
|
],
|
||||||
|
"temperature": 0.2,
|
||||||
|
"max_tokens": 100
|
||||||
|
}"#;
|
||||||
|
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
||||||
|
assert_eq!(r.model, "qwen3.5:latest");
|
||||||
|
assert_eq!(r.messages.len(), 2);
|
||||||
|
assert_eq!(r.messages[0].role, "system");
|
||||||
|
assert_eq!(r.messages[1].content, "Hi");
|
||||||
|
assert_eq!(r.temperature, Some(0.2));
|
||||||
|
assert_eq!(r.max_tokens, Some(100));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chat_request_accepts_minimal() {
|
||||||
|
let raw = r#"{
|
||||||
|
"model": "any",
|
||||||
|
"messages": [{"role": "user", "content": "hi"}]
|
||||||
|
}"#;
|
||||||
|
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
||||||
|
assert_eq!(r.temperature, None);
|
||||||
|
assert_eq!(r.max_tokens, None);
|
||||||
|
assert_eq!(r.stream, None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn usage_counter_default_is_zero() {
|
||||||
|
let u = Usage::default();
|
||||||
|
assert_eq!(u.requests, 0);
|
||||||
|
assert_eq!(u.total_tokens, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
149
crates/gateway/src/v1/ollama.rs
Normal file
149
crates/gateway/src/v1/ollama.rs
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
//! Phase 38 — Ollama shape adapter.
|
||||||
|
//!
|
||||||
|
//! Translates `/v1/chat` (OpenAI-compatible) requests into the
|
||||||
|
//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
|
||||||
|
//! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
|
||||||
|
//! not a new client — aibridge + the Python sidecar stay as-is.
|
||||||
|
//!
|
||||||
|
//! Phase 39 replaces this direct call with a `ProviderAdapter` trait
|
||||||
|
//! dispatch so the same `/v1/chat` handler routes to any provider.
|
||||||
|
|
||||||
|
use aibridge::client::{AiClient, GenerateRequest};
|
||||||
|
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||||
|
|
||||||
|
pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
|
||||||
|
let (system, prompt) = flatten_messages(&req.messages);
|
||||||
|
|
||||||
|
let gen_req = GenerateRequest {
|
||||||
|
prompt,
|
||||||
|
model: Some(req.model.clone()),
|
||||||
|
system: if system.is_empty() { None } else { Some(system) },
|
||||||
|
temperature: req.temperature,
|
||||||
|
max_tokens: req.max_tokens,
|
||||||
|
// Phase 38 default: leave thinking behavior to the model's
|
||||||
|
// default (None). Phase 21's `think:false` discipline is a
|
||||||
|
// call-site concern for hot-path JSON emitters — Phase 40's
|
||||||
|
// routing engine can set it per task class.
|
||||||
|
think: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let t0 = std::time::Instant::now();
|
||||||
|
let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
|
||||||
|
let latency_ms = t0.elapsed().as_millis();
|
||||||
|
|
||||||
|
// Prefer sidecar-reported token counts when present. Fall back to
|
||||||
|
// chars/4 estimate (biased safe ~15%, matches Phase 21 convention
|
||||||
|
// in crates/aibridge/src/context.rs::estimate_tokens).
|
||||||
|
let prompt_tokens = resp.tokens_evaluated
|
||||||
|
.map(|n| n as u32)
|
||||||
|
.unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
|
||||||
|
let completion_tokens = resp.tokens_generated
|
||||||
|
.map(|n| n as u32)
|
||||||
|
.unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
target: "v1.chat",
|
||||||
|
model = %req.model,
|
||||||
|
prompt_tokens,
|
||||||
|
completion_tokens,
|
||||||
|
latency_ms = latency_ms as u64,
|
||||||
|
"ollama chat completed",
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(ChatResponse {
|
||||||
|
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||||
|
object: "chat.completion",
|
||||||
|
created: chrono::Utc::now().timestamp(),
|
||||||
|
model: resp.model,
|
||||||
|
choices: vec![Choice {
|
||||||
|
index: 0,
|
||||||
|
message: Message {
|
||||||
|
role: "assistant".into(),
|
||||||
|
content: resp.text,
|
||||||
|
},
|
||||||
|
finish_reason: "stop".into(),
|
||||||
|
}],
|
||||||
|
usage: UsageBlock {
|
||||||
|
prompt_tokens,
|
||||||
|
completion_tokens,
|
||||||
|
total_tokens: prompt_tokens + completion_tokens,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collapse a message array into (system, prompt). Multiple system
|
||||||
|
/// messages concatenate with a newline — matches OpenAI's documented
|
||||||
|
/// behavior. Non-system messages become role-labeled turns.
|
||||||
|
fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||||
|
let mut system = String::new();
|
||||||
|
let mut prompt = String::new();
|
||||||
|
for m in messages {
|
||||||
|
if m.role == "system" {
|
||||||
|
if !system.is_empty() { system.push('\n'); }
|
||||||
|
system.push_str(&m.content);
|
||||||
|
} else {
|
||||||
|
prompt.push_str(&m.role);
|
||||||
|
prompt.push_str(": ");
|
||||||
|
prompt.push_str(&m.content);
|
||||||
|
prompt.push_str("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prompt.push_str("assistant:");
|
||||||
|
(system, prompt)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
|
||||||
|
let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
|
||||||
|
((chars + 3) / 4) as u32
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_separates_system_from_turns() {
|
||||||
|
let msgs = vec![
|
||||||
|
Message { role: "system".into(), content: "Rules here.".into() },
|
||||||
|
Message { role: "user".into(), content: "Q1".into() },
|
||||||
|
Message { role: "assistant".into(), content: "A1".into() },
|
||||||
|
Message { role: "user".into(), content: "Q2".into() },
|
||||||
|
];
|
||||||
|
let (system, prompt) = flatten_messages(&msgs);
|
||||||
|
assert_eq!(system, "Rules here.");
|
||||||
|
assert!(prompt.contains("user: Q1"));
|
||||||
|
assert!(prompt.contains("assistant: A1"));
|
||||||
|
assert!(prompt.contains("user: Q2"));
|
||||||
|
assert!(prompt.trim_end().ends_with("assistant:"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_concatenates_multiple_system_messages() {
|
||||||
|
let msgs = vec![
|
||||||
|
Message { role: "system".into(), content: "First.".into() },
|
||||||
|
Message { role: "system".into(), content: "Second.".into() },
|
||||||
|
Message { role: "user".into(), content: "Hi".into() },
|
||||||
|
];
|
||||||
|
let (system, _) = flatten_messages(&msgs);
|
||||||
|
assert_eq!(system, "First.\nSecond.");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_with_no_system_returns_empty_system() {
|
||||||
|
let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
|
||||||
|
let (system, prompt) = flatten_messages(&msgs);
|
||||||
|
assert!(system.is_empty());
|
||||||
|
assert!(prompt.contains("user: hi"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn estimate_tokens_chars_div_4_ceiling() {
|
||||||
|
let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
|
||||||
|
// 8 chars / 4 = 2, with ceiling → 2
|
||||||
|
assert_eq!(estimate_prompt_tokens(&msgs), 2);
|
||||||
|
|
||||||
|
let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
|
||||||
|
// 9 chars → (9+3)/4 = 3 (ceiling)
|
||||||
|
assert_eq!(estimate_prompt_tokens(&msgs2), 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user