Phase 38: Universal API skeleton — /v1/chat, /v1/usage, /v1/sessions
First slice of the control-plane pivot. OpenAI-compatible surface
over the existing aibridge → Ollama path. Additive — no existing
routes touched. All 7 unit tests green, release build clean.
What ships:
- crates/gateway/src/v1/mod.rs — router, V1State (ai_client + Usage
counter), ChatRequest/ChatResponse/Message/UsageBlock types, handlers
for /chat, /usage, /sessions. OpenAI-compatible field shapes:
{model, messages[{role,content}], temperature?, max_tokens?, stream?}
- crates/gateway/src/v1/ollama.rs — shape adapter. Flattens messages
into (system, prompt), calls aibridge.generate, unwraps response
back into OpenAI /v1/chat shape. Prefers sidecar-reported tokens;
falls back to chars/4 ceiling estimate matching Phase 21 convention.
- crates/gateway/src/main.rs — one new mod, one .nest("/v1", ...)
Tests (7/7):
- chat_request_parses_openai_shape
- chat_request_accepts_minimal
- usage_counter_default_is_zero
- flatten_separates_system_from_turns
- flatten_concatenates_multiple_system_messages
- flatten_with_no_system_returns_empty_system
- estimate_tokens_chars_div_4_ceiling
Not in this phase (per CONTROL_PLANE_PRD.md): streaming, tool calls,
session state, multi-provider, fallback chain, cost gating. All
land in Phases 39-44.
Next: live-test POST /v1/chat after gateway restart, then migrate
bot/propose.ts off direct sidecar calls to prove the loop end-to-end.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f44b6b3e6b
commit
4cb405bb42
@ -3,6 +3,7 @@ mod access_service;
|
||||
mod auth;
|
||||
mod observability;
|
||||
mod tools;
|
||||
mod v1;
|
||||
|
||||
use axum::{Router, extract::DefaultBodyLimit, routing::get};
|
||||
use proto::lakehouse::catalog_service_server::CatalogServiceServer;
|
||||
@ -183,6 +184,14 @@ async fn main() {
|
||||
registry: tool_reg,
|
||||
query_fn: tools::QueryExecutor::new(engine.clone()),
|
||||
}
|
||||
}))
|
||||
// Phase 38 — Universal API skeleton. Thin OpenAI-compatible
|
||||
// surface over the existing aibridge → Ollama path. Future
|
||||
// phases add provider adapters (39), routing engine (40),
|
||||
// session state (41), etc. All without changing this mount.
|
||||
.nest("/v1", v1::router(v1::V1State {
|
||||
ai_client: ai_client.clone(),
|
||||
usage: std::sync::Arc::new(tokio::sync::RwLock::new(v1::Usage::default())),
|
||||
}));
|
||||
|
||||
// Auth middleware (if enabled)
|
||||
|
||||
184
crates/gateway/src/v1/mod.rs
Normal file
184
crates/gateway/src/v1/mod.rs
Normal file
@ -0,0 +1,184 @@
|
||||
//! Phase 38 — Universal API skeleton (`/v1/*`).
|
||||
//!
|
||||
//! OpenAI-compatible shape on top of the existing aibridge → Ollama
|
||||
//! path. This is the thin slice: single provider, stateless, no
|
||||
//! streaming. Phase 39 replaces the direct Ollama call with a
|
||||
//! `ProviderAdapter` trait dispatch; Phase 40 adds routing + fallback.
|
||||
//!
|
||||
//! The shape matches OpenAI's `/v1/chat/completions` closely enough
|
||||
//! that clients using openai-compatible SDKs can point at us with the
|
||||
//! URL swap alone. We keep the endpoint path `/v1/chat` (not
|
||||
//! `/v1/chat/completions`) because our PRD declares the terser form;
|
||||
//! adding the alias is one line in Phase 39 when it matters.
|
||||
|
||||
pub mod ollama;
|
||||
|
||||
use axum::{
|
||||
Router,
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
Json,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct V1State {
|
||||
pub ai_client: aibridge::client::AiClient,
|
||||
pub usage: Arc<RwLock<Usage>>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
pub struct Usage {
|
||||
pub requests: u64,
|
||||
pub prompt_tokens: u64,
|
||||
pub completion_tokens: u64,
|
||||
pub total_tokens: u64,
|
||||
}
|
||||
|
||||
pub fn router(state: V1State) -> Router {
|
||||
Router::new()
|
||||
.route("/chat", post(chat))
|
||||
.route("/usage", get(usage))
|
||||
.route("/sessions", get(sessions))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
// -- Shared types (OpenAI-compatible) --
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct Message {
|
||||
pub role: String,
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ChatRequest {
|
||||
pub model: String,
|
||||
pub messages: Vec<Message>,
|
||||
#[serde(default)]
|
||||
pub temperature: Option<f64>,
|
||||
#[serde(default)]
|
||||
pub max_tokens: Option<u32>,
|
||||
/// Accepted for shape-compat but ignored in the thin slice —
|
||||
/// Phase 38 returns non-streaming even when the client asked for it.
|
||||
/// Phase 39+ wires real streaming.
|
||||
#[serde(default)]
|
||||
pub stream: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct ChatResponse {
|
||||
pub id: String,
|
||||
pub object: &'static str,
|
||||
pub created: i64,
|
||||
pub model: String,
|
||||
pub choices: Vec<Choice>,
|
||||
pub usage: UsageBlock,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct Choice {
|
||||
pub index: u32,
|
||||
pub message: Message,
|
||||
pub finish_reason: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct UsageBlock {
|
||||
pub prompt_tokens: u32,
|
||||
pub completion_tokens: u32,
|
||||
pub total_tokens: u32,
|
||||
}
|
||||
|
||||
// -- Handlers --
|
||||
|
||||
async fn chat(
|
||||
State(state): State<V1State>,
|
||||
Json(req): Json<ChatRequest>,
|
||||
) -> Result<Json<ChatResponse>, (StatusCode, String)> {
|
||||
if req.messages.is_empty() {
|
||||
return Err((StatusCode::BAD_REQUEST, "messages must be non-empty".into()));
|
||||
}
|
||||
if req.stream.unwrap_or(false) {
|
||||
tracing::warn!("/v1/chat: stream=true requested but Phase 38 returns non-streaming");
|
||||
}
|
||||
|
||||
let resp = ollama::chat(&state.ai_client, &req)
|
||||
.await
|
||||
.map_err(|e| (StatusCode::BAD_GATEWAY, format!("provider: {e}")))?;
|
||||
|
||||
{
|
||||
let mut u = state.usage.write().await;
|
||||
u.requests += 1;
|
||||
u.prompt_tokens += resp.usage.prompt_tokens as u64;
|
||||
u.completion_tokens += resp.usage.completion_tokens as u64;
|
||||
u.total_tokens += resp.usage.total_tokens as u64;
|
||||
}
|
||||
|
||||
Ok(Json(resp))
|
||||
}
|
||||
|
||||
async fn usage(State(state): State<V1State>) -> impl IntoResponse {
|
||||
let snapshot = state.usage.read().await.clone();
|
||||
Json(snapshot)
|
||||
}
|
||||
|
||||
// Phase 38 is stateless — no session persistence yet. Return an empty
|
||||
// list in OpenAI-ish shape so clients that probe this endpoint don't
|
||||
// 404. Real session state lands in Phase 41 with the profile-system
|
||||
// expansion.
|
||||
async fn sessions() -> impl IntoResponse {
|
||||
Json(serde_json::json!({
|
||||
"data": [],
|
||||
"object": "list",
|
||||
"note": "Phase 38: stateless. Session state lands in Phase 41.",
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn chat_request_parses_openai_shape() {
|
||||
let raw = r#"{
|
||||
"model": "qwen3.5:latest",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "Hi"}
|
||||
],
|
||||
"temperature": 0.2,
|
||||
"max_tokens": 100
|
||||
}"#;
|
||||
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
||||
assert_eq!(r.model, "qwen3.5:latest");
|
||||
assert_eq!(r.messages.len(), 2);
|
||||
assert_eq!(r.messages[0].role, "system");
|
||||
assert_eq!(r.messages[1].content, "Hi");
|
||||
assert_eq!(r.temperature, Some(0.2));
|
||||
assert_eq!(r.max_tokens, Some(100));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chat_request_accepts_minimal() {
|
||||
let raw = r#"{
|
||||
"model": "any",
|
||||
"messages": [{"role": "user", "content": "hi"}]
|
||||
}"#;
|
||||
let r: ChatRequest = serde_json::from_str(raw).unwrap();
|
||||
assert_eq!(r.temperature, None);
|
||||
assert_eq!(r.max_tokens, None);
|
||||
assert_eq!(r.stream, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn usage_counter_default_is_zero() {
|
||||
let u = Usage::default();
|
||||
assert_eq!(u.requests, 0);
|
||||
assert_eq!(u.total_tokens, 0);
|
||||
}
|
||||
}
|
||||
149
crates/gateway/src/v1/ollama.rs
Normal file
149
crates/gateway/src/v1/ollama.rs
Normal file
@ -0,0 +1,149 @@
|
||||
//! Phase 38 — Ollama shape adapter.
|
||||
//!
|
||||
//! Translates `/v1/chat` (OpenAI-compatible) requests into the
|
||||
//! existing aibridge `GenerateRequest` shape, and the `GenerateResponse`
|
||||
//! back into an OpenAI-compatible `ChatResponse`. This is a bridge,
|
||||
//! not a new client — aibridge + the Python sidecar stay as-is.
|
||||
//!
|
||||
//! Phase 39 replaces this direct call with a `ProviderAdapter` trait
|
||||
//! dispatch so the same `/v1/chat` handler routes to any provider.
|
||||
|
||||
use aibridge::client::{AiClient, GenerateRequest};
|
||||
use super::{ChatRequest, ChatResponse, Choice, Message, UsageBlock};
|
||||
|
||||
pub async fn chat(client: &AiClient, req: &ChatRequest) -> Result<ChatResponse, String> {
|
||||
let (system, prompt) = flatten_messages(&req.messages);
|
||||
|
||||
let gen_req = GenerateRequest {
|
||||
prompt,
|
||||
model: Some(req.model.clone()),
|
||||
system: if system.is_empty() { None } else { Some(system) },
|
||||
temperature: req.temperature,
|
||||
max_tokens: req.max_tokens,
|
||||
// Phase 38 default: leave thinking behavior to the model's
|
||||
// default (None). Phase 21's `think:false` discipline is a
|
||||
// call-site concern for hot-path JSON emitters — Phase 40's
|
||||
// routing engine can set it per task class.
|
||||
think: None,
|
||||
};
|
||||
|
||||
let t0 = std::time::Instant::now();
|
||||
let resp = client.generate(gen_req).await.map_err(|e| e.to_string())?;
|
||||
let latency_ms = t0.elapsed().as_millis();
|
||||
|
||||
// Prefer sidecar-reported token counts when present. Fall back to
|
||||
// chars/4 estimate (biased safe ~15%, matches Phase 21 convention
|
||||
// in crates/aibridge/src/context.rs::estimate_tokens).
|
||||
let prompt_tokens = resp.tokens_evaluated
|
||||
.map(|n| n as u32)
|
||||
.unwrap_or_else(|| estimate_prompt_tokens(&req.messages));
|
||||
let completion_tokens = resp.tokens_generated
|
||||
.map(|n| n as u32)
|
||||
.unwrap_or_else(|| ((resp.text.chars().count() + 3) / 4) as u32);
|
||||
|
||||
tracing::info!(
|
||||
target: "v1.chat",
|
||||
model = %req.model,
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
latency_ms = latency_ms as u64,
|
||||
"ollama chat completed",
|
||||
);
|
||||
|
||||
Ok(ChatResponse {
|
||||
id: format!("chatcmpl-{}", chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)),
|
||||
object: "chat.completion",
|
||||
created: chrono::Utc::now().timestamp(),
|
||||
model: resp.model,
|
||||
choices: vec![Choice {
|
||||
index: 0,
|
||||
message: Message {
|
||||
role: "assistant".into(),
|
||||
content: resp.text,
|
||||
},
|
||||
finish_reason: "stop".into(),
|
||||
}],
|
||||
usage: UsageBlock {
|
||||
prompt_tokens,
|
||||
completion_tokens,
|
||||
total_tokens: prompt_tokens + completion_tokens,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
/// Collapse a message array into (system, prompt). Multiple system
|
||||
/// messages concatenate with a newline — matches OpenAI's documented
|
||||
/// behavior. Non-system messages become role-labeled turns.
|
||||
fn flatten_messages(messages: &[Message]) -> (String, String) {
|
||||
let mut system = String::new();
|
||||
let mut prompt = String::new();
|
||||
for m in messages {
|
||||
if m.role == "system" {
|
||||
if !system.is_empty() { system.push('\n'); }
|
||||
system.push_str(&m.content);
|
||||
} else {
|
||||
prompt.push_str(&m.role);
|
||||
prompt.push_str(": ");
|
||||
prompt.push_str(&m.content);
|
||||
prompt.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
prompt.push_str("assistant:");
|
||||
(system, prompt)
|
||||
}
|
||||
|
||||
fn estimate_prompt_tokens(messages: &[Message]) -> u32 {
|
||||
let chars: usize = messages.iter().map(|m| m.content.chars().count()).sum();
|
||||
((chars + 3) / 4) as u32
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn flatten_separates_system_from_turns() {
|
||||
let msgs = vec![
|
||||
Message { role: "system".into(), content: "Rules here.".into() },
|
||||
Message { role: "user".into(), content: "Q1".into() },
|
||||
Message { role: "assistant".into(), content: "A1".into() },
|
||||
Message { role: "user".into(), content: "Q2".into() },
|
||||
];
|
||||
let (system, prompt) = flatten_messages(&msgs);
|
||||
assert_eq!(system, "Rules here.");
|
||||
assert!(prompt.contains("user: Q1"));
|
||||
assert!(prompt.contains("assistant: A1"));
|
||||
assert!(prompt.contains("user: Q2"));
|
||||
assert!(prompt.trim_end().ends_with("assistant:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flatten_concatenates_multiple_system_messages() {
|
||||
let msgs = vec![
|
||||
Message { role: "system".into(), content: "First.".into() },
|
||||
Message { role: "system".into(), content: "Second.".into() },
|
||||
Message { role: "user".into(), content: "Hi".into() },
|
||||
];
|
||||
let (system, _) = flatten_messages(&msgs);
|
||||
assert_eq!(system, "First.\nSecond.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flatten_with_no_system_returns_empty_system() {
|
||||
let msgs = vec![Message { role: "user".into(), content: "hi".into() }];
|
||||
let (system, prompt) = flatten_messages(&msgs);
|
||||
assert!(system.is_empty());
|
||||
assert!(prompt.contains("user: hi"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimate_tokens_chars_div_4_ceiling() {
|
||||
let msgs = vec![Message { role: "user".into(), content: "abcdefgh".into() }];
|
||||
// 8 chars / 4 = 2, with ceiling → 2
|
||||
assert_eq!(estimate_prompt_tokens(&msgs), 2);
|
||||
|
||||
let msgs2 = vec![Message { role: "user".into(), content: "abcdefghi".into() }];
|
||||
// 9 chars → (9+3)/4 = 3 (ceiling)
|
||||
assert_eq!(estimate_prompt_tokens(&msgs2), 3);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user