Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11

Merged
profit merged 118 commits from scrum/auto-apply-19814 into main 2026-04-27 15:55:24 +00:00
Showing only changes of commit 7b88fb9269 - Show all commits

View File

@ -3,10 +3,26 @@ use serde::{Deserialize, Serialize};
use std::time::Duration;
/// HTTP client for the Python AI sidecar.
///
/// `generate()` has two transport modes:
/// - When `gateway_url` is None (default), it posts to
/// `${base_url}/generate` (sidecar direct).
/// - When `gateway_url` is `Some(url)`, it posts to
/// `${url}/v1/chat` with `provider="ollama"` so the call appears
/// in `/v1/usage` and Langfuse traces.
///
/// `embed()`, `rerank()`, and admin methods always go direct to the
/// sidecar — no `/v1` equivalent yet, no point round-tripping.
///
/// Phase 44 part 2 (2026-04-27): the gateway URL is wired in by
/// callers that want observability (vectord modules); it's left
/// unset by callers that ARE the gateway internals (avoids self-loops
/// + redundant hops).
#[derive(Clone)]
pub struct AiClient {
client: Client,
base_url: String,
gateway_url: Option<String>,
}
// -- Request/Response types --
@ -86,9 +102,22 @@ impl AiClient {
Self {
client,
base_url: base_url.trim_end_matches('/').to_string(),
gateway_url: None,
}
}
/// Same as `new`, but every `generate()` is routed through
/// `${gateway_url}/v1/chat` (provider=ollama) for observability.
/// Use this for callers OUTSIDE the gateway. Inside the gateway
/// itself, prefer `new()` — calling /v1/chat from /v1/chat works
/// (no infinite loop, ollama_arm doesn't use AiClient) but adds
/// a wasted localhost hop.
pub fn new_with_gateway(base_url: &str, gateway_url: &str) -> Self {
let mut c = Self::new(base_url);
c.gateway_url = Some(gateway_url.trim_end_matches('/').to_string());
c
}
pub async fn health(&self) -> Result<serde_json::Value, String> {
let resp = self.client
.get(format!("{}/health", self.base_url))
@ -114,6 +143,13 @@ impl AiClient {
}
pub async fn generate(&self, req: GenerateRequest) -> Result<GenerateResponse, String> {
if let Some(gw) = self.gateway_url.as_deref() {
return self.generate_via_gateway(gw, req).await;
}
// Direct-sidecar legacy path. Used by gateway internals (so
// ollama_arm can call sidecar without a self-loop) and by
// any consumer that wants raw transport without /v1/usage
// accounting.
let resp = self.client
.post(format!("{}/generate", self.base_url))
.json(&req)
@ -128,6 +164,59 @@ impl AiClient {
resp.json().await.map_err(|e| format!("generate parse error: {e}"))
}
/// Phase 44 part 2: route generate() through the gateway's
/// /v1/chat with provider="ollama" so the call lands in
/// /v1/usage + Langfuse. Translates between the sidecar
/// GenerateRequest/Response shape and the OpenAI-compat
/// chat shape on the wire.
async fn generate_via_gateway(&self, gateway_url: &str, req: GenerateRequest) -> Result<GenerateResponse, String> {
let mut messages = Vec::with_capacity(2);
if let Some(sys) = &req.system {
messages.push(serde_json::json!({"role": "system", "content": sys}));
}
messages.push(serde_json::json!({"role": "user", "content": req.prompt}));
let mut body = serde_json::json!({
"messages": messages,
"provider": "ollama",
});
if let Some(m) = &req.model { body["model"] = serde_json::json!(m); }
if let Some(t) = req.temperature { body["temperature"] = serde_json::json!(t); }
if let Some(mt) = req.max_tokens { body["max_tokens"] = serde_json::json!(mt); }
if let Some(th) = req.think { body["think"] = serde_json::json!(th); }
let resp = self.client
.post(format!("{}/v1/chat", gateway_url))
.json(&body)
.send()
.await
.map_err(|e| format!("/v1/chat request failed: {e}"))?;
if !resp.status().is_success() {
let text = resp.text().await.unwrap_or_default();
return Err(format!("/v1/chat error: {text}"));
}
let parsed: serde_json::Value = resp.json().await
.map_err(|e| format!("/v1/chat parse error: {e}"))?;
let text = parsed
.pointer("/choices/0/message/content")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let model = parsed.get("model")
.and_then(|v| v.as_str())
.unwrap_or_else(|| req.model.as_deref().unwrap_or(""))
.to_string();
let prompt_tokens = parsed.pointer("/usage/prompt_tokens").and_then(|v| v.as_u64());
let completion_tokens = parsed.pointer("/usage/completion_tokens").and_then(|v| v.as_u64());
Ok(GenerateResponse {
text,
model,
tokens_evaluated: prompt_tokens,
tokens_generated: completion_tokens,
})
}
pub async fn rerank(&self, req: RerankRequest) -> Result<RerankResponse, String> {
let resp = self.client
.post(format!("{}/rerank", self.base_url))