From 5d93a715c3146a34f7966601b1898dfa860054b1 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 07:53:18 -0500 Subject: [PATCH] =?UTF-8?q?gateway:=20Phase=2044=20part=203=20=E2=80=94=20?= =?UTF-8?q?split=20AiClient=20so=20vectord=20routes=20through=20/v1/chat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds two AiClient instances at boot: - `ai_client_direct = AiClient::new(sidecar_url)` — direct sidecar transport. Used by V1State (gateway's own /v1/chat ollama_arm needs this — calling /v1/chat from itself would self-loop) and by the legacy /ai proxy. - `ai_client_observable = AiClient::new_with_gateway(sidecar_url, ${gateway_host}:${gateway_port})` — routes generate() through /v1/chat with provider="ollama". Used by: vectord::agent (autotune background loop) vectord::service (the /vectors HTTP surface — RAG, summary, playbook synthesis, etc.) Net result: every LLM call from a vectord module now lands in /v1/usage and Langfuse traces. The autotune agent's hourly cycle becomes observable; /vectors RAG calls show provider+model+latency in the usage report. Phase 44 PRD's gate ("/v1/usage accounts for every LLM call in the system within a 1-minute window") is now satisfied for the gateway-hosted services. Cost: one localhost HTTP hop per vectord-originated LLM call. At ~1-3ms RTT for in-process loopback, negligible against the LLM call's own 30-90s wall-clock. Phase 44 part 4 (deferred): - Standalone consumers that build their own AiClient (test harnesses, bot/propose, etc) — the TS-side already migrated in part 1 + the regression guard at scripts/check_phase44_callers.sh catches new direct callers. Rust standalone harnesses (if any surface) follow the same pattern: construct via new_with_gateway to opt into observability. - Direct sidecar callers in standalone tools (scripts/serve_lab.py is one) — Python-side; out of Rust scope. Verified: cargo build --release -p gateway compiles systemctl restart lakehouse active /v1/chat sanity PONG, finish=stop When the autotune agent next cycles or any /vectors RAG endpoint fires, /v1/usage will show the provider=ollama tick — first real-world data should land within the next agent cycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/gateway/src/main.rs | 39 ++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs index acf0e54..eae6adc 100644 --- a/crates/gateway/src/main.rs +++ b/crates/gateway/src/main.rs @@ -95,8 +95,35 @@ async fn main() { tracing::warn!("workspace rebuild: {e}"); } - // AI sidecar client - let ai_client = aibridge::client::AiClient::new(&config.sidecar.url); + // AI sidecar clients — Phase 44 part 3 (2026-04-27). + // + // Two flavors of the same client: + // - `ai_client_direct` posts directly to ${sidecar}/generate. Used + // inside the gateway by V1State + the legacy /ai proxy. These + // call sites are themselves the implementation of /v1/chat + // (or its sidecar shim), so routing them through /v1/chat + // would self-loop. + // - `ai_client_observable` posts via ${gateway}/v1/chat with + // provider="ollama". Used by vectord modules (autotune agent, + // /vectors service) so their LLM calls land in /v1/usage and + // Langfuse traces. Adds one localhost HTTP hop per call (~ms); + // accepted for the observability gain. + // + // The gateway can call its own /v1/chat over localhost during + // boot's transient period because we don't fire any LLM calls + // until the listener is up — the observable client is just + // configured here, not exercised. + let ai_client_direct = aibridge::client::AiClient::new(&config.sidecar.url); + let gateway_self_url = format!("http://{}:{}", config.gateway.host, config.gateway.port); + let ai_client_observable = aibridge::client::AiClient::new_with_gateway( + &config.sidecar.url, + &gateway_self_url, + ); + // Backwards-compat alias for the (many) existing references in this file. + // Defaults to direct so the existing wiring (V1State, /ai proxy) + // keeps its non-self-loop transport. New vectord wiring below + // explicitly uses ai_client_observable. + let ai_client = ai_client_direct.clone(); // Vector service components — built before the router because both the // /vectors service AND ingestd need the agent handle to enqueue triggers. @@ -134,7 +161,9 @@ async fn main() { agent_cfg, vectord::agent::AgentDeps { store: store.clone(), - ai_client: ai_client.clone(), + // Observable: autotune agent's LLM calls go through + // /v1/chat for /v1/usage + Langfuse visibility. + ai_client: ai_client_observable.clone(), catalog: registry.clone(), index_registry: index_reg.clone(), hnsw_store: hnsw.clone(), @@ -189,7 +218,9 @@ async fn main() { })) .nest("/vectors", vectord::service::router(vectord::service::VectorState { store: store.clone(), - ai_client: ai_client.clone(), + // Observable: /vectors service's LLM calls (RAG, summary, + // playbook synthesis, etc.) flow through /v1/chat. + ai_client: ai_client_observable.clone(), job_tracker: vectord::jobs::JobTracker::new(), index_registry: index_reg.clone(), hnsw_store: hnsw,