From 6ed48c1a69e849dcfd73720c0874700397d81838 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 08:07:18 -0500 Subject: [PATCH] gateway+validator: /v1/health reports honest worker count for production MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `fn len() -> usize` (default 0) to the WorkerLookup trait. The InMemoryWorkerLookup overrides with HashMap size; ParquetWorkerLookup constructs an InMemoryWorkerLookup so it inherits the count. /v1/health now reports `workers_count` (exact integer) alongside `workers_loaded` (derived bool: count > 0). The previous placeholder true was a known caveat in the prior commit's body — this closes it. Production switchover use case: J swaps workers_500k.parquet → real Chicago contractor data, restarts the gateway, and verifies the swap with one curl: curl http://localhost:3100/v1/health | jq .workers_count Expected: matches the row count of the new file. Mismatch (or 0) means the file is missing / unreadable / had a schema mismatch and the gateway fell back to the empty InMemoryWorkerLookup. Operator catches the drift before traffic reaches the validators. Verified live (current synthetic data): workers_count: 500000 (matches workers_500k.parquet row count) workers_loaded: true When the Chicago data lands, the same curl is the single source of truth that the new dataset is hot. Removes the restart-and-pray failure mode. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/gateway/src/v1/mod.rs | 25 ++++++++----------------- crates/validator/src/lib.rs | 13 +++++++++++++ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/crates/gateway/src/v1/mod.rs b/crates/gateway/src/v1/mod.rs index b118792..052e5cc 100644 --- a/crates/gateway/src/v1/mod.rs +++ b/crates/gateway/src/v1/mod.rs @@ -579,22 +579,12 @@ async fn usage(State(state): State) -> impl IntoResponse { /// gates. A monitoring tool should evaluate the booleans + counts /// against its own thresholds. async fn health(State(state): State) -> impl IntoResponse { - let workers_loaded = { - // Use a lookup with an obviously-fake id to probe — None - // could mean empty roster OR healthy roster without that id. - // We don't have a count() method on the trait; use a sample - // probe + treat presence of workers as a yes/no signal. - let probe = state.validate_workers.find("__healthcheck_probe__"); - // probe is always None for the synthetic id, so this isn't - // useful. Better: rely on the fact that an empty-fallback - // InMemoryWorkerLookup ALSO returns None — there's no way - // to distinguish "loaded, just doesn't have this id" from - // "empty fallback". We'd need a count() method on WorkerLookup - // to report honestly. For now report the load attempt was - // performed (boot logs are the source of truth on rows count). - let _ = probe; - true - }; + // Honest worker count via WorkerLookup::len. Production switchover + // verification: after swapping workers_500k.parquet → real Chicago + // data and restarting, this number should match the row count of + // the new file. 0 means the file was missing / unreadable / had a + // schema mismatch and the gateway booted with the empty fallback. + let workers_count = state.validate_workers.len(); let providers_configured = serde_json::json!({ "ollama_cloud": state.ollama_cloud_key.is_some(), "openrouter": state.openrouter_key.is_some(), @@ -607,7 +597,8 @@ async fn health(State(state): State) -> impl IntoResponse { let usage_snapshot = state.usage.read().await.clone(); Json(serde_json::json!({ "status": "ok", - "workers_loaded": workers_loaded, + "workers_count": workers_count, + "workers_loaded": workers_count > 0, "providers_configured": providers_configured, "langfuse_configured": langfuse_configured, "usage_total_requests": usage_snapshot.requests, diff --git a/crates/validator/src/lib.rs b/crates/validator/src/lib.rs index 253fd74..e646dd2 100644 --- a/crates/validator/src/lib.rs +++ b/crates/validator/src/lib.rs @@ -130,6 +130,16 @@ pub struct WorkerRecord { /// hold an in-memory snapshot, not perform per-call I/O. pub trait WorkerLookup: Send + Sync { fn find(&self, candidate_id: &str) -> Option; + /// Number of workers in the snapshot. Default 0 for impls that + /// genuinely don't know (e.g. a future SQL-backed lookup that + /// counts on demand). InMemoryWorkerLookup overrides with the + /// HashMap size; ParquetWorkerLookup constructs an + /// InMemoryWorkerLookup so it inherits the override. Used by + /// /v1/health to report data-load status during production + /// switchover (the Chicago dataset replaces synthetic test data; + /// the health endpoint is how operators verify the new file + /// loaded correctly without restart-and-pray). + fn len(&self) -> usize { 0 } } /// HashMap-backed lookup. Used by validator unit tests + as a @@ -165,4 +175,7 @@ impl WorkerLookup for InMemoryWorkerLookup { fn find(&self, candidate_id: &str) -> Option { self.rows.get(candidate_id).cloned() } + fn len(&self) -> usize { + self.rows.len() + } }