/// Eval harness for HNSW tuning. /// /// An EvalSet is a named collection of queries against a specific vector index. /// Each query has a query_text and (optionally pre-computed) ground_truth /// doc_ids — the "correct" top-k results that brute-force cosine returns. /// HNSW trials are scored by how well they recreate that top-k (recall@k). /// /// Storage: `_hnsw_evals/{name}.json` as a single JSON document. Small. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use std::sync::Arc; use aibridge::client::{AiClient, EmbedRequest}; use storaged::ops; use storaged::registry::BucketRegistry; use crate::index_registry::IndexRegistry; use crate::store::StoredEmbedding; /// A single eval query with optional pre-computed ground truth. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EvalQuery { pub id: String, pub query_text: String, /// Ordered list of doc_ids that brute-force returns as top-k. /// `None` means the ground truth hasn't been computed yet; `compute_ground_truth` /// fills it in against the current embedding set. #[serde(default)] pub ground_truth: Option>, /// Optional pre-computed query embedding. Filling this avoids re-embedding /// the same query on every trial. #[serde(default)] pub query_embedding: Option>, } /// A named eval set tied to a specific vector index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EvalSet { pub name: String, pub index_name: String, pub k: usize, // top-k used for recall calculation pub queries: Vec, pub created_at: DateTime, pub ground_truth_built: bool, } impl EvalSet { pub fn new(name: &str, index_name: &str, k: usize) -> Self { Self { name: name.to_string(), index_name: index_name.to_string(), k, queries: Vec::new(), created_at: Utc::now(), ground_truth_built: false, } } pub fn storage_key(&self) -> String { format!("_hnsw_evals/{}.json", self.name) } pub async fn save(&self, store: &Arc) -> Result<(), String> { let json = serde_json::to_vec_pretty(self).map_err(|e| e.to_string())?; ops::put(store, &self.storage_key(), json.into()).await } pub async fn load(store: &Arc, name: &str) -> Result { let key = format!("_hnsw_evals/{}.json", name); let data = ops::get(store, &key).await?; serde_json::from_slice(&data).map_err(|e| format!("parse eval set: {e}")) } pub async fn list(store: &Arc) -> Result, String> { let keys = ops::list(store, Some("_hnsw_evals/")).await?; Ok(keys .into_iter() .filter(|k| k.ends_with(".json")) .map(|k| { k.trim_start_matches("_hnsw_evals/") .trim_end_matches(".json") .to_string() }) .collect()) } } /// Federation-aware wrapper around EvalSet persistence. Mirrors the /// `TrialJournal` / `PromotionRegistry` pattern: harness files colocate /// with their index's bucket (looked up via `IndexMeta.bucket`), falling /// back to `primary` for indexes the registry has never seen. Legacy /// harnesses predating federation remain discoverable — lookups transparently /// try the resolved bucket first, then `primary` as a fallback. Cross-bucket /// listing dedupes across every registered bucket so `GET /hnsw/evals` /// returns a complete picture. #[derive(Clone)] pub struct HarnessStore { buckets: Arc, index_registry: IndexRegistry, } impl HarnessStore { pub fn new(buckets: Arc, index_registry: IndexRegistry) -> Self { Self { buckets, index_registry } } /// Resolve which bucket holds this index's eval artifacts. Indexes the /// registry has never heard of fall through to `primary`. async fn bucket_for_index(&self, index_name: &str) -> String { self.index_registry .get(index_name) .await .map(|m| m.bucket) .unwrap_or_else(|| "primary".to_string()) } /// Save to the bucket that owns `eval.index_name`. Writes under the /// standard `_hnsw_evals/{name}.json` prefix of the resolved bucket. pub async fn save(&self, eval: &EvalSet) -> Result<(), String> { let bucket = self.bucket_for_index(&eval.index_name).await; let store = self.buckets.get(&bucket)?; eval.save(&store).await } /// Load a harness by name, given the index it belongs to. Tries the /// index's bucket first; if the file is absent AND the resolved bucket /// isn't `primary`, falls through to `primary` so pre-federation evals /// remain reachable without migration. pub async fn load_for_index( &self, index_name: &str, harness_name: &str, ) -> Result { let bucket = self.bucket_for_index(index_name).await; let primary_store = self.buckets.get("primary")?; let store = self.buckets.get(&bucket)?; match EvalSet::load(&store, harness_name).await { Ok(e) => Ok(e), Err(e) if bucket != "primary" => EvalSet::load(&primary_store, harness_name) .await .map_err(|primary_err| format!("{bucket}: {e}; primary fallback: {primary_err}")), Err(e) => Err(e), } } /// Find a harness by name without knowing which index it belongs to — /// used by `GET /hnsw/evals/{name}`. Scans every registered bucket; /// first hit wins. Primary is searched first so pre-federation evals /// with the same name as a federated one resolve deterministically. pub async fn get_any(&self, harness_name: &str) -> Result { let bucket_infos = self.buckets.list().await; let mut ordered: Vec = bucket_infos.iter().map(|b| b.name.clone()).collect(); ordered.sort_by_key(|n| if n == "primary" { 0 } else { 1 }); let mut last_err = None; for b in ordered { let store = match self.buckets.get(&b) { Ok(s) => s, Err(e) => { last_err = Some(e); continue; } }; match EvalSet::load(&store, harness_name).await { Ok(e) => return Ok(e), Err(e) => { last_err = Some(e); } } } Err(last_err.unwrap_or_else(|| format!("no buckets registered for eval '{harness_name}'"))) } /// Union of every harness name across every registered bucket. /// Duplicates (same name in multiple buckets — pathological but /// possible after a manual migration) are collapsed. pub async fn list_all(&self) -> Vec { let mut all: HashSet = HashSet::new(); for b in self.buckets.list().await { let store = match self.buckets.get(&b.name) { Ok(s) => s, Err(_) => continue, }; if let Ok(names) = EvalSet::list(&store).await { all.extend(names); } } let mut out: Vec = all.into_iter().collect(); out.sort(); out } } /// Cosine similarity for two same-length f32 slices. fn cosine(a: &[f32], b: &[f32]) -> f32 { let mut dot = 0.0f32; let mut na = 0.0f32; let mut nb = 0.0f32; for i in 0..a.len() { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } if na == 0.0 || nb == 0.0 { return 0.0; } dot / (na.sqrt() * nb.sqrt()) } /// Brute-force top-k search for a single query against all embeddings. /// This is the ground-truth oracle that HNSW trials must approximate. pub fn brute_force_top_k( query: &[f32], embeddings: &[StoredEmbedding], k: usize, ) -> Vec { let mut scored: Vec<(f32, usize)> = embeddings .iter() .enumerate() .map(|(i, e)| (cosine(query, &e.vector), i)) .collect(); // Partial sort — we only need top-k. scored.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); scored .into_iter() .take(k) .map(|(_, i)| embeddings[i].doc_id.clone()) .collect() } /// Recall@k — fraction of ground-truth doc_ids that appear in predicted. pub fn recall_at_k(predicted: &[String], ground_truth: &[String], k: usize) -> f32 { if ground_truth.is_empty() || k == 0 { return 0.0; } // Set-intersection recall@k. Previous implementation counted duplicates // in `predicted` (a corpus with repeated chunks — e.g. cached LLM // responses — returns the same doc_id multiple times via HNSW), which // inflated recall above 1.0 and poisoned promotion decisions. let gt_set: std::collections::HashSet<&String> = ground_truth.iter().take(k).collect(); let pred_set: std::collections::HashSet<&String> = predicted.iter().take(k).collect(); let hits = pred_set.intersection(>_set).count(); hits as f32 / gt_set.len() as f32 } /// Populate query_embedding and ground_truth for every query that lacks them. pub async fn compute_ground_truth( eval: &mut EvalSet, embeddings: &[StoredEmbedding], ai_client: &AiClient, ) -> Result<(), String> { let need_embed: Vec<(usize, String)> = eval .queries .iter() .enumerate() .filter(|(_, q)| q.query_embedding.is_none()) .map(|(i, q)| (i, q.query_text.clone())) .collect(); if !need_embed.is_empty() { // Embed in one batch to keep things simple; for very large eval sets // we'd batch this in chunks of 32. let texts: Vec = need_embed.iter().map(|(_, t)| t.clone()).collect(); let resp = ai_client .embed(EmbedRequest { texts, model: None }) .await .map_err(|e| format!("embed queries: {e}"))?; for ((idx, _), vec) in need_embed.iter().zip(resp.embeddings.iter()) { let v: Vec = vec.iter().map(|&x| x as f32).collect(); eval.queries[*idx].query_embedding = Some(v); } } for q in eval.queries.iter_mut() { if q.ground_truth.is_some() { continue; } let emb = q.query_embedding.as_ref().ok_or("missing embedding")?; q.ground_truth = Some(brute_force_top_k(emb, embeddings, eval.k)); } eval.ground_truth_built = true; Ok(()) } /// Auto-generate a synthetic eval set by sampling every Nth chunk's text as /// its own query. Useful for a quick-start eval when the user doesn't have /// real natural-language queries yet. pub fn synthetic_from_chunks( eval_name: &str, index_name: &str, embeddings: &[StoredEmbedding], sample_count: usize, k: usize, ) -> EvalSet { let n = embeddings.len(); let sample_count = sample_count.min(n); let stride = (n / sample_count.max(1)).max(1); let mut queries = Vec::with_capacity(sample_count); for i in 0..sample_count { let idx = (i * stride).min(n - 1); let chunk = &embeddings[idx]; // Use the first ~200 chars of the chunk as the "query" — it should find // itself and nearby chunks as top results. let query_text: String = chunk.chunk_text.chars().take(200).collect(); queries.push(EvalQuery { id: format!("syn-{}", i), query_text, ground_truth: None, query_embedding: None, }); } EvalSet { name: eval_name.to_string(), index_name: index_name.to_string(), k, queries, created_at: Utc::now(), ground_truth_built: false, } } #[cfg(test)] mod tests { use super::*; fn s(v: &[&str]) -> Vec { v.iter().map(|x| x.to_string()).collect() } #[test] fn recall_empty_ground_truth_is_zero() { assert_eq!(recall_at_k(&s(&["a", "b"]), &[], 10), 0.0); } #[test] fn recall_k_zero_is_zero() { assert_eq!(recall_at_k(&s(&["a"]), &s(&["a"]), 0), 0.0); } #[test] fn recall_perfect_match_equals_one() { let pred = s(&["a", "b", "c"]); let gt = s(&["a", "b", "c"]); assert!((recall_at_k(&pred, >, 3) - 1.0).abs() < 1e-6); } #[test] fn recall_half_match() { let pred = s(&["a", "b", "x", "y"]); let gt = s(&["a", "b", "c", "d"]); assert!((recall_at_k(&pred, >, 4) - 0.5).abs() < 1e-6); } #[test] fn recall_duplicates_in_predicted_do_not_inflate() { // Regression guard: the previous implementation counted each // duplicate in `predicted` separately, which could push recall // above 1.0 on corpora with repeated chunks (cached responses etc). // Set-intersection semantics keep it bounded in [0, 1]. let pred = s(&["a", "a", "a", "a"]); let gt = s(&["a", "b", "c", "d"]); let r = recall_at_k(&pred, >, 4); assert!(r <= 1.0, "recall {r} must not exceed 1.0"); // One unique match out of four in gt = 0.25. assert!((r - 0.25).abs() < 1e-6); } #[test] fn recall_duplicates_in_ground_truth_handled() { // gt with dupes reduces effective |gt|; matching one hits all. let pred = s(&["x"]); let gt = s(&["x", "x", "x"]); let r = recall_at_k(&pred, >, 3); assert!(r <= 1.0); assert!((r - 1.0).abs() < 1e-6); // |pred ∩ gt| / |gt_set| = 1/1 } #[test] fn recall_respects_k_bound() { // k=2 means only the first 2 of pred and gt count toward the set. let pred = s(&["a", "b", "c", "d"]); let gt = s(&["a", "b", "c", "d"]); let r = recall_at_k(&pred, >, 2); assert!((r - 1.0).abs() < 1e-6); } }