/// Eval harness for HNSW tuning. /// /// An EvalSet is a named collection of queries against a specific vector index. /// Each query has a query_text and (optionally pre-computed) ground_truth /// doc_ids — the "correct" top-k results that brute-force cosine returns. /// HNSW trials are scored by how well they recreate that top-k (recall@k). /// /// Storage: `_hnsw_evals/{name}.json` as a single JSON document. Small. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::sync::Arc; use aibridge::client::{AiClient, EmbedRequest}; use storaged::ops; use crate::store::StoredEmbedding; /// A single eval query with optional pre-computed ground truth. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EvalQuery { pub id: String, pub query_text: String, /// Ordered list of doc_ids that brute-force returns as top-k. /// `None` means the ground truth hasn't been computed yet; `compute_ground_truth` /// fills it in against the current embedding set. #[serde(default)] pub ground_truth: Option>, /// Optional pre-computed query embedding. Filling this avoids re-embedding /// the same query on every trial. #[serde(default)] pub query_embedding: Option>, } /// A named eval set tied to a specific vector index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EvalSet { pub name: String, pub index_name: String, pub k: usize, // top-k used for recall calculation pub queries: Vec, pub created_at: DateTime, pub ground_truth_built: bool, } impl EvalSet { pub fn new(name: &str, index_name: &str, k: usize) -> Self { Self { name: name.to_string(), index_name: index_name.to_string(), k, queries: Vec::new(), created_at: Utc::now(), ground_truth_built: false, } } pub fn storage_key(&self) -> String { format!("_hnsw_evals/{}.json", self.name) } pub async fn save(&self, store: &Arc) -> Result<(), String> { let json = serde_json::to_vec_pretty(self).map_err(|e| e.to_string())?; ops::put(store, &self.storage_key(), json.into()).await } pub async fn load(store: &Arc, name: &str) -> Result { let key = format!("_hnsw_evals/{}.json", name); let data = ops::get(store, &key).await?; serde_json::from_slice(&data).map_err(|e| format!("parse eval set: {e}")) } pub async fn list(store: &Arc) -> Result, String> { let keys = ops::list(store, Some("_hnsw_evals/")).await?; Ok(keys .into_iter() .filter(|k| k.ends_with(".json")) .map(|k| { k.trim_start_matches("_hnsw_evals/") .trim_end_matches(".json") .to_string() }) .collect()) } } /// Cosine similarity for two same-length f32 slices. fn cosine(a: &[f32], b: &[f32]) -> f32 { let mut dot = 0.0f32; let mut na = 0.0f32; let mut nb = 0.0f32; for i in 0..a.len() { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } if na == 0.0 || nb == 0.0 { return 0.0; } dot / (na.sqrt() * nb.sqrt()) } /// Brute-force top-k search for a single query against all embeddings. /// This is the ground-truth oracle that HNSW trials must approximate. pub fn brute_force_top_k( query: &[f32], embeddings: &[StoredEmbedding], k: usize, ) -> Vec { let mut scored: Vec<(f32, usize)> = embeddings .iter() .enumerate() .map(|(i, e)| (cosine(query, &e.vector), i)) .collect(); // Partial sort — we only need top-k. scored.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); scored .into_iter() .take(k) .map(|(_, i)| embeddings[i].doc_id.clone()) .collect() } /// Recall@k — fraction of ground-truth doc_ids that appear in predicted. pub fn recall_at_k(predicted: &[String], ground_truth: &[String], k: usize) -> f32 { if ground_truth.is_empty() || k == 0 { return 0.0; } let gt_set: std::collections::HashSet<&String> = ground_truth.iter().take(k).collect(); let hits = predicted .iter() .take(k) .filter(|d| gt_set.contains(d)) .count(); hits as f32 / gt_set.len() as f32 } /// Populate query_embedding and ground_truth for every query that lacks them. pub async fn compute_ground_truth( eval: &mut EvalSet, embeddings: &[StoredEmbedding], ai_client: &AiClient, ) -> Result<(), String> { let need_embed: Vec<(usize, String)> = eval .queries .iter() .enumerate() .filter(|(_, q)| q.query_embedding.is_none()) .map(|(i, q)| (i, q.query_text.clone())) .collect(); if !need_embed.is_empty() { // Embed in one batch to keep things simple; for very large eval sets // we'd batch this in chunks of 32. let texts: Vec = need_embed.iter().map(|(_, t)| t.clone()).collect(); let resp = ai_client .embed(EmbedRequest { texts, model: None }) .await .map_err(|e| format!("embed queries: {e}"))?; for ((idx, _), vec) in need_embed.iter().zip(resp.embeddings.iter()) { let v: Vec = vec.iter().map(|&x| x as f32).collect(); eval.queries[*idx].query_embedding = Some(v); } } for q in eval.queries.iter_mut() { if q.ground_truth.is_some() { continue; } let emb = q.query_embedding.as_ref().ok_or("missing embedding")?; q.ground_truth = Some(brute_force_top_k(emb, embeddings, eval.k)); } eval.ground_truth_built = true; Ok(()) } /// Auto-generate a synthetic eval set by sampling every Nth chunk's text as /// its own query. Useful for a quick-start eval when the user doesn't have /// real natural-language queries yet. pub fn synthetic_from_chunks( eval_name: &str, index_name: &str, embeddings: &[StoredEmbedding], sample_count: usize, k: usize, ) -> EvalSet { let n = embeddings.len(); let sample_count = sample_count.min(n); let stride = (n / sample_count.max(1)).max(1); let mut queries = Vec::with_capacity(sample_count); for i in 0..sample_count { let idx = (i * stride).min(n - 1); let chunk = &embeddings[idx]; // Use the first ~200 chars of the chunk as the "query" — it should find // itself and nearby chunks as top results. let query_text: String = chunk.chunk_text.chars().take(200).collect(); queries.push(EvalQuery { id: format!("syn-{}", i), query_text, ground_truth: None, query_embedding: None, }); } EvalSet { name: eval_name.to_string(), index_name: index_name.to_string(), k, queries, created_at: Utc::now(), ground_truth_built: false, } }