Accumulated since a6f12e2 (Phase 21 Rust port + Phase 27 versioning): - Phase 36: embed_semaphore on VectorState (permits=1) serializes seed embed calls — prevents sidecar socket collisions under concurrent /seed stress load - Phase 31+: run_stress.ts 6-task diverse stress scaffolding; run_e2e_rated.ts + orchestrator.ts tightening - Catalog dedupe cleanup: 16 duplicate manifests removed; canonical candidates.parquet (10.5MB -> 76KB) + placements.parquet (1.2MB -> 11KB) regenerated post-dedupe; fresh manifests for active datasets - vectord: harness EvalSet refinements (+181), agent portfolio rotation + ingest triggers (+158), autotune + rag adjustments - catalogd/storaged/ingestd/mcp-server: misc tightening - docs: Phase 28-36 PRD entries + DECISIONS ADR additions; control-plane pivot banner added to top of docs/PRD.md (pointing at docs/CONTROL_PLANE_PRD.md which lands in next commit) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
388 lines
14 KiB
Rust
388 lines
14 KiB
Rust
/// Eval harness for HNSW tuning.
|
|
///
|
|
/// An EvalSet is a named collection of queries against a specific vector index.
|
|
/// Each query has a query_text and (optionally pre-computed) ground_truth
|
|
/// doc_ids — the "correct" top-k results that brute-force cosine returns.
|
|
/// HNSW trials are scored by how well they recreate that top-k (recall@k).
|
|
///
|
|
/// Storage: `_hnsw_evals/{name}.json` as a single JSON document. Small.
|
|
|
|
use chrono::{DateTime, Utc};
|
|
use object_store::ObjectStore;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashSet;
|
|
use std::sync::Arc;
|
|
|
|
use aibridge::client::{AiClient, EmbedRequest};
|
|
use storaged::ops;
|
|
use storaged::registry::BucketRegistry;
|
|
|
|
use crate::index_registry::IndexRegistry;
|
|
use crate::store::StoredEmbedding;
|
|
|
|
/// A single eval query with optional pre-computed ground truth.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct EvalQuery {
|
|
pub id: String,
|
|
pub query_text: String,
|
|
/// Ordered list of doc_ids that brute-force returns as top-k.
|
|
/// `None` means the ground truth hasn't been computed yet; `compute_ground_truth`
|
|
/// fills it in against the current embedding set.
|
|
#[serde(default)]
|
|
pub ground_truth: Option<Vec<String>>,
|
|
/// Optional pre-computed query embedding. Filling this avoids re-embedding
|
|
/// the same query on every trial.
|
|
#[serde(default)]
|
|
pub query_embedding: Option<Vec<f32>>,
|
|
}
|
|
|
|
/// A named eval set tied to a specific vector index.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct EvalSet {
|
|
pub name: String,
|
|
pub index_name: String,
|
|
pub k: usize, // top-k used for recall calculation
|
|
pub queries: Vec<EvalQuery>,
|
|
pub created_at: DateTime<Utc>,
|
|
pub ground_truth_built: bool,
|
|
}
|
|
|
|
impl EvalSet {
|
|
pub fn new(name: &str, index_name: &str, k: usize) -> Self {
|
|
Self {
|
|
name: name.to_string(),
|
|
index_name: index_name.to_string(),
|
|
k,
|
|
queries: Vec::new(),
|
|
created_at: Utc::now(),
|
|
ground_truth_built: false,
|
|
}
|
|
}
|
|
|
|
pub fn storage_key(&self) -> String {
|
|
format!("_hnsw_evals/{}.json", self.name)
|
|
}
|
|
|
|
pub async fn save(&self, store: &Arc<dyn ObjectStore>) -> Result<(), String> {
|
|
let json = serde_json::to_vec_pretty(self).map_err(|e| e.to_string())?;
|
|
ops::put(store, &self.storage_key(), json.into()).await
|
|
}
|
|
|
|
pub async fn load(store: &Arc<dyn ObjectStore>, name: &str) -> Result<Self, String> {
|
|
let key = format!("_hnsw_evals/{}.json", name);
|
|
let data = ops::get(store, &key).await?;
|
|
serde_json::from_slice(&data).map_err(|e| format!("parse eval set: {e}"))
|
|
}
|
|
|
|
pub async fn list(store: &Arc<dyn ObjectStore>) -> Result<Vec<String>, String> {
|
|
let keys = ops::list(store, Some("_hnsw_evals/")).await?;
|
|
Ok(keys
|
|
.into_iter()
|
|
.filter(|k| k.ends_with(".json"))
|
|
.map(|k| {
|
|
k.trim_start_matches("_hnsw_evals/")
|
|
.trim_end_matches(".json")
|
|
.to_string()
|
|
})
|
|
.collect())
|
|
}
|
|
}
|
|
|
|
/// Federation-aware wrapper around EvalSet persistence. Mirrors the
|
|
/// `TrialJournal` / `PromotionRegistry` pattern: harness files colocate
|
|
/// with their index's bucket (looked up via `IndexMeta.bucket`), falling
|
|
/// back to `primary` for indexes the registry has never seen. Legacy
|
|
/// harnesses predating federation remain discoverable — lookups transparently
|
|
/// try the resolved bucket first, then `primary` as a fallback. Cross-bucket
|
|
/// listing dedupes across every registered bucket so `GET /hnsw/evals`
|
|
/// returns a complete picture.
|
|
#[derive(Clone)]
|
|
pub struct HarnessStore {
|
|
buckets: Arc<BucketRegistry>,
|
|
index_registry: IndexRegistry,
|
|
}
|
|
|
|
impl HarnessStore {
|
|
pub fn new(buckets: Arc<BucketRegistry>, index_registry: IndexRegistry) -> Self {
|
|
Self { buckets, index_registry }
|
|
}
|
|
|
|
/// Resolve which bucket holds this index's eval artifacts. Indexes the
|
|
/// registry has never heard of fall through to `primary`.
|
|
async fn bucket_for_index(&self, index_name: &str) -> String {
|
|
self.index_registry
|
|
.get(index_name)
|
|
.await
|
|
.map(|m| m.bucket)
|
|
.unwrap_or_else(|| "primary".to_string())
|
|
}
|
|
|
|
/// Save to the bucket that owns `eval.index_name`. Writes under the
|
|
/// standard `_hnsw_evals/{name}.json` prefix of the resolved bucket.
|
|
pub async fn save(&self, eval: &EvalSet) -> Result<(), String> {
|
|
let bucket = self.bucket_for_index(&eval.index_name).await;
|
|
let store = self.buckets.get(&bucket)?;
|
|
eval.save(&store).await
|
|
}
|
|
|
|
/// Load a harness by name, given the index it belongs to. Tries the
|
|
/// index's bucket first; if the file is absent AND the resolved bucket
|
|
/// isn't `primary`, falls through to `primary` so pre-federation evals
|
|
/// remain reachable without migration.
|
|
pub async fn load_for_index(
|
|
&self,
|
|
index_name: &str,
|
|
harness_name: &str,
|
|
) -> Result<EvalSet, String> {
|
|
let bucket = self.bucket_for_index(index_name).await;
|
|
let primary_store = self.buckets.get("primary")?;
|
|
let store = self.buckets.get(&bucket)?;
|
|
|
|
match EvalSet::load(&store, harness_name).await {
|
|
Ok(e) => Ok(e),
|
|
Err(e) if bucket != "primary" => EvalSet::load(&primary_store, harness_name)
|
|
.await
|
|
.map_err(|primary_err| format!("{bucket}: {e}; primary fallback: {primary_err}")),
|
|
Err(e) => Err(e),
|
|
}
|
|
}
|
|
|
|
/// Find a harness by name without knowing which index it belongs to —
|
|
/// used by `GET /hnsw/evals/{name}`. Scans every registered bucket;
|
|
/// first hit wins. Primary is searched first so pre-federation evals
|
|
/// with the same name as a federated one resolve deterministically.
|
|
pub async fn get_any(&self, harness_name: &str) -> Result<EvalSet, String> {
|
|
let bucket_infos = self.buckets.list().await;
|
|
let mut ordered: Vec<String> = bucket_infos.iter().map(|b| b.name.clone()).collect();
|
|
ordered.sort_by_key(|n| if n == "primary" { 0 } else { 1 });
|
|
|
|
let mut last_err = None;
|
|
for b in ordered {
|
|
let store = match self.buckets.get(&b) {
|
|
Ok(s) => s,
|
|
Err(e) => { last_err = Some(e); continue; }
|
|
};
|
|
match EvalSet::load(&store, harness_name).await {
|
|
Ok(e) => return Ok(e),
|
|
Err(e) => { last_err = Some(e); }
|
|
}
|
|
}
|
|
Err(last_err.unwrap_or_else(|| format!("no buckets registered for eval '{harness_name}'")))
|
|
}
|
|
|
|
/// Union of every harness name across every registered bucket.
|
|
/// Duplicates (same name in multiple buckets — pathological but
|
|
/// possible after a manual migration) are collapsed.
|
|
pub async fn list_all(&self) -> Vec<String> {
|
|
let mut all: HashSet<String> = HashSet::new();
|
|
for b in self.buckets.list().await {
|
|
let store = match self.buckets.get(&b.name) {
|
|
Ok(s) => s,
|
|
Err(_) => continue,
|
|
};
|
|
if let Ok(names) = EvalSet::list(&store).await {
|
|
all.extend(names);
|
|
}
|
|
}
|
|
let mut out: Vec<String> = all.into_iter().collect();
|
|
out.sort();
|
|
out
|
|
}
|
|
}
|
|
|
|
/// Cosine similarity for two same-length f32 slices.
|
|
fn cosine(a: &[f32], b: &[f32]) -> f32 {
|
|
let mut dot = 0.0f32;
|
|
let mut na = 0.0f32;
|
|
let mut nb = 0.0f32;
|
|
for i in 0..a.len() {
|
|
dot += a[i] * b[i];
|
|
na += a[i] * a[i];
|
|
nb += b[i] * b[i];
|
|
}
|
|
if na == 0.0 || nb == 0.0 {
|
|
return 0.0;
|
|
}
|
|
dot / (na.sqrt() * nb.sqrt())
|
|
}
|
|
|
|
/// Brute-force top-k search for a single query against all embeddings.
|
|
/// This is the ground-truth oracle that HNSW trials must approximate.
|
|
pub fn brute_force_top_k(
|
|
query: &[f32],
|
|
embeddings: &[StoredEmbedding],
|
|
k: usize,
|
|
) -> Vec<String> {
|
|
let mut scored: Vec<(f32, usize)> = embeddings
|
|
.iter()
|
|
.enumerate()
|
|
.map(|(i, e)| (cosine(query, &e.vector), i))
|
|
.collect();
|
|
// Partial sort — we only need top-k.
|
|
scored.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
|
scored
|
|
.into_iter()
|
|
.take(k)
|
|
.map(|(_, i)| embeddings[i].doc_id.clone())
|
|
.collect()
|
|
}
|
|
|
|
/// Recall@k — fraction of ground-truth doc_ids that appear in predicted.
|
|
pub fn recall_at_k(predicted: &[String], ground_truth: &[String], k: usize) -> f32 {
|
|
if ground_truth.is_empty() || k == 0 {
|
|
return 0.0;
|
|
}
|
|
// Set-intersection recall@k. Previous implementation counted duplicates
|
|
// in `predicted` (a corpus with repeated chunks — e.g. cached LLM
|
|
// responses — returns the same doc_id multiple times via HNSW), which
|
|
// inflated recall above 1.0 and poisoned promotion decisions.
|
|
let gt_set: std::collections::HashSet<&String> =
|
|
ground_truth.iter().take(k).collect();
|
|
let pred_set: std::collections::HashSet<&String> =
|
|
predicted.iter().take(k).collect();
|
|
let hits = pred_set.intersection(>_set).count();
|
|
hits as f32 / gt_set.len() as f32
|
|
}
|
|
|
|
/// Populate query_embedding and ground_truth for every query that lacks them.
|
|
pub async fn compute_ground_truth(
|
|
eval: &mut EvalSet,
|
|
embeddings: &[StoredEmbedding],
|
|
ai_client: &AiClient,
|
|
) -> Result<(), String> {
|
|
let need_embed: Vec<(usize, String)> = eval
|
|
.queries
|
|
.iter()
|
|
.enumerate()
|
|
.filter(|(_, q)| q.query_embedding.is_none())
|
|
.map(|(i, q)| (i, q.query_text.clone()))
|
|
.collect();
|
|
|
|
if !need_embed.is_empty() {
|
|
// Embed in one batch to keep things simple; for very large eval sets
|
|
// we'd batch this in chunks of 32.
|
|
let texts: Vec<String> = need_embed.iter().map(|(_, t)| t.clone()).collect();
|
|
let resp = ai_client
|
|
.embed(EmbedRequest { texts, model: None })
|
|
.await
|
|
.map_err(|e| format!("embed queries: {e}"))?;
|
|
for ((idx, _), vec) in need_embed.iter().zip(resp.embeddings.iter()) {
|
|
let v: Vec<f32> = vec.iter().map(|&x| x as f32).collect();
|
|
eval.queries[*idx].query_embedding = Some(v);
|
|
}
|
|
}
|
|
|
|
for q in eval.queries.iter_mut() {
|
|
if q.ground_truth.is_some() {
|
|
continue;
|
|
}
|
|
let emb = q.query_embedding.as_ref().ok_or("missing embedding")?;
|
|
q.ground_truth = Some(brute_force_top_k(emb, embeddings, eval.k));
|
|
}
|
|
eval.ground_truth_built = true;
|
|
Ok(())
|
|
}
|
|
|
|
/// Auto-generate a synthetic eval set by sampling every Nth chunk's text as
|
|
/// its own query. Useful for a quick-start eval when the user doesn't have
|
|
/// real natural-language queries yet.
|
|
pub fn synthetic_from_chunks(
|
|
eval_name: &str,
|
|
index_name: &str,
|
|
embeddings: &[StoredEmbedding],
|
|
sample_count: usize,
|
|
k: usize,
|
|
) -> EvalSet {
|
|
let n = embeddings.len();
|
|
let sample_count = sample_count.min(n);
|
|
let stride = (n / sample_count.max(1)).max(1);
|
|
|
|
let mut queries = Vec::with_capacity(sample_count);
|
|
for i in 0..sample_count {
|
|
let idx = (i * stride).min(n - 1);
|
|
let chunk = &embeddings[idx];
|
|
// Use the first ~200 chars of the chunk as the "query" — it should find
|
|
// itself and nearby chunks as top results.
|
|
let query_text: String = chunk.chunk_text.chars().take(200).collect();
|
|
queries.push(EvalQuery {
|
|
id: format!("syn-{}", i),
|
|
query_text,
|
|
ground_truth: None,
|
|
query_embedding: None,
|
|
});
|
|
}
|
|
|
|
EvalSet {
|
|
name: eval_name.to_string(),
|
|
index_name: index_name.to_string(),
|
|
k,
|
|
queries,
|
|
created_at: Utc::now(),
|
|
ground_truth_built: false,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn s(v: &[&str]) -> Vec<String> { v.iter().map(|x| x.to_string()).collect() }
|
|
|
|
#[test]
|
|
fn recall_empty_ground_truth_is_zero() {
|
|
assert_eq!(recall_at_k(&s(&["a", "b"]), &[], 10), 0.0);
|
|
}
|
|
|
|
#[test]
|
|
fn recall_k_zero_is_zero() {
|
|
assert_eq!(recall_at_k(&s(&["a"]), &s(&["a"]), 0), 0.0);
|
|
}
|
|
|
|
#[test]
|
|
fn recall_perfect_match_equals_one() {
|
|
let pred = s(&["a", "b", "c"]);
|
|
let gt = s(&["a", "b", "c"]);
|
|
assert!((recall_at_k(&pred, >, 3) - 1.0).abs() < 1e-6);
|
|
}
|
|
|
|
#[test]
|
|
fn recall_half_match() {
|
|
let pred = s(&["a", "b", "x", "y"]);
|
|
let gt = s(&["a", "b", "c", "d"]);
|
|
assert!((recall_at_k(&pred, >, 4) - 0.5).abs() < 1e-6);
|
|
}
|
|
|
|
#[test]
|
|
fn recall_duplicates_in_predicted_do_not_inflate() {
|
|
// Regression guard: the previous implementation counted each
|
|
// duplicate in `predicted` separately, which could push recall
|
|
// above 1.0 on corpora with repeated chunks (cached responses etc).
|
|
// Set-intersection semantics keep it bounded in [0, 1].
|
|
let pred = s(&["a", "a", "a", "a"]);
|
|
let gt = s(&["a", "b", "c", "d"]);
|
|
let r = recall_at_k(&pred, >, 4);
|
|
assert!(r <= 1.0, "recall {r} must not exceed 1.0");
|
|
// One unique match out of four in gt = 0.25.
|
|
assert!((r - 0.25).abs() < 1e-6);
|
|
}
|
|
|
|
#[test]
|
|
fn recall_duplicates_in_ground_truth_handled() {
|
|
// gt with dupes reduces effective |gt|; matching one hits all.
|
|
let pred = s(&["x"]);
|
|
let gt = s(&["x", "x", "x"]);
|
|
let r = recall_at_k(&pred, >, 3);
|
|
assert!(r <= 1.0);
|
|
assert!((r - 1.0).abs() < 1e-6); // |pred ∩ gt| / |gt_set| = 1/1
|
|
}
|
|
|
|
#[test]
|
|
fn recall_respects_k_bound() {
|
|
// k=2 means only the first 2 of pred and gt count toward the set.
|
|
let pred = s(&["a", "b", "c", "d"]);
|
|
let gt = s(&["a", "b", "c", "d"]);
|
|
let r = recall_at_k(&pred, >, 2);
|
|
assert!((r - 1.0).abs() < 1e-6);
|
|
}
|
|
}
|