lakehouse/crates/vectord/src/harness.rs

/// Eval harness for HNSW tuning.
///
/// An EvalSet is a named collection of queries against a specific vector index.
/// Each query has a query_text and (optionally pre-computed) ground_truth
/// doc_ids — the "correct" top-k results that brute-force cosine returns.
/// HNSW trials are scored by how well they recreate that top-k (recall@k).
///
/// Storage: `_hnsw_evals/{name}.json` as a single JSON document. Small.

use chrono::{DateTime, Utc};
use object_store::ObjectStore;
use serde::{Deserialize, Serialize};
use std::sync::Arc;

use aibridge::client::{AiClient, EmbedRequest};
use storaged::ops;

use crate::store::StoredEmbedding;

/// A single eval query with optional pre-computed ground truth.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalQuery {
    pub id: String,
    pub query_text: String,
    /// Ordered list of doc_ids that brute-force returns as top-k.
    /// `None` means the ground truth hasn't been computed yet; `compute_ground_truth`
    /// fills it in against the current embedding set.
    #[serde(default)]
    pub ground_truth: Option<Vec<String>>,
    /// Optional pre-computed query embedding. Filling this avoids re-embedding
    /// the same query on every trial.
    #[serde(default)]
    pub query_embedding: Option<Vec<f32>>,
}

/// A named eval set tied to a specific vector index.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSet {
    pub name: String,
    pub index_name: String,
    pub k: usize, // top-k used for recall calculation
    pub queries: Vec<EvalQuery>,
    pub created_at: DateTime<Utc>,
    pub ground_truth_built: bool,
}

impl EvalSet {
    pub fn new(name: &str, index_name: &str, k: usize) -> Self {
        Self {
            name: name.to_string(),
            index_name: index_name.to_string(),
            k,
            queries: Vec::new(),
            created_at: Utc::now(),
            ground_truth_built: false,
        }
    }

    pub fn storage_key(&self) -> String {
        format!("_hnsw_evals/{}.json", self.name)
    }

    pub async fn save(&self, store: &Arc<dyn ObjectStore>) -> Result<(), String> {
        let json = serde_json::to_vec_pretty(self).map_err(|e| e.to_string())?;
        ops::put(store, &self.storage_key(), json.into()).await
    }

    pub async fn load(store: &Arc<dyn ObjectStore>, name: &str) -> Result<Self, String> {
        let key = format!("_hnsw_evals/{}.json", name);
        let data = ops::get(store, &key).await?;
        serde_json::from_slice(&data).map_err(|e| format!("parse eval set: {e}"))
    }

    pub async fn list(store: &Arc<dyn ObjectStore>) -> Result<Vec<String>, String> {
        let keys = ops::list(store, Some("_hnsw_evals/")).await?;
        Ok(keys
            .into_iter()
            .filter(|k| k.ends_with(".json"))
            .map(|k| {
                k.trim_start_matches("_hnsw_evals/")
                    .trim_end_matches(".json")
                    .to_string()
            })
            .collect())
    }
}

/// Cosine similarity for two same-length f32 slices.
fn cosine(a: &[f32], b: &[f32]) -> f32 {
    let mut dot = 0.0f32;
    let mut na = 0.0f32;
    let mut nb = 0.0f32;
    for i in 0..a.len() {
        dot += a[i] * b[i];
        na += a[i] * a[i];
        nb += b[i] * b[i];
    }
    if na == 0.0 || nb == 0.0 {
        return 0.0;
    }
    dot / (na.sqrt() * nb.sqrt())
}

/// Brute-force top-k search for a single query against all embeddings.
/// This is the ground-truth oracle that HNSW trials must approximate.
pub fn brute_force_top_k(
    query: &[f32],
    embeddings: &[StoredEmbedding],
    k: usize,
) -> Vec<String> {
    let mut scored: Vec<(f32, usize)> = embeddings
        .iter()
        .enumerate()
        .map(|(i, e)| (cosine(query, &e.vector), i))
        .collect();
    // Partial sort — we only need top-k.
    scored.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
    scored
        .into_iter()
        .take(k)
        .map(|(_, i)| embeddings[i].doc_id.clone())
        .collect()
}

/// Recall@k — fraction of ground-truth doc_ids that appear in predicted.
pub fn recall_at_k(predicted: &[String], ground_truth: &[String], k: usize) -> f32 {
    if ground_truth.is_empty() || k == 0 {
        return 0.0;
    }
    let gt_set: std::collections::HashSet<&String> =
        ground_truth.iter().take(k).collect();
    let hits = predicted
        .iter()
        .take(k)
        .filter(|d| gt_set.contains(d))
        .count();
    hits as f32 / gt_set.len() as f32
}

/// Populate query_embedding and ground_truth for every query that lacks them.
pub async fn compute_ground_truth(
    eval: &mut EvalSet,
    embeddings: &[StoredEmbedding],
    ai_client: &AiClient,
) -> Result<(), String> {
    let need_embed: Vec<(usize, String)> = eval
        .queries
        .iter()
        .enumerate()
        .filter(|(_, q)| q.query_embedding.is_none())
        .map(|(i, q)| (i, q.query_text.clone()))
        .collect();

    if !need_embed.is_empty() {
        // Embed in one batch to keep things simple; for very large eval sets
        // we'd batch this in chunks of 32.
        let texts: Vec<String> = need_embed.iter().map(|(_, t)| t.clone()).collect();
        let resp = ai_client
            .embed(EmbedRequest { texts, model: None })
            .await
            .map_err(|e| format!("embed queries: {e}"))?;
        for ((idx, _), vec) in need_embed.iter().zip(resp.embeddings.iter()) {
            let v: Vec<f32> = vec.iter().map(|&x| x as f32).collect();
            eval.queries[*idx].query_embedding = Some(v);
        }
    }

    for q in eval.queries.iter_mut() {
        if q.ground_truth.is_some() {
            continue;
        }
        let emb = q.query_embedding.as_ref().ok_or("missing embedding")?;
        q.ground_truth = Some(brute_force_top_k(emb, embeddings, eval.k));
    }
    eval.ground_truth_built = true;
    Ok(())
}

/// Auto-generate a synthetic eval set by sampling every Nth chunk's text as
/// its own query. Useful for a quick-start eval when the user doesn't have
/// real natural-language queries yet.
pub fn synthetic_from_chunks(
    eval_name: &str,
    index_name: &str,
    embeddings: &[StoredEmbedding],
    sample_count: usize,
    k: usize,
) -> EvalSet {
    let n = embeddings.len();
    let sample_count = sample_count.min(n);
    let stride = (n / sample_count.max(1)).max(1);

    let mut queries = Vec::with_capacity(sample_count);
    for i in 0..sample_count {
        let idx = (i * stride).min(n - 1);
        let chunk = &embeddings[idx];
        // Use the first ~200 chars of the chunk as the "query" — it should find
        // itself and nearby chunks as top results.
        let query_text: String = chunk.chunk_text.chars().take(200).collect();
        queries.push(EvalQuery {
            id: format!("syn-{}", i),
            query_text,
            ground_truth: None,
            query_embedding: None,
        });
    }

    EvalSet {
        name: eval_name.to_string(),
        index_name: index_name.to_string(),
        k,
        queries,
        created_at: Utc::now(),
        ground_truth_built: false,
    }
}