lakehouse/crates/vectord/src/embedding_cache.rs

/// In-memory cache for StoredEmbedding vectors.
///
/// Rationale: loading 100K embeddings from Parquet takes ~2-5s. When an AI agent
/// iterates on HNSW parameters, each trial would repeat that cost. The cache
/// pins embeddings in memory so trials reuse them.
///
/// This is a pure performance layer — the Parquet file is still the source of
/// truth (ADR-008). Eviction is safe; worst case is one slow reload.

use object_store::ObjectStore;
use serde::Serialize;
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;

use crate::store::{self, StoredEmbedding};

#[derive(Clone)]
pub struct EmbeddingCache {
    store: Arc<dyn ObjectStore>,
    cache: Arc<RwLock<HashMap<String, Arc<Vec<StoredEmbedding>>>>>,
}

#[derive(Debug, Clone, Serialize)]
pub struct CacheEntry {
    pub index_name: String,
    pub vectors: usize,
    pub dimensions: usize,
    pub memory_bytes: u64,
}

#[derive(Debug, Clone, Serialize)]
pub struct CacheStats {
    pub entries: Vec<CacheEntry>,
    pub total_memory_bytes: u64,
}

impl EmbeddingCache {
    pub fn new(store: Arc<dyn ObjectStore>) -> Self {
        Self {
            store,
            cache: Arc::new(RwLock::new(HashMap::new())),
        }
    }

    /// Return cached embeddings, loading from object storage on first request.
    pub async fn get_or_load(
        &self,
        index_name: &str,
    ) -> Result<Arc<Vec<StoredEmbedding>>, String> {
        if let Some(cached) = self.cache.read().await.get(index_name) {
            return Ok(cached.clone());
        }

        // Load under a write lock so concurrent callers only hit disk once.
        let mut guard = self.cache.write().await;
        if let Some(cached) = guard.get(index_name) {
            return Ok(cached.clone());
        }
        tracing::info!("embedding_cache: loading '{index_name}' from object storage");
        let t0 = std::time::Instant::now();
        let loaded = store::load_embeddings(&self.store, index_name).await?;
        let n = loaded.len();
        let arc = Arc::new(loaded);
        guard.insert(index_name.to_string(), arc.clone());
        tracing::info!(
            "embedding_cache: loaded '{index_name}' — {n} vectors in {:.2}s",
            t0.elapsed().as_secs_f32()
        );
        Ok(arc)
    }

    pub async fn evict(&self, index_name: &str) -> bool {
        self.cache.write().await.remove(index_name).is_some()
    }

    pub async fn stats(&self) -> CacheStats {
        let cache = self.cache.read().await;
        let mut entries = Vec::with_capacity(cache.len());
        let mut total: u64 = 0;
        for (name, embs) in cache.iter() {
            let dims = embs.first().map(|e| e.vector.len()).unwrap_or(0);
            // Rough estimate: vector data + chunk_text + metadata overhead.
            let vec_bytes = (embs.len() * dims * std::mem::size_of::<f32>()) as u64;
            let text_bytes: u64 = embs.iter().map(|e| e.chunk_text.len() as u64).sum();
            let overhead = (embs.len() * 128) as u64; // strings + struct overhead
            let mem = vec_bytes + text_bytes + overhead;
            total += mem;
            entries.push(CacheEntry {
                index_name: name.clone(),
                vectors: embs.len(),
                dimensions: dims,
                memory_bytes: mem,
            });
        }
        CacheStats { entries, total_memory_bytes: total }
    }
}