lakehouse/crates/vectord/src/index_registry.rs

/// Vector index registry — tracks all indexes with model versioning.
/// Each index knows which model created it, enabling:
/// - Multi-version indexes (same data, different models, coexist)
/// - Incremental re-embed (only new/changed docs on model upgrade)
/// - A/B search comparison between model versions

use chrono::{DateTime, Utc};
use object_store::ObjectStore;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;

use storaged::ops;

/// Metadata for a vector index.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexMeta {
    pub index_name: String,
    pub source: String,           // dataset this was built from
    pub model_name: String,       // "nomic-embed-text"
    pub model_version: String,    // "latest" or specific version
    pub dimensions: u32,          // 768
    pub chunk_count: usize,
    pub doc_count: usize,
    pub chunk_size: usize,
    pub overlap: usize,
    pub storage_key: String,      // "vectors/resumes_v1_nomic.parquet"
    pub created_at: DateTime<Utc>,
    pub build_time_secs: f32,
    pub chunks_per_sec: f32,
    /// Federation layer 2: which bucket holds this index's artifacts
    /// (trial journal + promotion file). Defaults to "primary" for
    /// pre-federation indexes — the serde default keeps old metadata
    /// files readable without migration.
    #[serde(default = "default_bucket")]
    pub bucket: String,
    /// ADR-019: which physical backend stores this index. `Parquet`
    /// means storage_key points at our binary-blob Parquet file;
    /// `Lance` means it points at a Lance dataset directory.
    #[serde(default)]
    pub vector_backend: shared::types::VectorBackend,
    /// ADR-020: prefix prepended to doc_ids during embedding. If set,
    /// hybrid search strips this prefix to match against SQL primary keys.
    /// None = doc_ids ARE the raw primary keys (no stripping needed).
    /// Existing indexes: "W-", "CAND-", "W500K-", etc.
    #[serde(default)]
    pub id_prefix: Option<String>,
    /// PRD 11.3 — when this index was last searched against. `None` =
    /// never used since registration (or pre-field-existed metadata).
    /// Incremental re-embed walks this to skip cold indexes.
    /// Scrum iter 11 flagged the missing field as a UnitMismatch
    /// because callers were reading `created_at` as a proxy for
    /// liveness, which conflated "built" with "used."
    #[serde(default)]
    pub last_used: Option<DateTime<Utc>>,
    /// PRD 11.3 — SHA-256 of (sorted source file list + chunk_size +
    /// overlap + model_version). Lets incremental re-embed detect
    /// "no change since last build" without scanning the source
    /// Parquet. None = signature not computed yet (pre-existing
    /// indexes before this field landed).
    #[serde(default)]
    pub build_signature: Option<String>,
}

fn default_bucket() -> String { "primary".to_string() }

/// Registry of all vector indexes.
#[derive(Clone)]
pub struct IndexRegistry {
    indexes: Arc<RwLock<HashMap<String, IndexMeta>>>,
    store: Arc<dyn ObjectStore>,
}

impl IndexRegistry {
    pub fn new(store: Arc<dyn ObjectStore>) -> Self {
        Self {
            indexes: Arc::new(RwLock::new(HashMap::new())),
            store,
        }
    }

    /// Rebuild from persisted index metadata on startup.
    pub async fn rebuild(&self) -> Result<usize, String> {
        let keys = ops::list(&self.store, Some("vectors/meta/")).await?;
        let mut reg = self.indexes.write().await;
        reg.clear();

        for key in &keys {
            if !key.ends_with(".json") { continue; }
            let data = ops::get(&self.store, key).await?;
            match serde_json::from_slice::<IndexMeta>(&data) {
                Ok(meta) => { reg.insert(meta.index_name.clone(), meta); }
                Err(e) => tracing::warn!("failed to load index meta {key}: {e}"),
            }
        }

        let count = reg.len();
        if count > 0 {
            tracing::info!("loaded {count} vector index metadata entries");
        }
        Ok(count)
    }

    /// Register a new index.
    pub async fn register(&self, meta: IndexMeta) -> Result<(), String> {
        let key = format!("vectors/meta/{}.json", meta.index_name);
        let json = serde_json::to_vec_pretty(&meta).map_err(|e| e.to_string())?;
        ops::put(&self.store, &key, json.into()).await?;
        self.indexes.write().await.insert(meta.index_name.clone(), meta);
        Ok(())
    }

    /// Get metadata for an index.
    pub async fn get(&self, index_name: &str) -> Option<IndexMeta> {
        self.indexes.read().await.get(index_name).cloned()
    }

    /// List all indexes, optionally filtered by source or model.
    pub async fn list(&self, source: Option<&str>, model: Option<&str>) -> Vec<IndexMeta> {
        self.indexes.read().await.values()
            .filter(|m| source.map_or(true, |s| m.source == s))
            .filter(|m| model.map_or(true, |mo| m.model_name == mo))
            .cloned()
            .collect()
    }

    /// Find all versions of an index for a given source dataset.
    /// Returns indexes sorted by creation time (newest first).
    pub async fn versions_for_source(&self, source: &str) -> Vec<IndexMeta> {
        let mut versions: Vec<IndexMeta> = self.indexes.read().await.values()
            .filter(|m| m.source == source)
            .cloned()
            .collect();
        versions.sort_by(|a, b| b.created_at.cmp(&a.created_at));
        versions
    }

    /// Delete an index (metadata only — vector Parquet stays for safety).
    pub async fn delete(&self, index_name: &str) -> Result<(), String> {
        let key = format!("vectors/meta/{index_name}.json");
        ops::delete(&self.store, &key).await?;
        self.indexes.write().await.remove(index_name);
        Ok(())
    }

    /// Stamp `last_used = now()` on an index. Search handlers call this
    /// on every hit so incremental re-embed (PRD 11.3) can tell live
    /// indexes from cold ones. Silently no-ops if the index is unknown
    /// — callers get best-effort behavior, not a 500 on a missing row.
    pub async fn touch_used(&self, index_name: &str) {
        if let Some(m) = self.indexes.write().await.get_mut(index_name) {
            m.last_used = Some(Utc::now());
        }
    }
}

/// Compute a stable build_signature for PRD 11.3 incremental re-embed.
/// Hashes (sorted source file list, chunk_size, overlap, model_version)
/// so a caller can ask "has anything we built from changed?" without
/// re-scanning the source parquet. Same inputs always produce the
/// same hash.
pub fn compute_build_signature(
    source_files: &[impl AsRef<str>],
    chunk_size: usize,
    overlap: usize,
    model_version: &str,
) -> String {
    use sha2::{Digest, Sha256};
    let mut sorted: Vec<&str> = source_files.iter().map(|s| s.as_ref()).collect();
    sorted.sort();
    let mut hasher = Sha256::new();
    for f in &sorted {
        hasher.update(f.as_bytes());
        hasher.update(b"\n");
    }
    hasher.update(chunk_size.to_le_bytes());
    hasher.update(overlap.to_le_bytes());
    hasher.update(model_version.as_bytes());
    format!("{:x}", hasher.finalize())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn build_signature_is_deterministic() {
        let sig1 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
        let sig2 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
        assert_eq!(sig1, sig2, "same inputs → same hash");
    }

    #[test]
    fn build_signature_order_invariant() {
        // Files get sorted internally so caller's order doesn't matter.
        let sig_a = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
        let sig_b = compute_build_signature(&["b.parquet", "a.parquet"], 800, 80, "v1");
        assert_eq!(sig_a, sig_b, "file list order must not affect hash");
    }

    #[test]
    fn build_signature_changes_on_chunk_param() {
        let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1");
        let sig_b = compute_build_signature(&["a.parquet"], 900, 80, "v1");
        assert_ne!(sig_a, sig_b, "chunk_size change → different hash");
    }

    #[test]
    fn build_signature_changes_on_model_version() {
        let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1");
        let sig_b = compute_build_signature(&["a.parquet"], 800, 80, "v2");
        assert_ne!(sig_a, sig_b, "model version change → different hash");
    }

    #[tokio::test]
    async fn touch_used_updates_last_used() {
        use object_store::memory::InMemory;
        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
        let reg = IndexRegistry::new(store);
        let meta = IndexMeta {
            index_name: "test".into(),
            source: "s".into(),
            model_name: "m".into(),
            model_version: "v1".into(),
            dimensions: 768,
            chunk_count: 0,
            doc_count: 0,
            chunk_size: 800,
            overlap: 80,
            storage_key: "k".into(),
            created_at: Utc::now(),
            build_time_secs: 0.0,
            chunks_per_sec: 0.0,
            bucket: "primary".into(),
            vector_backend: Default::default(),
            id_prefix: None,
            last_used: None,
            build_signature: None,
        };
        reg.register(meta).await.unwrap();
        assert!(reg.get("test").await.unwrap().last_used.is_none());
        reg.touch_used("test").await;
        assert!(reg.get("test").await.unwrap().last_used.is_some());
    }

    #[tokio::test]
    async fn touch_used_is_noop_on_missing_index() {
        use object_store::memory::InMemory;
        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
        let reg = IndexRegistry::new(store);
        // No panic — unknown index just doesn't get touched.
        reg.touch_used("nonexistent").await;
    }

    #[test]
    fn index_meta_deserializes_without_new_fields_backcompat() {
        // Pre-field-existence metadata files on disk must still load.
        // Critical — we have ~40 .json meta files under vectors/meta/
        // that predate these fields.
        let json = r#"{
            "index_name": "resumes_v1",
            "source": "resumes",
            "model_name": "nomic-embed-text",
            "model_version": "latest",
            "dimensions": 768,
            "chunk_count": 100,
            "doc_count": 10,
            "chunk_size": 800,
            "overlap": 80,
            "storage_key": "vectors/resumes_v1.parquet",
            "created_at": "2026-04-20T00:00:00Z",
            "build_time_secs": 1.0,
            "chunks_per_sec": 100.0
        }"#;
        let meta: IndexMeta = serde_json::from_str(json).expect("must deserialize pre-field meta");
        assert!(meta.last_used.is_none());
        assert!(meta.build_signature.is_none());
        assert_eq!(meta.bucket, "primary");
    }
}