/// Vector index registry — tracks all indexes with model versioning. /// Each index knows which model created it, enabling: /// - Multi-version indexes (same data, different models, coexist) /// - Incremental re-embed (only new/changed docs on model upgrade) /// - A/B search comparison between model versions use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; use storaged::ops; /// Metadata for a vector index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexMeta { pub index_name: String, pub source: String, // dataset this was built from pub model_name: String, // "nomic-embed-text" pub model_version: String, // "latest" or specific version pub dimensions: u32, // 768 pub chunk_count: usize, pub doc_count: usize, pub chunk_size: usize, pub overlap: usize, pub storage_key: String, // "vectors/resumes_v1_nomic.parquet" pub created_at: DateTime, pub build_time_secs: f32, pub chunks_per_sec: f32, /// Federation layer 2: which bucket holds this index's artifacts /// (trial journal + promotion file). Defaults to "primary" for /// pre-federation indexes — the serde default keeps old metadata /// files readable without migration. #[serde(default = "default_bucket")] pub bucket: String, /// ADR-019: which physical backend stores this index. `Parquet` /// means storage_key points at our binary-blob Parquet file; /// `Lance` means it points at a Lance dataset directory. #[serde(default)] pub vector_backend: shared::types::VectorBackend, /// ADR-020: prefix prepended to doc_ids during embedding. If set, /// hybrid search strips this prefix to match against SQL primary keys. /// None = doc_ids ARE the raw primary keys (no stripping needed). /// Existing indexes: "W-", "CAND-", "W500K-", etc. #[serde(default)] pub id_prefix: Option, /// PRD 11.3 — when this index was last searched against. `None` = /// never used since registration (or pre-field-existed metadata). /// Incremental re-embed walks this to skip cold indexes. /// Scrum iter 11 flagged the missing field as a UnitMismatch /// because callers were reading `created_at` as a proxy for /// liveness, which conflated "built" with "used." #[serde(default)] pub last_used: Option>, /// PRD 11.3 — SHA-256 of (sorted source file list + chunk_size + /// overlap + model_version). Lets incremental re-embed detect /// "no change since last build" without scanning the source /// Parquet. None = signature not computed yet (pre-existing /// indexes before this field landed). #[serde(default)] pub build_signature: Option, } fn default_bucket() -> String { "primary".to_string() } /// Registry of all vector indexes. #[derive(Clone)] pub struct IndexRegistry { indexes: Arc>>, store: Arc, } impl IndexRegistry { pub fn new(store: Arc) -> Self { Self { indexes: Arc::new(RwLock::new(HashMap::new())), store, } } /// Rebuild from persisted index metadata on startup. pub async fn rebuild(&self) -> Result { let keys = ops::list(&self.store, Some("vectors/meta/")).await?; let mut reg = self.indexes.write().await; reg.clear(); for key in &keys { if !key.ends_with(".json") { continue; } let data = ops::get(&self.store, key).await?; match serde_json::from_slice::(&data) { Ok(meta) => { reg.insert(meta.index_name.clone(), meta); } Err(e) => tracing::warn!("failed to load index meta {key}: {e}"), } } let count = reg.len(); if count > 0 { tracing::info!("loaded {count} vector index metadata entries"); } Ok(count) } /// Register a new index. pub async fn register(&self, meta: IndexMeta) -> Result<(), String> { let key = format!("vectors/meta/{}.json", meta.index_name); let json = serde_json::to_vec_pretty(&meta).map_err(|e| e.to_string())?; ops::put(&self.store, &key, json.into()).await?; self.indexes.write().await.insert(meta.index_name.clone(), meta); Ok(()) } /// Get metadata for an index. pub async fn get(&self, index_name: &str) -> Option { self.indexes.read().await.get(index_name).cloned() } /// List all indexes, optionally filtered by source or model. pub async fn list(&self, source: Option<&str>, model: Option<&str>) -> Vec { self.indexes.read().await.values() .filter(|m| source.map_or(true, |s| m.source == s)) .filter(|m| model.map_or(true, |mo| m.model_name == mo)) .cloned() .collect() } /// Find all versions of an index for a given source dataset. /// Returns indexes sorted by creation time (newest first). pub async fn versions_for_source(&self, source: &str) -> Vec { let mut versions: Vec = self.indexes.read().await.values() .filter(|m| m.source == source) .cloned() .collect(); versions.sort_by(|a, b| b.created_at.cmp(&a.created_at)); versions } /// Delete an index (metadata only — vector Parquet stays for safety). pub async fn delete(&self, index_name: &str) -> Result<(), String> { let key = format!("vectors/meta/{index_name}.json"); ops::delete(&self.store, &key).await?; self.indexes.write().await.remove(index_name); Ok(()) } /// Stamp `last_used = now()` on an index. Search handlers call this /// on every hit so incremental re-embed (PRD 11.3) can tell live /// indexes from cold ones. Silently no-ops if the index is unknown /// — callers get best-effort behavior, not a 500 on a missing row. pub async fn touch_used(&self, index_name: &str) { if let Some(m) = self.indexes.write().await.get_mut(index_name) { m.last_used = Some(Utc::now()); } } } /// Compute a stable build_signature for PRD 11.3 incremental re-embed. /// Hashes (sorted source file list, chunk_size, overlap, model_version) /// so a caller can ask "has anything we built from changed?" without /// re-scanning the source parquet. Same inputs always produce the /// same hash. pub fn compute_build_signature( source_files: &[impl AsRef], chunk_size: usize, overlap: usize, model_version: &str, ) -> String { use sha2::{Digest, Sha256}; let mut sorted: Vec<&str> = source_files.iter().map(|s| s.as_ref()).collect(); sorted.sort(); let mut hasher = Sha256::new(); for f in &sorted { hasher.update(f.as_bytes()); hasher.update(b"\n"); } hasher.update(chunk_size.to_le_bytes()); hasher.update(overlap.to_le_bytes()); hasher.update(model_version.as_bytes()); format!("{:x}", hasher.finalize()) } #[cfg(test)] mod tests { use super::*; #[test] fn build_signature_is_deterministic() { let sig1 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1"); let sig2 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1"); assert_eq!(sig1, sig2, "same inputs → same hash"); } #[test] fn build_signature_order_invariant() { // Files get sorted internally so caller's order doesn't matter. let sig_a = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1"); let sig_b = compute_build_signature(&["b.parquet", "a.parquet"], 800, 80, "v1"); assert_eq!(sig_a, sig_b, "file list order must not affect hash"); } #[test] fn build_signature_changes_on_chunk_param() { let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1"); let sig_b = compute_build_signature(&["a.parquet"], 900, 80, "v1"); assert_ne!(sig_a, sig_b, "chunk_size change → different hash"); } #[test] fn build_signature_changes_on_model_version() { let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1"); let sig_b = compute_build_signature(&["a.parquet"], 800, 80, "v2"); assert_ne!(sig_a, sig_b, "model version change → different hash"); } #[tokio::test] async fn touch_used_updates_last_used() { use object_store::memory::InMemory; let store: Arc = Arc::new(InMemory::new()); let reg = IndexRegistry::new(store); let meta = IndexMeta { index_name: "test".into(), source: "s".into(), model_name: "m".into(), model_version: "v1".into(), dimensions: 768, chunk_count: 0, doc_count: 0, chunk_size: 800, overlap: 80, storage_key: "k".into(), created_at: Utc::now(), build_time_secs: 0.0, chunks_per_sec: 0.0, bucket: "primary".into(), vector_backend: Default::default(), id_prefix: None, last_used: None, build_signature: None, }; reg.register(meta).await.unwrap(); assert!(reg.get("test").await.unwrap().last_used.is_none()); reg.touch_used("test").await; assert!(reg.get("test").await.unwrap().last_used.is_some()); } #[tokio::test] async fn touch_used_is_noop_on_missing_index() { use object_store::memory::InMemory; let store: Arc = Arc::new(InMemory::new()); let reg = IndexRegistry::new(store); // No panic — unknown index just doesn't get touched. reg.touch_used("nonexistent").await; } #[test] fn index_meta_deserializes_without_new_fields_backcompat() { // Pre-field-existence metadata files on disk must still load. // Critical — we have ~40 .json meta files under vectors/meta/ // that predate these fields. let json = r#"{ "index_name": "resumes_v1", "source": "resumes", "model_name": "nomic-embed-text", "model_version": "latest", "dimensions": 768, "chunk_count": 100, "doc_count": 10, "chunk_size": 800, "overlap": 80, "storage_key": "vectors/resumes_v1.parquet", "created_at": "2026-04-20T00:00:00Z", "build_time_secs": 1.0, "chunks_per_sec": 100.0 }"#; let meta: IndexMeta = serde_json::from_str(json).expect("must deserialize pre-field meta"); assert!(meta.last_used.is_none()); assert!(meta.build_signature.is_none()); assert_eq!(meta.bucket, "primary"); } }