Scrum iter 11 on crates/vectord/src/index_registry.rs flagged two
concrete field gaps (90% confidence). Both were tagged UnitMismatch
/ missing-invariant.
IndexMeta gains two Optional fields:
last_used: Option<DateTime<Utc>>
PRD 11.3 — when this index was last searched against. Callers
were reading created_at as a liveness proxy, which conflated
"built" with "used." IndexRegistry::touch_used(name) stamps the
field on every hit; incremental re-embed can now skip cold
indexes without misattributing "fresh build" to "recent use."
build_signature: Option<String>
PRD 11.3 — stable SHA-256 of (sorted source files + chunk_size
+ overlap + model_version). compute_build_signature() in the
same module is deterministic: file-order-invariant, changes on
chunk param, changes on model version. Lets incremental re-embed
answer "has anything changed since last build?" without scanning
the source Parquet.
Both fields are #[serde(default)] — the ~40 existing .json meta
files under vectors/meta/ load unchanged. Backward-compat verified
by the explicit `index_meta_deserializes_without_new_fields_backcompat`
test.
7 new tests:
- build_signature_is_deterministic
- build_signature_order_invariant (sorted internally)
- build_signature_changes_on_chunk_param
- build_signature_changes_on_model_version
- touch_used_updates_last_used
- touch_used_is_noop_on_missing_index
- index_meta_deserializes_without_new_fields_backcompat
Call-site fixes: crates/vectord/src/refresh.rs:294 and
crates/vectord/src/service.rs:244 both construct IndexMeta with
fully-literal init, default the new fields to None. One
indentation cleanup on service.rs (a pre-existing visual issue on
id_prefix: None).
Workspace warnings still at 0. touch_used() isn't wired into search
hot-path yet — follow-up commit when the search handlers can
adopt it without a broader refactor.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
282 lines
11 KiB
Rust
282 lines
11 KiB
Rust
/// Vector index registry — tracks all indexes with model versioning.
|
|
/// Each index knows which model created it, enabling:
|
|
/// - Multi-version indexes (same data, different models, coexist)
|
|
/// - Incremental re-embed (only new/changed docs on model upgrade)
|
|
/// - A/B search comparison between model versions
|
|
|
|
use chrono::{DateTime, Utc};
|
|
use object_store::ObjectStore;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
|
|
use storaged::ops;
|
|
|
|
/// Metadata for a vector index.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct IndexMeta {
|
|
pub index_name: String,
|
|
pub source: String, // dataset this was built from
|
|
pub model_name: String, // "nomic-embed-text"
|
|
pub model_version: String, // "latest" or specific version
|
|
pub dimensions: u32, // 768
|
|
pub chunk_count: usize,
|
|
pub doc_count: usize,
|
|
pub chunk_size: usize,
|
|
pub overlap: usize,
|
|
pub storage_key: String, // "vectors/resumes_v1_nomic.parquet"
|
|
pub created_at: DateTime<Utc>,
|
|
pub build_time_secs: f32,
|
|
pub chunks_per_sec: f32,
|
|
/// Federation layer 2: which bucket holds this index's artifacts
|
|
/// (trial journal + promotion file). Defaults to "primary" for
|
|
/// pre-federation indexes — the serde default keeps old metadata
|
|
/// files readable without migration.
|
|
#[serde(default = "default_bucket")]
|
|
pub bucket: String,
|
|
/// ADR-019: which physical backend stores this index. `Parquet`
|
|
/// means storage_key points at our binary-blob Parquet file;
|
|
/// `Lance` means it points at a Lance dataset directory.
|
|
#[serde(default)]
|
|
pub vector_backend: shared::types::VectorBackend,
|
|
/// ADR-020: prefix prepended to doc_ids during embedding. If set,
|
|
/// hybrid search strips this prefix to match against SQL primary keys.
|
|
/// None = doc_ids ARE the raw primary keys (no stripping needed).
|
|
/// Existing indexes: "W-", "CAND-", "W500K-", etc.
|
|
#[serde(default)]
|
|
pub id_prefix: Option<String>,
|
|
/// PRD 11.3 — when this index was last searched against. `None` =
|
|
/// never used since registration (or pre-field-existed metadata).
|
|
/// Incremental re-embed walks this to skip cold indexes.
|
|
/// Scrum iter 11 flagged the missing field as a UnitMismatch
|
|
/// because callers were reading `created_at` as a proxy for
|
|
/// liveness, which conflated "built" with "used."
|
|
#[serde(default)]
|
|
pub last_used: Option<DateTime<Utc>>,
|
|
/// PRD 11.3 — SHA-256 of (sorted source file list + chunk_size +
|
|
/// overlap + model_version). Lets incremental re-embed detect
|
|
/// "no change since last build" without scanning the source
|
|
/// Parquet. None = signature not computed yet (pre-existing
|
|
/// indexes before this field landed).
|
|
#[serde(default)]
|
|
pub build_signature: Option<String>,
|
|
}
|
|
|
|
fn default_bucket() -> String { "primary".to_string() }
|
|
|
|
/// Registry of all vector indexes.
|
|
#[derive(Clone)]
|
|
pub struct IndexRegistry {
|
|
indexes: Arc<RwLock<HashMap<String, IndexMeta>>>,
|
|
store: Arc<dyn ObjectStore>,
|
|
}
|
|
|
|
impl IndexRegistry {
|
|
pub fn new(store: Arc<dyn ObjectStore>) -> Self {
|
|
Self {
|
|
indexes: Arc::new(RwLock::new(HashMap::new())),
|
|
store,
|
|
}
|
|
}
|
|
|
|
/// Rebuild from persisted index metadata on startup.
|
|
pub async fn rebuild(&self) -> Result<usize, String> {
|
|
let keys = ops::list(&self.store, Some("vectors/meta/")).await?;
|
|
let mut reg = self.indexes.write().await;
|
|
reg.clear();
|
|
|
|
for key in &keys {
|
|
if !key.ends_with(".json") { continue; }
|
|
let data = ops::get(&self.store, key).await?;
|
|
match serde_json::from_slice::<IndexMeta>(&data) {
|
|
Ok(meta) => { reg.insert(meta.index_name.clone(), meta); }
|
|
Err(e) => tracing::warn!("failed to load index meta {key}: {e}"),
|
|
}
|
|
}
|
|
|
|
let count = reg.len();
|
|
if count > 0 {
|
|
tracing::info!("loaded {count} vector index metadata entries");
|
|
}
|
|
Ok(count)
|
|
}
|
|
|
|
/// Register a new index.
|
|
pub async fn register(&self, meta: IndexMeta) -> Result<(), String> {
|
|
let key = format!("vectors/meta/{}.json", meta.index_name);
|
|
let json = serde_json::to_vec_pretty(&meta).map_err(|e| e.to_string())?;
|
|
ops::put(&self.store, &key, json.into()).await?;
|
|
self.indexes.write().await.insert(meta.index_name.clone(), meta);
|
|
Ok(())
|
|
}
|
|
|
|
/// Get metadata for an index.
|
|
pub async fn get(&self, index_name: &str) -> Option<IndexMeta> {
|
|
self.indexes.read().await.get(index_name).cloned()
|
|
}
|
|
|
|
/// List all indexes, optionally filtered by source or model.
|
|
pub async fn list(&self, source: Option<&str>, model: Option<&str>) -> Vec<IndexMeta> {
|
|
self.indexes.read().await.values()
|
|
.filter(|m| source.map_or(true, |s| m.source == s))
|
|
.filter(|m| model.map_or(true, |mo| m.model_name == mo))
|
|
.cloned()
|
|
.collect()
|
|
}
|
|
|
|
/// Find all versions of an index for a given source dataset.
|
|
/// Returns indexes sorted by creation time (newest first).
|
|
pub async fn versions_for_source(&self, source: &str) -> Vec<IndexMeta> {
|
|
let mut versions: Vec<IndexMeta> = self.indexes.read().await.values()
|
|
.filter(|m| m.source == source)
|
|
.cloned()
|
|
.collect();
|
|
versions.sort_by(|a, b| b.created_at.cmp(&a.created_at));
|
|
versions
|
|
}
|
|
|
|
/// Delete an index (metadata only — vector Parquet stays for safety).
|
|
pub async fn delete(&self, index_name: &str) -> Result<(), String> {
|
|
let key = format!("vectors/meta/{index_name}.json");
|
|
ops::delete(&self.store, &key).await?;
|
|
self.indexes.write().await.remove(index_name);
|
|
Ok(())
|
|
}
|
|
|
|
/// Stamp `last_used = now()` on an index. Search handlers call this
|
|
/// on every hit so incremental re-embed (PRD 11.3) can tell live
|
|
/// indexes from cold ones. Silently no-ops if the index is unknown
|
|
/// — callers get best-effort behavior, not a 500 on a missing row.
|
|
pub async fn touch_used(&self, index_name: &str) {
|
|
if let Some(m) = self.indexes.write().await.get_mut(index_name) {
|
|
m.last_used = Some(Utc::now());
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Compute a stable build_signature for PRD 11.3 incremental re-embed.
|
|
/// Hashes (sorted source file list, chunk_size, overlap, model_version)
|
|
/// so a caller can ask "has anything we built from changed?" without
|
|
/// re-scanning the source parquet. Same inputs always produce the
|
|
/// same hash.
|
|
pub fn compute_build_signature(
|
|
source_files: &[impl AsRef<str>],
|
|
chunk_size: usize,
|
|
overlap: usize,
|
|
model_version: &str,
|
|
) -> String {
|
|
use sha2::{Digest, Sha256};
|
|
let mut sorted: Vec<&str> = source_files.iter().map(|s| s.as_ref()).collect();
|
|
sorted.sort();
|
|
let mut hasher = Sha256::new();
|
|
for f in &sorted {
|
|
hasher.update(f.as_bytes());
|
|
hasher.update(b"\n");
|
|
}
|
|
hasher.update(chunk_size.to_le_bytes());
|
|
hasher.update(overlap.to_le_bytes());
|
|
hasher.update(model_version.as_bytes());
|
|
format!("{:x}", hasher.finalize())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn build_signature_is_deterministic() {
|
|
let sig1 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
|
|
let sig2 = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
|
|
assert_eq!(sig1, sig2, "same inputs → same hash");
|
|
}
|
|
|
|
#[test]
|
|
fn build_signature_order_invariant() {
|
|
// Files get sorted internally so caller's order doesn't matter.
|
|
let sig_a = compute_build_signature(&["a.parquet", "b.parquet"], 800, 80, "v1");
|
|
let sig_b = compute_build_signature(&["b.parquet", "a.parquet"], 800, 80, "v1");
|
|
assert_eq!(sig_a, sig_b, "file list order must not affect hash");
|
|
}
|
|
|
|
#[test]
|
|
fn build_signature_changes_on_chunk_param() {
|
|
let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1");
|
|
let sig_b = compute_build_signature(&["a.parquet"], 900, 80, "v1");
|
|
assert_ne!(sig_a, sig_b, "chunk_size change → different hash");
|
|
}
|
|
|
|
#[test]
|
|
fn build_signature_changes_on_model_version() {
|
|
let sig_a = compute_build_signature(&["a.parquet"], 800, 80, "v1");
|
|
let sig_b = compute_build_signature(&["a.parquet"], 800, 80, "v2");
|
|
assert_ne!(sig_a, sig_b, "model version change → different hash");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn touch_used_updates_last_used() {
|
|
use object_store::memory::InMemory;
|
|
let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
|
|
let reg = IndexRegistry::new(store);
|
|
let meta = IndexMeta {
|
|
index_name: "test".into(),
|
|
source: "s".into(),
|
|
model_name: "m".into(),
|
|
model_version: "v1".into(),
|
|
dimensions: 768,
|
|
chunk_count: 0,
|
|
doc_count: 0,
|
|
chunk_size: 800,
|
|
overlap: 80,
|
|
storage_key: "k".into(),
|
|
created_at: Utc::now(),
|
|
build_time_secs: 0.0,
|
|
chunks_per_sec: 0.0,
|
|
bucket: "primary".into(),
|
|
vector_backend: Default::default(),
|
|
id_prefix: None,
|
|
last_used: None,
|
|
build_signature: None,
|
|
};
|
|
reg.register(meta).await.unwrap();
|
|
assert!(reg.get("test").await.unwrap().last_used.is_none());
|
|
reg.touch_used("test").await;
|
|
assert!(reg.get("test").await.unwrap().last_used.is_some());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn touch_used_is_noop_on_missing_index() {
|
|
use object_store::memory::InMemory;
|
|
let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
|
|
let reg = IndexRegistry::new(store);
|
|
// No panic — unknown index just doesn't get touched.
|
|
reg.touch_used("nonexistent").await;
|
|
}
|
|
|
|
#[test]
|
|
fn index_meta_deserializes_without_new_fields_backcompat() {
|
|
// Pre-field-existence metadata files on disk must still load.
|
|
// Critical — we have ~40 .json meta files under vectors/meta/
|
|
// that predate these fields.
|
|
let json = r#"{
|
|
"index_name": "resumes_v1",
|
|
"source": "resumes",
|
|
"model_name": "nomic-embed-text",
|
|
"model_version": "latest",
|
|
"dimensions": 768,
|
|
"chunk_count": 100,
|
|
"doc_count": 10,
|
|
"chunk_size": 800,
|
|
"overlap": 80,
|
|
"storage_key": "vectors/resumes_v1.parquet",
|
|
"created_at": "2026-04-20T00:00:00Z",
|
|
"build_time_secs": 1.0,
|
|
"chunks_per_sec": 100.0
|
|
}"#;
|
|
let meta: IndexMeta = serde_json::from_str(json).expect("must deserialize pre-field meta");
|
|
assert!(meta.last_used.is_none());
|
|
assert!(meta.build_signature.is_none());
|
|
assert_eq!(meta.bucket, "primary");
|
|
}
|
|
}
|