Scrum iter 11 on crates/vectord/src/index_registry.rs flagged two
concrete field gaps (90% confidence). Both were tagged UnitMismatch
/ missing-invariant.
IndexMeta gains two Optional fields:
last_used: Option<DateTime<Utc>>
PRD 11.3 — when this index was last searched against. Callers
were reading created_at as a liveness proxy, which conflated
"built" with "used." IndexRegistry::touch_used(name) stamps the
field on every hit; incremental re-embed can now skip cold
indexes without misattributing "fresh build" to "recent use."
build_signature: Option<String>
PRD 11.3 — stable SHA-256 of (sorted source files + chunk_size
+ overlap + model_version). compute_build_signature() in the
same module is deterministic: file-order-invariant, changes on
chunk param, changes on model version. Lets incremental re-embed
answer "has anything changed since last build?" without scanning
the source Parquet.
Both fields are #[serde(default)] — the ~40 existing .json meta
files under vectors/meta/ load unchanged. Backward-compat verified
by the explicit `index_meta_deserializes_without_new_fields_backcompat`
test.
7 new tests:
- build_signature_is_deterministic
- build_signature_order_invariant (sorted internally)
- build_signature_changes_on_chunk_param
- build_signature_changes_on_model_version
- touch_used_updates_last_used
- touch_used_is_noop_on_missing_index
- index_meta_deserializes_without_new_fields_backcompat
Call-site fixes: crates/vectord/src/refresh.rs:294 and
crates/vectord/src/service.rs:244 both construct IndexMeta with
fully-literal init, default the new fields to None. One
indentation cleanup on service.rs (a pre-existing visual issue on
id_prefix: None).
Workspace warnings still at 0. touch_used() isn't wired into search
hot-path yet — follow-up commit when the search handlers can
adopt it without a broader refactor.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
316 lines
12 KiB
Rust
316 lines
12 KiB
Rust
//! Phase C: Decoupled embedding refresh.
|
|
//!
|
|
//! When a dataset's row-level data changes, the vector index it feeds is
|
|
//! stale. Historically we coupled ingest and embedding — writing new rows
|
|
//! also re-embedded every row, which is fine at 500 rows but blows up at
|
|
//! 100K+. The llms3.com architecture calls out "asynchronous vector
|
|
//! refresh cycles independent from transactional mutations" as the right
|
|
//! pattern.
|
|
//!
|
|
//! This module implements the refresh side. The ingest path marks
|
|
//! embeddings stale (see catalogd::registry::mark_embeddings_stale); this
|
|
//! code clears that staleness by embedding only rows whose `doc_id` isn't
|
|
//! already in the existing vector index.
|
|
//!
|
|
//! Scope — MVP:
|
|
//! - Reads the dataset's Parquet, extracts (doc_id, text) pairs from named
|
|
//! columns
|
|
//! - Loads existing embeddings via EmbeddingCache
|
|
//! - Filters to rows whose doc_id is NOT in the existing set
|
|
//! - Chunks, embeds via Ollama, appends to the index parquet
|
|
//! - Clears stale flag on success
|
|
//!
|
|
//! Not in MVP:
|
|
//! - UPDATE semantics (same doc_id, new content) — would need content-hash
|
|
//! comparison per row
|
|
//! - Large-scale resilience (batching with checkpoints like the
|
|
//! supervisor) — MVP does it inline
|
|
//! - Lance backend — ADR-019 makes this straightforward later; MVP stays
|
|
//! on Parquet sidecar indexes
|
|
|
|
use std::collections::HashSet;
|
|
use std::sync::Arc;
|
|
|
|
use aibridge::client::{AiClient, EmbedRequest};
|
|
use arrow::array::{Array, Int32Array, Int64Array, StringArray};
|
|
use catalogd::registry::Registry;
|
|
use object_store::ObjectStore;
|
|
|
|
use crate::chunker::{self, TextChunk};
|
|
use crate::embedding_cache::EmbeddingCache;
|
|
use crate::index_registry::IndexRegistry;
|
|
use crate::store::{self, StoredEmbedding};
|
|
|
|
#[derive(Debug, Clone, serde::Deserialize)]
|
|
pub struct RefreshRequest {
|
|
pub index_name: String,
|
|
/// Column with the document id (row identity).
|
|
pub id_column: String,
|
|
/// Column with the text to embed.
|
|
pub text_column: String,
|
|
/// Chunk size (chars). Defaults to 500.
|
|
#[serde(default)]
|
|
pub chunk_size: Option<usize>,
|
|
/// Overlap (chars). Defaults to 50.
|
|
#[serde(default)]
|
|
pub overlap: Option<usize>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, serde::Serialize)]
|
|
pub struct RefreshResult {
|
|
pub index_name: String,
|
|
pub dataset_name: String,
|
|
pub pre_existing_docs: usize,
|
|
pub dataset_docs: usize,
|
|
pub new_docs_embedded: usize,
|
|
pub new_chunks_embedded: usize,
|
|
pub total_embeddings_after: usize,
|
|
pub duration_secs: f32,
|
|
pub stale_cleared: bool,
|
|
}
|
|
|
|
/// Full refresh pipeline. Takes dataset_name as URL param, body has the
|
|
/// column selectors.
|
|
pub async fn refresh_index(
|
|
dataset_name: &str,
|
|
req: &RefreshRequest,
|
|
store_: &Arc<dyn ObjectStore>,
|
|
registry: &Registry,
|
|
ai_client: &AiClient,
|
|
embedding_cache: &EmbeddingCache,
|
|
index_registry: &IndexRegistry,
|
|
) -> Result<RefreshResult, String> {
|
|
let t0 = std::time::Instant::now();
|
|
|
|
// 1. Find the dataset manifest, pull its object storage key
|
|
let manifest = registry
|
|
.get_by_name(dataset_name)
|
|
.await
|
|
.ok_or_else(|| format!("dataset not found: {dataset_name}"))?;
|
|
|
|
if manifest.objects.is_empty() {
|
|
return Err(format!("dataset '{dataset_name}' has no object references"));
|
|
}
|
|
|
|
// 2. Read dataset rows — extract (doc_id, text) pairs from the
|
|
// specified columns. Multi-object datasets: concat across all.
|
|
let mut doc_id_to_text: Vec<(String, String)> = Vec::new();
|
|
for obj in &manifest.objects {
|
|
let data = storaged::ops::get(store_, &obj.key).await
|
|
.map_err(|e| format!("read {}: {e}", obj.key))?;
|
|
let (schema, batches) = shared::arrow_helpers::parquet_to_record_batches(&data)
|
|
.map_err(|e| format!("parse {}: {e}", obj.key))?;
|
|
|
|
let id_idx = schema.index_of(&req.id_column)
|
|
.map_err(|_| format!("id column '{}' not in dataset schema", req.id_column))?;
|
|
let text_idx = schema.index_of(&req.text_column)
|
|
.map_err(|_| format!("text column '{}' not in dataset schema", req.text_column))?;
|
|
|
|
for batch in &batches {
|
|
let text_col = batch
|
|
.column(text_idx)
|
|
.as_any()
|
|
.downcast_ref::<StringArray>()
|
|
.ok_or_else(|| format!("text column '{}' is not Utf8", req.text_column))?;
|
|
|
|
// Accept Utf8, Int32, or Int64 as id — that covers CSV-derived
|
|
// and Postgres-imported schemas without forcing upstream casts.
|
|
let id_reader: Box<dyn Fn(usize) -> Option<String>> = {
|
|
let col = batch.column(id_idx);
|
|
if let Some(s) = col.as_any().downcast_ref::<StringArray>() {
|
|
let s = s.clone();
|
|
Box::new(move |row| if s.is_null(row) { None } else { Some(s.value(row).to_string()) })
|
|
} else if let Some(a) = col.as_any().downcast_ref::<Int32Array>() {
|
|
let a = a.clone();
|
|
Box::new(move |row| if a.is_null(row) { None } else { Some(a.value(row).to_string()) })
|
|
} else if let Some(a) = col.as_any().downcast_ref::<Int64Array>() {
|
|
let a = a.clone();
|
|
Box::new(move |row| if a.is_null(row) { None } else { Some(a.value(row).to_string()) })
|
|
} else {
|
|
return Err(format!(
|
|
"id column '{}' must be Utf8, Int32, or Int64 — got {}",
|
|
req.id_column,
|
|
col.data_type(),
|
|
));
|
|
}
|
|
};
|
|
|
|
for row in 0..batch.num_rows() {
|
|
if text_col.is_null(row) { continue; }
|
|
let Some(id) = id_reader(row) else { continue; };
|
|
let text = text_col.value(row).to_string();
|
|
if text.trim().is_empty() { continue; }
|
|
doc_id_to_text.push((id, text));
|
|
}
|
|
}
|
|
}
|
|
let dataset_docs = doc_id_to_text.len();
|
|
tracing::info!("refresh '{}': dataset has {dataset_docs} rows", dataset_name);
|
|
|
|
// 3. Load existing embeddings (empty if no index yet)
|
|
let existing: Vec<StoredEmbedding> = match embedding_cache.get_or_load(&req.index_name).await {
|
|
Ok(arc) => arc.as_ref().clone(),
|
|
Err(_) => Vec::new(), // first-time index build
|
|
};
|
|
let pre_existing_docs: HashSet<String> = existing
|
|
.iter()
|
|
.map(|e| e.doc_id.clone())
|
|
.collect();
|
|
let pre_existing_count = pre_existing_docs.len();
|
|
|
|
// 4. Delta — rows whose doc_id isn't already indexed
|
|
let new_rows: Vec<(String, String)> = doc_id_to_text
|
|
.into_iter()
|
|
.filter(|(id, _)| !pre_existing_docs.contains(id))
|
|
.collect();
|
|
let new_docs = new_rows.len();
|
|
|
|
if new_docs == 0 {
|
|
tracing::info!("refresh '{}': no new docs to embed", dataset_name);
|
|
// Even on no-op, make sure index metadata is registered so
|
|
// downstream discovery (profile activation, A/B search) sees
|
|
// indexes built by earlier refresh calls that predated the
|
|
// auto-register behavior.
|
|
let _ = try_update_index_meta(index_registry, &req.index_name, existing.len()).await;
|
|
registry.clear_embeddings_stale(dataset_name).await?;
|
|
return Ok(RefreshResult {
|
|
index_name: req.index_name.clone(),
|
|
dataset_name: dataset_name.to_string(),
|
|
pre_existing_docs: pre_existing_count,
|
|
dataset_docs,
|
|
new_docs_embedded: 0,
|
|
new_chunks_embedded: 0,
|
|
total_embeddings_after: existing.len(),
|
|
duration_secs: t0.elapsed().as_secs_f32(),
|
|
stale_cleared: true,
|
|
});
|
|
}
|
|
|
|
// 5. Chunk the new rows
|
|
let chunk_size = req.chunk_size.unwrap_or(500);
|
|
let overlap = req.overlap.unwrap_or(50);
|
|
let doc_ids: Vec<String> = new_rows.iter().map(|(id, _)| id.clone()).collect();
|
|
let texts: Vec<String> = new_rows.iter().map(|(_, t)| t.clone()).collect();
|
|
let chunks: Vec<TextChunk> = chunker::chunk_column(
|
|
dataset_name, &doc_ids, &texts, chunk_size, overlap,
|
|
);
|
|
let new_chunks = chunks.len();
|
|
tracing::info!("refresh '{}': {} new docs -> {} chunks", dataset_name, new_docs, new_chunks);
|
|
|
|
// 6. Embed via Ollama (batched)
|
|
let batch_size = 32;
|
|
let mut all_vectors: Vec<Vec<f64>> = Vec::with_capacity(new_chunks);
|
|
for batch in chunks.chunks(batch_size) {
|
|
let batch_texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();
|
|
let resp = ai_client
|
|
.embed(EmbedRequest { texts: batch_texts, model: None })
|
|
.await
|
|
.map_err(|e| format!("embed: {e}"))?;
|
|
all_vectors.extend(resp.embeddings);
|
|
}
|
|
|
|
// 7. Combine existing + new and write back as a single parquet
|
|
// (MVP — append-as-rewrite. ADR-019 points to Lance for true native
|
|
// append; for the Parquet sidecar we rewrite at refresh time.)
|
|
let mut new_stored: Vec<StoredEmbedding> = existing.clone();
|
|
for (chunk, vector) in chunks.into_iter().zip(all_vectors.iter()) {
|
|
new_stored.push(StoredEmbedding {
|
|
source: chunk.source,
|
|
doc_id: chunk.doc_id,
|
|
chunk_idx: chunk.chunk_idx,
|
|
chunk_text: chunk.text,
|
|
vector: vector.iter().map(|&x| x as f32).collect(),
|
|
});
|
|
}
|
|
|
|
// We have to reconstruct chunks + vectors for store_embeddings — but
|
|
// store_embeddings takes &[TextChunk] and vectors separately. Convert
|
|
// the combined StoredEmbedding back to that shape.
|
|
let combined_chunks: Vec<TextChunk> = new_stored
|
|
.iter()
|
|
.map(|e| TextChunk {
|
|
source: e.source.clone(),
|
|
doc_id: e.doc_id.clone(),
|
|
chunk_idx: e.chunk_idx,
|
|
text: e.chunk_text.clone(),
|
|
})
|
|
.collect();
|
|
let combined_vectors: Vec<Vec<f64>> = new_stored
|
|
.iter()
|
|
.map(|e| e.vector.iter().map(|&f| f as f64).collect())
|
|
.collect();
|
|
|
|
let _key = store::store_embeddings(
|
|
store_, &req.index_name, &combined_chunks, &combined_vectors,
|
|
).await?;
|
|
|
|
// 8. Evict embedding cache — next read will pick up the new file
|
|
let _ = embedding_cache.evict(&req.index_name).await;
|
|
|
|
// 9. Update index registry metadata (row/chunk counts; others unchanged
|
|
// so we don't disturb the existing entry if present)
|
|
let _ = try_update_index_meta(index_registry, &req.index_name, new_stored.len()).await;
|
|
|
|
// 10. Clear stale flag
|
|
registry.clear_embeddings_stale(dataset_name).await?;
|
|
|
|
let total = new_stored.len();
|
|
Ok(RefreshResult {
|
|
index_name: req.index_name.clone(),
|
|
dataset_name: dataset_name.to_string(),
|
|
pre_existing_docs: pre_existing_count,
|
|
dataset_docs,
|
|
new_docs_embedded: new_docs,
|
|
new_chunks_embedded: new_chunks,
|
|
total_embeddings_after: total,
|
|
duration_secs: t0.elapsed().as_secs_f32(),
|
|
stale_cleared: true,
|
|
})
|
|
}
|
|
|
|
/// Best-effort refresh of index registry metadata.
|
|
/// - If the index already exists, bump the chunk_count.
|
|
/// - If it's brand new (first-time build via refresh), REGISTER it so
|
|
/// downstream discovery (profile activation, A/B search) can find it.
|
|
async fn try_update_index_meta(
|
|
index_registry: &IndexRegistry,
|
|
index_name: &str,
|
|
chunk_count: usize,
|
|
) -> Result<(), String> {
|
|
if let Some(mut meta) = index_registry.get(index_name).await {
|
|
meta.chunk_count = chunk_count;
|
|
return index_registry.register(meta).await;
|
|
}
|
|
// First-time registration — infer reasonable defaults. Convention:
|
|
// index name `{source}_v1` or `{source}_vN` implies a source dataset
|
|
// named by stripping the `_vN` suffix. Otherwise use the index name
|
|
// as the source and let the caller patch later.
|
|
let source = match index_name.rsplit_once('_') {
|
|
Some((base, suffix)) if suffix.starts_with('v') && suffix[1..].chars().all(|c| c.is_ascii_digit()) => {
|
|
base.to_string()
|
|
}
|
|
_ => index_name.to_string(),
|
|
};
|
|
let meta = crate::index_registry::IndexMeta {
|
|
index_name: index_name.to_string(),
|
|
source,
|
|
model_name: "nomic-embed-text".to_string(),
|
|
model_version: "latest".to_string(),
|
|
dimensions: 768,
|
|
chunk_count,
|
|
doc_count: chunk_count,
|
|
chunk_size: 500,
|
|
overlap: 50,
|
|
storage_key: format!("vectors/{index_name}.parquet"),
|
|
created_at: chrono::Utc::now(),
|
|
build_time_secs: 0.0,
|
|
chunks_per_sec: 0.0,
|
|
bucket: "primary".to_string(),
|
|
vector_backend: shared::types::VectorBackend::Parquet,
|
|
id_prefix: None,
|
|
last_used: None,
|
|
build_signature: None,
|
|
};
|
|
index_registry.register(meta).await
|
|
}
|