diff --git a/crates/vectord/src/index_registry.rs b/crates/vectord/src/index_registry.rs index 621e129..cc94139 100644 --- a/crates/vectord/src/index_registry.rs +++ b/crates/vectord/src/index_registry.rs @@ -40,6 +40,12 @@ pub struct IndexMeta { /// `Lance` means it points at a Lance dataset directory. #[serde(default)] pub vector_backend: shared::types::VectorBackend, + /// ADR-020: prefix prepended to doc_ids during embedding. If set, + /// hybrid search strips this prefix to match against SQL primary keys. + /// None = doc_ids ARE the raw primary keys (no stripping needed). + /// Existing indexes: "W-", "CAND-", "W500K-", etc. + #[serde(default)] + pub id_prefix: Option, } fn default_bucket() -> String { "primary".to_string() } diff --git a/crates/vectord/src/refresh.rs b/crates/vectord/src/refresh.rs index 1e9c62e..627fd7f 100644 --- a/crates/vectord/src/refresh.rs +++ b/crates/vectord/src/refresh.rs @@ -307,6 +307,7 @@ async fn try_update_index_meta( chunks_per_sec: 0.0, bucket: "primary".to_string(), vector_backend: shared::types::VectorBackend::Parquet, + id_prefix: None, }; index_registry.register(meta).await } diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 2cec969..5bd6366 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -210,6 +210,7 @@ async fn create_index( chunks_per_sec: rate, bucket: bucket.clone(), vector_backend: shared::types::VectorBackend::Parquet, + id_prefix: None, }; let _ = registry.register(meta).await; @@ -548,15 +549,28 @@ async fn hybrid_search( }; // Step 3: Filter vector results to only SQL-verified IDs. + // ADR-020: read the index's id_prefix from the catalog instead of + // hardcoding prefix stripping. Falls back to heuristic for legacy indexes. + let id_prefix: Option = state.index_registry + .get(&req.index_name).await + .and_then(|m| m.id_prefix.clone()); + let sql_count = valid_ids.as_ref().map(|s| s.len()).unwrap_or(0); let filtered: Vec = if let Some(ref ids) = valid_ids { all_results.into_iter() .filter(|r| { - // doc_id format is "W-{worker_id}" — extract the number - let num = r.doc_id.strip_prefix("W-") - .or_else(|| r.doc_id.strip_prefix("CAND-")) - .unwrap_or(&r.doc_id); - ids.contains(num) + let raw_id = if let Some(ref prefix) = id_prefix { + r.doc_id.strip_prefix(prefix.as_str()).unwrap_or(&r.doc_id) + } else { + // Legacy: heuristic strip for pre-ADR-020 indexes + r.doc_id.strip_prefix("W500K-") + .or_else(|| r.doc_id.strip_prefix("W500-")) + .or_else(|| r.doc_id.strip_prefix("W5K-")) + .or_else(|| r.doc_id.strip_prefix("W-")) + .or_else(|| r.doc_id.strip_prefix("CAND-")) + .unwrap_or(&r.doc_id) + }; + ids.contains(raw_id) }) .take(req.top_k) .collect() diff --git a/docs/ADR-020-universal-id-mapping.md b/docs/ADR-020-universal-id-mapping.md new file mode 100644 index 0000000..53a7536 --- /dev/null +++ b/docs/ADR-020-universal-id-mapping.md @@ -0,0 +1,107 @@ +# ADR-020: Universal ID Mapping — the hybrid search identity problem + +**Status:** Proposed — 2026-04-17 +**Triggered by:** ID mismatch between vector doc_ids and SQL primary keys +**Owner:** J + +--- + +## Problem + +The hybrid search endpoint (`POST /vectors/hybrid`) filters SQL results +by primary key, then matches those keys against vector doc_ids. This +requires the two ID spaces to be compatible. Currently they're not: + +| Source | SQL primary key | Vector doc_id | Match? | +|---|---|---|---| +| ethereal_workers | `worker_id = 4925` | `W-4925` | Only with W- strip | +| workers_500k | `worker_id = 41566` | `W500K-41566` | Only with W500K- strip | +| candidates | `candidate_id = CAND-055035` | `CAND-055035` | Yes (same format) | +| workers_5k_proof | `worker_id = 3200` | `W5K-3200` | Only with W5K- strip | + +Every time a new dataset is ingested and embedded, a new prefix +appears and the hybrid search breaks until someone hardcodes another +`strip_prefix` line. + +This violates PRD invariant: **"Any data source can be ingested +without pre-defined schemas."** The ID format IS a pre-defined schema +that the hybrid search depends on. + +## Root cause + +When the embedding pipeline creates a vector index, it generates +doc_ids by concatenating a prefix + the source row's ID column. The +prefix is chosen at index creation time and baked into the Parquet +vector file. The SQL dataset knows nothing about this prefix. + +The hybrid search then has to reverse-engineer the prefix to match +vector results against SQL rows. This is fragile and breaks on every +new data source. + +## Solution: catalog-level ID mapping + +### Option A: Normalize doc_ids at embedding time (simplest) + +When creating a vector index, don't prefix the doc_id. Use the raw +value from the source column. If the source has `worker_id = 4925`, +the vector doc_id is just `"4925"` — no `W-`, no `W500K-`. + +**Pros:** Simplest. Hybrid search just compares strings directly. +**Cons:** Doc_ids across different indexes could collide (two datasets +both have worker_id=1). Need to scope by index name. + +### Option B: Catalog stores the mapping (most robust) + +The catalog maintains a mapping table: +``` +index_name | doc_id_prefix | source_dataset | source_id_column +-----------+---------------+----------------+----------------- +resumes | CAND- | candidates | candidate_id +workers_v1 | W500K- | workers_500k | worker_id +``` + +Hybrid search reads this mapping and applies the prefix/strip +automatically. New datasets register their mapping at index creation. + +**Pros:** Handles all cases. Self-describing. No code changes for new data. +**Cons:** One more lookup per hybrid search (trivial perf cost). + +### Option C: Pass the mapping in the request (pragmatic) + +The hybrid search request already accepts `id_column`. Extend it +with `id_prefix` so the caller says "vector doc_ids have prefix +W500K- and the SQL column is worker_id." + +```json +{ + "index_name": "workers_500k_v1", + "id_column": "worker_id", + "id_prefix": "W500K-", + "sql_filter": "role = 'Forklift Operator' AND state = 'IL'", + ... +} +``` + +**Pros:** Zero backend change. Caller already knows the context. +**Cons:** Caller has to know the prefix — not self-service. + +## Recommendation + +**Option A for new indexes, Option B for registry.** + +1. Change the embedding pipeline to use RAW IDs (no prefix) as the + default. Existing indexes keep their prefixed IDs. +2. Add an `id_prefix` column to IndexMeta so the catalog knows how + to map between vector and SQL IDs. +3. Hybrid search reads IndexMeta.id_prefix and applies it automatically. +4. Remove the hardcoded strip_prefix chain. + +This means: ingest a new dataset, embed it, hybrid search works +immediately. No code changes. The system is truly "any data source." + +## Implementation + +1. `IndexMeta` gains `id_prefix: Option` (default None = no prefix) +2. Embedding pipeline: when `id_prefix` is None, use raw IDs from source +3. Hybrid search: read `IndexMeta.id_prefix`, prepend to SQL IDs before matching +4. Migration: existing indexes retain their prefix in IndexMeta