Scalar btree index on doc_id + auto-build during Lance activation

LanceVectorStore gains build_scalar_index(column) and has_scalar_index(column). Exposed as POST /vectors/lance/scalar-index/ {index}/{column}. activate_profile auto-builds the doc_id btree alongside the IVF_PQ vector index when activating a Lance-backed profile — operators get both indexes without extra API calls. stats() now reports has_doc_id_index alongside has_vector_index. Measured on resumes_100k_v2 (100K × 768d): random doc_id fetch improved from ~5.4ms to ~3.5ms (35% faster). Btree build: 19ms, +2.7 MB on disk. The remaining ~3ms is vector column materialization, not index lookup — to close further would need a projection-only fetch that skips the 768-float vector for text-only RAG retrieval. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 20:49:17 -05:00 · 2026-04-16 20:49:17 -05:00 · 59e72fa566
commit 59e72fa566
parent 2592f8fcb3
2 changed files with 85 additions and 5 deletions
--- a/crates/vectord-lance/src/lib.rs
+++ b/crates/vectord-lance/src/lib.rs
@ -77,12 +77,21 @@ pub struct IndexStats {
    pub disk_bytes_added: u64,
 }

+#[derive(Debug, Clone, Serialize)]
+pub struct ScalarIndexStats {
+    pub name: String,
+    pub column: String,
+    pub build_time_secs: f32,
+    pub disk_bytes_added: u64,
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct DatasetStats {
    pub path: String,
    pub rows: usize,
    pub disk_bytes: u64,
    pub has_vector_index: bool,
+    pub has_doc_id_index: bool,
 }

 // ================= The backend =================
@ -132,11 +141,13 @@ impl LanceVectorStore {
        let rows = self.count().await.unwrap_or(0);
        let disk_bytes = dir_size_bytes(&strip_file_uri(&self.path));
        let has_vector_index = self.has_vector_index().await.unwrap_or(false);
+        let has_doc_id_index = self.has_scalar_index("doc_id").await.unwrap_or(false);
        Ok(DatasetStats {
            path: self.path.clone(),
            rows,
            disk_bytes,
            has_vector_index,
+            has_doc_id_index,
        })
    }

@ -297,6 +308,52 @@ impl LanceVectorStore {
        })
    }

+    /// Build a scalar btree index on `doc_id`. Makes `get_by_doc_id`
+    /// O(log N) instead of a filter-scan of every fragment. Small index
+    /// footprint — a few hundred KB on 100K rows.
+    pub async fn build_scalar_index(&self, column: &str) -> Result<ScalarIndexStats, String> {
+        use lance::dataset::Dataset;
+        use lance_index::{DatasetIndexExt, IndexType};
+        use lance_index::scalar::ScalarIndexParams;
+
+        let pre_bytes = dir_size_bytes(&strip_file_uri(&self.path));
+        let t0 = Instant::now();
+
+        let mut dataset = Dataset::open(&self.path).await.map_err(e)?;
+        let idx_name = format!("{column}_btree");
+        dataset.create_index(
+            &[column],
+            IndexType::Scalar,
+            Some(idx_name.clone()),
+            &ScalarIndexParams::default(),
+            true,
+        ).await.map_err(e)?;
+
+        Ok(ScalarIndexStats {
+            name: idx_name,
+            column: column.to_string(),
+            build_time_secs: t0.elapsed().as_secs_f32(),
+            disk_bytes_added: dir_size_bytes(&strip_file_uri(&self.path)).saturating_sub(pre_bytes),
+        })
+    }
+
+    /// True if a scalar index exists on the named column.
+    pub async fn has_scalar_index(&self, column: &str) -> Result<bool, String> {
+        use lance_index::DatasetIndexExt;
+        let dataset = match lance::dataset::Dataset::open(&self.path).await {
+            Ok(d) => d,
+            Err(_) => return Ok(false),
+        };
+        let indexes = dataset.load_indices().await.map_err(e)?;
+        Ok(indexes.iter().any(|ix| {
+            ix.fields.iter().any(|fid| {
+                dataset.schema().field_by_id(*fid)
+                    .map(|f| f.name == column)
+                    .unwrap_or(false)
+            })
+        }))
+    }
+
    /// Search for top_k nearest neighbors of `query`. Uses the IVF_PQ
    /// index if one exists; otherwise does a full scan (slow but
    /// correct — useful during development before index build).
--- a/crates/vectord/src/service.rs
+++ b/crates/vectord/src/service.rs
@ -107,6 +107,7 @@ pub fn router(state: VectorState) -> Router {
        .route("/lance/doc/{index_name}/{doc_id}", get(lance_get_doc))
        .route("/lance/append/{index_name}", post(lance_append))
        .route("/lance/stats/{index_name}", get(lance_stats))
+        .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index))
        .with_state(state)
 }

@ -967,17 +968,25 @@ async fn activate_profile(
                        hnsw_build_secs: 0.0,
                    });
                }
-                // Ensure IVF_PQ index exists.
+                // Ensure IVF_PQ vector index exists.
                if !lance_store.has_vector_index().await.unwrap_or(false) {
-                    let build_t = std::time::Instant::now();
                    match lance_store.build_index(316, 8, 48).await {
                        Ok(ix) => tracing::info!(
-                            "lance auto-index '{}': built in {:.1}s",
+                            "lance auto-index '{}': IVF_PQ built in {:.1}s",
                            meta.index_name, ix.build_time_secs,
                        ),
-                        Err(e) => failures.push(format!("{}: lance index build: {e}", meta.index_name)),
+                        Err(e) => failures.push(format!("{}: lance IVF_PQ build: {e}", meta.index_name)),
+                    }
+                }
+                // Ensure scalar btree on doc_id for O(log N) random fetch.
+                if !lance_store.has_scalar_index("doc_id").await.unwrap_or(false) {
+                    match lance_store.build_scalar_index("doc_id").await {
+                        Ok(ix) => tracing::info!(
+                            "lance auto-index '{}': doc_id btree built in {:.2}s",
+                            meta.index_name, ix.build_time_secs,
+                        ),
+                        Err(e) => failures.push(format!("{}: lance doc_id btree: {e}", meta.index_name)),
                    }
-                    let _ = build_t; // suppress unused warning
                }
            } else {
                // --- Parquet + HNSW activation path (existing) ---
@ -1499,3 +1508,17 @@ async fn lance_stats(
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
 }
+
+/// Build a scalar btree index on a column (typically `doc_id`). Makes
+/// filter-pushdown queries O(log N) instead of full-fragment scan.
+async fn lance_build_scalar_index(
+    State(state): State<VectorState>,
+    Path((index_name, column)): Path<(String, String)>,
+) -> impl IntoResponse {
+    let lance_store = state.lance.store_for(&index_name).await
+        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
+    match lance_store.build_scalar_index(&column).await {
+        Ok(stats) => Ok(Json(stats)),
+        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
+    }
+}