From 044650a1da534184220621d8bcc8f4c608c95a33 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 2 May 2026 22:19:16 -0500 Subject: [PATCH] =?UTF-8?q?lance-bench:=20also=20build=20doc=5Fid=20btree?= =?UTF-8?q?=20post-IVF=20=E2=80=94=20match=20gateway's=20migrate=20behavio?= =?UTF-8?q?r?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bench's own measure_random_access_lance uses take(row_position) — doesn't need the btree. But datasets written by this bench are commonly queried via /vectors/lance/doc// downstream, and without the btree that path falls back to a full table scan. Building inline keeps bench-produced datasets immediately production-shape and removes a footgun (the same one that made scale_test_10m's doc-fetch ~100ms until commit 5d30b3d fixed it via the migrate handler path). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/lance-bench/src/main.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/lance-bench/src/main.rs b/crates/lance-bench/src/main.rs index 7216d37..91af48d 100644 --- a/crates/lance-bench/src/main.rs +++ b/crates/lance-bench/src/main.rs @@ -456,6 +456,26 @@ async fn build_lance_vector_index(path: &str, _dims: usize) -> Result<()> { .await .context("create_index")?; + // Also build the scalar btree on doc_id. This bench's + // measure_random_access_lance uses take(row_position) which doesn't + // need the btree, but the dataset this bench writes is also queried + // downstream by /vectors/lance/doc// (the production + // lookup path) — without this index that path falls back to a full + // table scan. Cheap to build (~1.2s on 10M rows) and matches the + // gateway's lance_migrate handler behavior so bench-produced datasets + // are immediately production-shape. + use lance_index::scalar::ScalarIndexParams; + dataset + .create_index( + &["doc_id"], + IndexType::Scalar, + Some("doc_id_btree".into()), + &ScalarIndexParams::default(), + true, + ) + .await + .context("create_index doc_id btree")?; + Ok(()) }