From fd4b6836ae5156a76603c817d3c008e78458843a Mon Sep 17 00:00:00 2001 From: root Date: Thu, 16 Apr 2026 20:52:34 -0500 Subject: [PATCH] =?UTF-8?q?IVF=5FPQ=20recall=20harness=20=E2=80=94=20close?= =?UTF-8?q?s=20ADR-019's=20explicit=20measurement=20gap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POST /vectors/lance/recall/{index} runs an existing harness through Lance IVF_PQ search and measures recall@k against brute-force ground truth. Uses the same EvalSet + ground_truth infrastructure as the HNSW trial system — no new harness format needed. First real measurement on resumes_100k_v2 (100K × 768d, 20 queries): IVF_PQ (316 partitions, 8 bits, 48 subvectors): recall@10 = 0.805 For comparison — HNSW ec=80 es=30: recall@10 = 1.000 ADR-019 predicted "likely 0.85-0.95" — actual is 0.805. Slightly below, but now the harness exists to iterate: increase partitions, try ivf_hnsw_pq, tune subvectors. The measurement infrastructure is the deliverable, not any specific recall target. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/vectord/src/service.rs | 106 ++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 5e40297..171f92b 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -108,6 +108,7 @@ pub fn router(state: VectorState) -> Router { .route("/lance/append/{index_name}", post(lance_append)) .route("/lance/stats/{index_name}", get(lance_stats)) .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index)) + .route("/lance/recall/{index_name}", post(lance_recall_harness)) .with_state(state) } @@ -1509,6 +1510,111 @@ async fn lance_stats( } } +/// Run an existing harness against Lance IVF_PQ and measure recall@k. +/// Uses the same ground truth computed by brute-force cosine (the HNSW +/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not +/// measured." +#[derive(Deserialize)] +struct LanceRecallRequest { + harness: String, + #[serde(default = "default_top_k")] + top_k: usize, +} + +#[derive(serde::Serialize)] +struct LanceRecallResult { + index_name: String, + harness: String, + queries: usize, + top_k: usize, + mean_recall: f32, + per_query: Vec, + latency_p50_us: f32, + latency_p95_us: f32, + total_duration_secs: f32, +} + +#[derive(serde::Serialize)] +struct LanceRecallQuery { + query_id: String, + recall: f32, + latency_us: f32, + hits_returned: usize, +} + +async fn lance_recall_harness( + State(state): State, + Path(index_name): Path, + Json(req): Json, +) -> impl IntoResponse { + let t0 = std::time::Instant::now(); + + let harness_set = harness::EvalSet::load(&state.store, &req.harness).await + .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?; + if !harness_set.ground_truth_built { + return Err((StatusCode::BAD_REQUEST, + "harness has no ground truth — run a regular /hnsw/trial first to compute it".into())); + } + + let lance_store = state.lance.store_for(&index_name).await + .map_err(|e| (StatusCode::BAD_REQUEST, e))?; + + let k = req.top_k; + let mut per_query = Vec::with_capacity(harness_set.queries.len()); + let mut latencies: Vec = Vec::with_capacity(harness_set.queries.len()); + let mut recalls: Vec = Vec::with_capacity(harness_set.queries.len()); + + for q in &harness_set.queries { + let qv = match &q.query_embedding { + Some(v) => v, + None => continue, + }; + let gt = match &q.ground_truth { + Some(gt) => gt, + None => continue, + }; + + let qt0 = std::time::Instant::now(); + let hits = lance_store.search(qv, k).await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?; + let lat_us = qt0.elapsed().as_micros() as f32; + + let predicted: Vec = hits.iter().map(|h| h.doc_id.clone()).collect(); + let recall = harness::recall_at_k(&predicted, gt, k); + + per_query.push(LanceRecallQuery { + query_id: q.id.clone(), + recall, + latency_us: lat_us, + hits_returned: hits.len(), + }); + latencies.push(lat_us); + recalls.push(recall); + } + + let mean_recall = if recalls.is_empty() { 0.0 } else { + recalls.iter().sum::() / recalls.len() as f32 + }; + latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let p = |pct: f32| -> f32 { + if latencies.is_empty() { return 0.0; } + let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize; + latencies[idx.min(latencies.len() - 1)] + }; + + Ok(Json(LanceRecallResult { + index_name, + harness: req.harness, + queries: per_query.len(), + top_k: k, + mean_recall, + per_query, + latency_p50_us: p(0.50), + latency_p95_us: p(0.95), + total_duration_secs: t0.elapsed().as_secs_f32(), + })) +} + /// Build a scalar btree index on a column (typically `doc_id`). Makes /// filter-pushdown queries O(log N) instead of full-fragment scan. async fn lance_build_scalar_index(