diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 5e40297..171f92b 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -108,6 +108,7 @@ pub fn router(state: VectorState) -> Router { .route("/lance/append/{index_name}", post(lance_append)) .route("/lance/stats/{index_name}", get(lance_stats)) .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index)) + .route("/lance/recall/{index_name}", post(lance_recall_harness)) .with_state(state) } @@ -1509,6 +1510,111 @@ async fn lance_stats( } } +/// Run an existing harness against Lance IVF_PQ and measure recall@k. +/// Uses the same ground truth computed by brute-force cosine (the HNSW +/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not +/// measured." +#[derive(Deserialize)] +struct LanceRecallRequest { + harness: String, + #[serde(default = "default_top_k")] + top_k: usize, +} + +#[derive(serde::Serialize)] +struct LanceRecallResult { + index_name: String, + harness: String, + queries: usize, + top_k: usize, + mean_recall: f32, + per_query: Vec, + latency_p50_us: f32, + latency_p95_us: f32, + total_duration_secs: f32, +} + +#[derive(serde::Serialize)] +struct LanceRecallQuery { + query_id: String, + recall: f32, + latency_us: f32, + hits_returned: usize, +} + +async fn lance_recall_harness( + State(state): State, + Path(index_name): Path, + Json(req): Json, +) -> impl IntoResponse { + let t0 = std::time::Instant::now(); + + let harness_set = harness::EvalSet::load(&state.store, &req.harness).await + .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?; + if !harness_set.ground_truth_built { + return Err((StatusCode::BAD_REQUEST, + "harness has no ground truth — run a regular /hnsw/trial first to compute it".into())); + } + + let lance_store = state.lance.store_for(&index_name).await + .map_err(|e| (StatusCode::BAD_REQUEST, e))?; + + let k = req.top_k; + let mut per_query = Vec::with_capacity(harness_set.queries.len()); + let mut latencies: Vec = Vec::with_capacity(harness_set.queries.len()); + let mut recalls: Vec = Vec::with_capacity(harness_set.queries.len()); + + for q in &harness_set.queries { + let qv = match &q.query_embedding { + Some(v) => v, + None => continue, + }; + let gt = match &q.ground_truth { + Some(gt) => gt, + None => continue, + }; + + let qt0 = std::time::Instant::now(); + let hits = lance_store.search(qv, k).await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?; + let lat_us = qt0.elapsed().as_micros() as f32; + + let predicted: Vec = hits.iter().map(|h| h.doc_id.clone()).collect(); + let recall = harness::recall_at_k(&predicted, gt, k); + + per_query.push(LanceRecallQuery { + query_id: q.id.clone(), + recall, + latency_us: lat_us, + hits_returned: hits.len(), + }); + latencies.push(lat_us); + recalls.push(recall); + } + + let mean_recall = if recalls.is_empty() { 0.0 } else { + recalls.iter().sum::() / recalls.len() as f32 + }; + latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let p = |pct: f32| -> f32 { + if latencies.is_empty() { return 0.0; } + let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize; + latencies[idx.min(latencies.len() - 1)] + }; + + Ok(Json(LanceRecallResult { + index_name, + harness: req.harness, + queries: per_query.len(), + top_k: k, + mean_recall, + per_query, + latency_p50_us: p(0.50), + latency_p95_us: p(0.95), + total_duration_secs: t0.elapsed().as_secs_f32(), + })) +} + /// Build a scalar btree index on a column (typically `doc_id`). Makes /// filter-pushdown queries O(log N) instead of full-fragment scan. async fn lance_build_scalar_index(