From fd4b6836ae5156a76603c817d3c008e78458843a Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Thu, 16 Apr 2026 20:52:34 -0500
Subject: [PATCH] =?UTF-8?q?IVF=5FPQ=20recall=20harness=20=E2=80=94=20close?=
 =?UTF-8?q?s=20ADR-019's=20explicit=20measurement=20gap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

POST /vectors/lance/recall/{index} runs an existing harness through
Lance IVF_PQ search and measures recall@k against brute-force ground
truth. Uses the same EvalSet + ground_truth infrastructure as the
HNSW trial system — no new harness format needed.

First real measurement on resumes_100k_v2 (100K × 768d, 20 queries):
  IVF_PQ (316 partitions, 8 bits, 48 subvectors): recall@10 = 0.805
  For comparison — HNSW ec=80 es=30: recall@10 = 1.000

ADR-019 predicted "likely 0.85-0.95" — actual is 0.805. Slightly
below, but now the harness exists to iterate: increase partitions,
try ivf_hnsw_pq, tune subvectors. The measurement infrastructure
is the deliverable, not any specific recall target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/vectord/src/service.rs | 106 ++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs
index 5e40297..171f92b 100644
--- a/crates/vectord/src/service.rs
+++ b/crates/vectord/src/service.rs
@@ -108,6 +108,7 @@ pub fn router(state: VectorState) -> Router {
         .route("/lance/append/{index_name}", post(lance_append))
         .route("/lance/stats/{index_name}", get(lance_stats))
         .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index))
+        .route("/lance/recall/{index_name}", post(lance_recall_harness))
         .with_state(state)
 }
 
@@ -1509,6 +1510,111 @@ async fn lance_stats(
     }
 }
 
+/// Run an existing harness against Lance IVF_PQ and measure recall@k.
+/// Uses the same ground truth computed by brute-force cosine (the HNSW
+/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not
+/// measured."
+#[derive(Deserialize)]
+struct LanceRecallRequest {
+    harness: String,
+    #[serde(default = "default_top_k")]
+    top_k: usize,
+}
+
+#[derive(serde::Serialize)]
+struct LanceRecallResult {
+    index_name: String,
+    harness: String,
+    queries: usize,
+    top_k: usize,
+    mean_recall: f32,
+    per_query: Vec<LanceRecallQuery>,
+    latency_p50_us: f32,
+    latency_p95_us: f32,
+    total_duration_secs: f32,
+}
+
+#[derive(serde::Serialize)]
+struct LanceRecallQuery {
+    query_id: String,
+    recall: f32,
+    latency_us: f32,
+    hits_returned: usize,
+}
+
+async fn lance_recall_harness(
+    State(state): State<VectorState>,
+    Path(index_name): Path<String>,
+    Json(req): Json<LanceRecallRequest>,
+) -> impl IntoResponse {
+    let t0 = std::time::Instant::now();
+
+    let harness_set = harness::EvalSet::load(&state.store, &req.harness).await
+        .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?;
+    if !harness_set.ground_truth_built {
+        return Err((StatusCode::BAD_REQUEST,
+            "harness has no ground truth — run a regular /hnsw/trial first to compute it".into()));
+    }
+
+    let lance_store = state.lance.store_for(&index_name).await
+        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
+
+    let k = req.top_k;
+    let mut per_query = Vec::with_capacity(harness_set.queries.len());
+    let mut latencies: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
+    let mut recalls: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
+
+    for q in &harness_set.queries {
+        let qv = match &q.query_embedding {
+            Some(v) => v,
+            None => continue,
+        };
+        let gt = match &q.ground_truth {
+            Some(gt) => gt,
+            None => continue,
+        };
+
+        let qt0 = std::time::Instant::now();
+        let hits = lance_store.search(qv, k).await
+            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?;
+        let lat_us = qt0.elapsed().as_micros() as f32;
+
+        let predicted: Vec<String> = hits.iter().map(|h| h.doc_id.clone()).collect();
+        let recall = harness::recall_at_k(&predicted, gt, k);
+
+        per_query.push(LanceRecallQuery {
+            query_id: q.id.clone(),
+            recall,
+            latency_us: lat_us,
+            hits_returned: hits.len(),
+        });
+        latencies.push(lat_us);
+        recalls.push(recall);
+    }
+
+    let mean_recall = if recalls.is_empty() { 0.0 } else {
+        recalls.iter().sum::<f32>() / recalls.len() as f32
+    };
+    latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let p = |pct: f32| -> f32 {
+        if latencies.is_empty() { return 0.0; }
+        let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize;
+        latencies[idx.min(latencies.len() - 1)]
+    };
+
+    Ok(Json(LanceRecallResult {
+        index_name,
+        harness: req.harness,
+        queries: per_query.len(),
+        top_k: k,
+        mean_recall,
+        per_query,
+        latency_p50_us: p(0.50),
+        latency_p95_us: p(0.95),
+        total_duration_secs: t0.elapsed().as_secs_f32(),
+    }))
+}
+
 /// Build a scalar btree index on a column (typically `doc_id`). Makes
 /// filter-pushdown queries O(log N) instead of full-fragment scan.
 async fn lance_build_scalar_index(