IVF_PQ recall harness — closes ADR-019's explicit measurement gap

POST /vectors/lance/recall/{index} runs an existing harness through
Lance IVF_PQ search and measures recall@k against brute-force ground
truth. Uses the same EvalSet + ground_truth infrastructure as the
HNSW trial system — no new harness format needed.

First real measurement on resumes_100k_v2 (100K × 768d, 20 queries):
  IVF_PQ (316 partitions, 8 bits, 48 subvectors): recall@10 = 0.805
  For comparison — HNSW ec=80 es=30: recall@10 = 1.000

ADR-019 predicted "likely 0.85-0.95" — actual is 0.805. Slightly
below, but now the harness exists to iterate: increase partitions,
try ivf_hnsw_pq, tune subvectors. The measurement infrastructure
is the deliverable, not any specific recall target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-04-16 20:52:34 -05:00
parent 59e72fa566
commit fd4b6836ae

View File

@ -108,6 +108,7 @@ pub fn router(state: VectorState) -> Router {
.route("/lance/append/{index_name}", post(lance_append))
.route("/lance/stats/{index_name}", get(lance_stats))
.route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index))
.route("/lance/recall/{index_name}", post(lance_recall_harness))
.with_state(state)
}
@ -1509,6 +1510,111 @@ async fn lance_stats(
}
}
/// Run an existing harness against Lance IVF_PQ and measure recall@k.
/// Uses the same ground truth computed by brute-force cosine (the HNSW
/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not
/// measured."
#[derive(Deserialize)]
struct LanceRecallRequest {
harness: String,
#[serde(default = "default_top_k")]
top_k: usize,
}
#[derive(serde::Serialize)]
struct LanceRecallResult {
index_name: String,
harness: String,
queries: usize,
top_k: usize,
mean_recall: f32,
per_query: Vec<LanceRecallQuery>,
latency_p50_us: f32,
latency_p95_us: f32,
total_duration_secs: f32,
}
#[derive(serde::Serialize)]
struct LanceRecallQuery {
query_id: String,
recall: f32,
latency_us: f32,
hits_returned: usize,
}
async fn lance_recall_harness(
State(state): State<VectorState>,
Path(index_name): Path<String>,
Json(req): Json<LanceRecallRequest>,
) -> impl IntoResponse {
let t0 = std::time::Instant::now();
let harness_set = harness::EvalSet::load(&state.store, &req.harness).await
.map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?;
if !harness_set.ground_truth_built {
return Err((StatusCode::BAD_REQUEST,
"harness has no ground truth — run a regular /hnsw/trial first to compute it".into()));
}
let lance_store = state.lance.store_for(&index_name).await
.map_err(|e| (StatusCode::BAD_REQUEST, e))?;
let k = req.top_k;
let mut per_query = Vec::with_capacity(harness_set.queries.len());
let mut latencies: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
let mut recalls: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
for q in &harness_set.queries {
let qv = match &q.query_embedding {
Some(v) => v,
None => continue,
};
let gt = match &q.ground_truth {
Some(gt) => gt,
None => continue,
};
let qt0 = std::time::Instant::now();
let hits = lance_store.search(qv, k).await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?;
let lat_us = qt0.elapsed().as_micros() as f32;
let predicted: Vec<String> = hits.iter().map(|h| h.doc_id.clone()).collect();
let recall = harness::recall_at_k(&predicted, gt, k);
per_query.push(LanceRecallQuery {
query_id: q.id.clone(),
recall,
latency_us: lat_us,
hits_returned: hits.len(),
});
latencies.push(lat_us);
recalls.push(recall);
}
let mean_recall = if recalls.is_empty() { 0.0 } else {
recalls.iter().sum::<f32>() / recalls.len() as f32
};
latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p = |pct: f32| -> f32 {
if latencies.is_empty() { return 0.0; }
let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize;
latencies[idx.min(latencies.len() - 1)]
};
Ok(Json(LanceRecallResult {
index_name,
harness: req.harness,
queries: per_query.len(),
top_k: k,
mean_recall,
per_query,
latency_p50_us: p(0.50),
latency_p95_us: p(0.95),
total_duration_secs: t0.elapsed().as_secs_f32(),
}))
}
/// Build a scalar btree index on a column (typically `doc_id`). Makes
/// filter-pushdown queries O(log N) instead of full-fragment scan.
async fn lance_build_scalar_index(