IVF_PQ recall harness — closes ADR-019's explicit measurement gap
POST /vectors/lance/recall/{index} runs an existing harness through
Lance IVF_PQ search and measures recall@k against brute-force ground
truth. Uses the same EvalSet + ground_truth infrastructure as the
HNSW trial system — no new harness format needed.
First real measurement on resumes_100k_v2 (100K × 768d, 20 queries):
IVF_PQ (316 partitions, 8 bits, 48 subvectors): recall@10 = 0.805
For comparison — HNSW ec=80 es=30: recall@10 = 1.000
ADR-019 predicted "likely 0.85-0.95" — actual is 0.805. Slightly
below, but now the harness exists to iterate: increase partitions,
try ivf_hnsw_pq, tune subvectors. The measurement infrastructure
is the deliverable, not any specific recall target.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
59e72fa566
commit
fd4b6836ae
@ -108,6 +108,7 @@ pub fn router(state: VectorState) -> Router {
|
||||
.route("/lance/append/{index_name}", post(lance_append))
|
||||
.route("/lance/stats/{index_name}", get(lance_stats))
|
||||
.route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index))
|
||||
.route("/lance/recall/{index_name}", post(lance_recall_harness))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
@ -1509,6 +1510,111 @@ async fn lance_stats(
|
||||
}
|
||||
}
|
||||
|
||||
/// Run an existing harness against Lance IVF_PQ and measure recall@k.
|
||||
/// Uses the same ground truth computed by brute-force cosine (the HNSW
|
||||
/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not
|
||||
/// measured."
|
||||
#[derive(Deserialize)]
|
||||
struct LanceRecallRequest {
|
||||
harness: String,
|
||||
#[serde(default = "default_top_k")]
|
||||
top_k: usize,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct LanceRecallResult {
|
||||
index_name: String,
|
||||
harness: String,
|
||||
queries: usize,
|
||||
top_k: usize,
|
||||
mean_recall: f32,
|
||||
per_query: Vec<LanceRecallQuery>,
|
||||
latency_p50_us: f32,
|
||||
latency_p95_us: f32,
|
||||
total_duration_secs: f32,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct LanceRecallQuery {
|
||||
query_id: String,
|
||||
recall: f32,
|
||||
latency_us: f32,
|
||||
hits_returned: usize,
|
||||
}
|
||||
|
||||
async fn lance_recall_harness(
|
||||
State(state): State<VectorState>,
|
||||
Path(index_name): Path<String>,
|
||||
Json(req): Json<LanceRecallRequest>,
|
||||
) -> impl IntoResponse {
|
||||
let t0 = std::time::Instant::now();
|
||||
|
||||
let harness_set = harness::EvalSet::load(&state.store, &req.harness).await
|
||||
.map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?;
|
||||
if !harness_set.ground_truth_built {
|
||||
return Err((StatusCode::BAD_REQUEST,
|
||||
"harness has no ground truth — run a regular /hnsw/trial first to compute it".into()));
|
||||
}
|
||||
|
||||
let lance_store = state.lance.store_for(&index_name).await
|
||||
.map_err(|e| (StatusCode::BAD_REQUEST, e))?;
|
||||
|
||||
let k = req.top_k;
|
||||
let mut per_query = Vec::with_capacity(harness_set.queries.len());
|
||||
let mut latencies: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
|
||||
let mut recalls: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
|
||||
|
||||
for q in &harness_set.queries {
|
||||
let qv = match &q.query_embedding {
|
||||
Some(v) => v,
|
||||
None => continue,
|
||||
};
|
||||
let gt = match &q.ground_truth {
|
||||
Some(gt) => gt,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let qt0 = std::time::Instant::now();
|
||||
let hits = lance_store.search(qv, k).await
|
||||
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?;
|
||||
let lat_us = qt0.elapsed().as_micros() as f32;
|
||||
|
||||
let predicted: Vec<String> = hits.iter().map(|h| h.doc_id.clone()).collect();
|
||||
let recall = harness::recall_at_k(&predicted, gt, k);
|
||||
|
||||
per_query.push(LanceRecallQuery {
|
||||
query_id: q.id.clone(),
|
||||
recall,
|
||||
latency_us: lat_us,
|
||||
hits_returned: hits.len(),
|
||||
});
|
||||
latencies.push(lat_us);
|
||||
recalls.push(recall);
|
||||
}
|
||||
|
||||
let mean_recall = if recalls.is_empty() { 0.0 } else {
|
||||
recalls.iter().sum::<f32>() / recalls.len() as f32
|
||||
};
|
||||
latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
let p = |pct: f32| -> f32 {
|
||||
if latencies.is_empty() { return 0.0; }
|
||||
let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize;
|
||||
latencies[idx.min(latencies.len() - 1)]
|
||||
};
|
||||
|
||||
Ok(Json(LanceRecallResult {
|
||||
index_name,
|
||||
harness: req.harness,
|
||||
queries: per_query.len(),
|
||||
top_k: k,
|
||||
mean_recall,
|
||||
per_query,
|
||||
latency_p50_us: p(0.50),
|
||||
latency_p95_us: p(0.95),
|
||||
total_duration_secs: t0.elapsed().as_secs_f32(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Build a scalar btree index on a column (typically `doc_id`). Makes
|
||||
/// filter-pushdown queries O(log N) instead of full-fragment scan.
|
||||
async fn lance_build_scalar_index(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user