lakehouse/crates/vectord/src/autotune.rs
profit 5b1fcf6d27 Phase 28-36 body of work
Accumulated since a6f12e2 (Phase 21 Rust port + Phase 27 versioning):

- Phase 36: embed_semaphore on VectorState (permits=1) serializes
  seed embed calls — prevents sidecar socket collisions under
  concurrent /seed stress load
- Phase 31+: run_stress.ts 6-task diverse stress scaffolding;
  run_e2e_rated.ts + orchestrator.ts tightening
- Catalog dedupe cleanup: 16 duplicate manifests removed; canonical
  candidates.parquet (10.5MB -> 76KB) + placements.parquet (1.2MB ->
  11KB) regenerated post-dedupe; fresh manifests for active datasets
- vectord: harness EvalSet refinements (+181), agent portfolio
  rotation + ingest triggers (+158), autotune + rag adjustments
- catalogd/storaged/ingestd/mcp-server: misc tightening
- docs: Phase 28-36 PRD entries + DECISIONS ADR additions;
  control-plane pivot banner added to top of docs/PRD.md (pointing
  at docs/CONTROL_PLANE_PRD.md which lands in next commit)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:41:15 -05:00

333 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Phase 16: Autotune — the agent loop for HNSW tuning.
//!
//! Given an index and a harness, run a grid of (ef_construction,
//! ef_search) trials, pick the Pareto winner, and promote it.
//!
//! This is deliberately NOT a sophisticated optimizer — random-grid
//! search is good enough when the config space is small (a dozen
//! combinations) and the work per trial is seconds, not hours. Think
//! of it as a "smart operator" rather than Bayesian optimization.
//!
//! Score: we want high recall and low p50 latency. The function below
//! picks the Pareto-dominating trial, breaking ties by prefer-lower-
//! build-time (cheaper to rebuild on data changes).
//!
//! Safeguards (PRD risk mitigation):
//! - Never promote a trial with recall < `min_recall` (default 0.9)
//! - Hard config bounds: ef_construction ∈ [10, 400], ef_search ∈ [10, 200]
//! - Concurrency cap: one autotune per index at a time (tracked by
//! JobTracker — caller is responsible for not firing in parallel
//! for the same index)
use chrono::Utc;
use object_store::ObjectStore;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use aibridge::client::AiClient;
use catalogd::registry::Registry as CatalogRegistry;
use crate::embedding_cache::EmbeddingCache;
use crate::harness::{self, HarnessStore};
use crate::hnsw::HnswStore;
use crate::index_registry::IndexRegistry;
use crate::jobs::JobTracker;
use crate::promotion::{PromotionEntry, PromotionRegistry};
use crate::trial::{HnswConfig, Trial, TrialJournal, TrialMetrics};
#[derive(Debug, Clone, Deserialize)]
pub struct AutotuneRequest {
pub index_name: String,
pub harness: String,
/// Minimum recall a trial must achieve to be eligible for promotion.
/// Default 0.9 — below that, the tuning is not producing useful
/// answers. Never promote a bad index.
#[serde(default = "default_min_recall")]
pub min_recall: f32,
/// Override the default config grid (rarely needed). When omitted,
/// a small curated grid runs.
#[serde(default)]
pub grid: Option<Vec<HnswConfig>>,
}
fn default_min_recall() -> f32 { 0.9 }
#[derive(Debug, Clone, Serialize)]
pub struct AutotuneResult {
pub index_name: String,
pub trials_run: usize,
pub trials_eligible: usize,
pub winner: Option<WinnerReport>,
pub promoted: bool,
pub duration_secs: f32,
pub skipped: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct WinnerReport {
pub trial_id: String,
pub config: HnswConfig,
pub recall_at_k: f32,
pub search_latency_p50_us: f32,
pub build_time_secs: f32,
}
/// Default exploration grid. Covers the practical range of settings
/// from "fast build, OK recall" to "thorough build, great recall".
fn default_grid() -> Vec<HnswConfig> {
vec![
HnswConfig { ef_construction: 20, ef_search: 30, seed: Some(42) },
HnswConfig { ef_construction: 40, ef_search: 30, seed: Some(42) },
HnswConfig { ef_construction: 80, ef_search: 30, seed: Some(42) },
HnswConfig { ef_construction: 80, ef_search: 60, seed: Some(42) },
HnswConfig { ef_construction: 160, ef_search: 30, seed: Some(42) },
]
}
/// Config-space bounds. Rejects anything outside these before running a
/// trial — defends against an agent getting creative with absurd values.
fn sanitize_grid(grid: Vec<HnswConfig>) -> (Vec<HnswConfig>, Vec<String>) {
let mut ok = Vec::new();
let mut rejected = Vec::new();
for cfg in grid {
if !(10..=400).contains(&cfg.ef_construction) {
rejected.push(format!(
"ef_construction={} out of bounds [10,400]", cfg.ef_construction,
));
continue;
}
if !(10..=200).contains(&cfg.ef_search) {
rejected.push(format!(
"ef_search={} out of bounds [10,200]", cfg.ef_search,
));
continue;
}
ok.push(cfg);
}
(ok, rejected)
}
/// Pareto winner: highest recall, then lowest p50 latency among
/// eligible trials (recall >= min_recall).
fn pick_winner(trials: &[Trial], min_recall: f32) -> Option<&Trial> {
trials
.iter()
.filter(|t| t.metrics.recall_at_k >= min_recall)
.min_by(|a, b| {
// Compare by (recall, p50_latency) so minimum is the
// best trial: higher recall wins, then lower latency.
let a_key = (-a.metrics.recall_at_k, a.metrics.search_latency_p50_us);
let b_key = (-b.metrics.recall_at_k, b.metrics.search_latency_p50_us);
a_key.partial_cmp(&b_key).unwrap_or(std::cmp::Ordering::Equal)
})
}
/// Run the autotune loop. Synchronous — caller can wrap in tokio::spawn
/// to make it fire-and-forget. Uses the existing trial flow so every
/// autotune step lands in the trial journal.
#[allow(clippy::too_many_arguments)]
pub async fn run_autotune(
req: AutotuneRequest,
_store: &Arc<dyn ObjectStore>,
catalog: &CatalogRegistry,
ai_client: &AiClient,
embedding_cache: &EmbeddingCache,
hnsw_store: &HnswStore,
index_registry: &IndexRegistry,
trial_journal: &TrialJournal,
promotion_registry: &PromotionRegistry,
harness_store: &HarnessStore,
_job_tracker: &JobTracker,
) -> Result<AutotuneResult, String> {
let t0 = std::time::Instant::now();
// Sanity: the index has to exist.
if index_registry.get(&req.index_name).await.is_none() {
return Err(format!("index not found: {}", req.index_name));
}
let _ = catalog; // reserved for future audit emission
// Load the harness once, compute ground truth once. Harness resolves
// to the index's bucket via HarnessStore, with primary as fallback for
// pre-federation evals.
let mut harness_set = harness_store
.load_for_index(&req.index_name, &req.harness)
.await
.map_err(|e| format!("load harness: {e}"))?;
let embeddings = embedding_cache
.get_or_load(&req.index_name)
.await
.map_err(|e| format!("embeddings: {e}"))?;
if !harness_set.ground_truth_built {
harness::compute_ground_truth(&mut harness_set, &embeddings, ai_client)
.await
.map_err(|e| format!("ground truth: {e}"))?;
harness_store.save(&harness_set).await.ok();
}
let (grid, rejected) = sanitize_grid(req.grid.clone().unwrap_or_else(default_grid));
if grid.is_empty() {
return Err("all configs rejected by bounds check".into());
}
tracing::info!(
"autotune '{}': {} trial configs, min_recall={}",
req.index_name, grid.len(), req.min_recall,
);
// Run each trial through the existing pipeline so every run lands
// in the trial journal and contributes to the long-term tuning
// memory. This is the llms3.com-style "trials are data" pattern.
let mut run_trials: Vec<Trial> = Vec::with_capacity(grid.len());
for config in &grid {
let trial = run_single_trial(
&req.index_name,
&harness_set,
&embeddings,
config,
hnsw_store,
trial_journal,
ai_client,
).await;
match trial {
Ok(t) => run_trials.push(t),
Err(e) => tracing::warn!("autotune: trial ec={} es={} failed: {e}",
config.ef_construction, config.ef_search),
}
}
let trials_eligible = run_trials
.iter()
.filter(|t| t.metrics.recall_at_k >= req.min_recall)
.count();
let winner = pick_winner(&run_trials, req.min_recall);
let mut promoted = false;
let winner_report = if let Some(w) = winner {
let entry = PromotionEntry {
config: w.config.clone(),
trial_id: w.id.clone(),
promoted_at: Utc::now(),
promoted_by: "autotune".to_string(),
note: Some(format!(
"autotune winner: recall={:.3} p50={:.0}us",
w.metrics.recall_at_k, w.metrics.search_latency_p50_us,
)),
};
if let Err(e) = promotion_registry.promote(&req.index_name, entry).await {
tracing::warn!("autotune: promote failed: {e}");
} else {
promoted = true;
}
Some(WinnerReport {
trial_id: w.id.clone(),
config: w.config.clone(),
recall_at_k: w.metrics.recall_at_k,
search_latency_p50_us: w.metrics.search_latency_p50_us,
build_time_secs: w.metrics.build_time_secs,
})
} else {
tracing::warn!(
"autotune '{}': no trial met min_recall={}; not promoting",
req.index_name, req.min_recall,
);
None
};
Ok(AutotuneResult {
index_name: req.index_name,
trials_run: run_trials.len(),
trials_eligible,
winner: winner_report,
promoted,
duration_secs: t0.elapsed().as_secs_f32(),
skipped: rejected,
})
}
/// Run one trial and journal it. Returns the recorded Trial.
#[allow(clippy::too_many_arguments)]
async fn run_single_trial(
index_name: &str,
harness_set: &harness::EvalSet,
embeddings: &Arc<Vec<crate::store::StoredEmbedding>>,
config: &HnswConfig,
hnsw_store: &HnswStore,
trial_journal: &TrialJournal,
_ai_client: &AiClient,
) -> Result<Trial, String> {
let trial_id = Trial::new_id();
let slot = format!("{}__{}", index_name, trial_id);
let build = hnsw_store
.build_index_with_config(&slot, (**embeddings).clone(), config)
.await?;
let query_vectors: Vec<Vec<f32>> = harness_set
.queries
.iter()
.filter_map(|q| q.query_embedding.clone())
.collect();
let bench = hnsw_store
.bench_search(&slot, &query_vectors, harness_set.k)
.await?;
let mut recalls = Vec::with_capacity(harness_set.queries.len());
for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) {
if let Some(gt) = &q.ground_truth {
recalls.push(harness::recall_at_k(hits, gt, harness_set.k));
}
}
let mean_recall = if recalls.is_empty() {
0.0
} else {
recalls.iter().sum::<f32>() / recalls.len() as f32
};
let mut lats = bench.latencies_us.clone();
lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p = |pct: f32| -> f32 {
if lats.is_empty() { return 0.0; }
let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize;
lats[idx.min(lats.len() - 1)]
};
let brute_us = if let Some(qv) = query_vectors.first() {
let t = std::time::Instant::now();
let _ = harness::brute_force_top_k(qv, embeddings, harness_set.k);
t.elapsed().as_micros() as f32
} else {
0.0
};
let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0);
let memory_bytes =
(embeddings.len() * dims * std::mem::size_of::<f32>() + embeddings.len() * 128) as u64;
let trial = Trial {
id: trial_id.clone(),
index_name: index_name.to_string(),
eval_set: harness_set.name.clone(),
config: config.clone(),
metrics: TrialMetrics {
build_time_secs: build.build_time_secs,
search_latency_p50_us: p(0.50),
search_latency_p95_us: p(0.95),
search_latency_p99_us: p(0.99),
recall_at_k: mean_recall,
memory_bytes,
vectors: build.vectors,
eval_queries: harness_set.queries.len(),
brute_force_latency_us: brute_us,
},
created_at: Utc::now(),
note: Some("autotune".to_string()),
};
let _ = trial_journal.append(&trial).await;
let _ = hnsw_store.drop(&slot).await;
Ok(trial)
}