Accumulated since a6f12e2 (Phase 21 Rust port + Phase 27 versioning): - Phase 36: embed_semaphore on VectorState (permits=1) serializes seed embed calls — prevents sidecar socket collisions under concurrent /seed stress load - Phase 31+: run_stress.ts 6-task diverse stress scaffolding; run_e2e_rated.ts + orchestrator.ts tightening - Catalog dedupe cleanup: 16 duplicate manifests removed; canonical candidates.parquet (10.5MB -> 76KB) + placements.parquet (1.2MB -> 11KB) regenerated post-dedupe; fresh manifests for active datasets - vectord: harness EvalSet refinements (+181), agent portfolio rotation + ingest triggers (+158), autotune + rag adjustments - catalogd/storaged/ingestd/mcp-server: misc tightening - docs: Phase 28-36 PRD entries + DECISIONS ADR additions; control-plane pivot banner added to top of docs/PRD.md (pointing at docs/CONTROL_PLANE_PRD.md which lands in next commit) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
333 lines
12 KiB
Rust
333 lines
12 KiB
Rust
//! Phase 16: Autotune — the agent loop for HNSW tuning.
|
||
//!
|
||
//! Given an index and a harness, run a grid of (ef_construction,
|
||
//! ef_search) trials, pick the Pareto winner, and promote it.
|
||
//!
|
||
//! This is deliberately NOT a sophisticated optimizer — random-grid
|
||
//! search is good enough when the config space is small (a dozen
|
||
//! combinations) and the work per trial is seconds, not hours. Think
|
||
//! of it as a "smart operator" rather than Bayesian optimization.
|
||
//!
|
||
//! Score: we want high recall and low p50 latency. The function below
|
||
//! picks the Pareto-dominating trial, breaking ties by prefer-lower-
|
||
//! build-time (cheaper to rebuild on data changes).
|
||
//!
|
||
//! Safeguards (PRD risk mitigation):
|
||
//! - Never promote a trial with recall < `min_recall` (default 0.9)
|
||
//! - Hard config bounds: ef_construction ∈ [10, 400], ef_search ∈ [10, 200]
|
||
//! - Concurrency cap: one autotune per index at a time (tracked by
|
||
//! JobTracker — caller is responsible for not firing in parallel
|
||
//! for the same index)
|
||
|
||
use chrono::Utc;
|
||
use object_store::ObjectStore;
|
||
use serde::{Deserialize, Serialize};
|
||
use std::sync::Arc;
|
||
|
||
use aibridge::client::AiClient;
|
||
use catalogd::registry::Registry as CatalogRegistry;
|
||
|
||
use crate::embedding_cache::EmbeddingCache;
|
||
use crate::harness::{self, HarnessStore};
|
||
use crate::hnsw::HnswStore;
|
||
use crate::index_registry::IndexRegistry;
|
||
use crate::jobs::JobTracker;
|
||
use crate::promotion::{PromotionEntry, PromotionRegistry};
|
||
use crate::trial::{HnswConfig, Trial, TrialJournal, TrialMetrics};
|
||
|
||
#[derive(Debug, Clone, Deserialize)]
|
||
pub struct AutotuneRequest {
|
||
pub index_name: String,
|
||
pub harness: String,
|
||
/// Minimum recall a trial must achieve to be eligible for promotion.
|
||
/// Default 0.9 — below that, the tuning is not producing useful
|
||
/// answers. Never promote a bad index.
|
||
#[serde(default = "default_min_recall")]
|
||
pub min_recall: f32,
|
||
/// Override the default config grid (rarely needed). When omitted,
|
||
/// a small curated grid runs.
|
||
#[serde(default)]
|
||
pub grid: Option<Vec<HnswConfig>>,
|
||
}
|
||
|
||
fn default_min_recall() -> f32 { 0.9 }
|
||
|
||
#[derive(Debug, Clone, Serialize)]
|
||
pub struct AutotuneResult {
|
||
pub index_name: String,
|
||
pub trials_run: usize,
|
||
pub trials_eligible: usize,
|
||
pub winner: Option<WinnerReport>,
|
||
pub promoted: bool,
|
||
pub duration_secs: f32,
|
||
pub skipped: Vec<String>,
|
||
}
|
||
|
||
#[derive(Debug, Clone, Serialize)]
|
||
pub struct WinnerReport {
|
||
pub trial_id: String,
|
||
pub config: HnswConfig,
|
||
pub recall_at_k: f32,
|
||
pub search_latency_p50_us: f32,
|
||
pub build_time_secs: f32,
|
||
}
|
||
|
||
/// Default exploration grid. Covers the practical range of settings
|
||
/// from "fast build, OK recall" to "thorough build, great recall".
|
||
fn default_grid() -> Vec<HnswConfig> {
|
||
vec![
|
||
HnswConfig { ef_construction: 20, ef_search: 30, seed: Some(42) },
|
||
HnswConfig { ef_construction: 40, ef_search: 30, seed: Some(42) },
|
||
HnswConfig { ef_construction: 80, ef_search: 30, seed: Some(42) },
|
||
HnswConfig { ef_construction: 80, ef_search: 60, seed: Some(42) },
|
||
HnswConfig { ef_construction: 160, ef_search: 30, seed: Some(42) },
|
||
]
|
||
}
|
||
|
||
/// Config-space bounds. Rejects anything outside these before running a
|
||
/// trial — defends against an agent getting creative with absurd values.
|
||
fn sanitize_grid(grid: Vec<HnswConfig>) -> (Vec<HnswConfig>, Vec<String>) {
|
||
let mut ok = Vec::new();
|
||
let mut rejected = Vec::new();
|
||
for cfg in grid {
|
||
if !(10..=400).contains(&cfg.ef_construction) {
|
||
rejected.push(format!(
|
||
"ef_construction={} out of bounds [10,400]", cfg.ef_construction,
|
||
));
|
||
continue;
|
||
}
|
||
if !(10..=200).contains(&cfg.ef_search) {
|
||
rejected.push(format!(
|
||
"ef_search={} out of bounds [10,200]", cfg.ef_search,
|
||
));
|
||
continue;
|
||
}
|
||
ok.push(cfg);
|
||
}
|
||
(ok, rejected)
|
||
}
|
||
|
||
/// Pareto winner: highest recall, then lowest p50 latency among
|
||
/// eligible trials (recall >= min_recall).
|
||
fn pick_winner(trials: &[Trial], min_recall: f32) -> Option<&Trial> {
|
||
trials
|
||
.iter()
|
||
.filter(|t| t.metrics.recall_at_k >= min_recall)
|
||
.min_by(|a, b| {
|
||
// Compare by (−recall, p50_latency) so minimum is the
|
||
// best trial: higher recall wins, then lower latency.
|
||
let a_key = (-a.metrics.recall_at_k, a.metrics.search_latency_p50_us);
|
||
let b_key = (-b.metrics.recall_at_k, b.metrics.search_latency_p50_us);
|
||
a_key.partial_cmp(&b_key).unwrap_or(std::cmp::Ordering::Equal)
|
||
})
|
||
}
|
||
|
||
/// Run the autotune loop. Synchronous — caller can wrap in tokio::spawn
|
||
/// to make it fire-and-forget. Uses the existing trial flow so every
|
||
/// autotune step lands in the trial journal.
|
||
#[allow(clippy::too_many_arguments)]
|
||
pub async fn run_autotune(
|
||
req: AutotuneRequest,
|
||
_store: &Arc<dyn ObjectStore>,
|
||
catalog: &CatalogRegistry,
|
||
ai_client: &AiClient,
|
||
embedding_cache: &EmbeddingCache,
|
||
hnsw_store: &HnswStore,
|
||
index_registry: &IndexRegistry,
|
||
trial_journal: &TrialJournal,
|
||
promotion_registry: &PromotionRegistry,
|
||
harness_store: &HarnessStore,
|
||
_job_tracker: &JobTracker,
|
||
) -> Result<AutotuneResult, String> {
|
||
let t0 = std::time::Instant::now();
|
||
|
||
// Sanity: the index has to exist.
|
||
if index_registry.get(&req.index_name).await.is_none() {
|
||
return Err(format!("index not found: {}", req.index_name));
|
||
}
|
||
let _ = catalog; // reserved for future audit emission
|
||
|
||
// Load the harness once, compute ground truth once. Harness resolves
|
||
// to the index's bucket via HarnessStore, with primary as fallback for
|
||
// pre-federation evals.
|
||
let mut harness_set = harness_store
|
||
.load_for_index(&req.index_name, &req.harness)
|
||
.await
|
||
.map_err(|e| format!("load harness: {e}"))?;
|
||
|
||
let embeddings = embedding_cache
|
||
.get_or_load(&req.index_name)
|
||
.await
|
||
.map_err(|e| format!("embeddings: {e}"))?;
|
||
|
||
if !harness_set.ground_truth_built {
|
||
harness::compute_ground_truth(&mut harness_set, &embeddings, ai_client)
|
||
.await
|
||
.map_err(|e| format!("ground truth: {e}"))?;
|
||
harness_store.save(&harness_set).await.ok();
|
||
}
|
||
|
||
let (grid, rejected) = sanitize_grid(req.grid.clone().unwrap_or_else(default_grid));
|
||
if grid.is_empty() {
|
||
return Err("all configs rejected by bounds check".into());
|
||
}
|
||
|
||
tracing::info!(
|
||
"autotune '{}': {} trial configs, min_recall={}",
|
||
req.index_name, grid.len(), req.min_recall,
|
||
);
|
||
|
||
// Run each trial through the existing pipeline so every run lands
|
||
// in the trial journal and contributes to the long-term tuning
|
||
// memory. This is the llms3.com-style "trials are data" pattern.
|
||
let mut run_trials: Vec<Trial> = Vec::with_capacity(grid.len());
|
||
for config in &grid {
|
||
let trial = run_single_trial(
|
||
&req.index_name,
|
||
&harness_set,
|
||
&embeddings,
|
||
config,
|
||
hnsw_store,
|
||
trial_journal,
|
||
ai_client,
|
||
).await;
|
||
match trial {
|
||
Ok(t) => run_trials.push(t),
|
||
Err(e) => tracing::warn!("autotune: trial ec={} es={} failed: {e}",
|
||
config.ef_construction, config.ef_search),
|
||
}
|
||
}
|
||
|
||
let trials_eligible = run_trials
|
||
.iter()
|
||
.filter(|t| t.metrics.recall_at_k >= req.min_recall)
|
||
.count();
|
||
|
||
let winner = pick_winner(&run_trials, req.min_recall);
|
||
let mut promoted = false;
|
||
|
||
let winner_report = if let Some(w) = winner {
|
||
let entry = PromotionEntry {
|
||
config: w.config.clone(),
|
||
trial_id: w.id.clone(),
|
||
promoted_at: Utc::now(),
|
||
promoted_by: "autotune".to_string(),
|
||
note: Some(format!(
|
||
"autotune winner: recall={:.3} p50={:.0}us",
|
||
w.metrics.recall_at_k, w.metrics.search_latency_p50_us,
|
||
)),
|
||
};
|
||
if let Err(e) = promotion_registry.promote(&req.index_name, entry).await {
|
||
tracing::warn!("autotune: promote failed: {e}");
|
||
} else {
|
||
promoted = true;
|
||
}
|
||
Some(WinnerReport {
|
||
trial_id: w.id.clone(),
|
||
config: w.config.clone(),
|
||
recall_at_k: w.metrics.recall_at_k,
|
||
search_latency_p50_us: w.metrics.search_latency_p50_us,
|
||
build_time_secs: w.metrics.build_time_secs,
|
||
})
|
||
} else {
|
||
tracing::warn!(
|
||
"autotune '{}': no trial met min_recall={}; not promoting",
|
||
req.index_name, req.min_recall,
|
||
);
|
||
None
|
||
};
|
||
|
||
Ok(AutotuneResult {
|
||
index_name: req.index_name,
|
||
trials_run: run_trials.len(),
|
||
trials_eligible,
|
||
winner: winner_report,
|
||
promoted,
|
||
duration_secs: t0.elapsed().as_secs_f32(),
|
||
skipped: rejected,
|
||
})
|
||
}
|
||
|
||
/// Run one trial and journal it. Returns the recorded Trial.
|
||
#[allow(clippy::too_many_arguments)]
|
||
async fn run_single_trial(
|
||
index_name: &str,
|
||
harness_set: &harness::EvalSet,
|
||
embeddings: &Arc<Vec<crate::store::StoredEmbedding>>,
|
||
config: &HnswConfig,
|
||
hnsw_store: &HnswStore,
|
||
trial_journal: &TrialJournal,
|
||
_ai_client: &AiClient,
|
||
) -> Result<Trial, String> {
|
||
let trial_id = Trial::new_id();
|
||
let slot = format!("{}__{}", index_name, trial_id);
|
||
|
||
let build = hnsw_store
|
||
.build_index_with_config(&slot, (**embeddings).clone(), config)
|
||
.await?;
|
||
|
||
let query_vectors: Vec<Vec<f32>> = harness_set
|
||
.queries
|
||
.iter()
|
||
.filter_map(|q| q.query_embedding.clone())
|
||
.collect();
|
||
let bench = hnsw_store
|
||
.bench_search(&slot, &query_vectors, harness_set.k)
|
||
.await?;
|
||
|
||
let mut recalls = Vec::with_capacity(harness_set.queries.len());
|
||
for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) {
|
||
if let Some(gt) = &q.ground_truth {
|
||
recalls.push(harness::recall_at_k(hits, gt, harness_set.k));
|
||
}
|
||
}
|
||
let mean_recall = if recalls.is_empty() {
|
||
0.0
|
||
} else {
|
||
recalls.iter().sum::<f32>() / recalls.len() as f32
|
||
};
|
||
|
||
let mut lats = bench.latencies_us.clone();
|
||
lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
let p = |pct: f32| -> f32 {
|
||
if lats.is_empty() { return 0.0; }
|
||
let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize;
|
||
lats[idx.min(lats.len() - 1)]
|
||
};
|
||
|
||
let brute_us = if let Some(qv) = query_vectors.first() {
|
||
let t = std::time::Instant::now();
|
||
let _ = harness::brute_force_top_k(qv, embeddings, harness_set.k);
|
||
t.elapsed().as_micros() as f32
|
||
} else {
|
||
0.0
|
||
};
|
||
|
||
let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0);
|
||
let memory_bytes =
|
||
(embeddings.len() * dims * std::mem::size_of::<f32>() + embeddings.len() * 128) as u64;
|
||
|
||
let trial = Trial {
|
||
id: trial_id.clone(),
|
||
index_name: index_name.to_string(),
|
||
eval_set: harness_set.name.clone(),
|
||
config: config.clone(),
|
||
metrics: TrialMetrics {
|
||
build_time_secs: build.build_time_secs,
|
||
search_latency_p50_us: p(0.50),
|
||
search_latency_p95_us: p(0.95),
|
||
search_latency_p99_us: p(0.99),
|
||
recall_at_k: mean_recall,
|
||
memory_bytes,
|
||
vectors: build.vectors,
|
||
eval_queries: harness_set.queries.len(),
|
||
brute_force_latency_us: brute_us,
|
||
},
|
||
created_at: Utc::now(),
|
||
note: Some("autotune".to_string()),
|
||
};
|
||
let _ = trial_journal.append(&trial).await;
|
||
let _ = hnsw_store.drop(&slot).await;
|
||
Ok(trial)
|
||
}
|