lakehouse/crates/vectord/src/autotune.rs

//! Phase 16: Autotune — the agent loop for HNSW tuning.
//!
//! Given an index and a harness, run a grid of (ef_construction,
//! ef_search) trials, pick the Pareto winner, and promote it.
//!
//! This is deliberately NOT a sophisticated optimizer — random-grid
//! search is good enough when the config space is small (a dozen
//! combinations) and the work per trial is seconds, not hours. Think
//! of it as a "smart operator" rather than Bayesian optimization.
//!
//! Score: we want high recall and low p50 latency. The function below
//! picks the Pareto-dominating trial, breaking ties by prefer-lower-
//! build-time (cheaper to rebuild on data changes).
//!
//! Safeguards (PRD risk mitigation):
//! - Never promote a trial with recall < `min_recall` (default 0.9)
//! - Hard config bounds: ef_construction ∈ [10, 400], ef_search ∈ [10, 200]
//! - Concurrency cap: one autotune per index at a time (tracked by
//!   JobTracker — caller is responsible for not firing in parallel
//!   for the same index)

use chrono::Utc;
use object_store::ObjectStore;
use serde::{Deserialize, Serialize};
use std::sync::Arc;

use aibridge::client::AiClient;
use catalogd::registry::Registry as CatalogRegistry;

use crate::embedding_cache::EmbeddingCache;
use crate::harness::{self, HarnessStore};
use crate::hnsw::HnswStore;
use crate::index_registry::IndexRegistry;
use crate::jobs::JobTracker;
use crate::promotion::{PromotionEntry, PromotionRegistry};
use crate::trial::{HnswConfig, Trial, TrialJournal, TrialMetrics};

#[derive(Debug, Clone, Deserialize)]
pub struct AutotuneRequest {
    pub index_name: String,
    pub harness: String,
    /// Minimum recall a trial must achieve to be eligible for promotion.
    /// Default 0.9 — below that, the tuning is not producing useful
    /// answers. Never promote a bad index.
    #[serde(default = "default_min_recall")]
    pub min_recall: f32,
    /// Override the default config grid (rarely needed). When omitted,
    /// a small curated grid runs.
    #[serde(default)]
    pub grid: Option<Vec<HnswConfig>>,
}

fn default_min_recall() -> f32 { 0.9 }

#[derive(Debug, Clone, Serialize)]
pub struct AutotuneResult {
    pub index_name: String,
    pub trials_run: usize,
    pub trials_eligible: usize,
    pub winner: Option<WinnerReport>,
    pub promoted: bool,
    pub duration_secs: f32,
    pub skipped: Vec<String>,
}

#[derive(Debug, Clone, Serialize)]
pub struct WinnerReport {
    pub trial_id: String,
    pub config: HnswConfig,
    pub recall_at_k: f32,
    pub search_latency_p50_us: f32,
    pub build_time_secs: f32,
}

/// Default exploration grid. Covers the practical range of settings
/// from "fast build, OK recall" to "thorough build, great recall".
fn default_grid() -> Vec<HnswConfig> {
    vec![
        HnswConfig { ef_construction: 20, ef_search: 30, seed: Some(42) },
        HnswConfig { ef_construction: 40, ef_search: 30, seed: Some(42) },
        HnswConfig { ef_construction: 80, ef_search: 30, seed: Some(42) },
        HnswConfig { ef_construction: 80, ef_search: 60, seed: Some(42) },
        HnswConfig { ef_construction: 160, ef_search: 30, seed: Some(42) },
    ]
}

/// Config-space bounds. Rejects anything outside these before running a
/// trial — defends against an agent getting creative with absurd values.
fn sanitize_grid(grid: Vec<HnswConfig>) -> (Vec<HnswConfig>, Vec<String>) {
    let mut ok = Vec::new();
    let mut rejected = Vec::new();
    for cfg in grid {
        if !(10..=400).contains(&cfg.ef_construction) {
            rejected.push(format!(
                "ef_construction={} out of bounds [10,400]", cfg.ef_construction,
            ));
            continue;
        }
        if !(10..=200).contains(&cfg.ef_search) {
            rejected.push(format!(
                "ef_search={} out of bounds [10,200]", cfg.ef_search,
            ));
            continue;
        }
        ok.push(cfg);
    }
    (ok, rejected)
}

/// Pareto winner: highest recall, then lowest p50 latency among
/// eligible trials (recall >= min_recall).
fn pick_winner(trials: &[Trial], min_recall: f32) -> Option<&Trial> {
    trials
        .iter()
        .filter(|t| t.metrics.recall_at_k >= min_recall)
        .min_by(|a, b| {
            // Compare by (−recall, p50_latency) so minimum is the
            // best trial: higher recall wins, then lower latency.
            let a_key = (-a.metrics.recall_at_k, a.metrics.search_latency_p50_us);
            let b_key = (-b.metrics.recall_at_k, b.metrics.search_latency_p50_us);
            a_key.partial_cmp(&b_key).unwrap_or(std::cmp::Ordering::Equal)
        })
}

/// Run the autotune loop. Synchronous — caller can wrap in tokio::spawn
/// to make it fire-and-forget. Uses the existing trial flow so every
/// autotune step lands in the trial journal.
#[allow(clippy::too_many_arguments)]
pub async fn run_autotune(
    req: AutotuneRequest,
    _store: &Arc<dyn ObjectStore>,
    catalog: &CatalogRegistry,
    ai_client: &AiClient,
    embedding_cache: &EmbeddingCache,
    hnsw_store: &HnswStore,
    index_registry: &IndexRegistry,
    trial_journal: &TrialJournal,
    promotion_registry: &PromotionRegistry,
    harness_store: &HarnessStore,
    _job_tracker: &JobTracker,
) -> Result<AutotuneResult, String> {
    let t0 = std::time::Instant::now();

    // Sanity: the index has to exist.
    if index_registry.get(&req.index_name).await.is_none() {
        return Err(format!("index not found: {}", req.index_name));
    }
    let _ = catalog; // reserved for future audit emission

    // Load the harness once, compute ground truth once. Harness resolves
    // to the index's bucket via HarnessStore, with primary as fallback for
    // pre-federation evals.
    let mut harness_set = harness_store
        .load_for_index(&req.index_name, &req.harness)
        .await
        .map_err(|e| format!("load harness: {e}"))?;

    let embeddings = embedding_cache
        .get_or_load(&req.index_name)
        .await
        .map_err(|e| format!("embeddings: {e}"))?;

    if !harness_set.ground_truth_built {
        harness::compute_ground_truth(&mut harness_set, &embeddings, ai_client)
            .await
            .map_err(|e| format!("ground truth: {e}"))?;
        harness_store.save(&harness_set).await.ok();
    }

    let (grid, rejected) = sanitize_grid(req.grid.clone().unwrap_or_else(default_grid));
    if grid.is_empty() {
        return Err("all configs rejected by bounds check".into());
    }

    tracing::info!(
        "autotune '{}': {} trial configs, min_recall={}",
        req.index_name, grid.len(), req.min_recall,
    );

    // Run each trial through the existing pipeline so every run lands
    // in the trial journal and contributes to the long-term tuning
    // memory. This is the llms3.com-style "trials are data" pattern.
    let mut run_trials: Vec<Trial> = Vec::with_capacity(grid.len());
    for config in &grid {
        let trial = run_single_trial(
            &req.index_name,
            &harness_set,
            &embeddings,
            config,
            hnsw_store,
            trial_journal,
            ai_client,
        ).await;
        match trial {
            Ok(t) => run_trials.push(t),
            Err(e) => tracing::warn!("autotune: trial ec={} es={} failed: {e}",
                config.ef_construction, config.ef_search),
        }
    }

    let trials_eligible = run_trials
        .iter()
        .filter(|t| t.metrics.recall_at_k >= req.min_recall)
        .count();

    let winner = pick_winner(&run_trials, req.min_recall);
    let mut promoted = false;

    let winner_report = if let Some(w) = winner {
        let entry = PromotionEntry {
            config: w.config.clone(),
            trial_id: w.id.clone(),
            promoted_at: Utc::now(),
            promoted_by: "autotune".to_string(),
            note: Some(format!(
                "autotune winner: recall={:.3} p50={:.0}us",
                w.metrics.recall_at_k, w.metrics.search_latency_p50_us,
            )),
        };
        if let Err(e) = promotion_registry.promote(&req.index_name, entry).await {
            tracing::warn!("autotune: promote failed: {e}");
        } else {
            promoted = true;
        }
        Some(WinnerReport {
            trial_id: w.id.clone(),
            config: w.config.clone(),
            recall_at_k: w.metrics.recall_at_k,
            search_latency_p50_us: w.metrics.search_latency_p50_us,
            build_time_secs: w.metrics.build_time_secs,
        })
    } else {
        tracing::warn!(
            "autotune '{}': no trial met min_recall={}; not promoting",
            req.index_name, req.min_recall,
        );
        None
    };

    Ok(AutotuneResult {
        index_name: req.index_name,
        trials_run: run_trials.len(),
        trials_eligible,
        winner: winner_report,
        promoted,
        duration_secs: t0.elapsed().as_secs_f32(),
        skipped: rejected,
    })
}

/// Run one trial and journal it. Returns the recorded Trial.
#[allow(clippy::too_many_arguments)]
async fn run_single_trial(
    index_name: &str,
    harness_set: &harness::EvalSet,
    embeddings: &Arc<Vec<crate::store::StoredEmbedding>>,
    config: &HnswConfig,
    hnsw_store: &HnswStore,
    trial_journal: &TrialJournal,
    _ai_client: &AiClient,
) -> Result<Trial, String> {
    let trial_id = Trial::new_id();
    let slot = format!("{}__{}", index_name, trial_id);

    let build = hnsw_store
        .build_index_with_config(&slot, (**embeddings).clone(), config)
        .await?;

    let query_vectors: Vec<Vec<f32>> = harness_set
        .queries
        .iter()
        .filter_map(|q| q.query_embedding.clone())
        .collect();
    let bench = hnsw_store
        .bench_search(&slot, &query_vectors, harness_set.k)
        .await?;

    let mut recalls = Vec::with_capacity(harness_set.queries.len());
    for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) {
        if let Some(gt) = &q.ground_truth {
            recalls.push(harness::recall_at_k(hits, gt, harness_set.k));
        }
    }
    let mean_recall = if recalls.is_empty() {
        0.0
    } else {
        recalls.iter().sum::<f32>() / recalls.len() as f32
    };

    let mut lats = bench.latencies_us.clone();
    lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let p = |pct: f32| -> f32 {
        if lats.is_empty() { return 0.0; }
        let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize;
        lats[idx.min(lats.len() - 1)]
    };

    let brute_us = if let Some(qv) = query_vectors.first() {
        let t = std::time::Instant::now();
        let _ = harness::brute_force_top_k(qv, embeddings, harness_set.k);
        t.elapsed().as_micros() as f32
    } else {
        0.0
    };

    let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0);
    let memory_bytes =
        (embeddings.len() * dims * std::mem::size_of::<f32>() + embeddings.len() * 128) as u64;

    let trial = Trial {
        id: trial_id.clone(),
        index_name: index_name.to_string(),
        eval_set: harness_set.name.clone(),
        config: config.clone(),
        metrics: TrialMetrics {
            build_time_secs: build.build_time_secs,
            search_latency_p50_us: p(0.50),
            search_latency_p95_us: p(0.95),
            search_latency_p99_us: p(0.99),
            recall_at_k: mean_recall,
            memory_bytes,
            vectors: build.vectors,
            eval_queries: harness_set.queries.len(),
            brute_force_latency_us: brute_us,
        },
        created_at: Utc::now(),
        note: Some("autotune".to_string()),
    };
    let _ = trial_journal.append(&trial).await;
    let _ = hnsw_store.drop(&slot).await;
    Ok(trial)
}