//! Phase 16.2 + 16.5 — The autotune agent. //! //! A long-running tokio task that watches the trial journal and //! autonomously proposes + runs new HNSW configs. Distinct from //! `autotune::run_autotune` which is synchronous (one HTTP call, grid //! of trials, done). The agent is the continuous version: it sleeps, //! wakes on triggers, proposes configs based on prior trial history, //! runs them one at a time, and auto-promotes when it finds an //! improvement. //! //! Design invariants: //! - Trials are data (ADR-018). Every proposal reads the journal; every //! attempt appends to it. The journal is the agent's memory. //! - One trial at a time. Bounded Ollama load — the agent never fires //! multiple parallel embeddings and respects `cooldown_between_trials_secs`. //! - Rate-limited. `max_trials_per_hour` is a hard ceiling so a //! misbehaving proposal function can't saturate the system. //! - Never promotes below `min_recall`. Same safety gate as //! `run_autotune` — we will not make the index worse. //! - Triggered OR periodic. Ingest enqueues a `DatasetAppended` event //! when a new batch lands; the agent also wakes periodically to keep //! exploring even when nothing changed externally. //! - Graceful shutdown via the `stop_tx` signal — the handle's Drop //! doesn't force-kill, but `stop()` requests a clean exit after the //! current trial. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::VecDeque; use std::sync::Arc; use tokio::sync::{Mutex, RwLock, mpsc, oneshot}; use aibridge::client::AiClient; use catalogd::registry::Registry as CatalogRegistry; use crate::embedding_cache::EmbeddingCache; use crate::harness; use crate::hnsw::HnswStore; use crate::index_registry::IndexRegistry; use crate::promotion::{PromotionEntry, PromotionRegistry}; use crate::trial::{HnswConfig, Trial, TrialJournal, TrialMetrics}; // -------- Public-facing types -------- /// Runtime configuration for the agent. Mirrored in shared::config under /// `[agent]`. Defaults are conservative — designed to tune slowly in the /// background without fighting real workloads for GPU time. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct AgentConfig { /// Master switch. When false, `spawn` returns a handle but the loop /// doesn't run. pub enabled: bool, /// Periodic wake-up — even if the trigger queue is empty, every N /// seconds the agent picks an index with trials and proposes one /// more config. Keeps exploration alive on idle indexes. pub cycle_interval_secs: u64, /// Minimum gap between two trials on the SAME index. Prevents the /// agent from hammering Ollama when a hot index has many pending /// triggers in a row. pub cooldown_between_trials_secs: u64, /// Below this recall, a proposal is never promoted — even if it /// beats the champion on latency. pub min_recall: f32, /// Budget cap: hard ceiling on trials per hour across all indexes. /// When hit, the agent idles until the hour window rolls. pub max_trials_per_hour: u32, } impl Default for AgentConfig { fn default() -> Self { Self { enabled: false, // opt-in — don't auto-start until J turns it on cycle_interval_secs: 60, cooldown_between_trials_secs: 30, min_recall: 0.9, max_trials_per_hour: 30, } } } /// What caused the agent to look at a particular index. Recorded in the /// trial's note field so we can tell "new data arrived" trials from /// "periodic exploration" trials in the journal. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum TriggerReason { /// Ingest just appended to a dataset that has attached HNSW indexes. DatasetAppended { dataset: String }, /// A human or another agent hit `/agent/enqueue/{index}`. Manual, /// Periodic wake — no external event, just keep exploring. Periodic, } /// One unit of work for the agent. #[derive(Debug, Clone)] pub struct TriggerEvent { pub index_name: String, pub reason: TriggerReason, pub enqueued_at: DateTime, } impl TriggerEvent { pub fn manual(index_name: impl Into) -> Self { Self { index_name: index_name.into(), reason: TriggerReason::Manual, enqueued_at: Utc::now() } } pub fn dataset_appended(index_name: impl Into, dataset: impl Into) -> Self { Self { index_name: index_name.into(), reason: TriggerReason::DatasetAppended { dataset: dataset.into() }, enqueued_at: Utc::now(), } } pub fn periodic(index_name: impl Into) -> Self { Self { index_name: index_name.into(), reason: TriggerReason::Periodic, enqueued_at: Utc::now() } } } /// Observable snapshot of the agent's state — what `/agent/status` returns. #[derive(Debug, Clone, Serialize)] pub struct AgentStatus { pub running: bool, pub config: AgentConfig, pub queue_depth: usize, pub trials_run: u64, pub promotions: u64, pub trials_in_last_hour: u32, pub last_event: Option, pub started_at: Option>, } /// Last thing that happened — useful for "why didn't it do anything?" debugging. #[derive(Debug, Clone, Serialize)] pub struct AgentEvent { pub at: DateTime, pub kind: String, // "trial_completed" | "promoted" | "skipped_rate_limit" | etc pub index_name: Option, pub detail: String, } /// Handle returned by `spawn`. Holds the trigger sender + shared status + /// stop signal. #[derive(Clone)] pub struct AgentHandle { trigger_tx: mpsc::Sender, inner: Arc, } struct AgentInner { status: RwLock, stop_tx: Mutex>>, queue_len: Mutex, // mirror of the channel capacity — for status reporting recent_trials: Mutex>>, // ring of recent trial timestamps for rate limit } impl AgentHandle { /// Enqueue a trigger. Returns Err if the agent isn't running or the /// queue is full (backpressure — dropping events is correct here /// since periodic exploration will pick up the slack). pub async fn enqueue(&self, event: TriggerEvent) -> Result<(), String> { self.trigger_tx.try_send(event).map_err(|e| format!("enqueue: {e}"))?; let mut guard = self.inner.queue_len.lock().await; *guard = guard.saturating_add(1); // Update queue_depth in status for observability. let mut s = self.inner.status.write().await; s.queue_depth = *guard; Ok(()) } pub async fn status(&self) -> AgentStatus { let mut s = self.inner.status.read().await.clone(); // Refresh rate-limit window from ring buffer. let cutoff = Utc::now() - chrono::Duration::hours(1); let ring = self.inner.recent_trials.lock().await; s.trials_in_last_hour = ring.iter().filter(|t| **t >= cutoff).count() as u32; s } /// Request a graceful stop. Returns immediately — the loop exits /// after its current trial completes. pub async fn stop(&self) -> bool { let mut guard = self.inner.stop_tx.lock().await; if let Some(tx) = guard.take() { let _ = tx.send(()); true } else { false } } } // -------- Agent state holder -------- /// Everything the agent needs to run a trial. Mirrors the fields of /// `VectorState` the agent actually uses. Kept separate so the service /// layer builds it explicitly — no clone of unneeded state. #[derive(Clone)] pub struct AgentDeps { pub store: Arc, pub ai_client: AiClient, pub catalog: CatalogRegistry, pub index_registry: IndexRegistry, pub hnsw_store: HnswStore, pub embedding_cache: EmbeddingCache, pub trial_journal: TrialJournal, pub promotion_registry: PromotionRegistry, } // -------- Spawn -------- /// Start the agent loop in a background tokio task. Returns a handle /// the caller uses to enqueue events and read status. pub fn spawn(config: AgentConfig, deps: AgentDeps) -> AgentHandle { let (trigger_tx, trigger_rx) = mpsc::channel::(256); let (stop_tx, stop_rx) = oneshot::channel::<()>(); let status = AgentStatus { running: config.enabled, config: config.clone(), queue_depth: 0, trials_run: 0, promotions: 0, trials_in_last_hour: 0, last_event: None, started_at: if config.enabled { Some(Utc::now()) } else { None }, }; let inner = Arc::new(AgentInner { status: RwLock::new(status), stop_tx: Mutex::new(Some(stop_tx)), queue_len: Mutex::new(0), recent_trials: Mutex::new(VecDeque::with_capacity(64)), }); if config.enabled { tracing::info!( "autotune agent started (cycle={}s, cooldown={}s, cap={}/hr, min_recall={})", config.cycle_interval_secs, config.cooldown_between_trials_secs, config.max_trials_per_hour, config.min_recall, ); let loop_inner = inner.clone(); let loop_deps = deps.clone(); let loop_config = config.clone(); tokio::spawn(async move { run_loop(loop_config, loop_deps, trigger_rx, stop_rx, loop_inner).await; }); } else { // Agent disabled — still drain the channel so sends don't back up. tokio::spawn(async move { let mut rx = trigger_rx; while rx.recv().await.is_some() {} }); tracing::info!("autotune agent configured but disabled (set [agent].enabled=true)"); } AgentHandle { trigger_tx, inner } } // -------- Main loop -------- async fn run_loop( config: AgentConfig, deps: AgentDeps, mut trigger_rx: mpsc::Receiver, mut stop_rx: oneshot::Receiver<()>, inner: Arc, ) { let mut periodic = tokio::time::interval(std::time::Duration::from_secs(config.cycle_interval_secs)); // First tick fires immediately — skip it so we don't double-fire on startup. periodic.tick().await; loop { let event = tokio::select! { _ = &mut stop_rx => { tracing::info!("autotune agent: stop signal received"); let mut s = inner.status.write().await; s.running = false; return; } maybe = trigger_rx.recv() => match maybe { Some(ev) => { let mut guard = inner.queue_len.lock().await; *guard = guard.saturating_sub(1); let mut s = inner.status.write().await; s.queue_depth = *guard; ev } None => { tracing::info!("autotune agent: trigger channel closed"); return; } }, _ = periodic.tick() => { // Periodic wake — pick an index with existing trials. // If nothing's been tuned yet there's nothing to propose. match pick_periodic_target(&deps).await { Some(idx) => TriggerEvent::periodic(idx), None => continue, } } }; // Rate limit: count recent trials, skip if over budget. if over_rate_limit(&inner, config.max_trials_per_hour).await { record_event(&inner, "skipped_rate_limit", Some(&event.index_name), format!("hit cap of {}/hour", config.max_trials_per_hour)).await; continue; } // Per-index cooldown. if cooling_down(&inner, &event.index_name, config.cooldown_between_trials_secs).await { record_event(&inner, "skipped_cooldown", Some(&event.index_name), format!("last trial too recent (<{}s)", config.cooldown_between_trials_secs)).await; continue; } // Run one trial. match run_one_cycle(&event, &deps, config.min_recall).await { Ok(outcome) => { mark_recent_trial(&inner).await; { let mut s = inner.status.write().await; s.trials_run += 1; if outcome.promoted { s.promotions += 1; } } record_event(&inner, if outcome.promoted { "promoted" } else { "trial_completed" }, Some(&event.index_name), format!("config=ec{}/es{} recall={:.3} p50={:.0}us {}", outcome.trial.config.ef_construction, outcome.trial.config.ef_search, outcome.trial.metrics.recall_at_k, outcome.trial.metrics.search_latency_p50_us, if outcome.promoted { "★ PROMOTED" } else { "" })).await; } Err(e) => { record_event(&inner, "trial_error", Some(&event.index_name), e).await; } } } } /// Result of one cycle — ran a trial, maybe promoted it. struct CycleOutcome { trial: Trial, promoted: bool, } /// Core cycle: propose → build → bench → record → maybe promote. async fn run_one_cycle( event: &TriggerEvent, deps: &AgentDeps, min_recall: f32, ) -> Result { // Read history. let history = deps.trial_journal.list(&event.index_name).await .map_err(|e| format!("read journal: {e}"))?; if history.is_empty() { return Err(format!( "no trials yet for '{}' — seed with at least one POST /hnsw/trial first", event.index_name, )); } // Current champion (if any) is the promoted config. let champion = deps.promotion_registry.get_current(&event.index_name).await; let champion_trial = champion.as_ref().and_then(|p| { history.iter().find(|t| t.id == p.trial_id).cloned() }); // Propose the next config. let Some(next_config) = propose_next_config(&history, champion_trial.as_ref()) else { return Err("proposer returned None — search space exhausted".into()); }; // Validate bounds defensively. if !(10..=400).contains(&next_config.ef_construction) { return Err(format!("proposed ef_construction={} out of bounds", next_config.ef_construction)); } if !(10..=200).contains(&next_config.ef_search) { return Err(format!("proposed ef_search={} out of bounds", next_config.ef_search)); } // Need a harness to measure. Use the most recent one in history. // (A future refinement: remember per-index "canonical harness" on // the index metadata. For now: latest wins.) let harness_name = history.last().unwrap().eval_set.clone(); let mut harness_set = harness::EvalSet::load(&deps.store, &harness_name).await .map_err(|e| format!("load harness '{harness_name}': {e}"))?; let embeddings = deps.embedding_cache.get_or_load(&event.index_name).await .map_err(|e| format!("embeddings: {e}"))?; if !harness_set.ground_truth_built { harness::compute_ground_truth(&mut harness_set, &embeddings, &deps.ai_client).await .map_err(|e| format!("ground truth: {e}"))?; harness_set.save(&deps.store).await.ok(); } // Build + bench. let trial_id = Trial::new_id(); let slot = format!("{}__{}", event.index_name, trial_id); let build = deps.hnsw_store .build_index_with_config(&slot, (*embeddings).clone(), &next_config) .await?; let query_vectors: Vec> = harness_set.queries .iter().filter_map(|q| q.query_embedding.clone()).collect(); let bench = deps.hnsw_store.bench_search(&slot, &query_vectors, harness_set.k).await?; let mut recalls = Vec::with_capacity(harness_set.queries.len()); for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) { if let Some(gt) = &q.ground_truth { recalls.push(harness::recall_at_k(hits, gt, harness_set.k)); } } let mean_recall = if recalls.is_empty() { 0.0 } else { recalls.iter().sum::() / recalls.len() as f32 }; let mut lats = bench.latencies_us.clone(); lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let p = |pct: f32| -> f32 { if lats.is_empty() { return 0.0; } let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize; lats[idx.min(lats.len() - 1)] }; let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0); let memory_bytes = (embeddings.len() * dims * std::mem::size_of::() + embeddings.len() * 128) as u64; let note = match &event.reason { TriggerReason::DatasetAppended { dataset } => format!("agent: dataset_appended({dataset})"), TriggerReason::Manual => "agent: manual".to_string(), TriggerReason::Periodic => "agent: periodic".to_string(), }; let trial = Trial { id: trial_id, index_name: event.index_name.clone(), eval_set: harness_set.name.clone(), config: next_config.clone(), metrics: TrialMetrics { build_time_secs: build.build_time_secs, search_latency_p50_us: p(0.50), search_latency_p95_us: p(0.95), search_latency_p99_us: p(0.99), recall_at_k: mean_recall, memory_bytes, vectors: build.vectors, eval_queries: harness_set.queries.len(), brute_force_latency_us: 0.0, }, created_at: Utc::now(), note: Some(note), }; deps.trial_journal.append(&trial).await.ok(); deps.hnsw_store.drop(&slot).await; // Promotion decision: the new trial must meet recall gate AND beat // the current champion (higher recall OR same recall + lower p50). let promoted = if trial.metrics.recall_at_k < min_recall { false } else { let beats = match &champion_trial { None => true, // no champion yet — anything passing the gate wins Some(c) => beats_champion(&trial, c), }; if beats { let entry = PromotionEntry { config: trial.config.clone(), trial_id: trial.id.clone(), promoted_at: Utc::now(), promoted_by: "agent".to_string(), note: Some(format!( "auto-promote: recall={:.3} p50={:.0}us (was {:.3}/{:.0}us)", trial.metrics.recall_at_k, trial.metrics.search_latency_p50_us, champion_trial.as_ref().map(|t| t.metrics.recall_at_k).unwrap_or(0.0), champion_trial.as_ref().map(|t| t.metrics.search_latency_p50_us).unwrap_or(0.0), )), }; deps.promotion_registry.promote(&event.index_name, entry).await.is_ok() } else { false } }; Ok(CycleOutcome { trial, promoted }) } /// Champion-beat test: strictly higher recall, OR equal recall with /// lower p50. Same rule as autotune::pick_winner — kept consistent so the /// agent and the synchronous autotune agree on what "better" means. fn beats_champion(candidate: &Trial, champion: &Trial) -> bool { if candidate.metrics.recall_at_k > champion.metrics.recall_at_k { return true; } if (candidate.metrics.recall_at_k - champion.metrics.recall_at_k).abs() < 1e-4 && candidate.metrics.search_latency_p50_us < champion.metrics.search_latency_p50_us { return true; } false } /// Propose the next HnswConfig given trial history and the current /// champion. /// /// ============================================================ /// J: THIS IS YOURS TO IMPLEMENT /// ============================================================ /// /// Inputs: /// - `history`: every trial ever run on this index, oldest first /// - `champion`: the currently-promoted trial, if any /// /// Output: /// - `Some(HnswConfig)` with the config to try next /// - `None` if you decide there's nothing worth trying (agent logs /// "search space exhausted" and moves on) /// /// Hard bounds the caller enforces: ef_construction ∈ [10, 400], /// ef_search ∈ [10, 200]. Stay inside those — configs outside get /// rejected and count as a wasted cycle. /// /// Design options (pick one, or mix): /// /// 1. LOCAL REFINEMENT (exploit-heavy): /// Sample near champion ± small delta. Converges fast, risks local /// minima. Good for "we know roughly where the sweet spot is." /// /// 2. ε-GREEDY (mixed): /// With prob ε, random sample from full bounds (explore). Otherwise /// refinement around champion (exploit). ε=0.2 is a reasonable start. /// Good for long-running tune with no prior knowledge. /// /// 3. COARSE→FINE (annealed): /// First N trials: wide random. Then shrink the neighborhood around /// champion as more trials accumulate. Mimics simulated annealing. /// /// 4. DEDUP-AWARE: /// Whatever strategy, skip configs already in history. Prevents the /// agent from re-running the same (ec, es) pair twice. /// /// A simple starter implementation is provided below (local refinement /// + dedup). Replace with your preferred strategy. pub fn propose_next_config(history: &[Trial], champion: Option<&Trial>) -> Option { // ε-greedy around the champion, dedup-aware. // // - With probability ε (≈0.25), sample a random config from the full // bounds. Keeps exploration alive so we don't get stuck hill-climbing // one axis. // - Otherwise: perturb the champion symmetrically on BOTH axes (not // just +20 / +40 like the starter did). Prefers small moves first // so recall stays near the current level. // - Always skip configs already in history — no point re-running. // - Deterministic per-history: RNG is seeded from history length so // the same journal state always proposes the same next config. // Makes tests + offline replay reproducible. let base = champion .map(|t| t.config.clone()) .or_else(|| history.last().map(|t| t.config.clone())) .unwrap_or_default(); let tried = |ec: usize, es: usize| -> bool { history.iter().any(|t| t.config.ef_construction == ec && t.config.ef_search == es ) }; let clamp = |ec: i32, es: i32| -> (usize, usize) { (ec.clamp(10, 400) as usize, es.clamp(10, 200) as usize) }; // Tiny xorshift — no rand crate dep. Seeded from history length so the // proposer is deterministic for a given journal state. let mut rng = (history.len() as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15) | 1; let mut next_u = || { rng ^= rng << 13; rng ^= rng >> 7; rng ^= rng << 17; rng }; for _attempt in 0..32 { let explore = (next_u() % 100) < 25; // ε = 0.25 let (ec, es) = if explore { clamp(10 + (next_u() % 391) as i32, 10 + (next_u() % 191) as i32) } else { // Symmetric perturbation — signed steps on both axes. let dec = [-40, -20, -10, 10, 20, 40][(next_u() % 6) as usize]; let des = [-20, -10, -5, 5, 10, 20][(next_u() % 6) as usize]; clamp(base.ef_construction as i32 + dec, base.ef_search as i32 + des) }; if !tried(ec, es) { return Some(HnswConfig { ef_construction: ec, ef_search: es, seed: Some(42) }); } } None // 32 attempts all landed on duplicates — likely saturated } // -------- Helpers -------- /// Find an index to poke on a periodic wake. Strategy: the one with the /// most recent trial. If nothing's been trialed yet, return None. async fn pick_periodic_target(deps: &AgentDeps) -> Option { // `/agent` runs against any index that has a trial journal. We don't // have a "list all journals" helper, so we derive candidates from the // promotion registry (indexes with a human ever promoting are live). let promos = deps.promotion_registry.list_all().await.ok()?; // Prefer the one most recently promoted — it's the one a human cares // about right now. promos.into_iter() .filter_map(|f| f.current.map(|c| (f.index_name, c.promoted_at))) .max_by_key(|(_, at)| *at) .map(|(name, _)| name) } async fn over_rate_limit(inner: &Arc, cap: u32) -> bool { let cutoff = Utc::now() - chrono::Duration::hours(1); let ring = inner.recent_trials.lock().await; ring.iter().filter(|t| **t >= cutoff).count() as u32 >= cap } async fn cooling_down(inner: &Arc, _index: &str, cooldown_secs: u64) -> bool { // Minimal impl: gate on global most-recent trial rather than per-index. // Per-index cooldown would be easy to add — keep a HashMap // — but for Phase 16.2 MVP, global is fine. Ollama is the shared resource. let ring = inner.recent_trials.lock().await; if let Some(last) = ring.back() { let since = Utc::now().signed_duration_since(*last); return since < chrono::Duration::seconds(cooldown_secs as i64); } false } async fn mark_recent_trial(inner: &Arc) { let mut ring = inner.recent_trials.lock().await; ring.push_back(Utc::now()); // Keep bounded. while ring.len() > 256 { ring.pop_front(); } } async fn record_event( inner: &Arc, kind: &str, index: Option<&str>, detail: String, ) { tracing::info!("agent: {} {}{}", kind, index.map(|i| format!("[{i}] ")).unwrap_or_default(), detail); let mut s = inner.status.write().await; s.last_event = Some(AgentEvent { at: Utc::now(), kind: kind.to_string(), index_name: index.map(String::from), detail, }); } #[cfg(test)] mod tests { use super::*; fn mk_trial(ec: usize, es: usize, recall: f32, p50: f32) -> Trial { Trial { id: format!("t-{ec}-{es}"), index_name: "test".into(), eval_set: "eval".into(), config: HnswConfig { ef_construction: ec, ef_search: es, seed: Some(42) }, metrics: TrialMetrics { build_time_secs: 1.0, search_latency_p50_us: p50, search_latency_p95_us: p50 * 1.5, search_latency_p99_us: p50 * 2.0, recall_at_k: recall, memory_bytes: 0, vectors: 1000, eval_queries: 10, brute_force_latency_us: 100.0, }, created_at: Utc::now(), note: None, } } #[test] fn propose_skips_duplicates() { let hist = vec![ mk_trial(80, 30, 1.0, 500.0), mk_trial(100, 30, 1.0, 520.0), // ec+20 ]; let next = propose_next_config(&hist, Some(&hist[0])).unwrap(); // ec+20 is taken, so the proposer should skip it. assert!(next.ef_construction != 100 || next.ef_search != 30); } #[test] fn beats_champion_strict_recall() { let champ = mk_trial(80, 30, 0.95, 500.0); let better_recall = mk_trial(80, 30, 0.99, 600.0); let worse_recall = mk_trial(80, 30, 0.90, 100.0); assert!(beats_champion(&better_recall, &champ)); assert!(!beats_champion(&worse_recall, &champ)); } #[test] fn beats_champion_same_recall_lower_latency() { let champ = mk_trial(80, 30, 1.0, 500.0); let faster = mk_trial(60, 30, 1.0, 400.0); let slower = mk_trial(60, 30, 1.0, 600.0); assert!(beats_champion(&faster, &champ)); assert!(!beats_champion(&slower, &champ)); } }