/// Trial journal for HNSW parameter tuning. /// /// Every HNSW build+eval is recorded as a Trial. The journal is append-only /// and stored under `_hnsw_trials/{index_name}/` as batched JSONL files — /// an AI agent iterating on configs reads prior trials to decide what to /// try next, and writes a new trial on each attempt. /// /// Storage uses the shared `storaged::append_log::AppendLog` so appends are /// write-once (new file per batch) rather than rewriting a single growing /// JSONL on every event. See `append_log.rs` for the full rationale. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; use storaged::append_log::AppendLog; /// HNSW build/search parameters the agent can tune. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct HnswConfig { pub ef_construction: usize, pub ef_search: usize, #[serde(default)] pub seed: Option, } impl Default for HnswConfig { /// Production default, locked in 2026-04-16 based on trial grid against /// resumes_100k_v2 (100K vectors, 20 queries, recall@10): /// ec=20 es=30 → recall 0.960, p50 509us, build 8s /// ec=80 es=30 → recall 1.000, p50 873us, build 230s ← sweet spot /// ec=200 es=30 → recall 1.000, p50 874us, build 106s (no recall gain) /// /// `ec=80` is the smallest value that reaches 100% recall. Higher values /// waste build time. `es=30` gives faster search than `es=100` with no /// recall penalty at this scale. fn default() -> Self { Self { ef_construction: 80, ef_search: 30, seed: None, } } } /// Metrics collected on every trial. All latencies in microseconds. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TrialMetrics { pub build_time_secs: f32, pub search_latency_p50_us: f32, pub search_latency_p95_us: f32, pub search_latency_p99_us: f32, pub recall_at_k: f32, pub memory_bytes: u64, pub vectors: usize, pub eval_queries: usize, /// Brute-force latency for comparison — how much speedup did HNSW buy us? pub brute_force_latency_us: f32, } /// A single tuning attempt. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Trial { pub id: String, pub index_name: String, pub eval_set: String, pub config: HnswConfig, pub metrics: TrialMetrics, pub created_at: DateTime, /// Free-form note — the agent can record why it tried this config. #[serde(default)] pub note: Option, } impl Trial { pub fn new_id() -> String { format!( "trial-{}-{}", Utc::now().timestamp_millis(), &uuid::Uuid::new_v4().to_string()[..8] ) } } /// Per-index append log, lazy-created on first write. #[derive(Clone)] pub struct TrialJournal { store: Arc, /// Cache per-index AppendLog instances so the in-memory buffer persists /// across calls. logs: Arc>>>, } impl TrialJournal { pub fn new(store: Arc) -> Self { Self { store, logs: Arc::new(RwLock::new(HashMap::new())), } } fn prefix(index_name: &str) -> String { format!("_hnsw_trials/{}", index_name) } async fn log_for(&self, index_name: &str) -> Arc { if let Some(log) = self.logs.read().await.get(index_name) { return log.clone(); } let mut guard = self.logs.write().await; if let Some(log) = guard.get(index_name) { return log.clone(); } // Trials arrive one at a time during human/agent iteration — a low // threshold gives "hit /trials and see my latest attempt" immediacy // without creating one file per event. let log = Arc::new( AppendLog::new(self.store.clone(), Self::prefix(index_name)) .with_flush_threshold(4), ); guard.insert(index_name.to_string(), log.clone()); log } /// Append a trial record. In-memory buffered; persisted in batches. pub async fn append(&self, trial: &Trial) -> Result<(), String> { let line = serde_json::to_vec(trial).map_err(|e| e.to_string())?; let log = self.log_for(&trial.index_name).await; log.append(line).await } /// Read all trials for an index (flushed batches + unflushed buffer). pub async fn list(&self, index_name: &str) -> Result, String> { let log = self.log_for(index_name).await; let lines = log.read_all().await?; let mut trials = Vec::with_capacity(lines.len()); for line in lines { match serde_json::from_slice::(&line) { Ok(t) => trials.push(t), Err(e) => tracing::warn!("trial journal: skip malformed line: {e}"), } } Ok(trials) } /// Explicit flush for callers that want write-through semantics /// (e.g. an agent that wants to commit a trial before querying stats). pub async fn flush(&self, index_name: &str) -> Result<(), String> { let log = self.log_for(index_name).await; log.flush().await } /// Compact all batch files for an index into one. pub async fn compact(&self, index_name: &str) -> Result { let log = self.log_for(index_name).await; log.compact().await } /// Current champion for an index by the named metric. /// Valid metrics: `recall`, `latency`, `pareto`. /// /// The `pareto` strategy is a placeholder — J should tune the scoring /// function to match what matters in production. Right now it's a simple /// weighted sum. pub async fn best( &self, index_name: &str, metric: &str, ) -> Result, String> { let trials = self.list(index_name).await?; if trials.is_empty() { return Ok(None); } let best = match metric { "recall" => trials .into_iter() .max_by(|a, b| { a.metrics .recall_at_k .partial_cmp(&b.metrics.recall_at_k) .unwrap_or(std::cmp::Ordering::Equal) }) .unwrap(), "latency" => trials .into_iter() .min_by(|a, b| { a.metrics .search_latency_p95_us .partial_cmp(&b.metrics.search_latency_p95_us) .unwrap_or(std::cmp::Ordering::Equal) }) .unwrap(), "pareto" | _ => trials .into_iter() .max_by(|a, b| pareto_score(a).partial_cmp(&pareto_score(b)).unwrap()) .unwrap(), }; Ok(Some(best)) } } /// Simple Pareto-style score: reward recall, penalize p95 latency. /// Tunable — J should swap this in production to match what matters. fn pareto_score(t: &Trial) -> f32 { // Recall is [0, 1]. Latency is us — assume 100us baseline. let recall = t.metrics.recall_at_k; let latency_penalty = (t.metrics.search_latency_p95_us / 1000.0).min(1.0); // cap at 1ms recall - 0.2 * latency_penalty }