/// Trial journal for HNSW parameter tuning. /// /// Every HNSW build+eval is recorded as a Trial. The journal is append-only /// and stored under `_hnsw_trials/{index_name}/` as batched JSONL files — /// an AI agent iterating on configs reads prior trials to decide what to /// try next, and writes a new trial on each attempt. /// /// Storage uses the shared `storaged::append_log::AppendLog` so appends are /// write-once (new file per batch) rather than rewriting a single growing /// JSONL on every event. See `append_log.rs` for the full rationale. use chrono::{DateTime, Utc}; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; use storaged::append_log::AppendLog; use storaged::registry::BucketRegistry; use crate::index_registry::IndexRegistry; /// HNSW build/search parameters the agent can tune. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct HnswConfig { pub ef_construction: usize, pub ef_search: usize, #[serde(default)] pub seed: Option, } impl Default for HnswConfig { /// Production default, locked in 2026-04-16 based on trial grid against /// resumes_100k_v2 (100K vectors, 20 queries, recall@10): /// ec=20 es=30 → recall 0.960, p50 509us, build 8s /// ec=80 es=30 → recall 1.000, p50 873us, build 230s ← sweet spot /// ec=200 es=30 → recall 1.000, p50 874us, build 106s (no recall gain) /// /// `ec=80` is the smallest value that reaches 100% recall. Higher values /// waste build time. `es=30` gives faster search than `es=100` with no /// recall penalty at this scale. fn default() -> Self { Self { ef_construction: 80, ef_search: 30, seed: None, } } } /// Metrics collected on every trial. All latencies in microseconds. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TrialMetrics { pub build_time_secs: f32, pub search_latency_p50_us: f32, pub search_latency_p95_us: f32, pub search_latency_p99_us: f32, pub recall_at_k: f32, pub memory_bytes: u64, pub vectors: usize, pub eval_queries: usize, /// Brute-force latency for comparison — how much speedup did HNSW buy us? pub brute_force_latency_us: f32, } /// A single tuning attempt. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Trial { pub id: String, pub index_name: String, pub eval_set: String, pub config: HnswConfig, pub metrics: TrialMetrics, pub created_at: DateTime, /// Free-form note — the agent can record why it tried this config. #[serde(default)] pub note: Option, } impl Trial { pub fn new_id() -> String { format!( "trial-{}-{}", Utc::now().timestamp_millis(), &uuid::Uuid::new_v4().to_string()[..8] ) } } /// Per-index append log, lazy-created on first write. /// /// Federation layer 2: the journal resolves each index's bucket from the /// index registry and writes its JSONL batches to THAT bucket, not /// primary. Back-compat is preserved by `IndexMeta::bucket` defaulting /// to "primary" for pre-federation indexes. Indexes the registry has /// never heard of (edge case — trials run before first register) fall /// through to primary as well. #[derive(Clone)] pub struct TrialJournal { buckets: Arc, index_registry: IndexRegistry, /// Cache per (bucket, index) AppendLog so the in-memory buffer persists /// across calls. Keyed by `(bucket, index_name)` so moving an index /// between buckets is clean — the old journal stays intact. logs: Arc>>>, } impl TrialJournal { pub fn new(buckets: Arc, index_registry: IndexRegistry) -> Self { Self { buckets, index_registry, logs: Arc::new(RwLock::new(HashMap::new())), } } fn prefix(index_name: &str) -> String { format!("_hnsw_trials/{}", index_name) } /// Resolve which bucket holds this index's trial artifacts. /// Falls back to primary for indexes without recorded metadata. async fn bucket_for(&self, index_name: &str) -> String { self.index_registry .get(index_name) .await .map(|m| m.bucket) .unwrap_or_else(|| "primary".to_string()) } async fn log_for(&self, index_name: &str) -> Result, String> { let bucket = self.bucket_for(index_name).await; let key = (bucket.clone(), index_name.to_string()); if let Some(log) = self.logs.read().await.get(&key) { return Ok(log.clone()); } let mut guard = self.logs.write().await; if let Some(log) = guard.get(&key) { return Ok(log.clone()); } let store = self.buckets.get(&bucket)?; // Trials arrive one at a time during human/agent iteration — a low // threshold gives "hit /trials and see my latest attempt" immediacy // without creating one file per event. let log = Arc::new( AppendLog::new(store, Self::prefix(index_name)) .with_flush_threshold(4), ); guard.insert(key, log.clone()); Ok(log) } /// Append a trial record. In-memory buffered; persisted in batches. pub async fn append(&self, trial: &Trial) -> Result<(), String> { let line = serde_json::to_vec(trial).map_err(|e| e.to_string())?; let log = self.log_for(&trial.index_name).await?; log.append(line).await } /// Read all trials for an index (flushed batches + unflushed buffer). pub async fn list(&self, index_name: &str) -> Result, String> { let log = self.log_for(index_name).await?; let lines = log.read_all().await?; let mut trials = Vec::with_capacity(lines.len()); for line in lines { match serde_json::from_slice::(&line) { Ok(t) => trials.push(t), Err(e) => tracing::warn!("trial journal: skip malformed line: {e}"), } } Ok(trials) } /// Explicit flush for callers that want write-through semantics /// (e.g. an agent that wants to commit a trial before querying stats). pub async fn flush(&self, index_name: &str) -> Result<(), String> { let log = self.log_for(index_name).await?; log.flush().await } /// Compact all batch files for an index into one. pub async fn compact(&self, index_name: &str) -> Result { let log = self.log_for(index_name).await?; log.compact().await } /// Current champion for an index by the named metric. /// Valid metrics: `recall`, `latency`, `pareto`. /// /// The `pareto` strategy is a placeholder — J should tune the scoring /// function to match what matters in production. Right now it's a simple /// weighted sum. pub async fn best( &self, index_name: &str, metric: &str, ) -> Result, String> { let trials = self.list(index_name).await?; if trials.is_empty() { return Ok(None); } let best = match metric { "recall" => trials .into_iter() .max_by(|a, b| { a.metrics .recall_at_k .partial_cmp(&b.metrics.recall_at_k) .unwrap_or(std::cmp::Ordering::Equal) }) .unwrap(), "latency" => trials .into_iter() .min_by(|a, b| { a.metrics .search_latency_p95_us .partial_cmp(&b.metrics.search_latency_p95_us) .unwrap_or(std::cmp::Ordering::Equal) }) .unwrap(), "pareto" | _ => trials .into_iter() .max_by(|a, b| pareto_score(a).partial_cmp(&pareto_score(b)).unwrap()) .unwrap(), }; Ok(Some(best)) } } /// Simple Pareto-style score: reward recall, penalize p95 latency. /// Tunable — J should swap this in production to match what matters. fn pareto_score(t: &Trial) -> f32 { // Recall is [0, 1]. Latency is us — assume 100us baseline. let recall = t.metrics.recall_at_k; let latency_penalty = (t.metrics.search_latency_p95_us / 1000.0).min(1.0); // cap at 1ms recall - 0.2 * latency_penalty }