lakehouse/crates/vectord/src/promotion.rs

//! Phase 16: Promoted HNSW configs — the "active generation" pointer.
//!
//! An index's HNSW config used at build time normally defaults to the
//! system-wide default (`HnswConfig::default()`). An operator or the
//! autotune agent can *promote* a specific trial's config — subsequent
//! HNSW builds against that index use the promoted config instead.
//!
//! Every promotion is history-tracked so `rollback` can revert. The
//! history file lives at `primary://_hnsw_promotions/{index_name}.json`
//! and is small (< few KB) so we rewrite it on every promotion rather
//! than append-log.
//!
//! Not included here:
//! - Atomic graph rebuild on promote — promotion only updates the sticky
//!   default. Next activation (or search that triggers lazy build) picks
//!   up the new config. That's "zero-downtime swap after build" which is
//!   what ADR-019 actually claimed; an instant-swap requires a
//!   pre-built graph pool which we don't have yet.
//! - Agent loop — lives in `vectord::autotune`.

use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use storaged::ops;
use storaged::registry::BucketRegistry;
use tokio::sync::RwLock;

use crate::index_registry::IndexRegistry;
use crate::trial::HnswConfig;

const PROMOTION_PREFIX: &str = "_hnsw_promotions";

/// One promotion record. The `trial_id` is the origin of the config —
/// lets operators trace back "why was this config picked?" to the exact
/// trial in the trial journal.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PromotionEntry {
    pub config: HnswConfig,
    pub trial_id: String,
    pub promoted_at: DateTime<Utc>,
    #[serde(default)]
    pub promoted_by: String,
    #[serde(default)]
    pub note: Option<String>,
}

/// Serialized form of an index's promotion history.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PromotionFile {
    pub index_name: String,
    pub current: Option<PromotionEntry>,
    #[serde(default)]
    pub history: Vec<PromotionEntry>,
}

#[derive(Clone)]
pub struct PromotionRegistry {
    buckets: Arc<BucketRegistry>,
    index_registry: IndexRegistry,
    cache: Arc<RwLock<HashMap<String, PromotionFile>>>,
}

impl PromotionRegistry {
    pub fn new(buckets: Arc<BucketRegistry>, index_registry: IndexRegistry) -> Self {
        Self {
            buckets,
            index_registry,
            cache: Arc::new(RwLock::new(HashMap::new())),
        }
    }

    fn key(index_name: &str) -> String {
        // Sanitize for object-store safety.
        let safe: String = index_name
            .chars()
            .map(|c| if c.is_ascii_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
            .collect();
        format!("{PROMOTION_PREFIX}/{safe}.json")
    }

    /// Resolve which bucket's store holds this index's promotion file.
    /// Same rules as TrialJournal::bucket_for — follows IndexMeta.bucket,
    /// defaults to primary when metadata is missing.
    async fn store_for(&self, index_name: &str) -> Result<Arc<dyn object_store::ObjectStore>, String> {
        let bucket = self.index_registry
            .get(index_name)
            .await
            .map(|m| m.bucket)
            .unwrap_or_else(|| "primary".to_string());
        self.buckets.get(&bucket)
    }

    /// Load (and cache) the promotion file for an index.
    pub async fn load(&self, index_name: &str) -> Result<PromotionFile, String> {
        if let Some(cached) = self.cache.read().await.get(index_name) {
            return Ok(cached.clone());
        }
        let store = self.store_for(index_name).await?;
        let key = Self::key(index_name);
        let file = match ops::get(&store, &key).await {
            Ok(bytes) => serde_json::from_slice::<PromotionFile>(&bytes)
                .map_err(|e| format!("parse promotion file: {e}"))?,
            Err(_) => PromotionFile {
                index_name: index_name.to_string(),
                current: None,
                history: Vec::new(),
            },
        };
        self.cache.write().await.insert(index_name.to_string(), file.clone());
        Ok(file)
    }

    /// Promote a config to the active slot. Pushes the current promotion
    /// (if any) onto the history stack. Persists before returning — the
    /// config is durable by the time this call completes.
    pub async fn promote(
        &self,
        index_name: &str,
        entry: PromotionEntry,
    ) -> Result<PromotionFile, String> {
        let mut file = self.load(index_name).await?;
        if let Some(prior) = file.current.take() {
            file.history.push(prior);
            // Cap history to something sensible so this file doesn't grow
            // unbounded. 50 entries = 50 promotions — way more than any
            // sane workflow needs.
            const HISTORY_CAP: usize = 50;
            if file.history.len() > HISTORY_CAP {
                let drop = file.history.len() - HISTORY_CAP;
                file.history.drain(0..drop);
            }
        }
        file.current = Some(entry);
        file.index_name = index_name.to_string();

        let store = self.store_for(index_name).await?;
        let key = Self::key(index_name);
        let json = serde_json::to_vec_pretty(&file).map_err(|e| e.to_string())?;
        ops::put(&store, &key, json.into()).await?;

        self.cache.write().await.insert(index_name.to_string(), file.clone());
        tracing::info!(
            "promoted '{}' to config {:?} (trial={})",
            index_name, file.current.as_ref().unwrap().config, file.current.as_ref().unwrap().trial_id,
        );
        Ok(file)
    }

    /// Pop the latest promotion back onto the current slot (if any
    /// history exists). If current is set but history is empty, the
    /// current promotion is cleared — the index falls back to defaults.
    pub async fn rollback(&self, index_name: &str) -> Result<PromotionFile, String> {
        let mut file = self.load(index_name).await?;
        match file.history.pop() {
            Some(prev) => {
                file.current = Some(prev);
            }
            None => {
                if file.current.is_none() {
                    return Err(format!("no promotion to rollback for '{index_name}'"));
                }
                file.current = None;
            }
        }
        let store = self.store_for(index_name).await?;
        let key = Self::key(index_name);
        let json = serde_json::to_vec_pretty(&file).map_err(|e| e.to_string())?;
        ops::put(&store, &key, json.into()).await?;
        self.cache.write().await.insert(index_name.to_string(), file.clone());
        tracing::info!("rolled back promotion for '{}'", index_name);
        Ok(file)
    }

    /// Get the currently-promoted config (if any). Callers use this to
    /// pick the right HnswConfig at build time.
    pub async fn get_current(&self, index_name: &str) -> Option<PromotionEntry> {
        self.load(index_name).await.ok().and_then(|f| f.current)
    }

    /// Convenience: return the promoted config or the provided default.
    pub async fn config_or(&self, index_name: &str, default: HnswConfig) -> HnswConfig {
        match self.get_current(index_name).await {
            Some(entry) => entry.config,
            None => default,
        }
    }

    /// List every index that has a promotion recorded (for operator UI).
    ///
    /// Federation: scans EVERY registered bucket for promotion files.
    /// Per-profile buckets each have their own `_hnsw_promotions/` so we
    /// aggregate across them. Dedups by index_name — if the same index
    /// somehow has promotion files in multiple buckets, the one from the
    /// bucket recorded in IndexMeta wins.
    pub async fn list_all(&self) -> Result<Vec<PromotionFile>, String> {
        let bucket_infos = self.buckets.list().await;
        let mut by_name: HashMap<String, PromotionFile> = HashMap::new();

        for b in &bucket_infos {
            let store = match self.buckets.get(&b.name) {
                Ok(s) => s,
                Err(_) => continue,
            };
            let keys = ops::list(&store, Some(&format!("{PROMOTION_PREFIX}/")))
                .await.unwrap_or_default();
            for key in keys {
                if !key.ends_with(".json") { continue; }
                let bytes = match ops::get(&store, &key).await {
                    Ok(b) => b,
                    Err(_) => continue,
                };
                if let Ok(f) = serde_json::from_slice::<PromotionFile>(&bytes) {
                    by_name.insert(f.index_name.clone(), f);
                }
            }
        }
        Ok(by_name.into_values().collect())
    }
}