lakehouse/crates/vectord/src/service.rs

use axum::{
    Json, Router,
    extract::{Path, Query, State},
    http::StatusCode,
    response::IntoResponse,
    routing::{get, post},
};
use object_store::ObjectStore;
use serde::{Deserialize, Serialize};
use std::sync::Arc;

use aibridge::client::{AiClient, EmbedRequest, GenerateRequest};
use catalogd::registry::Registry as CatalogRegistry;
use storaged::registry::BucketRegistry;
use crate::{agent, autotune, chunker, embedding_cache, harness, hnsw, index_registry, jobs, lance_backend, playbook_memory, promotion, rag, refresh, search, store, supervisor, trial};

#[derive(Clone)]
pub struct VectorState {
    pub store: Arc<dyn ObjectStore>,
    pub ai_client: AiClient,
    pub job_tracker: jobs::JobTracker,
    pub index_registry: index_registry::IndexRegistry,
    pub hnsw_store: hnsw::HnswStore,
    pub embedding_cache: embedding_cache::EmbeddingCache,
    pub trial_journal: trial::TrialJournal,
    /// Federation-aware harness store — resolves eval artifacts to each
    /// index's recorded bucket, falling back to primary for legacy evals.
    pub harness_store: harness::HarnessStore,
    /// Catalog registry — needed by the Phase C refresh path to mark/clear
    /// staleness and look up dataset manifests.
    pub catalog: CatalogRegistry,
    /// Phase 16: promoted HNSW configs. Activation + autotune read/write here.
    pub promotion_registry: promotion::PromotionRegistry,
    /// Phase 16.2: handle to the background autotune agent. Always
    /// present — if the agent is disabled in config, the handle drops
    /// incoming triggers silently.
    pub agent_handle: agent::AgentHandle,
    /// Phase B (federation layer 2): bucket registry for per-profile
    /// bucket auto-provisioning on activation.
    pub bucket_registry: Arc<BucketRegistry>,
    /// Phase C (two-profile VRAM gate): tracks which profile is currently
    /// "active" on the GPU. Singleton — one profile at a time holds its
    /// model in VRAM. Swapping profiles with different ollama_name unloads
    /// the previous one (keep_alive=0) before preloading the new one.
    ///
    /// `None` = no profile has been activated this session; any first
    /// activation just preloads and takes the slot.
    pub active_profile: Arc<tokio::sync::RwLock<Option<ActiveProfileSlot>>>,
    /// ADR-019 hybrid: handles to Lance datasets keyed by index name.
    /// Lazy-created on first /vectors/lance/* call.
    pub lance: lance_backend::LanceRegistry,
    /// Phase 19 — meta-index feedback. Embeds past successful_playbooks
    /// and, when `use_playbook_memory` is set on /vectors/hybrid, boosts
    /// workers that were actually filled in semantically-similar past ops.
    pub playbook_memory: playbook_memory::PlaybookMemory,
}

/// What the active-profile singleton records. Narrow — we don't need the
/// full ModelProfile here, just enough to know what to unload on swap.
#[derive(Debug, Clone, Serialize)]
pub struct ActiveProfileSlot {
    pub profile_id: String,
    pub ollama_name: String,
    pub activated_at: chrono::DateTime<chrono::Utc>,
}

pub fn router(state: VectorState) -> Router {
    Router::new()
        .route("/health", get(health))
        .route("/index", post(create_index))
        .route("/indexes", get(list_indexes))
        .route("/indexes/{name}", get(get_index_meta))
        .route("/indexes/{name}/bucket", axum::routing::patch(migrate_index_bucket))
        .route("/jobs", get(list_jobs))
        .route("/jobs/{id}", get(get_job))
        .route("/search", post(search_index))
        .route("/rag", post(rag_query))
        .route("/hybrid", post(hybrid_search))
        .route("/hnsw/build", post(build_hnsw))
        .route("/hnsw/search", post(search_hnsw))
        .route("/hnsw/list", get(list_hnsw))
        // Trial system — parameterized tuning loop
        .route("/hnsw/trial", post(run_trial))
        .route("/hnsw/trials/{index_name}", get(list_trials))
        .route("/hnsw/trials/{index_name}/best", get(best_trial))
        // Eval sets
        .route("/hnsw/evals", get(list_evals))
        .route("/hnsw/evals/{name}", get(get_eval).put(put_eval))
        .route("/hnsw/evals/{name}/autogen", post(autogen_eval))
        // Cache management
        .route("/hnsw/cache/stats", get(cache_stats))
        .route("/hnsw/cache/{index_name}", axum::routing::delete(cache_evict))
        // Phase C: embedding refresh
        .route("/refresh/{dataset_name}", post(refresh_dataset))
        .route("/stale", get(list_stale))
        // Phase 17: profile activation — pre-load caches + HNSW for this
        // model's bound data. First search after activate is warm.
        .route("/profile/{id}/activate", post(activate_profile))
        .route("/profile/{id}/deactivate", post(deactivate_profile))
        .route("/profile/{id}/search", post(profile_scoped_search))
        // Phase 17 VRAM gate: which profile currently owns the GPU?
        .route("/profile/active", get(get_active_profile))
        // Phase 16: promotion + autotune
        .route("/hnsw/promote/{index}/{trial_id}", post(promote_trial))
        .route("/hnsw/rollback/{index}", post(rollback_promotion))
        .route("/hnsw/promoted/{index}", get(get_promoted))
        .route("/hnsw/autotune", post(run_autotune_endpoint))
        // Phase 16.2: background autotune agent
        .route("/agent/status", get(agent_status))
        .route("/agent/stop", post(agent_stop))
        .route("/agent/enqueue/{index_name}", post(agent_enqueue))
        // ADR-019: Lance hybrid backend
        .route("/lance/migrate/{index_name}", post(lance_migrate))
        .route("/lance/index/{index_name}", post(lance_build_index))
        .route("/lance/search/{index_name}", post(lance_search))
        .route("/lance/doc/{index_name}/{doc_id}", get(lance_get_doc))
        .route("/lance/append/{index_name}", post(lance_append))
        .route("/lance/stats/{index_name}", get(lance_stats))
        .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index))
        .route("/lance/recall/{index_name}", post(lance_recall_harness))
        // Phase 19: playbook memory — the meta-index feedback loop
        .route("/playbook_memory/rebuild", post(rebuild_playbook_memory))
        .route("/playbook_memory/stats", get(playbook_memory_stats))
        .route("/playbook_memory/seed", post(seed_playbook_memory))
        .route("/playbook_memory/persist_sql", post(persist_playbook_memory_sql))
        .route("/playbook_memory/patterns", post(discover_playbook_patterns))
        .route("/playbook_memory/mark_failed", post(mark_playbook_failed))
        .route("/playbook_memory/retire", post(retire_playbook_memory))
        .route("/playbook_memory/revise", post(revise_playbook_memory))
        .route("/playbook_memory/history/{id}", get(playbook_memory_history))
        .route("/playbook_memory/status", get(playbook_memory_status))
        .with_state(state)
}

async fn health() -> &'static str {
    "vectord ok"
}

// --- Background Index Creation ---

#[derive(Deserialize)]
struct CreateIndexRequest {
    index_name: String,
    source: String,
    documents: Vec<DocInput>,
    chunk_size: Option<usize>,
    overlap: Option<usize>,
    /// Federation layer 2: optional bucket to hold this index's trial
    /// journal + promotion file. Defaults to "primary" — pre-existing
    /// clients that don't know about federation keep working unchanged.
    #[serde(default)]
    bucket: Option<String>,
}

#[derive(Deserialize)]
struct DocInput {
    id: String,
    text: String,
}

#[derive(Serialize)]
struct CreateIndexResponse {
    job_id: String,
    index_name: String,
    documents: usize,
    chunks: usize,
    message: String,
}

async fn create_index(
    State(state): State<VectorState>,
    Json(req): Json<CreateIndexRequest>,
) -> impl IntoResponse {
    let chunk_size = req.chunk_size.unwrap_or(500);
    let overlap = req.overlap.unwrap_or(50);

    // Chunk synchronously (fast)
    let doc_ids: Vec<String> = req.documents.iter().map(|d| d.id.clone()).collect();
    let texts: Vec<String> = req.documents.iter().map(|d| d.text.clone()).collect();
    let chunks = chunker::chunk_column(&req.source, &doc_ids, &texts, chunk_size, overlap);

    if chunks.is_empty() {
        return Err((StatusCode::BAD_REQUEST, "no text to index".to_string()));
    }

    let n_docs = req.documents.len();
    let n_chunks = chunks.len();
    let index_name = req.index_name.clone();
    let bucket = req.bucket.clone().unwrap_or_else(|| "primary".to_string());

    // Create job and return immediately
    let job_id = state.job_tracker.create(&index_name, n_chunks).await;
    tracing::info!("job {job_id}: indexing '{}' — {} docs → {} chunks (background)", index_name, n_docs, n_chunks);

    // Spawn supervised dual-pipeline embedding
    let tracker = state.job_tracker.clone();
    let ai_client = state.ai_client.clone();
    let obj_store = state.store.clone();
    let registry = state.index_registry.clone();
    let jid = job_id.clone();
    let source_name = req.source.clone();
    let idx_name = req.index_name.clone();

    tokio::spawn(async move {
        let start_time = std::time::Instant::now();
        let config = supervisor::SupervisorConfig::default();
        let result = supervisor::run_supervised(
            &jid, &idx_name, chunks, &ai_client, &obj_store, &tracker, config,
        ).await;
        match result {
            Ok(key) => {
                let elapsed = start_time.elapsed().as_secs_f32();
                let rate = if elapsed > 0.0 { n_chunks as f32 / elapsed } else { 0.0 };

                // Register index metadata with model version info
                let meta = index_registry::IndexMeta {
                    index_name: idx_name.clone(),
                    source: source_name,
                    model_name: "nomic-embed-text".to_string(), // from sidecar config
                    model_version: "latest".to_string(),
                    dimensions: 768,
                    chunk_count: n_chunks,
                    doc_count: n_docs,
                    chunk_size: chunk_size,
                    overlap: overlap,
                    storage_key: key.clone(),
                    created_at: chrono::Utc::now(),
                    build_time_secs: elapsed,
                    chunks_per_sec: rate,
                    bucket: bucket.clone(),
                    vector_backend: shared::types::VectorBackend::Parquet,
        id_prefix: None,
                };
                let _ = registry.register(meta).await;

                tracker.complete(&jid, key).await;
                tracing::info!("job {jid}: completed — {n_chunks} chunks in {elapsed:.0}s ({rate:.0}/sec)");
            }
            Err(e) => {
                tracker.fail(&jid, e.clone()).await;
                tracing::error!("job {jid}: failed — {e}");
            }
        }
    });

    Ok((StatusCode::ACCEPTED, Json(CreateIndexResponse {
        job_id,
        index_name: req.index_name,
        documents: n_docs,
        chunks: n_chunks,
        message: format!("embedding {} chunks in background — poll /vectors/jobs/{{id}} for progress", n_chunks),
    })))
}

// --- Index Registry ---

#[derive(Deserialize)]
struct IndexListQuery {
    source: Option<String>,
    model: Option<String>,
}

async fn list_indexes(
    State(state): State<VectorState>,
    Query(q): Query<IndexListQuery>,
) -> impl IntoResponse {
    let indexes = state.index_registry.list(q.source.as_deref(), q.model.as_deref()).await;
    Json(indexes)
}

async fn get_index_meta(
    State(state): State<VectorState>,
    Path(name): Path<String>,
) -> impl IntoResponse {
    match state.index_registry.get(&name).await {
        Some(meta) => Ok(Json(meta)),
        None => Err((StatusCode::NOT_FOUND, format!("index not found: {name}"))),
    }
}

#[derive(Deserialize)]
struct MigrateBucketRequest {
    dest_bucket: String,
    /// If true, delete artifacts from the source bucket after the pointer
    /// flip. Default false — keeping source copies means a failed migration
    /// is recoverable by editing IndexMeta.bucket back, and a successful
    /// migration leaves inspectable forensics until an operator sweeps.
    #[serde(default)]
    delete_source: bool,
}

#[derive(Serialize)]
struct MigrateBucketReport {
    index_name: String,
    source_bucket: String,
    dest_bucket: String,
    /// Artifact keys that were copied (or attempted). Order follows copy order.
    copied: Vec<String>,
    /// Artifact prefixes that had nothing to copy (optional files missing,
    /// trial journal empty, etc).
    skipped: Vec<String>,
    /// Subset of `copied` that was subsequently deleted from the source.
    deleted_source: Vec<String>,
    duration_secs: f32,
}

/// Move an index's artifacts from its current bucket to `dest_bucket`.
/// Parquet-backed indexes only — Lance migration needs URI rewriting that
/// isn't in scope for this endpoint. Copies the vector data, trial journal,
/// promotion file, and auto-generated harness; updates `IndexMeta.bucket`
/// last so a mid-flight failure leaves the index still usable at its
/// original location. Evicts the `EmbeddingCache` entry so the next load
/// re-reads from the new bucket.
async fn migrate_index_bucket(
    State(state): State<VectorState>,
    Path(name): Path<String>,
    Json(req): Json<MigrateBucketRequest>,
) -> Result<Json<MigrateBucketReport>, (StatusCode, String)> {
    let t0 = std::time::Instant::now();

    let mut meta = state
        .index_registry
        .get(&name)
        .await
        .ok_or_else(|| (StatusCode::NOT_FOUND, format!("index '{name}' not found")))?;

    if meta.vector_backend == shared::types::VectorBackend::Lance {
        return Err((
            StatusCode::BAD_REQUEST,
            "Lance-backed indexes cannot be migrated via this endpoint — \
             Lance URIs are bucket-specific; a separate migrate_lance tool \
             is needed".into(),
        ));
    }

    if !state.bucket_registry.contains(&req.dest_bucket) {
        return Err((
            StatusCode::BAD_REQUEST,
            format!("dest bucket '{}' not registered", req.dest_bucket),
        ));
    }

    let source_bucket = meta.bucket.clone();
    if source_bucket == req.dest_bucket {
        return Err((
            StatusCode::BAD_REQUEST,
            format!("source and dest are both '{source_bucket}' — nothing to migrate"),
        ));
    }

    let src = state
        .bucket_registry
        .get(&source_bucket)
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;
    let dst = state
        .bucket_registry
        .get(&req.dest_bucket)
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;

    let mut copied: Vec<String> = Vec::new();
    let mut skipped: Vec<String> = Vec::new();

    // 1. Vector data (single parquet file for this backend).
    copy_key(&src, &dst, &meta.storage_key)
        .await
        .map_err(|e| {
            (StatusCode::INTERNAL_SERVER_ERROR,
             format!("copy {}: {e}", meta.storage_key))
        })?;
    copied.push(meta.storage_key.clone());

    // 2. Trial journal batches — per-index directory of JSONL files.
    let trial_prefix = format!("_hnsw_trials/{name}/");
    let trial_keys = storaged::ops::list(&src, Some(&trial_prefix))
        .await
        .unwrap_or_default();
    if trial_keys.is_empty() {
        skipped.push(trial_prefix);
    }
    for k in &trial_keys {
        copy_key(&src, &dst, k)
            .await
            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("copy {k}: {e}")))?;
        copied.push(k.clone());
    }

    // 3. Promotion file (optional — absent for never-promoted indexes).
    let promo_key = format!("_hnsw_promotions/{name}.json");
    match copy_key(&src, &dst, &promo_key).await {
        Ok(()) => copied.push(promo_key),
        Err(_) => skipped.push(promo_key),
    }

    // 4. Auto-generated harness (optional — absent if agent never ran).
    let harness_key = format!("_hnsw_evals/{name}_auto.json");
    match copy_key(&src, &dst, &harness_key).await {
        Ok(()) => copied.push(harness_key),
        Err(_) => skipped.push(harness_key),
    }

    // 5. Pointer flip — IndexMeta.bucket now points at destination. This
    // is the commit point; earlier failures leave copies in dest but the
    // index still usable at source.
    meta.bucket = req.dest_bucket.clone();
    state
        .index_registry
        .register(meta)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("update meta: {e}")))?;

    // 6. Cache eviction — next load reads the new bucket's parquet.
    state.embedding_cache.evict(&name).await;

    // 7. Optional source cleanup.
    let mut deleted_source: Vec<String> = Vec::new();
    if req.delete_source {
        for k in &copied {
            if storaged::ops::delete(&src, k).await.is_ok() {
                deleted_source.push(k.clone());
            }
        }
    }

    Ok(Json(MigrateBucketReport {
        index_name: name,
        source_bucket,
        dest_bucket: req.dest_bucket,
        copied,
        skipped,
        deleted_source,
        duration_secs: t0.elapsed().as_secs_f32(),
    }))
}

/// Stream a single object from one bucket to another. Uses the existing
/// `storaged::ops` get + put primitives — no native copy in object_store
/// across heterogeneous backends (local ↔ S3), so an in-memory hop is
/// unavoidable. Bounded by individual object size, which for our parquet
/// + jsonl artifacts tops out around a few hundred MB.
async fn copy_key(
    src: &Arc<dyn ObjectStore>,
    dst: &Arc<dyn ObjectStore>,
    key: &str,
) -> Result<(), String> {
    let data = storaged::ops::get(src, key).await?;
    storaged::ops::put(dst, key, data).await
}

// --- unused legacy function below, kept for reference ---

#[allow(dead_code)]
/// Legacy single-pipeline embedding (replaced by supervisor).
async fn _run_embedding_job_legacy(
    job_id: &str,
    index_name: &str,
    chunks: &[chunker::TextChunk],
    ai_client: &AiClient,
    store: &Arc<dyn ObjectStore>,
    tracker: &jobs::JobTracker,
) -> Result<String, String> {
    let batch_size = 32;
    let mut all_vectors: Vec<Vec<f64>> = Vec::new();
    let start = std::time::Instant::now();

    for (i, batch) in chunks.chunks(batch_size).enumerate() {
        let texts: Vec<String> = batch.iter().map(|c| c.text.clone()).collect();

        let embed_resp = ai_client.embed(EmbedRequest {
            texts,
            model: None,
        }).await.map_err(|e| format!("embed batch {} error: {e}", i))?;

        all_vectors.extend(embed_resp.embeddings);

        // Update progress
        let elapsed = start.elapsed().as_secs_f32();
        let rate = if elapsed > 0.0 { all_vectors.len() as f32 / elapsed } else { 0.0 };
        tracker.update_progress(job_id, all_vectors.len(), rate).await;

        // Log every 100 batches
        if (i + 1) % 100 == 0 {
            let pct = (all_vectors.len() as f32 / chunks.len() as f32) * 100.0;
            let eta = if rate > 0.0 { (chunks.len() - all_vectors.len()) as f32 / rate } else { 0.0 };
            tracing::info!("job {job_id}: {}/{} chunks ({pct:.0}%), {rate:.0}/sec, ETA {eta:.0}s",
                all_vectors.len(), chunks.len());
        }
    }

    // Store
    let key = store::store_embeddings(store, index_name, chunks, &all_vectors).await?;
    Ok(key)
}

// --- Job Status ---

async fn list_jobs(State(state): State<VectorState>) -> impl IntoResponse {
    let jobs = state.job_tracker.list().await;
    Json(jobs)
}

async fn get_job(
    State(state): State<VectorState>,
    Path(id): Path<String>,
) -> impl IntoResponse {
    match state.job_tracker.get(&id).await {
        Some(job) => Ok(Json(job)),
        None => Err((StatusCode::NOT_FOUND, format!("job not found: {id}"))),
    }
}

// --- Search ---

#[derive(Deserialize)]
struct SearchRequest {
    index_name: String,
    query: String,
    top_k: Option<usize>,
}

#[derive(Serialize)]
struct SearchResponse {
    results: Vec<search::SearchResult>,
    query: String,
}

async fn search_index(
    State(state): State<VectorState>,
    Json(req): Json<SearchRequest>,
) -> impl IntoResponse {
    let top_k = req.top_k.unwrap_or(5);

    let embed_resp = state.ai_client.embed(EmbedRequest {
        texts: vec![req.query.clone()],
        model: None,
    }).await.map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed error: {e}")))?;

    if embed_resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "no embedding returned".to_string()));
    }

    let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    let embeddings = store::load_embeddings(&state.store, &req.index_name)
        .await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("index not found: {e}")))?;

    let results = search::search(&query_vec, &embeddings, top_k);

    Ok(Json(SearchResponse {
        results,
        query: req.query,
    }))
}

// --- RAG ---

#[derive(Deserialize)]
struct RagRequest {
    index_name: String,
    question: String,
    top_k: Option<usize>,
}

async fn rag_query(
    State(state): State<VectorState>,
    Json(req): Json<RagRequest>,
) -> impl IntoResponse {
    let top_k = req.top_k.unwrap_or(5);

    match rag::query(&req.question, &req.index_name, top_k, &state.store, &state.ai_client).await {
        Ok(resp) => Ok(Json(resp)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

// --- Hybrid SQL+Vector Search ---
//
// The fix for the core RAG gap: vector search alone can't do structured
// filtering (state, role, reliability threshold). SQL alone can't do
// semantic similarity ("who could handle this kind of work"). Hybrid
// does both: SQL narrows to structurally-valid candidates, vector
// ranks them by semantic relevance, LLM generates from verified context.

#[derive(Deserialize)]
struct HybridRequest {
    /// Natural language question — used for embedding + LLM generation.
    question: String,
    /// Vector index to search against.
    index_name: String,
    /// SQL WHERE clause to pre-filter. Applied against the index's source
    /// dataset. Example: "state = 'IL' AND reliability > 0.8"
    /// Safety: runs through DataFusion's parser so injection is bounded
    /// by what DataFusion accepts (no DDL, no writes).
    #[serde(default)]
    sql_filter: Option<String>,
    /// Dataset to run the SQL filter against. Defaults to the index's
    /// source if omitted.
    #[serde(default)]
    filter_dataset: Option<String>,
    /// Column in the SQL result that maps to the vector index's doc_id.
    /// Default: "worker_id" (for the Ethereal dataset) or "candidate_id".
    #[serde(default)]
    id_column: Option<String>,
    #[serde(default = "default_top_k")]
    top_k: usize,
    /// If true, generate an LLM answer from the matched context.
    /// If false, just return the ranked matches (faster, no Ollama gen).
    #[serde(default = "default_true")]
    generate: bool,
    /// Phase 19: consult `playbook_memory` and boost workers that past
    /// similar playbooks successfully filled. Off by default so current
    /// callers keep deterministic ranking; opt-in unlocks the feedback.
    #[serde(default)]
    use_playbook_memory: bool,
    /// Number of past playbooks to consider when `use_playbook_memory`
    /// is on. Ignored otherwise. Defaults to 5.
    #[serde(default)]
    playbook_memory_k: Option<usize>,
}

fn default_true() -> bool { true }

#[derive(serde::Serialize)]
struct HybridResponse {
    question: String,
    sql_filter: Option<String>,
    sql_matches: usize,
    vector_reranked: usize,
    method: String,
    answer: Option<String>,
    sources: Vec<HybridSource>,
    duration_ms: u64,
}

#[derive(serde::Serialize)]
struct HybridSource {
    doc_id: String,
    chunk_text: String,
    score: f32,
    sql_verified: bool,
    /// Phase 19: how much the playbook_memory boost lifted this hit's
    /// score. 0.0 when `use_playbook_memory=false` or no past playbook
    /// endorsed this worker.
    #[serde(default, skip_serializing_if = "is_zero")]
    playbook_boost: f32,
    /// playbook_ids whose endorsement contributed to `playbook_boost`.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    playbook_citations: Vec<String>,
}

fn is_zero(x: &f32) -> bool { x.abs() < 1e-6 }

async fn hybrid_search(
    State(state): State<VectorState>,
    Json(req): Json<HybridRequest>,
) -> impl IntoResponse {
    let t0 = std::time::Instant::now();

    // Step 1: If SQL filter provided, run it to get the set of valid IDs.
    let valid_ids: Option<std::collections::HashSet<String>> = if let Some(ref filter) = req.sql_filter {
        let index_meta = state.index_registry.get(&req.index_name).await;
        let dataset = req.filter_dataset.clone()
            .or_else(|| index_meta.map(|m| m.source.clone()))
            .unwrap_or_else(|| req.index_name.clone());
        let id_col = req.id_column.clone().unwrap_or_else(|| "worker_id".into());

        let sql = format!("SELECT CAST({id_col} AS VARCHAR) AS id FROM {dataset} WHERE {filter}");
        tracing::info!("hybrid: SQL filter → {sql}");

        // Use queryd through the catalog — same engine as /query/sql
        // Use the query engine to get JSON rows — avoids Arrow type
        // wrangling across DataFusion's Utf8View/StringViewArray variants.
        let engine = queryd::context::QueryEngine::new(
            state.catalog.clone(),
            state.bucket_registry.clone(),
            queryd::cache::MemCache::new(0),
        );
        match engine.query(&sql).await {
            Ok(batches) => {
                use arrow::array::{Array, AsArray};
                let mut ids = std::collections::HashSet::new();
                for batch in &batches {
                    if let Some(col) = batch.column_by_name("id") {
                        // DataFusion CAST(x AS VARCHAR) → StringViewArray.
                        // Try StringView first, then String, then Int.
                        if let Some(arr) = col.as_string_view_opt() {
                            for i in 0..arr.len() {
                                if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); }
                            }
                        } else if let Some(arr) = col.as_string_opt::<i32>() {
                            for i in 0..arr.len() {
                                if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); }
                            }
                        } else {
                            // Fallback: try as Int32/Int64 (if CAST didn't happen)
                            if let Some(arr) = col.as_any().downcast_ref::<arrow::array::Int32Array>() {
                                for i in 0..arr.len() {
                                    if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); }
                                }
                            } else if let Some(arr) = col.as_any().downcast_ref::<arrow::array::Int64Array>() {
                                for i in 0..arr.len() {
                                    if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); }
                                }
                            }
                        }
                    }
                }
                tracing::info!("hybrid: SQL filter returned {} IDs", ids.len());
                if ids.is_empty() { None } else { Some(ids) }
            }
            Err(e) => {
                return Err((StatusCode::BAD_REQUEST, format!("SQL filter error: {e}")));
            }
        }
    } else {
        None
    };

    // Step 2: Vector search — embed question, search index.
    let embed_resp = state.ai_client
        .embed(EmbedRequest { texts: vec![req.question.clone()], model: None })
        .await
        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?;
    if embed_resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "no embedding".into()));
    }
    let qv: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    // When SQL-filtered: use brute-force cosine over all embeddings,
    // then filter by SQL IDs, then take top_k. HNSW's ef_search caps
    // results at ~30, which is too few to reliably intersect with
    // narrow SQL filters. Brute-force on 10K vectors is ~50ms — fast
    // enough for the hybrid path. Without SQL filter, use HNSW normally.
    let all_results = if valid_ids.is_some() {
        // Brute-force path: score ALL vectors, filter by SQL IDs later.
        let embeddings = store::load_embeddings(&state.store, &req.index_name).await
            .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?;
        search::search(&qv, &embeddings, embeddings.len()) // score everything
    } else if state.hnsw_store.has_index(&req.index_name).await {
        state.hnsw_store.search(&req.index_name, &qv, req.top_k).await
            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?
            .into_iter()
            .map(|h| search::SearchResult {
                doc_id: h.doc_id,
                chunk_text: h.chunk_text,
                score: h.score,
                source: h.source,
                chunk_idx: h.chunk_idx as u32,
            })
            .collect::<Vec<_>>()
    } else {
        let embeddings = store::load_embeddings(&state.store, &req.index_name).await
            .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?;
        search::search(&qv, &embeddings, req.top_k)
    };

    // Step 3: Filter vector results to only SQL-verified IDs.
    // ADR-020: read the index's id_prefix from the catalog instead of
    // hardcoding prefix stripping. Falls back to heuristic for legacy indexes.
    let id_prefix: Option<String> = state.index_registry
        .get(&req.index_name).await
        .and_then(|m| m.id_prefix.clone());

    let sql_count = valid_ids.as_ref().map(|s| s.len()).unwrap_or(0);
    // Phase 19: when playbook_memory is consulted, pull a wider candidate
    // pool so endorsed workers outside the vanilla top-K can still be
    // boosted into visibility. 5× is a conservative multiplier — plenty
    // for a +0.25 boost to flip rankings without dragging the cost up.
    let fetch_k = if req.use_playbook_memory { req.top_k * 5 } else { req.top_k };
    let filtered: Vec<search::SearchResult> = if let Some(ref ids) = valid_ids {
        all_results.into_iter()
            .filter(|r| {
                let raw_id = if let Some(ref prefix) = id_prefix {
                    r.doc_id.strip_prefix(prefix.as_str()).unwrap_or(&r.doc_id)
                } else {
                    // Legacy: heuristic strip for pre-ADR-020 indexes
                    r.doc_id.strip_prefix("W500K-")
                        .or_else(|| r.doc_id.strip_prefix("W500-"))
                        .or_else(|| r.doc_id.strip_prefix("W5K-"))
                        .or_else(|| r.doc_id.strip_prefix("W-"))
                        .or_else(|| r.doc_id.strip_prefix("CAND-"))
                        .unwrap_or(&r.doc_id)
                };
                ids.contains(raw_id)
            })
            .take(fetch_k)
            .collect()
    } else {
        all_results.into_iter().take(fetch_k).collect()
    };

    // Step 4: Build sources with SQL-verified flag.
    let mut sources: Vec<HybridSource> = filtered.iter().map(|r| HybridSource {
        doc_id: r.doc_id.clone(),
        chunk_text: r.chunk_text.clone(),
        score: r.score,
        sql_verified: valid_ids.is_some(),
        playbook_boost: 0.0,
        playbook_citations: Vec::new(),
    }).collect();

    // Step 4b (Phase 19): if use_playbook_memory, look up semantically
    // similar past playbooks and boost workers they endorsed. Name-match
    // is on the tuple (city, state, name) extracted from chunk_text —
    // hybrid_search's SQL filter already narrowed to one city+state, so
    // this just needs to check the name against each playbook's endorsed
    // set. Additive boost on the existing vector score, then re-sort.
    if req.use_playbook_memory {
        let boost_k = req.playbook_memory_k.unwrap_or(playbook_memory::DEFAULT_TOP_K_PLAYBOOKS);
        // Extract target (city, state, role) from the SQL filter so
        // compute_boost_for can skip playbooks from other cities AND
        // prioritize exact role matches via the multi-strategy path.
        // The executor's filter shape is stable:
        //   `... role = 'Welder' AND city = 'Toledo' AND state = 'OH' ...`.
        // Case-insensitive match, tolerant of single quotes.
        let target_geo = req.sql_filter.as_deref().and_then(extract_target_geo);
        let target_role = req.sql_filter.as_deref().and_then(extract_target_role);
        // We embedded the question as `qv` above — reuse it for the
        // playbook similarity lookup so we don't double-pay Ollama.
        let boosts = state.playbook_memory
            .compute_boost_for_filtered_with_role(
                &qv,
                boost_k,
                0.5,
                target_geo.as_ref().map(|(c, s)| (c.as_str(), s.as_str())),
                target_role.as_deref(),
            )
            .await;

        // Diagnostics for Phase 19 boost pipeline. Logged so item 3
        // investigation has ground truth:
        //   - boosts.len(): how many (city,state,name) keys surfaced for
        //     this query (0 = playbook_memory found nothing semantically
        //     similar to the question).
        //   - parsed: how many candidate chunks parsed cleanly into
        //     (name,city,state) via parse_worker_chunk.
        //   - matched: how many parsed keys matched an entry in boosts.
        // 2026-04-21 — 20-scenario batch showed 34/40 ok combos never
        // got a citation. These counters pin whether the gap is on the
        // SIMILARITY side (boosts empty) or the MATCH side (parsed vs
        // boosted keys mismatch — e.g. name format drift).
        let mut parsed_count = 0usize;
        let mut matched_count = 0usize;
        for src in sources.iter_mut() {
            // Parse "{Name} — {Role} in {City}, {State}. …" chunk. Being
            // defensive: chunks from other datasets may not follow this
            // exact shape, so absent fields just skip the boost.
            if let Some((name, city, state)) = parse_worker_chunk(&src.chunk_text) {
                parsed_count += 1;
                let key = (city, state, name);
                if let Some(entry) = boosts.get(&key) {
                    src.score += entry.boost;
                    src.playbook_boost = entry.boost;
                    src.playbook_citations = entry.citations.clone();
                    matched_count += 1;
                }
            }
        }
        tracing::info!(
            "playbook_boost: boosts={} sources={} parsed={} matched={} target_geo={:?} target_role={:?} (query='{}')",
            boosts.len(),
            sources.len(),
            parsed_count,
            matched_count,
            target_geo,
            target_role,
            req.question.chars().take(60).collect::<String>(),
        );
        // Re-rank: boosted scores can flip ordering.
        sources.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
        // Finally trim to the caller's requested top_k — we pulled fetch_k
        // (5× wider) above specifically so the boost could reach workers
        // that would otherwise have been trimmed pre-boost.
        sources.truncate(req.top_k);
    }

    // Step 5: Generate answer if requested.
    let answer = if req.generate && !sources.is_empty() {
        let context: String = sources.iter().enumerate().map(|(i, s)| {
            format!("[{}] (id: {}, verified: {}) {}", i + 1, s.doc_id, s.sql_verified, s.chunk_text)
        }).collect::<Vec<_>>().join("\n\n");

        let gen_resp = state.ai_client.generate(GenerateRequest {
            prompt: format!(
                "You are a staffing intelligence assistant. Answer based ONLY on these \
                 verified worker records. Every record has been SQL-verified against the \
                 database — you can trust the facts in them. Be specific: cite names, \
                 skills, certifications, scores, and locations.\n\n\
                 Records:\n{context}\n\n\
                 Question: {}\n\nAnswer:", req.question,
            ),
            model: None,
            system: None,
            temperature: Some(0.2),
            max_tokens: Some(512),
            think: None,
        }).await;

        gen_resp.ok().map(|r| r.text.trim().to_string())
    } else {
        None
    };

    let method = if valid_ids.is_some() { "hybrid_sql_vector" } else { "vector_only" };

    Ok(Json(HybridResponse {
        question: req.question,
        sql_filter: req.sql_filter,
        sql_matches: sql_count,
        vector_reranked: sources.len(),
        method: method.into(),
        answer,
        sources,
        duration_ms: t0.elapsed().as_millis() as u64,
    }))
}

// --- HNSW Fast Search ---

#[derive(Deserialize)]
struct BuildHnswRequest {
    /// Name of the stored vector index to build HNSW from
    index_name: String,
    /// Optional config override. Omit to use the production default
    /// (ec=80 es=30 — see HnswConfig::default docs for rationale).
    #[serde(default)]
    config: Option<trial::HnswConfig>,
}

/// Build an HNSW index from an existing stored vector index.
/// Uses the embedding cache so repeated builds don't reload from Parquet.
async fn build_hnsw(
    State(state): State<VectorState>,
    Json(req): Json<BuildHnswRequest>,
) -> impl IntoResponse {
    let config = req.config.unwrap_or_default();
    tracing::info!(
        "building HNSW for '{}' ef_construction={} ef_search={}",
        req.index_name, config.ef_construction, config.ef_search,
    );

    let embeddings = state
        .embedding_cache
        .get_or_load(&req.index_name)
        .await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("index not found: {e}")))?;

    match state
        .hnsw_store
        .build_index_with_config(&req.index_name, (*embeddings).clone(), &config)
        .await
    {
        Ok(stats) => Ok(Json(stats)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct HnswSearchRequest {
    index_name: String,
    query: String,
    top_k: Option<usize>,
}

/// Search using HNSW — approximate nearest neighbors, much faster than brute-force.
async fn search_hnsw(
    State(state): State<VectorState>,
    Json(req): Json<HnswSearchRequest>,
) -> impl IntoResponse {
    let top_k = req.top_k.unwrap_or(5);

    // Embed query
    let embed_resp = state.ai_client.embed(EmbedRequest {
        texts: vec![req.query.clone()],
        model: None,
    }).await.map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed error: {e}")))?;

    if embed_resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "no embedding returned".to_string()));
    }

    let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    // Search HNSW
    match state.hnsw_store.search(&req.index_name, &query_vec, top_k).await {
        Ok(results) => Ok(Json(serde_json::json!({
            "results": results,
            "query": req.query,
            "method": "hnsw",
        }))),
        Err(e) => Err((StatusCode::NOT_FOUND, e)),
    }
}

async fn list_hnsw(State(state): State<VectorState>) -> impl IntoResponse {
    Json(state.hnsw_store.list().await)
}

// --- Trial System: parameterized HNSW tuning loop ---
//
// Flow:
//   1. Agent picks an HnswConfig
//   2. POST /hnsw/trial builds HNSW with that config against cached embeddings,
//      runs every query in the harness, measures latency + recall vs the
//      harness's ground truth, appends a Trial record to _hnsw_trials/{idx}.jsonl
//   3. Agent reads GET /hnsw/trials/{index}, sees history, decides next config
//   4. Repeat until converged.
//
// The first trial triggers embedding load (slow). Every subsequent trial reuses
// the cache — so the agent iterates in seconds, not minutes.

#[derive(Deserialize)]
struct TrialRequest {
    index_name: String,
    harness: String,
    #[serde(default)]
    config: trial::HnswConfig,
    #[serde(default)]
    note: Option<String>,
}

async fn run_trial(
    State(state): State<VectorState>,
    Json(req): Json<TrialRequest>,
) -> Result<Json<trial::Trial>, (StatusCode, String)> {
    let mut harness_set = state.harness_store.load_for_index(&req.index_name, &req.harness)
        .await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("harness not found: {e}")))?;

    if harness_set.index_name != req.index_name {
        return Err((
            StatusCode::BAD_REQUEST,
            format!(
                "harness '{}' is for index '{}', not '{}'",
                req.harness, harness_set.index_name, req.index_name
            ),
        ));
    }
    if harness_set.queries.is_empty() {
        return Err((StatusCode::BAD_REQUEST, "harness has no queries".into()));
    }

    let embeddings = state
        .embedding_cache
        .get_or_load(&req.index_name)
        .await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?;

    if !harness_set.ground_truth_built {
        tracing::info!("trial: computing ground truth for harness '{}'", harness_set.name);
        let t0 = std::time::Instant::now();
        harness::compute_ground_truth(&mut harness_set, &embeddings, &state.ai_client)
            .await
            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?;
        tracing::info!("trial: ground truth built in {:.1}s", t0.elapsed().as_secs_f32());
        state.harness_store
            .save(&harness_set)
            .await
            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save harness: {e}")))?;
    }

    let trial_id = trial::Trial::new_id();
    let hnsw_slot = format!("{}__{}", req.index_name, trial_id);

    let build_stats = state
        .hnsw_store
        .build_index_with_config(&hnsw_slot, (*embeddings).clone(), &req.config)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("build: {e}")))?;

    let query_vectors: Vec<Vec<f32>> = harness_set
        .queries
        .iter()
        .filter_map(|q| q.query_embedding.clone())
        .collect();
    let bench = state
        .hnsw_store
        .bench_search(&hnsw_slot, &query_vectors, harness_set.k)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?;

    let mut recalls = Vec::with_capacity(harness_set.queries.len());
    for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) {
        if let Some(gt) = &q.ground_truth {
            recalls.push(harness::recall_at_k(hits, gt, harness_set.k));
        }
    }
    let mean_recall = if recalls.is_empty() {
        0.0
    } else {
        recalls.iter().sum::<f32>() / recalls.len() as f32
    };

    let mut lats = bench.latencies_us.clone();
    lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let p = |pct: f32| -> f32 {
        if lats.is_empty() { return 0.0; }
        let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize;
        lats[idx.min(lats.len() - 1)]
    };

    // One brute-force reference latency — keeps the cost proportional to
    // whatever the agent is willing to pay per trial.
    let brute_latency_us = if let Some(qv) = query_vectors.first() {
        let t0 = std::time::Instant::now();
        let _ = harness::brute_force_top_k(qv, &embeddings, harness_set.k);
        t0.elapsed().as_micros() as f32
    } else {
        0.0
    };

    let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0);
    let memory_bytes =
        (embeddings.len() * dims * std::mem::size_of::<f32>() + embeddings.len() * 128) as u64;

    let trial_record = trial::Trial {
        id: trial_id.clone(),
        index_name: req.index_name.clone(),
        eval_set: req.harness.clone(),
        config: req.config.clone(),
        metrics: trial::TrialMetrics {
            build_time_secs: build_stats.build_time_secs,
            search_latency_p50_us: p(0.50),
            search_latency_p95_us: p(0.95),
            search_latency_p99_us: p(0.99),
            recall_at_k: mean_recall,
            memory_bytes,
            vectors: build_stats.vectors,
            eval_queries: harness_set.queries.len(),
            brute_force_latency_us: brute_latency_us,
        },
        created_at: chrono::Utc::now(),
        note: req.note,
    };

    state
        .trial_journal
        .append(&trial_record)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("journal: {e}")))?;

    state.hnsw_store.drop(&hnsw_slot).await;

    Ok(Json(trial_record))
}

async fn list_trials(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    match state.trial_journal.list(&index_name).await {
        Ok(trials) => Ok(Json(trials)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct BestTrialQuery {
    #[serde(default = "default_metric")]
    metric: String,
}

fn default_metric() -> String {
    "pareto".to_string()
}

async fn best_trial(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Query(q): Query<BestTrialQuery>,
) -> impl IntoResponse {
    match state.trial_journal.best(&index_name, &q.metric).await {
        Ok(Some(t)) => Ok(Json(t)),
        Ok(None) => Err((StatusCode::NOT_FOUND, "no trials yet".to_string())),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

// --- Harness management ---

async fn list_evals(State(state): State<VectorState>) -> impl IntoResponse {
    Json(state.harness_store.list_all().await)
}

async fn get_eval(
    State(state): State<VectorState>,
    Path(name): Path<String>,
) -> impl IntoResponse {
    match state.harness_store.get_any(&name).await {
        Ok(e) => Ok(Json(e)),
        Err(err) => Err((StatusCode::NOT_FOUND, err)),
    }
}

async fn put_eval(
    State(state): State<VectorState>,
    Path(name): Path<String>,
    Json(mut harness_set): Json<harness::EvalSet>,
) -> impl IntoResponse {
    harness_set.name = name;
    harness_set.ground_truth_built = harness_set
        .queries
        .iter()
        .all(|q| q.ground_truth.is_some());
    match state.harness_store.save(&harness_set).await {
        Ok(()) => Ok(Json(harness_set)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct AutogenRequest {
    index_name: String,
    #[serde(default = "default_sample_count")]
    sample_count: usize,
    #[serde(default = "default_k")]
    k: usize,
}

fn default_sample_count() -> usize { 100 }
fn default_k() -> usize { 10 }

async fn autogen_eval(
    State(state): State<VectorState>,
    Path(name): Path<String>,
    Json(req): Json<AutogenRequest>,
) -> Result<Json<harness::EvalSet>, (StatusCode, String)> {
    let embeddings = state
        .embedding_cache
        .get_or_load(&req.index_name)
        .await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?;

    let mut harness_set = harness::synthetic_from_chunks(
        &name,
        &req.index_name,
        &embeddings,
        req.sample_count,
        req.k,
    );

    harness::compute_ground_truth(&mut harness_set, &embeddings, &state.ai_client)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?;

    state.harness_store
        .save(&harness_set)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save: {e}")))?;

    Ok(Json(harness_set))
}

// --- Embedding cache management ---

async fn cache_stats(State(state): State<VectorState>) -> impl IntoResponse {
    Json(state.embedding_cache.stats().await)
}

async fn cache_evict(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    let ok = state.embedding_cache.evict(&index_name).await;
    Json(serde_json::json!({ "evicted": ok, "index_name": index_name }))
}

// --- Phase C: embedding refresh ---
//
// Decouples "new row data arrived" from "re-embed everything." Ingest marks
// a dataset's embeddings stale (see catalogd::registry::mark_embeddings_stale);
// `/vectors/refresh/{dataset}` diffs existing embeddings against current
// rows, embeds only the new ones, appends to the index, and clears the
// stale flag.

async fn refresh_dataset(
    State(state): State<VectorState>,
    Path(dataset_name): Path<String>,
    Json(req): Json<refresh::RefreshRequest>,
) -> Result<Json<refresh::RefreshResult>, (StatusCode, String)> {
    tracing::info!(
        "refresh requested for dataset '{}' -> index '{}'",
        dataset_name, req.index_name,
    );
    match refresh::refresh_index(
        &dataset_name,
        &req,
        &state.store,
        &state.catalog,
        &state.ai_client,
        &state.embedding_cache,
        &state.index_registry,
    )
    .await
    {
        Ok(result) => Ok(Json(result)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Serialize)]
struct StaleEntry {
    dataset_name: String,
    last_embedded_at: Option<String>,
    stale_since: String,
    refresh_policy: Option<shared::types::RefreshPolicy>,
}

async fn list_stale(State(state): State<VectorState>) -> impl IntoResponse {
    let datasets = state.catalog.stale_datasets().await;
    let entries: Vec<StaleEntry> = datasets
        .into_iter()
        .map(|d| StaleEntry {
            dataset_name: d.name,
            last_embedded_at: d.last_embedded_at.map(|t| t.to_rfc3339()),
            stale_since: d
                .embedding_stale_since
                .map(|t| t.to_rfc3339())
                .unwrap_or_default(),
            refresh_policy: d.embedding_refresh_policy,
        })
        .collect();
    Json(entries)
}

// --- Phase 17: Model profile activation + scoped search ---

#[derive(Serialize)]
struct ActivateReport {
    profile_id: String,
    ollama_name: String,
    indexes_warmed: Vec<WarmedIndex>,
    failures: Vec<String>,
    total_vectors: usize,
    duration_secs: f32,
    /// Phase C: did we successfully preload the Ollama model?
    model_preloaded: bool,
    /// Phase C: which profile previously held the GPU slot, if any.
    /// Useful for observability of the swap.
    previous_profile: Option<String>,
}

#[derive(Serialize)]
struct WarmedIndex {
    index_name: String,
    source: String,
    vectors: usize,
    hnsw_build_secs: f32,
}

/// Warm this profile's indexes. For every bound dataset, find the
/// matching vector index (any index whose `source` equals the dataset
/// or view name), load its embeddings into EmbeddingCache, build HNSW
/// with the profile's config. Next `/profile/{id}/search` call is then
/// <1ms cold.
///
/// Failures on individual indexes don't stop the activation — they get
/// reported in the response. This matches the "substrate keeps working"
/// philosophy from ADR-017: one bad binding shouldn't take down the
/// whole profile.
async fn activate_profile(
    State(state): State<VectorState>,
    Path(profile_id): Path<String>,
) -> impl IntoResponse {
    let t0 = std::time::Instant::now();

    let profile = match state.catalog.get_profile(&profile_id).await {
        Some(p) => p,
        None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))),
    };

    let mut warmed = Vec::new();
    let mut failures = Vec::new();
    let mut total_vectors = 0usize;

    // Phase 17 / C: VRAM-aware swap. If another profile currently holds
    // the GPU and uses a DIFFERENT Ollama model than the one being
    // activated, unload it first (keep_alive=0). Same-model activations
    // skip the unload — no point churning a model that's already loaded.
    let previous_slot = {
        let guard = state.active_profile.read().await;
        guard.clone()
    };
    if let Some(prev) = &previous_slot {
        if prev.ollama_name != profile.ollama_name {
            match state.ai_client.unload_model(&prev.ollama_name).await {
                Ok(_) => tracing::info!(
                    "profile swap: unloaded '{}' ({} -> {})",
                    prev.ollama_name, prev.profile_id, profile.id,
                ),
                Err(e) => failures.push(format!(
                    "unload previous model '{}': {e}", prev.ollama_name,
                )),
            }
        }
    }

    // Federation layer 2: if this profile declares its own bucket and
    // that bucket isn't registered yet, auto-provision it under the
    // configured profile_root. This is the moment a "dormant" profile
    // becomes live — its bucket exists and is readable/writable.
    if let Some(bucket_name) = profile.bucket.clone() {
        if !state.bucket_registry.contains(&bucket_name) {
            let root = format!(
                "{}/{}",
                state.bucket_registry.profile_root().trim_end_matches('/'),
                bucket_name.replace(':', "_"),
            );
            let bc = shared::config::BucketConfig {
                name: bucket_name.clone(),
                backend: "local".to_string(),
                root: Some(root.clone()),
                bucket: None,
                region: None,
                endpoint: None,
                secret_ref: None,
            };
            match state.bucket_registry.add_bucket(bc).await {
                Ok(info) => {
                    tracing::info!(
                        "profile '{}' activated bucket '{}' (root={}, reachable={})",
                        profile.id, bucket_name, root, info.reachable,
                    );
                }
                Err(e) => {
                    failures.push(format!(
                        "auto-provision bucket '{}': {}", bucket_name, e,
                    ));
                }
            }
        }
    }

    let all_indexes = state.index_registry.list(None, None).await;
    let use_lance = profile.vector_backend == shared::types::VectorBackend::Lance;

    for binding in &profile.bound_datasets {
        let matched: Vec<_> = all_indexes
            .iter()
            .filter(|m| &m.source == binding)
            .collect();
        if matched.is_empty() {
            failures.push(format!(
                "no vector index found for binding '{}'", binding,
            ));
            continue;
        }
        for meta in matched {
            if use_lance {
                // --- Lance activation path ---
                // Ensure a Lance dataset exists for this index. If it
                // doesn't, auto-migrate from the Parquet blob. Then
                // ensure an IVF_PQ index is built.
                let bucket = meta.bucket.clone();
                let lance_store = match state.lance.store_for_new(&meta.index_name, &bucket).await {
                    Ok(s) => s,
                    Err(e) => {
                        failures.push(format!("{}: lance store init: {e}", meta.index_name));
                        continue;
                    }
                };
                let count = lance_store.count().await.unwrap_or(0);
                if count == 0 {
                    // Auto-migrate from existing Parquet.
                    let pq_store = match state.bucket_registry.get(&bucket) {
                        Ok(s) => s,
                        Err(e) => { failures.push(format!("{}: bucket: {e}", meta.index_name)); continue; }
                    };
                    match storaged::ops::get(&pq_store, &meta.storage_key).await {
                        Ok(bytes) => {
                            let build_t = std::time::Instant::now();
                            match lance_store.migrate_from_parquet_bytes(&bytes).await {
                                Ok(ms) => {
                                    total_vectors += ms.rows_written;
                                    tracing::info!(
                                        "lance auto-migrate '{}': {} rows in {:.2}s",
                                        meta.index_name, ms.rows_written, ms.duration_secs,
                                    );
                                    warmed.push(WarmedIndex {
                                        index_name: meta.index_name.clone(),
                                        source: meta.source.clone(),
                                        vectors: ms.rows_written,
                                        hnsw_build_secs: build_t.elapsed().as_secs_f32(),
                                    });
                                }
                                Err(e) => failures.push(format!("{}: lance migrate: {e}", meta.index_name)),
                            }
                        }
                        Err(e) => failures.push(format!("{}: read parquet: {e}", meta.index_name)),
                    }
                } else {
                    total_vectors += count;
                    warmed.push(WarmedIndex {
                        index_name: meta.index_name.clone(),
                        source: meta.source.clone(),
                        vectors: count,
                        hnsw_build_secs: 0.0,
                    });
                }
                // Ensure IVF_PQ vector index exists.
                if !lance_store.has_vector_index().await.unwrap_or(false) {
                    match lance_store.build_index(316, 8, 48).await {
                        Ok(ix) => tracing::info!(
                            "lance auto-index '{}': IVF_PQ built in {:.1}s",
                            meta.index_name, ix.build_time_secs,
                        ),
                        Err(e) => failures.push(format!("{}: lance IVF_PQ build: {e}", meta.index_name)),
                    }
                }
                // Ensure scalar btree on doc_id for O(log N) random fetch.
                if !lance_store.has_scalar_index("doc_id").await.unwrap_or(false) {
                    match lance_store.build_scalar_index("doc_id").await {
                        Ok(ix) => tracing::info!(
                            "lance auto-index '{}': doc_id btree built in {:.2}s",
                            meta.index_name, ix.build_time_secs,
                        ),
                        Err(e) => failures.push(format!("{}: lance doc_id btree: {e}", meta.index_name)),
                    }
                }
            } else {
                // --- Parquet + HNSW activation path (existing) ---
                let embeddings = match state.embedding_cache.get_or_load(&meta.index_name).await {
                    Ok(arc) => arc,
                    Err(e) => {
                        failures.push(format!("{}: load failed: {}", meta.index_name, e));
                        continue;
                    }
                };
                total_vectors += embeddings.len();

                let profile_default = trial::HnswConfig {
                    ef_construction: profile.hnsw_config.ef_construction,
                    ef_search: profile.hnsw_config.ef_search,
                    seed: profile.hnsw_config.seed,
                };
                let cfg = state
                    .promotion_registry
                    .config_or(&meta.index_name, profile_default)
                    .await;
                let build_t = std::time::Instant::now();
                match state
                    .hnsw_store
                    .build_index_with_config(&meta.index_name, (*embeddings).clone(), &cfg)
                    .await
                {
                    Ok(_) => {
                        warmed.push(WarmedIndex {
                            index_name: meta.index_name.clone(),
                            source: meta.source.clone(),
                            vectors: embeddings.len(),
                            hnsw_build_secs: build_t.elapsed().as_secs_f32(),
                        });
                    }
                    Err(e) => {
                        failures.push(format!("{}: HNSW build failed: {}", meta.index_name, e));
                    }
                }
            }
        }
    }

    // Preload the new profile's Ollama model proactively. Same-model
    // re-activations are cheap (Ollama no-ops if already loaded).
    let mut model_preloaded = false;
    match state.ai_client.preload_model(&profile.ollama_name).await {
        Ok(_) => {
            model_preloaded = true;
            tracing::info!("profile '{}' preloaded ollama model '{}'",
                profile.id, profile.ollama_name);
        }
        Err(e) => failures.push(format!(
            "preload ollama model '{}': {e}", profile.ollama_name,
        )),
    }

    // Take the GPU slot.
    {
        let mut guard = state.active_profile.write().await;
        *guard = Some(ActiveProfileSlot {
            profile_id: profile.id.clone(),
            ollama_name: profile.ollama_name.clone(),
            activated_at: chrono::Utc::now(),
        });
    }

    Ok(Json(ActivateReport {
        profile_id: profile.id,
        ollama_name: profile.ollama_name,
        indexes_warmed: warmed,
        failures,
        total_vectors,
        duration_secs: t0.elapsed().as_secs_f32(),
        model_preloaded,
        previous_profile: previous_slot.map(|s| s.profile_id),
    }))
}

/// Unload this profile's model and clear the active slot. No-op if the
/// caller isn't the currently-active profile.
async fn deactivate_profile(
    State(state): State<VectorState>,
    Path(profile_id): Path<String>,
) -> impl IntoResponse {
    let profile = match state.catalog.get_profile(&profile_id).await {
        Some(p) => p,
        None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))),
    };

    let was_active = {
        let mut guard = state.active_profile.write().await;
        match guard.as_ref() {
            Some(s) if s.profile_id == profile_id => {
                let prev = s.clone();
                *guard = None;
                Some(prev)
            }
            _ => None,
        }
    };

    // Regardless of whether it held the slot, we can still try to unload —
    // the operator's intent is "get this model out of VRAM."
    let unload_result = state.ai_client.unload_model(&profile.ollama_name).await;

    Ok(Json(serde_json::json!({
        "profile_id": profile.id,
        "ollama_name": profile.ollama_name,
        "was_active": was_active.is_some(),
        "unloaded": unload_result.is_ok(),
        "unload_error": unload_result.err(),
    })))
}

async fn get_active_profile(State(state): State<VectorState>) -> impl IntoResponse {
    let slot = state.active_profile.read().await.clone();
    Json(slot)
}

#[derive(Deserialize)]
struct ProfileSearchRequest {
    index_name: String,
    query: String,
    top_k: Option<usize>,
}

/// Search scoped to a profile — refuses if the requested index's source
/// isn't in the profile's bound_datasets. Reuses the existing HNSW
/// search path when the index is warm; falls back to brute-force cosine
/// if it's not (handled by the existing search code path).
async fn profile_scoped_search(
    State(state): State<VectorState>,
    Path(profile_id): Path<String>,
    Json(req): Json<ProfileSearchRequest>,
) -> impl IntoResponse {
    let profile = match state.catalog.get_profile(&profile_id).await {
        Some(p) => p,
        None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))),
    };

    // Verify the index is in scope for this profile.
    let index_meta = match state.index_registry.get(&req.index_name).await {
        Some(m) => m,
        None => return Err((StatusCode::NOT_FOUND, format!("index not found: {}", req.index_name))),
    };
    if !profile.bound_datasets.contains(&index_meta.source) {
        return Err((
            StatusCode::FORBIDDEN,
            format!(
                "profile '{}' is not bound to '{}' — allowed bindings: {:?}",
                profile.id, index_meta.source, profile.bound_datasets,
            ),
        ));
    }

    let top_k = req.top_k.unwrap_or(5);
    let use_lance = profile.vector_backend == shared::types::VectorBackend::Lance;

    // Embed the query.
    let embed_resp = state
        .ai_client
        .embed(EmbedRequest { texts: vec![req.query.clone()], model: None })
        .await
        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?;
    if embed_resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "no embedding returned".into()));
    }
    let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    // ADR-019 hybrid: route to Lance or Parquet+HNSW based on the
    // profile's declared backend. Callers don't need to know which
    // storage tier they're hitting — the profile abstracts it.
    if use_lance {
        let lance_store = state.lance.store_for(&req.index_name).await
            .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
        let t0 = std::time::Instant::now();
        match lance_store.search(
            &query_vec,
            top_k,
            Some(LANCE_DEFAULT_NPROBES),
            Some(LANCE_DEFAULT_REFINE_FACTOR),
        ).await {
            Ok(hits) => Ok(Json(serde_json::json!({
                "profile": profile.id,
                "source": index_meta.source,
                "method": "lance_ivf_pq",
                "latency_us": t0.elapsed().as_micros() as u64,
                "results": hits,
            }))),
            Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
        }
    } else if state.hnsw_store.has_index(&req.index_name).await {
        match state.hnsw_store.search(&req.index_name, &query_vec, top_k).await {
            Ok(hits) => Ok(Json(serde_json::json!({
                "profile": profile.id,
                "source": index_meta.source,
                "method": "hnsw",
                "results": hits,
            }))),
            Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
        }
    } else {
        let embeddings = state
            .embedding_cache
            .get_or_load(&req.index_name)
            .await
            .map_err(|e| (StatusCode::NOT_FOUND, format!("embeddings: {e}")))?;
        let results = search::search(&query_vec, &embeddings, top_k);
        Ok(Json(serde_json::json!({
            "profile": profile.id,
            "source": index_meta.source,
            "method": "brute_force",
            "results": results,
        })))
    }
}

// --- Phase 16: Promotion + autotune ---

#[derive(Deserialize)]
struct PromoteQuery {
    #[serde(default)]
    promoted_by: String,
    #[serde(default)]
    note: Option<String>,
}

async fn promote_trial(
    State(state): State<VectorState>,
    Path((index_name, trial_id)): Path<(String, String)>,
    Query(q): Query<PromoteQuery>,
) -> impl IntoResponse {
    // Pull the trial from the journal to get its config.
    let trials = state
        .trial_journal
        .list(&index_name)
        .await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;
    let trial = trials
        .iter()
        .find(|t| t.id == trial_id)
        .ok_or_else(|| (StatusCode::NOT_FOUND, format!("trial not found: {trial_id}")))?;

    let entry = promotion::PromotionEntry {
        config: trial.config.clone(),
        trial_id: trial.id.clone(),
        promoted_at: chrono::Utc::now(),
        promoted_by: q.promoted_by,
        note: q.note,
    };
    match state.promotion_registry.promote(&index_name, entry).await {
        Ok(file) => Ok(Json(file)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

async fn rollback_promotion(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    match state.promotion_registry.rollback(&index_name).await {
        Ok(file) => Ok(Json(file)),
        Err(e) => Err((StatusCode::NOT_FOUND, e)),
    }
}

async fn get_promoted(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    match state.promotion_registry.load(&index_name).await {
        Ok(file) => Ok(Json(file)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

async fn run_autotune_endpoint(
    State(state): State<VectorState>,
    Json(req): Json<autotune::AutotuneRequest>,
) -> impl IntoResponse {
    match autotune::run_autotune(
        req,
        &state.store,
        &state.catalog,
        &state.ai_client,
        &state.embedding_cache,
        &state.hnsw_store,
        &state.index_registry,
        &state.trial_journal,
        &state.promotion_registry,
        &state.harness_store,
        &state.job_tracker,
    ).await {
        Ok(result) => Ok(Json(result)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

// --- Phase 16.2: autotune agent endpoints ---

async fn agent_status(State(state): State<VectorState>) -> impl IntoResponse {
    Json(state.agent_handle.status().await)
}

async fn agent_stop(State(state): State<VectorState>) -> impl IntoResponse {
    let stopped = state.agent_handle.stop().await;
    Json(serde_json::json!({ "stopped": stopped }))
}

async fn agent_enqueue(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    let event = agent::TriggerEvent::manual(index_name);
    match state.agent_handle.enqueue(event).await {
        Ok(()) => Ok(Json(serde_json::json!({ "enqueued": true }))),
        Err(e) => Err((StatusCode::SERVICE_UNAVAILABLE, e)),
    }
}

// --- ADR-019: Lance hybrid backend HTTP surface ---
//
// Lance routes operate on the same `index_name` as the Parquet/HNSW path,
// but materialize the data as a Lance dataset on disk under
// `{bucket_root}/lance/{index_name}/`. The two backends are independent:
// you can have an index in both formats simultaneously. `IndexMeta.vector_backend`
// records which one is canonical for that index.

#[derive(Deserialize)]
struct LanceMigrateRequest {
    /// Optional bucket override. Defaults to whatever the existing
    /// IndexMeta says, or "primary" for indexes that don't exist yet.
    #[serde(default)]
    bucket: Option<String>,
}

/// Read the existing Parquet vector file for `index_name` from object
/// storage, hand the bytes to vectord-lance, return migration stats.
/// The original Parquet file is left intact — both backends coexist
/// after migration.
async fn lance_migrate(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Json(req): Json<LanceMigrateRequest>,
) -> impl IntoResponse {
    let meta = state.index_registry.get(&index_name).await
        .ok_or((StatusCode::NOT_FOUND, format!("index not found: {index_name}")))?;
    let bucket = req.bucket.unwrap_or(meta.bucket.clone());

    // Pull the Parquet bytes via storaged::ops — same path as the
    // existing embedding loader uses.
    let store = state.bucket_registry.get(&bucket)
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    let bytes = storaged::ops::get(&store, &meta.storage_key).await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("read parquet: {e}")))?;

    let lance_store = state.lance.store_for_new(&index_name, &bucket).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;

    let stats = lance_store.migrate_from_parquet_bytes(&bytes).await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;

    tracing::info!(
        "lance migrate '{}': {} rows, {}d, {} bytes on disk, {:.2}s",
        index_name, stats.rows_written, stats.dimensions,
        stats.disk_bytes, stats.duration_secs,
    );

    Ok::<_, (StatusCode, String)>(Json(serde_json::json!({
        "index_name": index_name,
        "bucket": bucket,
        "lance_path": lance_store.path(),
        "stats": stats,
    })))
}

#[derive(Deserialize)]
struct LanceIndexRequest {
    #[serde(default = "default_partitions")]
    num_partitions: u32,
    #[serde(default = "default_bits")]
    num_bits: u32,
    #[serde(default = "default_subvectors")]
    num_sub_vectors: u32,
}

fn default_partitions() -> u32 { 316 }   // ≈√100K — sane for the reference dataset
fn default_bits() -> u32 { 8 }
fn default_subvectors() -> u32 { 48 }    // 768/48 = 16 dims per subvector

/// Build the IVF_PQ index on the Lance dataset.
async fn lance_build_index(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Json(req): Json<LanceIndexRequest>,
) -> impl IntoResponse {
    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    match lance_store.build_index(req.num_partitions, req.num_bits, req.num_sub_vectors).await {
        Ok(stats) => Ok(Json(stats)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct LanceSearchRequest {
    /// Plain text query — embedded server-side for symmetry with the
    /// existing /vectors/search path.
    query: String,
    #[serde(default = "default_top_k")]
    top_k: usize,
    /// IVF partitions to probe. `None` uses Lance's built-in default of
    /// 1, which caps recall well below the index's real capability.
    /// Recommended: 5–10% of num_partitions (≈20 for a 316-partition
    /// index). Omitting it here picks the server-side default.
    #[serde(default)]
    nprobes: Option<usize>,
    /// Refine factor — re-rank `top_k * factor` PQ-approximate candidates
    /// with exact distances before returning `top_k`. Recovers recall
    /// lost to product quantization.
    #[serde(default)]
    refine_factor: Option<u32>,
}

/// Server-side defaults when the caller doesn't pin nprobes / refine
/// themselves. Tuned for the ~100K × 768d reference workload; see
/// docs/ADR-019-vector-storage.md for the recall / latency trade-off.
const LANCE_DEFAULT_NPROBES: usize = 20;
const LANCE_DEFAULT_REFINE_FACTOR: u32 = 5;

fn default_top_k() -> usize { 5 }

/// Vector search against a Lance dataset. Embeds the query text via the
/// sidecar then calls Lance's nearest-neighbor scanner.
async fn lance_search(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Json(req): Json<LanceSearchRequest>,
) -> impl IntoResponse {
    let embed_resp = state.ai_client
        .embed(EmbedRequest { texts: vec![req.query.clone()], model: None })
        .await
        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?;
    if embed_resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "no embedding returned".into()));
    }
    let qv: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;

    let t0 = std::time::Instant::now();
    let nprobes = req.nprobes.or(Some(LANCE_DEFAULT_NPROBES));
    let refine = req.refine_factor.or(Some(LANCE_DEFAULT_REFINE_FACTOR));
    let hits = lance_store.search(&qv, req.top_k, nprobes, refine).await
        .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;

    Ok(Json(serde_json::json!({
        "index_name": index_name,
        "query": req.query,
        "method": "lance_ivf_pq",
        "latency_us": t0.elapsed().as_micros() as u64,
        "results": hits,
    })))
}

/// Random-access fetch by doc_id — the O(1) lookup that's basically
/// impossible in our Parquet path without scanning the whole file.
async fn lance_get_doc(
    State(state): State<VectorState>,
    Path((index_name, doc_id)): Path<(String, String)>,
) -> impl IntoResponse {
    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    let t0 = std::time::Instant::now();
    match lance_store.get_by_doc_id(&doc_id).await {
        Ok(Some(row)) => Ok(Json(serde_json::json!({
            "index_name": index_name,
            "doc_id": doc_id,
            "latency_us": t0.elapsed().as_micros() as u64,
            "row": row,
        }))),
        Ok(None) => Err((StatusCode::NOT_FOUND, format!("doc_id not found: {doc_id}"))),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct LanceAppendRequest {
    /// Optional source tag — set on every appended row.
    #[serde(default)]
    source: Option<String>,
    rows: Vec<LanceAppendRow>,
}

#[derive(Deserialize)]
struct LanceAppendRow {
    doc_id: String,
    #[serde(default)]
    chunk_idx: Option<i32>,
    chunk_text: String,
    /// Pre-computed embedding. Caller is responsible for ensuring it
    /// matches the dataset's dimensions and embedding model.
    vector: Vec<f32>,
}

async fn lance_append(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Json(req): Json<LanceAppendRequest>,
) -> impl IntoResponse {
    if req.rows.is_empty() {
        return Err((StatusCode::BAD_REQUEST, "rows array is empty".into()));
    }
    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;

    let mut doc_ids = Vec::with_capacity(req.rows.len());
    let mut chunk_idxs = Vec::with_capacity(req.rows.len());
    let mut chunk_texts = Vec::with_capacity(req.rows.len());
    let mut vectors = Vec::with_capacity(req.rows.len());
    for r in req.rows {
        doc_ids.push(r.doc_id);
        chunk_idxs.push(r.chunk_idx.unwrap_or(0));
        chunk_texts.push(r.chunk_text);
        vectors.push(r.vector);
    }

    match lance_store.append(req.source, doc_ids, chunk_idxs, chunk_texts, vectors).await {
        Ok(stats) => Ok(Json(stats)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

async fn lance_stats(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
) -> impl IntoResponse {
    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    match lance_store.stats().await {
        Ok(s) => Ok(Json(s)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

/// Run an existing harness against Lance IVF_PQ and measure recall@k.
/// Uses the same ground truth computed by brute-force cosine (the HNSW
/// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not
/// measured."
#[derive(Deserialize)]
struct LanceRecallRequest {
    harness: String,
    #[serde(default = "default_top_k")]
    top_k: usize,
    /// Override server defaults so operators can sweep nprobes /
    /// refine_factor to chart the recall-vs-latency curve.
    #[serde(default)]
    nprobes: Option<usize>,
    #[serde(default)]
    refine_factor: Option<u32>,
}

#[derive(serde::Serialize)]
struct LanceRecallResult {
    index_name: String,
    harness: String,
    queries: usize,
    top_k: usize,
    mean_recall: f32,
    per_query: Vec<LanceRecallQuery>,
    latency_p50_us: f32,
    latency_p95_us: f32,
    total_duration_secs: f32,
}

#[derive(serde::Serialize)]
struct LanceRecallQuery {
    query_id: String,
    recall: f32,
    latency_us: f32,
    hits_returned: usize,
}

// --- Phase 19: playbook memory endpoints ---

/// Extract (name, city, state) from a chunk formatted like
/// "{Name} — {Role} in {City}, {State}. Skills: …".
/// Returns None if the chunk doesn't match the shape; callers simply
/// skip the boost for that hit.
/// Extract role from an SQL filter matching `role = 'Welder'` style.
/// Case-insensitive on the column name. Quoted value; quotes not
/// included in returned string.
fn extract_target_role(sql_filter: &str) -> Option<String> {
    grab_eq_value(sql_filter, "role")
}

/// Shared equality-value extractor for (city, state, role) lookups.
fn grab_eq_value(src: &str, col: &str) -> Option<String> {
    let lower = src.to_ascii_lowercase();
    let col_lower = col.to_ascii_lowercase();
    let mut search_from = 0usize;
    while let Some(off) = lower[search_from..].find(&col_lower) {
        let pos = search_from + off;
        let prior_ok = pos == 0
            || !lower.as_bytes()[pos - 1].is_ascii_alphanumeric()
               && lower.as_bytes()[pos - 1] != b'_';
        let after = pos + col_lower.len();
        if !prior_ok || after >= src.len() {
            search_from = pos + col_lower.len();
            continue;
        }
        let mut i = after;
        while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
        if i >= src.len() || src.as_bytes()[i] != b'=' { search_from = pos + col_lower.len(); continue; }
        i += 1;
        while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
        if i >= src.len() || src.as_bytes()[i] != b'\'' { search_from = pos + col_lower.len(); continue; }
        i += 1;
        let start = i;
        while i < src.len() && src.as_bytes()[i] != b'\'' { i += 1; }
        if i > start {
            return Some(src[start..i].to_string());
        }
        search_from = pos + col_lower.len();
    }
    None
}

/// Pull (city, state) out of a SQL filter that uses
/// `city = 'Toledo' AND state = 'OH'` style equality. Returns None if
/// either is missing — the caller keeps the original global boost map
/// behavior (no geo narrowing). Case-insensitive on the column name
/// so `CITY=` or `City =` also work.
fn extract_target_geo(sql_filter: &str) -> Option<(String, String)> {
    let city = grab_eq_value(sql_filter, "city")?;
    let state = grab_eq_value(sql_filter, "state")?;
    Some((city, state))
}

fn parse_worker_chunk(chunk: &str) -> Option<(String, String, String)> {
    // "Name — Role in City, ST. …" → split on "—" then " in " then ","
    let (name_part, rest) = chunk.split_once('—')?;
    let rest = rest.trim();
    let (_role, loc_part) = rest.split_once(" in ")?;
    let loc_part = loc_part.trim();
    let (city, state_plus) = loc_part.split_once(',')?;
    let state: String = state_plus.trim()
        .chars()
        .take_while(|c| c.is_ascii_alphabetic())
        .collect();
    let name = name_part.trim().to_string();
    let city = city.trim().to_string();
    if name.is_empty() || city.is_empty() || state.is_empty() {
        return None;
    }
    Some((name, city, state))
}

#[derive(Deserialize)]
struct SeedPlaybookRequest {
    /// One playbook with {operation, approach, context, endorsed_names}.
    /// City + state are parsed from the operation text.
    operation: String,
    #[serde(default)]
    approach: String,
    #[serde(default)]
    context: String,
    endorsed_names: Vec<String>,
    /// Append to the existing memory rather than replacing. Default true —
    /// seeding is a bootstrap/demo tool, not a rebuild substitute.
    #[serde(default = "default_true")]
    append: bool,
    /// Phase 25 — optional schema_fingerprint captured at seed time.
    /// When the underlying dataset's schema changes, any entry whose
    /// fingerprint doesn't match the new one is auto-retired via
    /// retire_on_schema_drift. Caller-provided so the producer (the
    /// scenario driver, the orchestrator) can pass the live fingerprint
    /// without the gateway needing a second catalogd round trip.
    #[serde(default)]
    schema_fingerprint: Option<String>,
    /// Phase 25 — optional hard expiry. RFC3339 timestamp. After this
    /// moment the entry is skipped during boost computation (not
    /// retired, just inactive). Useful for seasonal/temp contracts.
    #[serde(default)]
    valid_until: Option<String>,
}

/// Bootstrap / test-only: inject a playbook entry directly into
/// `playbook_memory` without going through `successful_playbooks`. Useful
/// when the source dataset has stale or phantom entries (as the initial
/// staffing seed did — names that don't correspond to real workers), and
/// you want to demonstrate the feedback loop with a known-good fixture.
///
/// Production path is always `/rebuild` — this endpoint is for operators
/// who need to prime the memory before real playbooks accumulate.
async fn seed_playbook_memory(
    State(state): State<VectorState>,
    Json(req): Json<SeedPlaybookRequest>,
) -> impl IntoResponse {
    // Embed the entry through the same text shape `rebuild` uses so
    // similarity math is comparable across seed + real entries.
    let tmp_entry = playbook_memory::PlaybookEntry {
        playbook_id: String::new(),
        operation: req.operation.clone(),
        approach: req.approach.clone(),
        context: req.context.clone(),
        timestamp: chrono::Utc::now().to_rfc3339(),
        endorsed_names: req.endorsed_names.clone(),
        city: None, state: None, embedding: None,
        schema_fingerprint: None,
        valid_until: None,
        retired_at: None,
        retirement_reason: None,
        version: 1,
        parent_id: None,
        superseded_at: None,
        superseded_by: None,
    };
    let text = format!(
        "{} | {} | {} | fills: {}",
        tmp_entry.operation, tmp_entry.approach, tmp_entry.context,
        tmp_entry.endorsed_names.join(", "),
    );
    let resp = match state.ai_client.embed(EmbedRequest { texts: vec![text], model: None }).await {
        Ok(r) => r,
        Err(e) => return Err((StatusCode::BAD_GATEWAY, format!("embed seed: {e}"))),
    };
    if resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "embed returned nothing".into()));
    }
    let emb: Vec<f32> = resp.embeddings[0].iter().map(|&x| x as f32).collect();

    // Parse city/state from the operation ("fill: Role xN in City, ST").
    // Parser lives in playbook_memory::rebuild — expose via a tiny helper
    // or inline the same logic here; duplicated briefly since this seed
    // path is stable but infrequently called.
    let (city, state_) = {
        let after_in = req.operation.split(" in ").nth(1).unwrap_or("");
        let mut parts = after_in.splitn(2, ',');
        let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty());
        let state = parts.next().map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::<String>()).filter(|s| !s.is_empty());
        (city, state)
    };
    if city.is_none() || state_.is_none() {
        return Err((StatusCode::BAD_REQUEST,
            "operation must match 'fill: Role xN in City, ST' shape".into()));
    }

    // Stable id: hash of timestamp + operation. Callers get the id back
    // so they can reference it in citations.
    let ts = chrono::Utc::now().to_rfc3339();
    use sha2::{Digest, Sha256};
    let mut h = Sha256::new();
    h.update(ts.as_bytes());
    h.update(b"|");
    h.update(req.operation.as_bytes());
    let bytes = h.finalize();
    let pid = format!("pb-seed-{}", bytes.iter().take(8).map(|b| format!("{b:02x}")).collect::<String>());

    let new_entry = playbook_memory::PlaybookEntry {
        playbook_id: pid.clone(),
        operation: req.operation,
        approach: req.approach,
        context: req.context,
        timestamp: ts,
        endorsed_names: req.endorsed_names,
        city, state: state_,
        embedding: Some(emb),
        // Phase 25 — seed request may carry a fingerprint; if not, we
        // default to None and the entry degrades to "no expiry signal"
        // (never auto-retired on drift, but manual retirement still
        // works). valid_until + retired_at start None.
        schema_fingerprint: req.schema_fingerprint.clone(),
        valid_until: req.valid_until.clone(),
        retired_at: None,
        retirement_reason: None,
        version: 1,
        parent_id: None,
        superseded_at: None,
        superseded_by: None,
    };

    // Phase 26 — when append=true (default), route through upsert so
    // same-day re-seeds of the same operation merge instead of
    // appending duplicates. When append=false, retain the old
    // replace-all semantics for callers that want a hard reset.
    if req.append {
        match state.playbook_memory.upsert_entry(new_entry).await {
            Ok(outcome) => {
                let entries_after = state.playbook_memory.entry_count().await;
                Ok(Json(serde_json::json!({
                    "outcome": outcome,
                    "entries_after": entries_after,
                })))
            }
            Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, format!("upsert: {e}"))),
        }
    } else {
        if let Err(e) = state.playbook_memory.set_entries(vec![new_entry]).await {
            return Err((StatusCode::INTERNAL_SERVER_ERROR, format!("persist: {e}")));
        }
        Ok(Json(serde_json::json!({
            "outcome": { "mode": "replaced", "playbook_id": pid },
            "entries_after": state.playbook_memory.entry_count().await,
        })))
    }
}

async fn rebuild_playbook_memory(
    State(state): State<VectorState>,
) -> impl IntoResponse {
    match playbook_memory::rebuild(
        &state.playbook_memory,
        &state.ai_client,
        &state.catalog,
        &state.bucket_registry,
    ).await {
        Ok(report) => Ok(Json(report)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

// Path 2 foundation — dump in-memory playbook_memory state to a fresh
// `successful_playbooks_live` dataset. Cheap to call (writes one parquet,
// updates one manifest), so /log can call it after every seed to keep the
// SQL-queryable surface honest without the destructive REPLACE bug that
// /ingest/file has.
async fn persist_playbook_memory_sql(
    State(state): State<VectorState>,
) -> impl IntoResponse {
    match playbook_memory::persist_to_sql(&state.playbook_memory, &state.catalog).await {
        Ok(report) => Ok(Json(report)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct PatternsRequest {
    query: String,
    #[serde(default = "default_pattern_k")]
    top_k_playbooks: usize,
    /// Minimum frequency (0.0-1.0) for a trait to make the report.
    /// Default 0.4 — at least 40% of examined workers must share it.
    #[serde(default = "default_pattern_min_freq")]
    min_trait_frequency: f32,
}
fn default_pattern_k() -> usize { 10 }
fn default_pattern_min_freq() -> f32 { 0.4 }

// Path 2 — meta-index discovery surface. "What did past similar fills
// have in common that I didn't ask about?" — surfaces signals like
// recurring certifications, skill clusters, archetype tendencies.
async fn discover_playbook_patterns(
    State(state): State<VectorState>,
    Json(req): Json<PatternsRequest>,
) -> impl IntoResponse {
    match playbook_memory::discover_patterns(
        &state.playbook_memory,
        &state.ai_client,
        &state.catalog,
        &state.bucket_registry,
        &req.query,
        req.top_k_playbooks,
        req.min_trait_frequency,
    ).await {
        Ok(report) => Ok(Json(report)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[derive(Deserialize)]
struct MarkFailedRequest {
    /// Operation text, same shape as seed: "fill: Role xN in City, ST"
    operation: String,
    /// Names of workers who didn't deliver on the fill.
    failed_names: Vec<String>,
    /// Short reason (no-show, fired, unreliable). Stored verbatim.
    #[serde(default)]
    reason: String,
}

async fn mark_playbook_failed(
    State(state): State<VectorState>,
    Json(req): Json<MarkFailedRequest>,
) -> impl IntoResponse {
    // Parse city + state from the operation — mirrors seed's parser.
    let after_in = req.operation.split(" in ").nth(1).unwrap_or("");
    let mut parts = after_in.splitn(2, ',');
    let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty());
    let state_ = parts.next().map(|s|
        s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::<String>()
    ).filter(|s| !s.is_empty());
    let (Some(city), Some(state_code)) = (city, state_) else {
        return Err((StatusCode::BAD_REQUEST,
            "operation must match 'fill: Role xN in City, ST' shape".into()));
    };

    let ts = chrono::Utc::now().to_rfc3339();
    let records: Vec<playbook_memory::FailureRecord> = req.failed_names.iter()
        .map(|n| playbook_memory::FailureRecord {
            city: city.clone(), state: state_code.clone(), name: n.clone(),
            reason: req.reason.clone(), timestamp: ts.clone(),
        })
        .collect();

    match state.playbook_memory.mark_failures(records).await {
        Ok(added) => Ok(Json(serde_json::json!({ "added": added, "city": city, "state": state_code }))),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

async fn playbook_memory_stats(
    State(state): State<VectorState>,
) -> impl IntoResponse {
    let entries = state.playbook_memory.snapshot().await;
    Json(serde_json::json!({
        "entries": entries.len(),
        "total_names_endorsed": entries.iter().map(|e| e.endorsed_names.len()).sum::<usize>(),
        "entries_with_embeddings": entries.iter().filter(|e| e.embedding.is_some()).count(),
        "sample": entries.iter().take(3).map(|e| serde_json::json!({
            "id": e.playbook_id,
            "operation": e.operation,
            "city": e.city,
            "state": e.state,
            "endorsed": e.endorsed_names,
        })).collect::<Vec<_>>(),
    }))
}

#[derive(Deserialize)]
struct RetirePlaybookRequest {
    /// Retire by playbook_id — exact match, single entry. Used for
    /// manual operator retirement via the UI.
    #[serde(default)]
    playbook_id: Option<String>,
    /// Retire by scope — city + state required, with a fingerprint
    /// that entries must match to survive. Fingerprint mismatch → retire.
    /// Use when a schema migration produces a new fingerprint and
    /// historical playbooks need to be auto-retired.
    #[serde(default)]
    city: Option<String>,
    #[serde(default)]
    state: Option<String>,
    #[serde(default)]
    current_schema_fingerprint: Option<String>,
    /// Human-readable reason stored on the retired entry.
    reason: String,
}

/// Phase 25 retirement endpoint. Two modes:
///   {playbook_id, reason}                    → retire one
///   {city, state, current_schema_fingerprint, reason} → retire all
///                                               entries in scope whose
///                                               fingerprint differs
async fn retire_playbook_memory(
    State(state): State<VectorState>,
    Json(req): Json<RetirePlaybookRequest>,
) -> impl IntoResponse {
    if let Some(id) = &req.playbook_id {
        return match state.playbook_memory.retire_one(id, &req.reason).await {
            Ok(found) => Ok(Json(serde_json::json!({ "mode": "by_id", "retired": if found { 1 } else { 0 } }))),
            Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
        };
    }
    if let (Some(city), Some(state_code), Some(fp)) = (&req.city, &req.state, &req.current_schema_fingerprint) {
        return match state.playbook_memory.retire_on_schema_drift(city, state_code, fp, &req.reason).await {
            Ok(n) => Ok(Json(serde_json::json!({ "mode": "schema_drift", "retired": n, "city": city, "state": state_code }))),
            Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
        };
    }
    Err((StatusCode::BAD_REQUEST,
        "supply either {playbook_id, reason} or {city, state, current_schema_fingerprint, reason}".into()))
}

/// Phase 27 — request body for `POST /playbook_memory/revise`. Same
/// shape as a seed request minus `append` (revise is always
/// append-semantics for a specific parent) plus `parent_id`. The new
/// version's `playbook_id` is derived deterministically so callers get
/// the same id back from repeated revises with identical content —
/// useful for idempotent retry paths.
#[derive(Deserialize)]
struct RevisePlaybookRequest {
    parent_id: String,
    operation: String,
    approach: String,
    context: String,
    endorsed_names: Vec<String>,
    #[serde(default)]
    schema_fingerprint: Option<String>,
    #[serde(default)]
    valid_until: Option<String>,
}

/// Phase 27 — create a new version of an existing playbook. The parent
/// is marked superseded; the new entry inherits the chain via
/// `parent_id` and carries `version = parent.version + 1`. Errors with
/// 400 on a retired or already-superseded parent (must revise the tip
/// of the chain). Embeds the new text through the same shape as
/// `/seed` so cosine similarity stays comparable across rebuild + seed
/// + revise entries.
async fn revise_playbook_memory(
    State(state): State<VectorState>,
    Json(req): Json<RevisePlaybookRequest>,
) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
    let text = format!(
        "{} | {} | {} | fills: {}",
        req.operation, req.approach, req.context,
        req.endorsed_names.join(", "),
    );
    let resp = state.ai_client.embed(EmbedRequest { texts: vec![text], model: None })
        .await
        .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed revise: {e}")))?;
    if resp.embeddings.is_empty() {
        return Err((StatusCode::BAD_GATEWAY, "embed returned nothing".into()));
    }
    let emb: Vec<f32> = resp.embeddings[0].iter().map(|&x| x as f32).collect();

    let (city, state_) = {
        let after_in = req.operation.split(" in ").nth(1).unwrap_or("");
        let mut parts = after_in.splitn(2, ',');
        let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty());
        let state = parts.next()
            .map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::<String>())
            .filter(|s| !s.is_empty());
        (city, state)
    };
    if city.is_none() || state_.is_none() {
        return Err((StatusCode::BAD_REQUEST,
            "operation must match 'fill: Role xN in City, ST' shape".into()));
    }

    let ts = chrono::Utc::now().to_rfc3339();
    use sha2::{Digest, Sha256};
    let mut h = Sha256::new();
    h.update(ts.as_bytes());
    h.update(b"|");
    h.update(req.parent_id.as_bytes());
    h.update(b"|");
    h.update(req.operation.as_bytes());
    let bytes = h.finalize();
    let pid = format!("pb-rev-{}", bytes.iter().take(8).map(|b| format!("{b:02x}")).collect::<String>());

    let new_entry = playbook_memory::PlaybookEntry {
        playbook_id: pid.clone(),
        operation: req.operation,
        approach: req.approach,
        context: req.context,
        timestamp: ts,
        endorsed_names: req.endorsed_names,
        city, state: state_,
        embedding: Some(emb),
        schema_fingerprint: req.schema_fingerprint,
        valid_until: req.valid_until,
        retired_at: None,
        retirement_reason: None,
        // revise_entry overwrites these from the parent — values here
        // are just placeholders so the struct is well-formed.
        version: 1,
        parent_id: None,
        superseded_at: None,
        superseded_by: None,
    };

    let outcome = state.playbook_memory.revise_entry(&req.parent_id, new_entry)
        .await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    Ok(Json(serde_json::json!({
        "outcome": outcome,
        "entries_after": state.playbook_memory.entry_count().await,
    })))
}

/// Phase 27 — return the full version chain containing `playbook_id`,
/// ordered root → tip. 404 if the id isn't present. The walker is
/// cycle-safe by construction (visited set per direction).
async fn playbook_memory_history(
    State(state): State<VectorState>,
    Path(playbook_id): Path<String>,
) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
    let chain = state.playbook_memory.history(&playbook_id).await;
    if chain.is_empty() {
        return Err((StatusCode::NOT_FOUND, format!("no playbook with id '{playbook_id}'")));
    }
    Ok(Json(serde_json::json!({
        "playbook_id": playbook_id,
        "versions": chain.len(),
        "chain": chain,
    })))
}

/// Phase 25 status endpoint — reports retirement counts so dashboards
/// can show "N playbooks retired (12 from 2026-05 schema migration)".
/// Phase 27 added `superseded` as a distinct counter.
async fn playbook_memory_status(
    State(state): State<VectorState>,
) -> impl IntoResponse {
    let (total, retired, superseded, failures) = state.playbook_memory.status_counts().await;
    // `active` = entries eligible for boost. Retired and superseded are
    // distinct exclusion reasons; subtract both. An entry can in principle
    // be both retired AND superseded (e.g. revised then retired) so
    // saturating_sub guards against underflow if that pathological case
    // ever lands.
    let inactive = retired + superseded;
    Json(serde_json::json!({
        "total": total,
        "retired": retired,
        "superseded": superseded,
        "active": total.saturating_sub(inactive),
        "failures": failures,
    }))
}

async fn lance_recall_harness(
    State(state): State<VectorState>,
    Path(index_name): Path<String>,
    Json(req): Json<LanceRecallRequest>,
) -> impl IntoResponse {
    let t0 = std::time::Instant::now();

    let harness_set = state.harness_store.load_for_index(&index_name, &req.harness).await
        .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?;
    if !harness_set.ground_truth_built {
        return Err((StatusCode::BAD_REQUEST,
            "harness has no ground truth — run a regular /hnsw/trial first to compute it".into()));
    }

    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;

    let k = req.top_k;
    let mut per_query = Vec::with_capacity(harness_set.queries.len());
    let mut latencies: Vec<f32> = Vec::with_capacity(harness_set.queries.len());
    let mut recalls: Vec<f32> = Vec::with_capacity(harness_set.queries.len());

    for q in &harness_set.queries {
        let qv = match &q.query_embedding {
            Some(v) => v,
            None => continue,
        };
        let gt = match &q.ground_truth {
            Some(gt) => gt,
            None => continue,
        };

        let qt0 = std::time::Instant::now();
        let hits = lance_store.search(
            qv,
            k,
            Some(req.nprobes.unwrap_or(LANCE_DEFAULT_NPROBES)),
            Some(req.refine_factor.unwrap_or(LANCE_DEFAULT_REFINE_FACTOR)),
        ).await
            .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?;
        let lat_us = qt0.elapsed().as_micros() as f32;

        let predicted: Vec<String> = hits.iter().map(|h| h.doc_id.clone()).collect();
        let recall = harness::recall_at_k(&predicted, gt, k);

        per_query.push(LanceRecallQuery {
            query_id: q.id.clone(),
            recall,
            latency_us: lat_us,
            hits_returned: hits.len(),
        });
        latencies.push(lat_us);
        recalls.push(recall);
    }

    let mean_recall = if recalls.is_empty() { 0.0 } else {
        recalls.iter().sum::<f32>() / recalls.len() as f32
    };
    latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let p = |pct: f32| -> f32 {
        if latencies.is_empty() { return 0.0; }
        let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize;
        latencies[idx.min(latencies.len() - 1)]
    };

    Ok(Json(LanceRecallResult {
        index_name,
        harness: req.harness,
        queries: per_query.len(),
        top_k: k,
        mean_recall,
        per_query,
        latency_p50_us: p(0.50),
        latency_p95_us: p(0.95),
        total_duration_secs: t0.elapsed().as_secs_f32(),
    }))
}

/// Build a scalar btree index on a column (typically `doc_id`). Makes
/// filter-pushdown queries O(log N) instead of full-fragment scan.
async fn lance_build_scalar_index(
    State(state): State<VectorState>,
    Path((index_name, column)): Path<(String, String)>,
) -> impl IntoResponse {
    let lance_store = state.lance.store_for(&index_name).await
        .map_err(|e| (StatusCode::BAD_REQUEST, e))?;
    match lance_store.build_scalar_index(&column).await {
        Ok(stats) => Ok(Json(stats)),
        Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)),
    }
}

#[cfg(test)]
mod extractor_tests {
    use super::*;

    #[test]
    fn extract_target_geo_basic() {
        let f = "role = 'Welder' AND city = 'Toledo' AND state = 'OH' AND CAST(availability AS DOUBLE) > 0.5";
        assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into())));
    }

    #[test]
    fn extract_target_geo_missing_state_returns_none() {
        let f = "role = 'Welder' AND city = 'Toledo'";
        assert_eq!(extract_target_geo(f), None);
    }

    #[test]
    fn extract_target_geo_word_boundary() {
        // "civilian" contains "city" as a substring — must not match.
        let f = "civilian_rank = 1 AND city = 'Toledo' AND state = 'OH'";
        assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into())));
    }

    #[test]
    fn extract_target_role_basic() {
        let f = "role = 'Welder' AND city = 'Toledo'";
        assert_eq!(extract_target_role(f), Some("Welder".into()));
    }

    #[test]
    fn extract_target_role_none_when_absent() {
        let f = "city = 'Toledo' AND state = 'OH'";
        assert_eq!(extract_target_role(f), None);
    }

    #[test]
    fn extract_target_role_multi_word() {
        let f = "role = 'Warehouse Associate' AND city = 'Chicago'";
        assert_eq!(extract_target_role(f), Some("Warehouse Associate".into()));
    }
}