use axum::{ Json, Router, extract::{Path, Query, State}, http::StatusCode, response::IntoResponse, routing::{get, post}, }; use object_store::ObjectStore; use serde::{Deserialize, Serialize}; use std::sync::Arc; use aibridge::client::{AiClient, EmbedRequest, GenerateRequest}; use catalogd::registry::Registry as CatalogRegistry; use storaged::registry::BucketRegistry; use crate::{agent, autotune, chunker, embedding_cache, harness, hnsw, index_registry, jobs, lance_backend, playbook_memory, promotion, rag, refresh, search, store, supervisor, trial}; #[derive(Clone)] pub struct VectorState { pub store: Arc, pub ai_client: AiClient, pub job_tracker: jobs::JobTracker, pub index_registry: index_registry::IndexRegistry, pub hnsw_store: hnsw::HnswStore, pub embedding_cache: embedding_cache::EmbeddingCache, pub trial_journal: trial::TrialJournal, /// Federation-aware harness store — resolves eval artifacts to each /// index's recorded bucket, falling back to primary for legacy evals. pub harness_store: harness::HarnessStore, /// Catalog registry — needed by the Phase C refresh path to mark/clear /// staleness and look up dataset manifests. pub catalog: CatalogRegistry, /// Phase 16: promoted HNSW configs. Activation + autotune read/write here. pub promotion_registry: promotion::PromotionRegistry, /// Phase 16.2: handle to the background autotune agent. Always /// present — if the agent is disabled in config, the handle drops /// incoming triggers silently. pub agent_handle: agent::AgentHandle, /// Phase B (federation layer 2): bucket registry for per-profile /// bucket auto-provisioning on activation. pub bucket_registry: Arc, /// Phase C (two-profile VRAM gate): tracks which profile is currently /// "active" on the GPU. Singleton — one profile at a time holds its /// model in VRAM. Swapping profiles with different ollama_name unloads /// the previous one (keep_alive=0) before preloading the new one. /// /// `None` = no profile has been activated this session; any first /// activation just preloads and takes the slot. pub active_profile: Arc>>, /// ADR-019 hybrid: handles to Lance datasets keyed by index name. /// Lazy-created on first /vectors/lance/* call. pub lance: lance_backend::LanceRegistry, /// Phase 19 — meta-index feedback. Embeds past successful_playbooks /// and, when `use_playbook_memory` is set on /vectors/hybrid, boosts /// workers that were actually filled in semantically-similar past ops. pub playbook_memory: playbook_memory::PlaybookMemory, } /// What the active-profile singleton records. Narrow — we don't need the /// full ModelProfile here, just enough to know what to unload on swap. #[derive(Debug, Clone, Serialize)] pub struct ActiveProfileSlot { pub profile_id: String, pub ollama_name: String, pub activated_at: chrono::DateTime, } pub fn router(state: VectorState) -> Router { Router::new() .route("/health", get(health)) .route("/index", post(create_index)) .route("/indexes", get(list_indexes)) .route("/indexes/{name}", get(get_index_meta)) .route("/indexes/{name}/bucket", axum::routing::patch(migrate_index_bucket)) .route("/jobs", get(list_jobs)) .route("/jobs/{id}", get(get_job)) .route("/search", post(search_index)) .route("/rag", post(rag_query)) .route("/hybrid", post(hybrid_search)) .route("/hnsw/build", post(build_hnsw)) .route("/hnsw/search", post(search_hnsw)) .route("/hnsw/list", get(list_hnsw)) // Trial system — parameterized tuning loop .route("/hnsw/trial", post(run_trial)) .route("/hnsw/trials/{index_name}", get(list_trials)) .route("/hnsw/trials/{index_name}/best", get(best_trial)) // Eval sets .route("/hnsw/evals", get(list_evals)) .route("/hnsw/evals/{name}", get(get_eval).put(put_eval)) .route("/hnsw/evals/{name}/autogen", post(autogen_eval)) // Cache management .route("/hnsw/cache/stats", get(cache_stats)) .route("/hnsw/cache/{index_name}", axum::routing::delete(cache_evict)) // Phase C: embedding refresh .route("/refresh/{dataset_name}", post(refresh_dataset)) .route("/stale", get(list_stale)) // Phase 17: profile activation — pre-load caches + HNSW for this // model's bound data. First search after activate is warm. .route("/profile/{id}/activate", post(activate_profile)) .route("/profile/{id}/deactivate", post(deactivate_profile)) .route("/profile/{id}/search", post(profile_scoped_search)) // Phase 17 VRAM gate: which profile currently owns the GPU? .route("/profile/active", get(get_active_profile)) // Phase 16: promotion + autotune .route("/hnsw/promote/{index}/{trial_id}", post(promote_trial)) .route("/hnsw/rollback/{index}", post(rollback_promotion)) .route("/hnsw/promoted/{index}", get(get_promoted)) .route("/hnsw/autotune", post(run_autotune_endpoint)) // Phase 16.2: background autotune agent .route("/agent/status", get(agent_status)) .route("/agent/stop", post(agent_stop)) .route("/agent/enqueue/{index_name}", post(agent_enqueue)) // ADR-019: Lance hybrid backend .route("/lance/migrate/{index_name}", post(lance_migrate)) .route("/lance/index/{index_name}", post(lance_build_index)) .route("/lance/search/{index_name}", post(lance_search)) .route("/lance/doc/{index_name}/{doc_id}", get(lance_get_doc)) .route("/lance/append/{index_name}", post(lance_append)) .route("/lance/stats/{index_name}", get(lance_stats)) .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index)) .route("/lance/recall/{index_name}", post(lance_recall_harness)) // Phase 19: playbook memory — the meta-index feedback loop .route("/playbook_memory/rebuild", post(rebuild_playbook_memory)) .route("/playbook_memory/stats", get(playbook_memory_stats)) .route("/playbook_memory/seed", post(seed_playbook_memory)) .route("/playbook_memory/persist_sql", post(persist_playbook_memory_sql)) .route("/playbook_memory/patterns", post(discover_playbook_patterns)) .route("/playbook_memory/mark_failed", post(mark_playbook_failed)) .route("/playbook_memory/retire", post(retire_playbook_memory)) .route("/playbook_memory/revise", post(revise_playbook_memory)) .route("/playbook_memory/history/{id}", get(playbook_memory_history)) .route("/playbook_memory/status", get(playbook_memory_status)) .with_state(state) } async fn health() -> &'static str { "vectord ok" } // --- Background Index Creation --- #[derive(Deserialize)] struct CreateIndexRequest { index_name: String, source: String, documents: Vec, chunk_size: Option, overlap: Option, /// Federation layer 2: optional bucket to hold this index's trial /// journal + promotion file. Defaults to "primary" — pre-existing /// clients that don't know about federation keep working unchanged. #[serde(default)] bucket: Option, } #[derive(Deserialize)] struct DocInput { id: String, text: String, } #[derive(Serialize)] struct CreateIndexResponse { job_id: String, index_name: String, documents: usize, chunks: usize, message: String, } async fn create_index( State(state): State, Json(req): Json, ) -> impl IntoResponse { let chunk_size = req.chunk_size.unwrap_or(500); let overlap = req.overlap.unwrap_or(50); // Chunk synchronously (fast) let doc_ids: Vec = req.documents.iter().map(|d| d.id.clone()).collect(); let texts: Vec = req.documents.iter().map(|d| d.text.clone()).collect(); let chunks = chunker::chunk_column(&req.source, &doc_ids, &texts, chunk_size, overlap); if chunks.is_empty() { return Err((StatusCode::BAD_REQUEST, "no text to index".to_string())); } let n_docs = req.documents.len(); let n_chunks = chunks.len(); let index_name = req.index_name.clone(); let bucket = req.bucket.clone().unwrap_or_else(|| "primary".to_string()); // Create job and return immediately let job_id = state.job_tracker.create(&index_name, n_chunks).await; tracing::info!("job {job_id}: indexing '{}' — {} docs → {} chunks (background)", index_name, n_docs, n_chunks); // Spawn supervised dual-pipeline embedding let tracker = state.job_tracker.clone(); let ai_client = state.ai_client.clone(); let obj_store = state.store.clone(); let registry = state.index_registry.clone(); let jid = job_id.clone(); let source_name = req.source.clone(); let idx_name = req.index_name.clone(); tokio::spawn(async move { let start_time = std::time::Instant::now(); let config = supervisor::SupervisorConfig::default(); let result = supervisor::run_supervised( &jid, &idx_name, chunks, &ai_client, &obj_store, &tracker, config, ).await; match result { Ok(key) => { let elapsed = start_time.elapsed().as_secs_f32(); let rate = if elapsed > 0.0 { n_chunks as f32 / elapsed } else { 0.0 }; // Register index metadata with model version info let meta = index_registry::IndexMeta { index_name: idx_name.clone(), source: source_name, model_name: "nomic-embed-text".to_string(), // from sidecar config model_version: "latest".to_string(), dimensions: 768, chunk_count: n_chunks, doc_count: n_docs, chunk_size: chunk_size, overlap: overlap, storage_key: key.clone(), created_at: chrono::Utc::now(), build_time_secs: elapsed, chunks_per_sec: rate, bucket: bucket.clone(), vector_backend: shared::types::VectorBackend::Parquet, id_prefix: None, }; let _ = registry.register(meta).await; tracker.complete(&jid, key).await; tracing::info!("job {jid}: completed — {n_chunks} chunks in {elapsed:.0}s ({rate:.0}/sec)"); } Err(e) => { tracker.fail(&jid, e.clone()).await; tracing::error!("job {jid}: failed — {e}"); } } }); Ok((StatusCode::ACCEPTED, Json(CreateIndexResponse { job_id, index_name: req.index_name, documents: n_docs, chunks: n_chunks, message: format!("embedding {} chunks in background — poll /vectors/jobs/{{id}} for progress", n_chunks), }))) } // --- Index Registry --- #[derive(Deserialize)] struct IndexListQuery { source: Option, model: Option, } async fn list_indexes( State(state): State, Query(q): Query, ) -> impl IntoResponse { let indexes = state.index_registry.list(q.source.as_deref(), q.model.as_deref()).await; Json(indexes) } async fn get_index_meta( State(state): State, Path(name): Path, ) -> impl IntoResponse { match state.index_registry.get(&name).await { Some(meta) => Ok(Json(meta)), None => Err((StatusCode::NOT_FOUND, format!("index not found: {name}"))), } } #[derive(Deserialize)] struct MigrateBucketRequest { dest_bucket: String, /// If true, delete artifacts from the source bucket after the pointer /// flip. Default false — keeping source copies means a failed migration /// is recoverable by editing IndexMeta.bucket back, and a successful /// migration leaves inspectable forensics until an operator sweeps. #[serde(default)] delete_source: bool, } #[derive(Serialize)] struct MigrateBucketReport { index_name: String, source_bucket: String, dest_bucket: String, /// Artifact keys that were copied (or attempted). Order follows copy order. copied: Vec, /// Artifact prefixes that had nothing to copy (optional files missing, /// trial journal empty, etc). skipped: Vec, /// Subset of `copied` that was subsequently deleted from the source. deleted_source: Vec, duration_secs: f32, } /// Move an index's artifacts from its current bucket to `dest_bucket`. /// Parquet-backed indexes only — Lance migration needs URI rewriting that /// isn't in scope for this endpoint. Copies the vector data, trial journal, /// promotion file, and auto-generated harness; updates `IndexMeta.bucket` /// last so a mid-flight failure leaves the index still usable at its /// original location. Evicts the `EmbeddingCache` entry so the next load /// re-reads from the new bucket. async fn migrate_index_bucket( State(state): State, Path(name): Path, Json(req): Json, ) -> Result, (StatusCode, String)> { let t0 = std::time::Instant::now(); let mut meta = state .index_registry .get(&name) .await .ok_or_else(|| (StatusCode::NOT_FOUND, format!("index '{name}' not found")))?; if meta.vector_backend == shared::types::VectorBackend::Lance { return Err(( StatusCode::BAD_REQUEST, "Lance-backed indexes cannot be migrated via this endpoint — \ Lance URIs are bucket-specific; a separate migrate_lance tool \ is needed".into(), )); } if !state.bucket_registry.contains(&req.dest_bucket) { return Err(( StatusCode::BAD_REQUEST, format!("dest bucket '{}' not registered", req.dest_bucket), )); } let source_bucket = meta.bucket.clone(); if source_bucket == req.dest_bucket { return Err(( StatusCode::BAD_REQUEST, format!("source and dest are both '{source_bucket}' — nothing to migrate"), )); } let src = state .bucket_registry .get(&source_bucket) .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; let dst = state .bucket_registry .get(&req.dest_bucket) .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; let mut copied: Vec = Vec::new(); let mut skipped: Vec = Vec::new(); // 1. Vector data (single parquet file for this backend). copy_key(&src, &dst, &meta.storage_key) .await .map_err(|e| { (StatusCode::INTERNAL_SERVER_ERROR, format!("copy {}: {e}", meta.storage_key)) })?; copied.push(meta.storage_key.clone()); // 2. Trial journal batches — per-index directory of JSONL files. let trial_prefix = format!("_hnsw_trials/{name}/"); let trial_keys = storaged::ops::list(&src, Some(&trial_prefix)) .await .unwrap_or_default(); if trial_keys.is_empty() { skipped.push(trial_prefix); } for k in &trial_keys { copy_key(&src, &dst, k) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("copy {k}: {e}")))?; copied.push(k.clone()); } // 3. Promotion file (optional — absent for never-promoted indexes). let promo_key = format!("_hnsw_promotions/{name}.json"); match copy_key(&src, &dst, &promo_key).await { Ok(()) => copied.push(promo_key), Err(_) => skipped.push(promo_key), } // 4. Auto-generated harness (optional — absent if agent never ran). let harness_key = format!("_hnsw_evals/{name}_auto.json"); match copy_key(&src, &dst, &harness_key).await { Ok(()) => copied.push(harness_key), Err(_) => skipped.push(harness_key), } // 5. Pointer flip — IndexMeta.bucket now points at destination. This // is the commit point; earlier failures leave copies in dest but the // index still usable at source. meta.bucket = req.dest_bucket.clone(); state .index_registry .register(meta) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("update meta: {e}")))?; // 6. Cache eviction — next load reads the new bucket's parquet. state.embedding_cache.evict(&name).await; // 7. Optional source cleanup. let mut deleted_source: Vec = Vec::new(); if req.delete_source { for k in &copied { if storaged::ops::delete(&src, k).await.is_ok() { deleted_source.push(k.clone()); } } } Ok(Json(MigrateBucketReport { index_name: name, source_bucket, dest_bucket: req.dest_bucket, copied, skipped, deleted_source, duration_secs: t0.elapsed().as_secs_f32(), })) } /// Stream a single object from one bucket to another. Uses the existing /// `storaged::ops` get + put primitives — no native copy in object_store /// across heterogeneous backends (local ↔ S3), so an in-memory hop is /// unavoidable. Bounded by individual object size, which for our parquet /// + jsonl artifacts tops out around a few hundred MB. async fn copy_key( src: &Arc, dst: &Arc, key: &str, ) -> Result<(), String> { let data = storaged::ops::get(src, key).await?; storaged::ops::put(dst, key, data).await } // --- unused legacy function below, kept for reference --- #[allow(dead_code)] /// Legacy single-pipeline embedding (replaced by supervisor). async fn _run_embedding_job_legacy( job_id: &str, index_name: &str, chunks: &[chunker::TextChunk], ai_client: &AiClient, store: &Arc, tracker: &jobs::JobTracker, ) -> Result { let batch_size = 32; let mut all_vectors: Vec> = Vec::new(); let start = std::time::Instant::now(); for (i, batch) in chunks.chunks(batch_size).enumerate() { let texts: Vec = batch.iter().map(|c| c.text.clone()).collect(); let embed_resp = ai_client.embed(EmbedRequest { texts, model: None, }).await.map_err(|e| format!("embed batch {} error: {e}", i))?; all_vectors.extend(embed_resp.embeddings); // Update progress let elapsed = start.elapsed().as_secs_f32(); let rate = if elapsed > 0.0 { all_vectors.len() as f32 / elapsed } else { 0.0 }; tracker.update_progress(job_id, all_vectors.len(), rate).await; // Log every 100 batches if (i + 1) % 100 == 0 { let pct = (all_vectors.len() as f32 / chunks.len() as f32) * 100.0; let eta = if rate > 0.0 { (chunks.len() - all_vectors.len()) as f32 / rate } else { 0.0 }; tracing::info!("job {job_id}: {}/{} chunks ({pct:.0}%), {rate:.0}/sec, ETA {eta:.0}s", all_vectors.len(), chunks.len()); } } // Store let key = store::store_embeddings(store, index_name, chunks, &all_vectors).await?; Ok(key) } // --- Job Status --- async fn list_jobs(State(state): State) -> impl IntoResponse { let jobs = state.job_tracker.list().await; Json(jobs) } async fn get_job( State(state): State, Path(id): Path, ) -> impl IntoResponse { match state.job_tracker.get(&id).await { Some(job) => Ok(Json(job)), None => Err((StatusCode::NOT_FOUND, format!("job not found: {id}"))), } } // --- Search --- #[derive(Deserialize)] struct SearchRequest { index_name: String, query: String, top_k: Option, } #[derive(Serialize)] struct SearchResponse { results: Vec, query: String, } async fn search_index( State(state): State, Json(req): Json, ) -> impl IntoResponse { let top_k = req.top_k.unwrap_or(5); let embed_resp = state.ai_client.embed(EmbedRequest { texts: vec![req.query.clone()], model: None, }).await.map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed error: {e}")))?; if embed_resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "no embedding returned".to_string())); } let query_vec: Vec = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect(); let embeddings = store::load_embeddings(&state.store, &req.index_name) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("index not found: {e}")))?; let results = search::search(&query_vec, &embeddings, top_k); Ok(Json(SearchResponse { results, query: req.query, })) } // --- RAG --- #[derive(Deserialize)] struct RagRequest { index_name: String, question: String, top_k: Option, } async fn rag_query( State(state): State, Json(req): Json, ) -> impl IntoResponse { let top_k = req.top_k.unwrap_or(5); match rag::query(&req.question, &req.index_name, top_k, &state.store, &state.ai_client).await { Ok(resp) => Ok(Json(resp)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } // --- Hybrid SQL+Vector Search --- // // The fix for the core RAG gap: vector search alone can't do structured // filtering (state, role, reliability threshold). SQL alone can't do // semantic similarity ("who could handle this kind of work"). Hybrid // does both: SQL narrows to structurally-valid candidates, vector // ranks them by semantic relevance, LLM generates from verified context. #[derive(Deserialize)] struct HybridRequest { /// Natural language question — used for embedding + LLM generation. question: String, /// Vector index to search against. index_name: String, /// SQL WHERE clause to pre-filter. Applied against the index's source /// dataset. Example: "state = 'IL' AND reliability > 0.8" /// Safety: runs through DataFusion's parser so injection is bounded /// by what DataFusion accepts (no DDL, no writes). #[serde(default)] sql_filter: Option, /// Dataset to run the SQL filter against. Defaults to the index's /// source if omitted. #[serde(default)] filter_dataset: Option, /// Column in the SQL result that maps to the vector index's doc_id. /// Default: "worker_id" (for the Ethereal dataset) or "candidate_id". #[serde(default)] id_column: Option, #[serde(default = "default_top_k")] top_k: usize, /// If true, generate an LLM answer from the matched context. /// If false, just return the ranked matches (faster, no Ollama gen). #[serde(default = "default_true")] generate: bool, /// Phase 19: consult `playbook_memory` and boost workers that past /// similar playbooks successfully filled. Off by default so current /// callers keep deterministic ranking; opt-in unlocks the feedback. #[serde(default)] use_playbook_memory: bool, /// Number of past playbooks to consider when `use_playbook_memory` /// is on. Ignored otherwise. Defaults to 5. #[serde(default)] playbook_memory_k: Option, } fn default_true() -> bool { true } #[derive(serde::Serialize)] struct HybridResponse { question: String, sql_filter: Option, sql_matches: usize, vector_reranked: usize, method: String, answer: Option, sources: Vec, duration_ms: u64, } #[derive(serde::Serialize)] struct HybridSource { doc_id: String, chunk_text: String, score: f32, sql_verified: bool, /// Phase 19: how much the playbook_memory boost lifted this hit's /// score. 0.0 when `use_playbook_memory=false` or no past playbook /// endorsed this worker. #[serde(default, skip_serializing_if = "is_zero")] playbook_boost: f32, /// playbook_ids whose endorsement contributed to `playbook_boost`. #[serde(default, skip_serializing_if = "Vec::is_empty")] playbook_citations: Vec, } fn is_zero(x: &f32) -> bool { x.abs() < 1e-6 } async fn hybrid_search( State(state): State, Json(req): Json, ) -> impl IntoResponse { let t0 = std::time::Instant::now(); // Step 1: If SQL filter provided, run it to get the set of valid IDs. let valid_ids: Option> = if let Some(ref filter) = req.sql_filter { let index_meta = state.index_registry.get(&req.index_name).await; let dataset = req.filter_dataset.clone() .or_else(|| index_meta.map(|m| m.source.clone())) .unwrap_or_else(|| req.index_name.clone()); let id_col = req.id_column.clone().unwrap_or_else(|| "worker_id".into()); let sql = format!("SELECT CAST({id_col} AS VARCHAR) AS id FROM {dataset} WHERE {filter}"); tracing::info!("hybrid: SQL filter → {sql}"); // Use queryd through the catalog — same engine as /query/sql // Use the query engine to get JSON rows — avoids Arrow type // wrangling across DataFusion's Utf8View/StringViewArray variants. let engine = queryd::context::QueryEngine::new( state.catalog.clone(), state.bucket_registry.clone(), queryd::cache::MemCache::new(0), ); match engine.query(&sql).await { Ok(batches) => { use arrow::array::{Array, AsArray}; let mut ids = std::collections::HashSet::new(); for batch in &batches { if let Some(col) = batch.column_by_name("id") { // DataFusion CAST(x AS VARCHAR) → StringViewArray. // Try StringView first, then String, then Int. if let Some(arr) = col.as_string_view_opt() { for i in 0..arr.len() { if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); } } } else if let Some(arr) = col.as_string_opt::() { for i in 0..arr.len() { if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); } } } else { // Fallback: try as Int32/Int64 (if CAST didn't happen) if let Some(arr) = col.as_any().downcast_ref::() { for i in 0..arr.len() { if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); } } } else if let Some(arr) = col.as_any().downcast_ref::() { for i in 0..arr.len() { if !arr.is_null(i) { ids.insert(arr.value(i).to_string()); } } } } } } tracing::info!("hybrid: SQL filter returned {} IDs", ids.len()); if ids.is_empty() { None } else { Some(ids) } } Err(e) => { return Err((StatusCode::BAD_REQUEST, format!("SQL filter error: {e}"))); } } } else { None }; // Step 2: Vector search — embed question, search index. let embed_resp = state.ai_client .embed(EmbedRequest { texts: vec![req.question.clone()], model: None }) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?; if embed_resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "no embedding".into())); } let qv: Vec = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect(); // When SQL-filtered: use brute-force cosine over all embeddings, // then filter by SQL IDs, then take top_k. HNSW's ef_search caps // results at ~30, which is too few to reliably intersect with // narrow SQL filters. Brute-force on 10K vectors is ~50ms — fast // enough for the hybrid path. Without SQL filter, use HNSW normally. let all_results = if valid_ids.is_some() { // Brute-force path: score ALL vectors, filter by SQL IDs later. let embeddings = store::load_embeddings(&state.store, &req.index_name).await .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?; search::search(&qv, &embeddings, embeddings.len()) // score everything } else if state.hnsw_store.has_index(&req.index_name).await { state.hnsw_store.search(&req.index_name, &qv, req.top_k).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))? .into_iter() .map(|h| search::SearchResult { doc_id: h.doc_id, chunk_text: h.chunk_text, score: h.score, source: h.source, chunk_idx: h.chunk_idx as u32, }) .collect::>() } else { let embeddings = store::load_embeddings(&state.store, &req.index_name).await .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?; search::search(&qv, &embeddings, req.top_k) }; // Step 3: Filter vector results to only SQL-verified IDs. // ADR-020: read the index's id_prefix from the catalog instead of // hardcoding prefix stripping. Falls back to heuristic for legacy indexes. let id_prefix: Option = state.index_registry .get(&req.index_name).await .and_then(|m| m.id_prefix.clone()); let sql_count = valid_ids.as_ref().map(|s| s.len()).unwrap_or(0); // Phase 19: when playbook_memory is consulted, pull a wider candidate // pool so endorsed workers outside the vanilla top-K can still be // boosted into visibility. 5× is a conservative multiplier — plenty // for a +0.25 boost to flip rankings without dragging the cost up. let fetch_k = if req.use_playbook_memory { req.top_k * 5 } else { req.top_k }; let filtered: Vec = if let Some(ref ids) = valid_ids { all_results.into_iter() .filter(|r| { let raw_id = if let Some(ref prefix) = id_prefix { r.doc_id.strip_prefix(prefix.as_str()).unwrap_or(&r.doc_id) } else { // Legacy: heuristic strip for pre-ADR-020 indexes r.doc_id.strip_prefix("W500K-") .or_else(|| r.doc_id.strip_prefix("W500-")) .or_else(|| r.doc_id.strip_prefix("W5K-")) .or_else(|| r.doc_id.strip_prefix("W-")) .or_else(|| r.doc_id.strip_prefix("CAND-")) .unwrap_or(&r.doc_id) }; ids.contains(raw_id) }) .take(fetch_k) .collect() } else { all_results.into_iter().take(fetch_k).collect() }; // Step 4: Build sources with SQL-verified flag. let mut sources: Vec = filtered.iter().map(|r| HybridSource { doc_id: r.doc_id.clone(), chunk_text: r.chunk_text.clone(), score: r.score, sql_verified: valid_ids.is_some(), playbook_boost: 0.0, playbook_citations: Vec::new(), }).collect(); // Step 4b (Phase 19): if use_playbook_memory, look up semantically // similar past playbooks and boost workers they endorsed. Name-match // is on the tuple (city, state, name) extracted from chunk_text — // hybrid_search's SQL filter already narrowed to one city+state, so // this just needs to check the name against each playbook's endorsed // set. Additive boost on the existing vector score, then re-sort. if req.use_playbook_memory { let boost_k = req.playbook_memory_k.unwrap_or(playbook_memory::DEFAULT_TOP_K_PLAYBOOKS); // Extract target (city, state, role) from the SQL filter so // compute_boost_for can skip playbooks from other cities AND // prioritize exact role matches via the multi-strategy path. // The executor's filter shape is stable: // `... role = 'Welder' AND city = 'Toledo' AND state = 'OH' ...`. // Case-insensitive match, tolerant of single quotes. let target_geo = req.sql_filter.as_deref().and_then(extract_target_geo); let target_role = req.sql_filter.as_deref().and_then(extract_target_role); // We embedded the question as `qv` above — reuse it for the // playbook similarity lookup so we don't double-pay Ollama. let boosts = state.playbook_memory .compute_boost_for_filtered_with_role( &qv, boost_k, 0.5, target_geo.as_ref().map(|(c, s)| (c.as_str(), s.as_str())), target_role.as_deref(), ) .await; // Diagnostics for Phase 19 boost pipeline. Logged so item 3 // investigation has ground truth: // - boosts.len(): how many (city,state,name) keys surfaced for // this query (0 = playbook_memory found nothing semantically // similar to the question). // - parsed: how many candidate chunks parsed cleanly into // (name,city,state) via parse_worker_chunk. // - matched: how many parsed keys matched an entry in boosts. // 2026-04-21 — 20-scenario batch showed 34/40 ok combos never // got a citation. These counters pin whether the gap is on the // SIMILARITY side (boosts empty) or the MATCH side (parsed vs // boosted keys mismatch — e.g. name format drift). let mut parsed_count = 0usize; let mut matched_count = 0usize; for src in sources.iter_mut() { // Parse "{Name} — {Role} in {City}, {State}. …" chunk. Being // defensive: chunks from other datasets may not follow this // exact shape, so absent fields just skip the boost. if let Some((name, city, state)) = parse_worker_chunk(&src.chunk_text) { parsed_count += 1; let key = (city, state, name); if let Some(entry) = boosts.get(&key) { src.score += entry.boost; src.playbook_boost = entry.boost; src.playbook_citations = entry.citations.clone(); matched_count += 1; } } } tracing::info!( "playbook_boost: boosts={} sources={} parsed={} matched={} target_geo={:?} target_role={:?} (query='{}')", boosts.len(), sources.len(), parsed_count, matched_count, target_geo, target_role, req.question.chars().take(60).collect::(), ); // Re-rank: boosted scores can flip ordering. sources.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); // Finally trim to the caller's requested top_k — we pulled fetch_k // (5× wider) above specifically so the boost could reach workers // that would otherwise have been trimmed pre-boost. sources.truncate(req.top_k); } // Step 5: Generate answer if requested. let answer = if req.generate && !sources.is_empty() { let context: String = sources.iter().enumerate().map(|(i, s)| { format!("[{}] (id: {}, verified: {}) {}", i + 1, s.doc_id, s.sql_verified, s.chunk_text) }).collect::>().join("\n\n"); let gen_resp = state.ai_client.generate(GenerateRequest { prompt: format!( "You are a staffing intelligence assistant. Answer based ONLY on these \ verified worker records. Every record has been SQL-verified against the \ database — you can trust the facts in them. Be specific: cite names, \ skills, certifications, scores, and locations.\n\n\ Records:\n{context}\n\n\ Question: {}\n\nAnswer:", req.question, ), model: None, system: None, temperature: Some(0.2), max_tokens: Some(512), think: None, }).await; gen_resp.ok().map(|r| r.text.trim().to_string()) } else { None }; let method = if valid_ids.is_some() { "hybrid_sql_vector" } else { "vector_only" }; Ok(Json(HybridResponse { question: req.question, sql_filter: req.sql_filter, sql_matches: sql_count, vector_reranked: sources.len(), method: method.into(), answer, sources, duration_ms: t0.elapsed().as_millis() as u64, })) } // --- HNSW Fast Search --- #[derive(Deserialize)] struct BuildHnswRequest { /// Name of the stored vector index to build HNSW from index_name: String, /// Optional config override. Omit to use the production default /// (ec=80 es=30 — see HnswConfig::default docs for rationale). #[serde(default)] config: Option, } /// Build an HNSW index from an existing stored vector index. /// Uses the embedding cache so repeated builds don't reload from Parquet. async fn build_hnsw( State(state): State, Json(req): Json, ) -> impl IntoResponse { let config = req.config.unwrap_or_default(); tracing::info!( "building HNSW for '{}' ef_construction={} ef_search={}", req.index_name, config.ef_construction, config.ef_search, ); let embeddings = state .embedding_cache .get_or_load(&req.index_name) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("index not found: {e}")))?; match state .hnsw_store .build_index_with_config(&req.index_name, (*embeddings).clone(), &config) .await { Ok(stats) => Ok(Json(stats)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct HnswSearchRequest { index_name: String, query: String, top_k: Option, } /// Search using HNSW — approximate nearest neighbors, much faster than brute-force. async fn search_hnsw( State(state): State, Json(req): Json, ) -> impl IntoResponse { let top_k = req.top_k.unwrap_or(5); // Embed query let embed_resp = state.ai_client.embed(EmbedRequest { texts: vec![req.query.clone()], model: None, }).await.map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed error: {e}")))?; if embed_resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "no embedding returned".to_string())); } let query_vec: Vec = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect(); // Search HNSW match state.hnsw_store.search(&req.index_name, &query_vec, top_k).await { Ok(results) => Ok(Json(serde_json::json!({ "results": results, "query": req.query, "method": "hnsw", }))), Err(e) => Err((StatusCode::NOT_FOUND, e)), } } async fn list_hnsw(State(state): State) -> impl IntoResponse { Json(state.hnsw_store.list().await) } // --- Trial System: parameterized HNSW tuning loop --- // // Flow: // 1. Agent picks an HnswConfig // 2. POST /hnsw/trial builds HNSW with that config against cached embeddings, // runs every query in the harness, measures latency + recall vs the // harness's ground truth, appends a Trial record to _hnsw_trials/{idx}.jsonl // 3. Agent reads GET /hnsw/trials/{index}, sees history, decides next config // 4. Repeat until converged. // // The first trial triggers embedding load (slow). Every subsequent trial reuses // the cache — so the agent iterates in seconds, not minutes. #[derive(Deserialize)] struct TrialRequest { index_name: String, harness: String, #[serde(default)] config: trial::HnswConfig, #[serde(default)] note: Option, } async fn run_trial( State(state): State, Json(req): Json, ) -> Result, (StatusCode, String)> { let mut harness_set = state.harness_store.load_for_index(&req.index_name, &req.harness) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("harness not found: {e}")))?; if harness_set.index_name != req.index_name { return Err(( StatusCode::BAD_REQUEST, format!( "harness '{}' is for index '{}', not '{}'", req.harness, harness_set.index_name, req.index_name ), )); } if harness_set.queries.is_empty() { return Err((StatusCode::BAD_REQUEST, "harness has no queries".into())); } let embeddings = state .embedding_cache .get_or_load(&req.index_name) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?; if !harness_set.ground_truth_built { tracing::info!("trial: computing ground truth for harness '{}'", harness_set.name); let t0 = std::time::Instant::now(); harness::compute_ground_truth(&mut harness_set, &embeddings, &state.ai_client) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?; tracing::info!("trial: ground truth built in {:.1}s", t0.elapsed().as_secs_f32()); state.harness_store .save(&harness_set) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save harness: {e}")))?; } let trial_id = trial::Trial::new_id(); let hnsw_slot = format!("{}__{}", req.index_name, trial_id); let build_stats = state .hnsw_store .build_index_with_config(&hnsw_slot, (*embeddings).clone(), &req.config) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("build: {e}")))?; let query_vectors: Vec> = harness_set .queries .iter() .filter_map(|q| q.query_embedding.clone()) .collect(); let bench = state .hnsw_store .bench_search(&hnsw_slot, &query_vectors, harness_set.k) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?; let mut recalls = Vec::with_capacity(harness_set.queries.len()); for (q, hits) in harness_set.queries.iter().zip(bench.retrieved.iter()) { if let Some(gt) = &q.ground_truth { recalls.push(harness::recall_at_k(hits, gt, harness_set.k)); } } let mean_recall = if recalls.is_empty() { 0.0 } else { recalls.iter().sum::() / recalls.len() as f32 }; let mut lats = bench.latencies_us.clone(); lats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let p = |pct: f32| -> f32 { if lats.is_empty() { return 0.0; } let idx = ((lats.len() as f32 - 1.0) * pct).round() as usize; lats[idx.min(lats.len() - 1)] }; // One brute-force reference latency — keeps the cost proportional to // whatever the agent is willing to pay per trial. let brute_latency_us = if let Some(qv) = query_vectors.first() { let t0 = std::time::Instant::now(); let _ = harness::brute_force_top_k(qv, &embeddings, harness_set.k); t0.elapsed().as_micros() as f32 } else { 0.0 }; let dims = embeddings.first().map(|e| e.vector.len()).unwrap_or(0); let memory_bytes = (embeddings.len() * dims * std::mem::size_of::() + embeddings.len() * 128) as u64; let trial_record = trial::Trial { id: trial_id.clone(), index_name: req.index_name.clone(), eval_set: req.harness.clone(), config: req.config.clone(), metrics: trial::TrialMetrics { build_time_secs: build_stats.build_time_secs, search_latency_p50_us: p(0.50), search_latency_p95_us: p(0.95), search_latency_p99_us: p(0.99), recall_at_k: mean_recall, memory_bytes, vectors: build_stats.vectors, eval_queries: harness_set.queries.len(), brute_force_latency_us: brute_latency_us, }, created_at: chrono::Utc::now(), note: req.note, }; state .trial_journal .append(&trial_record) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("journal: {e}")))?; state.hnsw_store.drop(&hnsw_slot).await; Ok(Json(trial_record)) } async fn list_trials( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { match state.trial_journal.list(&index_name).await { Ok(trials) => Ok(Json(trials)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct BestTrialQuery { #[serde(default = "default_metric")] metric: String, } fn default_metric() -> String { "pareto".to_string() } async fn best_trial( State(state): State, Path(index_name): Path, Query(q): Query, ) -> impl IntoResponse { match state.trial_journal.best(&index_name, &q.metric).await { Ok(Some(t)) => Ok(Json(t)), Ok(None) => Err((StatusCode::NOT_FOUND, "no trials yet".to_string())), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } // --- Harness management --- async fn list_evals(State(state): State) -> impl IntoResponse { Json(state.harness_store.list_all().await) } async fn get_eval( State(state): State, Path(name): Path, ) -> impl IntoResponse { match state.harness_store.get_any(&name).await { Ok(e) => Ok(Json(e)), Err(err) => Err((StatusCode::NOT_FOUND, err)), } } async fn put_eval( State(state): State, Path(name): Path, Json(mut harness_set): Json, ) -> impl IntoResponse { harness_set.name = name; harness_set.ground_truth_built = harness_set .queries .iter() .all(|q| q.ground_truth.is_some()); match state.harness_store.save(&harness_set).await { Ok(()) => Ok(Json(harness_set)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct AutogenRequest { index_name: String, #[serde(default = "default_sample_count")] sample_count: usize, #[serde(default = "default_k")] k: usize, } fn default_sample_count() -> usize { 100 } fn default_k() -> usize { 10 } async fn autogen_eval( State(state): State, Path(name): Path, Json(req): Json, ) -> Result, (StatusCode, String)> { let embeddings = state .embedding_cache .get_or_load(&req.index_name) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("load embeddings: {e}")))?; let mut harness_set = harness::synthetic_from_chunks( &name, &req.index_name, &embeddings, req.sample_count, req.k, ); harness::compute_ground_truth(&mut harness_set, &embeddings, &state.ai_client) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?; state.harness_store .save(&harness_set) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save: {e}")))?; Ok(Json(harness_set)) } // --- Embedding cache management --- async fn cache_stats(State(state): State) -> impl IntoResponse { Json(state.embedding_cache.stats().await) } async fn cache_evict( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { let ok = state.embedding_cache.evict(&index_name).await; Json(serde_json::json!({ "evicted": ok, "index_name": index_name })) } // --- Phase C: embedding refresh --- // // Decouples "new row data arrived" from "re-embed everything." Ingest marks // a dataset's embeddings stale (see catalogd::registry::mark_embeddings_stale); // `/vectors/refresh/{dataset}` diffs existing embeddings against current // rows, embeds only the new ones, appends to the index, and clears the // stale flag. async fn refresh_dataset( State(state): State, Path(dataset_name): Path, Json(req): Json, ) -> Result, (StatusCode, String)> { tracing::info!( "refresh requested for dataset '{}' -> index '{}'", dataset_name, req.index_name, ); match refresh::refresh_index( &dataset_name, &req, &state.store, &state.catalog, &state.ai_client, &state.embedding_cache, &state.index_registry, ) .await { Ok(result) => Ok(Json(result)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Serialize)] struct StaleEntry { dataset_name: String, last_embedded_at: Option, stale_since: String, refresh_policy: Option, } async fn list_stale(State(state): State) -> impl IntoResponse { let datasets = state.catalog.stale_datasets().await; let entries: Vec = datasets .into_iter() .map(|d| StaleEntry { dataset_name: d.name, last_embedded_at: d.last_embedded_at.map(|t| t.to_rfc3339()), stale_since: d .embedding_stale_since .map(|t| t.to_rfc3339()) .unwrap_or_default(), refresh_policy: d.embedding_refresh_policy, }) .collect(); Json(entries) } // --- Phase 17: Model profile activation + scoped search --- #[derive(Serialize)] struct ActivateReport { profile_id: String, ollama_name: String, indexes_warmed: Vec, failures: Vec, total_vectors: usize, duration_secs: f32, /// Phase C: did we successfully preload the Ollama model? model_preloaded: bool, /// Phase C: which profile previously held the GPU slot, if any. /// Useful for observability of the swap. previous_profile: Option, } #[derive(Serialize)] struct WarmedIndex { index_name: String, source: String, vectors: usize, hnsw_build_secs: f32, } /// Warm this profile's indexes. For every bound dataset, find the /// matching vector index (any index whose `source` equals the dataset /// or view name), load its embeddings into EmbeddingCache, build HNSW /// with the profile's config. Next `/profile/{id}/search` call is then /// <1ms cold. /// /// Failures on individual indexes don't stop the activation — they get /// reported in the response. This matches the "substrate keeps working" /// philosophy from ADR-017: one bad binding shouldn't take down the /// whole profile. async fn activate_profile( State(state): State, Path(profile_id): Path, ) -> impl IntoResponse { let t0 = std::time::Instant::now(); let profile = match state.catalog.get_profile(&profile_id).await { Some(p) => p, None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))), }; let mut warmed = Vec::new(); let mut failures = Vec::new(); let mut total_vectors = 0usize; // Phase 17 / C: VRAM-aware swap. If another profile currently holds // the GPU and uses a DIFFERENT Ollama model than the one being // activated, unload it first (keep_alive=0). Same-model activations // skip the unload — no point churning a model that's already loaded. let previous_slot = { let guard = state.active_profile.read().await; guard.clone() }; if let Some(prev) = &previous_slot { if prev.ollama_name != profile.ollama_name { match state.ai_client.unload_model(&prev.ollama_name).await { Ok(_) => tracing::info!( "profile swap: unloaded '{}' ({} -> {})", prev.ollama_name, prev.profile_id, profile.id, ), Err(e) => failures.push(format!( "unload previous model '{}': {e}", prev.ollama_name, )), } } } // Federation layer 2: if this profile declares its own bucket and // that bucket isn't registered yet, auto-provision it under the // configured profile_root. This is the moment a "dormant" profile // becomes live — its bucket exists and is readable/writable. if let Some(bucket_name) = profile.bucket.clone() { if !state.bucket_registry.contains(&bucket_name) { let root = format!( "{}/{}", state.bucket_registry.profile_root().trim_end_matches('/'), bucket_name.replace(':', "_"), ); let bc = shared::config::BucketConfig { name: bucket_name.clone(), backend: "local".to_string(), root: Some(root.clone()), bucket: None, region: None, endpoint: None, secret_ref: None, }; match state.bucket_registry.add_bucket(bc).await { Ok(info) => { tracing::info!( "profile '{}' activated bucket '{}' (root={}, reachable={})", profile.id, bucket_name, root, info.reachable, ); } Err(e) => { failures.push(format!( "auto-provision bucket '{}': {}", bucket_name, e, )); } } } } let all_indexes = state.index_registry.list(None, None).await; let use_lance = profile.vector_backend == shared::types::VectorBackend::Lance; for binding in &profile.bound_datasets { let matched: Vec<_> = all_indexes .iter() .filter(|m| &m.source == binding) .collect(); if matched.is_empty() { failures.push(format!( "no vector index found for binding '{}'", binding, )); continue; } for meta in matched { if use_lance { // --- Lance activation path --- // Ensure a Lance dataset exists for this index. If it // doesn't, auto-migrate from the Parquet blob. Then // ensure an IVF_PQ index is built. let bucket = meta.bucket.clone(); let lance_store = match state.lance.store_for_new(&meta.index_name, &bucket).await { Ok(s) => s, Err(e) => { failures.push(format!("{}: lance store init: {e}", meta.index_name)); continue; } }; let count = lance_store.count().await.unwrap_or(0); if count == 0 { // Auto-migrate from existing Parquet. let pq_store = match state.bucket_registry.get(&bucket) { Ok(s) => s, Err(e) => { failures.push(format!("{}: bucket: {e}", meta.index_name)); continue; } }; match storaged::ops::get(&pq_store, &meta.storage_key).await { Ok(bytes) => { let build_t = std::time::Instant::now(); match lance_store.migrate_from_parquet_bytes(&bytes).await { Ok(ms) => { total_vectors += ms.rows_written; tracing::info!( "lance auto-migrate '{}': {} rows in {:.2}s", meta.index_name, ms.rows_written, ms.duration_secs, ); warmed.push(WarmedIndex { index_name: meta.index_name.clone(), source: meta.source.clone(), vectors: ms.rows_written, hnsw_build_secs: build_t.elapsed().as_secs_f32(), }); } Err(e) => failures.push(format!("{}: lance migrate: {e}", meta.index_name)), } } Err(e) => failures.push(format!("{}: read parquet: {e}", meta.index_name)), } } else { total_vectors += count; warmed.push(WarmedIndex { index_name: meta.index_name.clone(), source: meta.source.clone(), vectors: count, hnsw_build_secs: 0.0, }); } // Ensure IVF_PQ vector index exists. if !lance_store.has_vector_index().await.unwrap_or(false) { match lance_store.build_index(316, 8, 48).await { Ok(ix) => tracing::info!( "lance auto-index '{}': IVF_PQ built in {:.1}s", meta.index_name, ix.build_time_secs, ), Err(e) => failures.push(format!("{}: lance IVF_PQ build: {e}", meta.index_name)), } } // Ensure scalar btree on doc_id for O(log N) random fetch. if !lance_store.has_scalar_index("doc_id").await.unwrap_or(false) { match lance_store.build_scalar_index("doc_id").await { Ok(ix) => tracing::info!( "lance auto-index '{}': doc_id btree built in {:.2}s", meta.index_name, ix.build_time_secs, ), Err(e) => failures.push(format!("{}: lance doc_id btree: {e}", meta.index_name)), } } } else { // --- Parquet + HNSW activation path (existing) --- let embeddings = match state.embedding_cache.get_or_load(&meta.index_name).await { Ok(arc) => arc, Err(e) => { failures.push(format!("{}: load failed: {}", meta.index_name, e)); continue; } }; total_vectors += embeddings.len(); let profile_default = trial::HnswConfig { ef_construction: profile.hnsw_config.ef_construction, ef_search: profile.hnsw_config.ef_search, seed: profile.hnsw_config.seed, }; let cfg = state .promotion_registry .config_or(&meta.index_name, profile_default) .await; let build_t = std::time::Instant::now(); match state .hnsw_store .build_index_with_config(&meta.index_name, (*embeddings).clone(), &cfg) .await { Ok(_) => { warmed.push(WarmedIndex { index_name: meta.index_name.clone(), source: meta.source.clone(), vectors: embeddings.len(), hnsw_build_secs: build_t.elapsed().as_secs_f32(), }); } Err(e) => { failures.push(format!("{}: HNSW build failed: {}", meta.index_name, e)); } } } } } // Preload the new profile's Ollama model proactively. Same-model // re-activations are cheap (Ollama no-ops if already loaded). let mut model_preloaded = false; match state.ai_client.preload_model(&profile.ollama_name).await { Ok(_) => { model_preloaded = true; tracing::info!("profile '{}' preloaded ollama model '{}'", profile.id, profile.ollama_name); } Err(e) => failures.push(format!( "preload ollama model '{}': {e}", profile.ollama_name, )), } // Take the GPU slot. { let mut guard = state.active_profile.write().await; *guard = Some(ActiveProfileSlot { profile_id: profile.id.clone(), ollama_name: profile.ollama_name.clone(), activated_at: chrono::Utc::now(), }); } Ok(Json(ActivateReport { profile_id: profile.id, ollama_name: profile.ollama_name, indexes_warmed: warmed, failures, total_vectors, duration_secs: t0.elapsed().as_secs_f32(), model_preloaded, previous_profile: previous_slot.map(|s| s.profile_id), })) } /// Unload this profile's model and clear the active slot. No-op if the /// caller isn't the currently-active profile. async fn deactivate_profile( State(state): State, Path(profile_id): Path, ) -> impl IntoResponse { let profile = match state.catalog.get_profile(&profile_id).await { Some(p) => p, None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))), }; let was_active = { let mut guard = state.active_profile.write().await; match guard.as_ref() { Some(s) if s.profile_id == profile_id => { let prev = s.clone(); *guard = None; Some(prev) } _ => None, } }; // Regardless of whether it held the slot, we can still try to unload — // the operator's intent is "get this model out of VRAM." let unload_result = state.ai_client.unload_model(&profile.ollama_name).await; Ok(Json(serde_json::json!({ "profile_id": profile.id, "ollama_name": profile.ollama_name, "was_active": was_active.is_some(), "unloaded": unload_result.is_ok(), "unload_error": unload_result.err(), }))) } async fn get_active_profile(State(state): State) -> impl IntoResponse { let slot = state.active_profile.read().await.clone(); Json(slot) } #[derive(Deserialize)] struct ProfileSearchRequest { index_name: String, query: String, top_k: Option, } /// Search scoped to a profile — refuses if the requested index's source /// isn't in the profile's bound_datasets. Reuses the existing HNSW /// search path when the index is warm; falls back to brute-force cosine /// if it's not (handled by the existing search code path). async fn profile_scoped_search( State(state): State, Path(profile_id): Path, Json(req): Json, ) -> impl IntoResponse { let profile = match state.catalog.get_profile(&profile_id).await { Some(p) => p, None => return Err((StatusCode::NOT_FOUND, format!("profile not found: {profile_id}"))), }; // Verify the index is in scope for this profile. let index_meta = match state.index_registry.get(&req.index_name).await { Some(m) => m, None => return Err((StatusCode::NOT_FOUND, format!("index not found: {}", req.index_name))), }; if !profile.bound_datasets.contains(&index_meta.source) { return Err(( StatusCode::FORBIDDEN, format!( "profile '{}' is not bound to '{}' — allowed bindings: {:?}", profile.id, index_meta.source, profile.bound_datasets, ), )); } let top_k = req.top_k.unwrap_or(5); let use_lance = profile.vector_backend == shared::types::VectorBackend::Lance; // Embed the query. let embed_resp = state .ai_client .embed(EmbedRequest { texts: vec![req.query.clone()], model: None }) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?; if embed_resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "no embedding returned".into())); } let query_vec: Vec = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect(); // ADR-019 hybrid: route to Lance or Parquet+HNSW based on the // profile's declared backend. Callers don't need to know which // storage tier they're hitting — the profile abstracts it. if use_lance { let lance_store = state.lance.store_for(&req.index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let t0 = std::time::Instant::now(); match lance_store.search( &query_vec, top_k, Some(LANCE_DEFAULT_NPROBES), Some(LANCE_DEFAULT_REFINE_FACTOR), ).await { Ok(hits) => Ok(Json(serde_json::json!({ "profile": profile.id, "source": index_meta.source, "method": "lance_ivf_pq", "latency_us": t0.elapsed().as_micros() as u64, "results": hits, }))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } else if state.hnsw_store.has_index(&req.index_name).await { match state.hnsw_store.search(&req.index_name, &query_vec, top_k).await { Ok(hits) => Ok(Json(serde_json::json!({ "profile": profile.id, "source": index_meta.source, "method": "hnsw", "results": hits, }))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } else { let embeddings = state .embedding_cache .get_or_load(&req.index_name) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("embeddings: {e}")))?; let results = search::search(&query_vec, &embeddings, top_k); Ok(Json(serde_json::json!({ "profile": profile.id, "source": index_meta.source, "method": "brute_force", "results": results, }))) } } // --- Phase 16: Promotion + autotune --- #[derive(Deserialize)] struct PromoteQuery { #[serde(default)] promoted_by: String, #[serde(default)] note: Option, } async fn promote_trial( State(state): State, Path((index_name, trial_id)): Path<(String, String)>, Query(q): Query, ) -> impl IntoResponse { // Pull the trial from the journal to get its config. let trials = state .trial_journal .list(&index_name) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; let trial = trials .iter() .find(|t| t.id == trial_id) .ok_or_else(|| (StatusCode::NOT_FOUND, format!("trial not found: {trial_id}")))?; let entry = promotion::PromotionEntry { config: trial.config.clone(), trial_id: trial.id.clone(), promoted_at: chrono::Utc::now(), promoted_by: q.promoted_by, note: q.note, }; match state.promotion_registry.promote(&index_name, entry).await { Ok(file) => Ok(Json(file)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } async fn rollback_promotion( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { match state.promotion_registry.rollback(&index_name).await { Ok(file) => Ok(Json(file)), Err(e) => Err((StatusCode::NOT_FOUND, e)), } } async fn get_promoted( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { match state.promotion_registry.load(&index_name).await { Ok(file) => Ok(Json(file)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } async fn run_autotune_endpoint( State(state): State, Json(req): Json, ) -> impl IntoResponse { match autotune::run_autotune( req, &state.store, &state.catalog, &state.ai_client, &state.embedding_cache, &state.hnsw_store, &state.index_registry, &state.trial_journal, &state.promotion_registry, &state.harness_store, &state.job_tracker, ).await { Ok(result) => Ok(Json(result)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } // --- Phase 16.2: autotune agent endpoints --- async fn agent_status(State(state): State) -> impl IntoResponse { Json(state.agent_handle.status().await) } async fn agent_stop(State(state): State) -> impl IntoResponse { let stopped = state.agent_handle.stop().await; Json(serde_json::json!({ "stopped": stopped })) } async fn agent_enqueue( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { let event = agent::TriggerEvent::manual(index_name); match state.agent_handle.enqueue(event).await { Ok(()) => Ok(Json(serde_json::json!({ "enqueued": true }))), Err(e) => Err((StatusCode::SERVICE_UNAVAILABLE, e)), } } // --- ADR-019: Lance hybrid backend HTTP surface --- // // Lance routes operate on the same `index_name` as the Parquet/HNSW path, // but materialize the data as a Lance dataset on disk under // `{bucket_root}/lance/{index_name}/`. The two backends are independent: // you can have an index in both formats simultaneously. `IndexMeta.vector_backend` // records which one is canonical for that index. #[derive(Deserialize)] struct LanceMigrateRequest { /// Optional bucket override. Defaults to whatever the existing /// IndexMeta says, or "primary" for indexes that don't exist yet. #[serde(default)] bucket: Option, } /// Read the existing Parquet vector file for `index_name` from object /// storage, hand the bytes to vectord-lance, return migration stats. /// The original Parquet file is left intact — both backends coexist /// after migration. async fn lance_migrate( State(state): State, Path(index_name): Path, Json(req): Json, ) -> impl IntoResponse { let meta = state.index_registry.get(&index_name).await .ok_or((StatusCode::NOT_FOUND, format!("index not found: {index_name}")))?; let bucket = req.bucket.unwrap_or(meta.bucket.clone()); // Pull the Parquet bytes via storaged::ops — same path as the // existing embedding loader uses. let store = state.bucket_registry.get(&bucket) .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let bytes = storaged::ops::get(&store, &meta.storage_key).await .map_err(|e| (StatusCode::NOT_FOUND, format!("read parquet: {e}")))?; let lance_store = state.lance.store_for_new(&index_name, &bucket).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let stats = lance_store.migrate_from_parquet_bytes(&bytes).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; tracing::info!( "lance migrate '{}': {} rows, {}d, {} bytes on disk, {:.2}s", index_name, stats.rows_written, stats.dimensions, stats.disk_bytes, stats.duration_secs, ); Ok::<_, (StatusCode, String)>(Json(serde_json::json!({ "index_name": index_name, "bucket": bucket, "lance_path": lance_store.path(), "stats": stats, }))) } #[derive(Deserialize)] struct LanceIndexRequest { #[serde(default = "default_partitions")] num_partitions: u32, #[serde(default = "default_bits")] num_bits: u32, #[serde(default = "default_subvectors")] num_sub_vectors: u32, } fn default_partitions() -> u32 { 316 } // ≈√100K — sane for the reference dataset fn default_bits() -> u32 { 8 } fn default_subvectors() -> u32 { 48 } // 768/48 = 16 dims per subvector /// Build the IVF_PQ index on the Lance dataset. async fn lance_build_index( State(state): State, Path(index_name): Path, Json(req): Json, ) -> impl IntoResponse { let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; match lance_store.build_index(req.num_partitions, req.num_bits, req.num_sub_vectors).await { Ok(stats) => Ok(Json(stats)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct LanceSearchRequest { /// Plain text query — embedded server-side for symmetry with the /// existing /vectors/search path. query: String, #[serde(default = "default_top_k")] top_k: usize, /// IVF partitions to probe. `None` uses Lance's built-in default of /// 1, which caps recall well below the index's real capability. /// Recommended: 5–10% of num_partitions (≈20 for a 316-partition /// index). Omitting it here picks the server-side default. #[serde(default)] nprobes: Option, /// Refine factor — re-rank `top_k * factor` PQ-approximate candidates /// with exact distances before returning `top_k`. Recovers recall /// lost to product quantization. #[serde(default)] refine_factor: Option, } /// Server-side defaults when the caller doesn't pin nprobes / refine /// themselves. Tuned for the ~100K × 768d reference workload; see /// docs/ADR-019-vector-storage.md for the recall / latency trade-off. const LANCE_DEFAULT_NPROBES: usize = 20; const LANCE_DEFAULT_REFINE_FACTOR: u32 = 5; fn default_top_k() -> usize { 5 } /// Vector search against a Lance dataset. Embeds the query text via the /// sidecar then calls Lance's nearest-neighbor scanner. async fn lance_search( State(state): State, Path(index_name): Path, Json(req): Json, ) -> impl IntoResponse { let embed_resp = state.ai_client .embed(EmbedRequest { texts: vec![req.query.clone()], model: None }) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed: {e}")))?; if embed_resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "no embedding returned".into())); } let qv: Vec = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect(); let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let t0 = std::time::Instant::now(); let nprobes = req.nprobes.or(Some(LANCE_DEFAULT_NPROBES)); let refine = req.refine_factor.or(Some(LANCE_DEFAULT_REFINE_FACTOR)); let hits = lance_store.search(&qv, req.top_k, nprobes, refine).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; Ok(Json(serde_json::json!({ "index_name": index_name, "query": req.query, "method": "lance_ivf_pq", "latency_us": t0.elapsed().as_micros() as u64, "results": hits, }))) } /// Random-access fetch by doc_id — the O(1) lookup that's basically /// impossible in our Parquet path without scanning the whole file. async fn lance_get_doc( State(state): State, Path((index_name, doc_id)): Path<(String, String)>, ) -> impl IntoResponse { let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let t0 = std::time::Instant::now(); match lance_store.get_by_doc_id(&doc_id).await { Ok(Some(row)) => Ok(Json(serde_json::json!({ "index_name": index_name, "doc_id": doc_id, "latency_us": t0.elapsed().as_micros() as u64, "row": row, }))), Ok(None) => Err((StatusCode::NOT_FOUND, format!("doc_id not found: {doc_id}"))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct LanceAppendRequest { /// Optional source tag — set on every appended row. #[serde(default)] source: Option, rows: Vec, } #[derive(Deserialize)] struct LanceAppendRow { doc_id: String, #[serde(default)] chunk_idx: Option, chunk_text: String, /// Pre-computed embedding. Caller is responsible for ensuring it /// matches the dataset's dimensions and embedding model. vector: Vec, } async fn lance_append( State(state): State, Path(index_name): Path, Json(req): Json, ) -> impl IntoResponse { if req.rows.is_empty() { return Err((StatusCode::BAD_REQUEST, "rows array is empty".into())); } let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let mut doc_ids = Vec::with_capacity(req.rows.len()); let mut chunk_idxs = Vec::with_capacity(req.rows.len()); let mut chunk_texts = Vec::with_capacity(req.rows.len()); let mut vectors = Vec::with_capacity(req.rows.len()); for r in req.rows { doc_ids.push(r.doc_id); chunk_idxs.push(r.chunk_idx.unwrap_or(0)); chunk_texts.push(r.chunk_text); vectors.push(r.vector); } match lance_store.append(req.source, doc_ids, chunk_idxs, chunk_texts, vectors).await { Ok(stats) => Ok(Json(stats)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } async fn lance_stats( State(state): State, Path(index_name): Path, ) -> impl IntoResponse { let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; match lance_store.stats().await { Ok(s) => Ok(Json(s)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } /// Run an existing harness against Lance IVF_PQ and measure recall@k. /// Uses the same ground truth computed by brute-force cosine (the HNSW /// eval path). This closes ADR-019's explicit gap: "IVF_PQ recall not /// measured." #[derive(Deserialize)] struct LanceRecallRequest { harness: String, #[serde(default = "default_top_k")] top_k: usize, /// Override server defaults so operators can sweep nprobes / /// refine_factor to chart the recall-vs-latency curve. #[serde(default)] nprobes: Option, #[serde(default)] refine_factor: Option, } #[derive(serde::Serialize)] struct LanceRecallResult { index_name: String, harness: String, queries: usize, top_k: usize, mean_recall: f32, per_query: Vec, latency_p50_us: f32, latency_p95_us: f32, total_duration_secs: f32, } #[derive(serde::Serialize)] struct LanceRecallQuery { query_id: String, recall: f32, latency_us: f32, hits_returned: usize, } // --- Phase 19: playbook memory endpoints --- /// Extract (name, city, state) from a chunk formatted like /// "{Name} — {Role} in {City}, {State}. Skills: …". /// Returns None if the chunk doesn't match the shape; callers simply /// skip the boost for that hit. /// Extract role from an SQL filter matching `role = 'Welder'` style. /// Case-insensitive on the column name. Quoted value; quotes not /// included in returned string. fn extract_target_role(sql_filter: &str) -> Option { grab_eq_value(sql_filter, "role") } /// Shared equality-value extractor for (city, state, role) lookups. fn grab_eq_value(src: &str, col: &str) -> Option { let lower = src.to_ascii_lowercase(); let col_lower = col.to_ascii_lowercase(); let mut search_from = 0usize; while let Some(off) = lower[search_from..].find(&col_lower) { let pos = search_from + off; let prior_ok = pos == 0 || !lower.as_bytes()[pos - 1].is_ascii_alphanumeric() && lower.as_bytes()[pos - 1] != b'_'; let after = pos + col_lower.len(); if !prior_ok || after >= src.len() { search_from = pos + col_lower.len(); continue; } let mut i = after; while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; } if i >= src.len() || src.as_bytes()[i] != b'=' { search_from = pos + col_lower.len(); continue; } i += 1; while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; } if i >= src.len() || src.as_bytes()[i] != b'\'' { search_from = pos + col_lower.len(); continue; } i += 1; let start = i; while i < src.len() && src.as_bytes()[i] != b'\'' { i += 1; } if i > start { return Some(src[start..i].to_string()); } search_from = pos + col_lower.len(); } None } /// Pull (city, state) out of a SQL filter that uses /// `city = 'Toledo' AND state = 'OH'` style equality. Returns None if /// either is missing — the caller keeps the original global boost map /// behavior (no geo narrowing). Case-insensitive on the column name /// so `CITY=` or `City =` also work. fn extract_target_geo(sql_filter: &str) -> Option<(String, String)> { let city = grab_eq_value(sql_filter, "city")?; let state = grab_eq_value(sql_filter, "state")?; Some((city, state)) } fn parse_worker_chunk(chunk: &str) -> Option<(String, String, String)> { // "Name — Role in City, ST. …" → split on "—" then " in " then "," let (name_part, rest) = chunk.split_once('—')?; let rest = rest.trim(); let (_role, loc_part) = rest.split_once(" in ")?; let loc_part = loc_part.trim(); let (city, state_plus) = loc_part.split_once(',')?; let state: String = state_plus.trim() .chars() .take_while(|c| c.is_ascii_alphabetic()) .collect(); let name = name_part.trim().to_string(); let city = city.trim().to_string(); if name.is_empty() || city.is_empty() || state.is_empty() { return None; } Some((name, city, state)) } #[derive(Deserialize)] struct SeedPlaybookRequest { /// One playbook with {operation, approach, context, endorsed_names}. /// City + state are parsed from the operation text. operation: String, #[serde(default)] approach: String, #[serde(default)] context: String, endorsed_names: Vec, /// Append to the existing memory rather than replacing. Default true — /// seeding is a bootstrap/demo tool, not a rebuild substitute. #[serde(default = "default_true")] append: bool, /// Phase 25 — optional schema_fingerprint captured at seed time. /// When the underlying dataset's schema changes, any entry whose /// fingerprint doesn't match the new one is auto-retired via /// retire_on_schema_drift. Caller-provided so the producer (the /// scenario driver, the orchestrator) can pass the live fingerprint /// without the gateway needing a second catalogd round trip. #[serde(default)] schema_fingerprint: Option, /// Phase 25 — optional hard expiry. RFC3339 timestamp. After this /// moment the entry is skipped during boost computation (not /// retired, just inactive). Useful for seasonal/temp contracts. #[serde(default)] valid_until: Option, } /// Bootstrap / test-only: inject a playbook entry directly into /// `playbook_memory` without going through `successful_playbooks`. Useful /// when the source dataset has stale or phantom entries (as the initial /// staffing seed did — names that don't correspond to real workers), and /// you want to demonstrate the feedback loop with a known-good fixture. /// /// Production path is always `/rebuild` — this endpoint is for operators /// who need to prime the memory before real playbooks accumulate. async fn seed_playbook_memory( State(state): State, Json(req): Json, ) -> impl IntoResponse { // Embed the entry through the same text shape `rebuild` uses so // similarity math is comparable across seed + real entries. let tmp_entry = playbook_memory::PlaybookEntry { playbook_id: String::new(), operation: req.operation.clone(), approach: req.approach.clone(), context: req.context.clone(), timestamp: chrono::Utc::now().to_rfc3339(), endorsed_names: req.endorsed_names.clone(), city: None, state: None, embedding: None, schema_fingerprint: None, valid_until: None, retired_at: None, retirement_reason: None, version: 1, parent_id: None, superseded_at: None, superseded_by: None, }; let text = format!( "{} | {} | {} | fills: {}", tmp_entry.operation, tmp_entry.approach, tmp_entry.context, tmp_entry.endorsed_names.join(", "), ); let resp = match state.ai_client.embed(EmbedRequest { texts: vec![text], model: None }).await { Ok(r) => r, Err(e) => return Err((StatusCode::BAD_GATEWAY, format!("embed seed: {e}"))), }; if resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "embed returned nothing".into())); } let emb: Vec = resp.embeddings[0].iter().map(|&x| x as f32).collect(); // Parse city/state from the operation ("fill: Role xN in City, ST"). // Parser lives in playbook_memory::rebuild — expose via a tiny helper // or inline the same logic here; duplicated briefly since this seed // path is stable but infrequently called. let (city, state_) = { let after_in = req.operation.split(" in ").nth(1).unwrap_or(""); let mut parts = after_in.splitn(2, ','); let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty()); let state = parts.next().map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::()).filter(|s| !s.is_empty()); (city, state) }; if city.is_none() || state_.is_none() { return Err((StatusCode::BAD_REQUEST, "operation must match 'fill: Role xN in City, ST' shape".into())); } // Stable id: hash of timestamp + operation. Callers get the id back // so they can reference it in citations. let ts = chrono::Utc::now().to_rfc3339(); use sha2::{Digest, Sha256}; let mut h = Sha256::new(); h.update(ts.as_bytes()); h.update(b"|"); h.update(req.operation.as_bytes()); let bytes = h.finalize(); let pid = format!("pb-seed-{}", bytes.iter().take(8).map(|b| format!("{b:02x}")).collect::()); let new_entry = playbook_memory::PlaybookEntry { playbook_id: pid.clone(), operation: req.operation, approach: req.approach, context: req.context, timestamp: ts, endorsed_names: req.endorsed_names, city, state: state_, embedding: Some(emb), // Phase 25 — seed request may carry a fingerprint; if not, we // default to None and the entry degrades to "no expiry signal" // (never auto-retired on drift, but manual retirement still // works). valid_until + retired_at start None. schema_fingerprint: req.schema_fingerprint.clone(), valid_until: req.valid_until.clone(), retired_at: None, retirement_reason: None, version: 1, parent_id: None, superseded_at: None, superseded_by: None, }; // Phase 26 — when append=true (default), route through upsert so // same-day re-seeds of the same operation merge instead of // appending duplicates. When append=false, retain the old // replace-all semantics for callers that want a hard reset. if req.append { match state.playbook_memory.upsert_entry(new_entry).await { Ok(outcome) => { let entries_after = state.playbook_memory.entry_count().await; Ok(Json(serde_json::json!({ "outcome": outcome, "entries_after": entries_after, }))) } Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, format!("upsert: {e}"))), } } else { if let Err(e) = state.playbook_memory.set_entries(vec![new_entry]).await { return Err((StatusCode::INTERNAL_SERVER_ERROR, format!("persist: {e}"))); } Ok(Json(serde_json::json!({ "outcome": { "mode": "replaced", "playbook_id": pid }, "entries_after": state.playbook_memory.entry_count().await, }))) } } async fn rebuild_playbook_memory( State(state): State, ) -> impl IntoResponse { match playbook_memory::rebuild( &state.playbook_memory, &state.ai_client, &state.catalog, &state.bucket_registry, ).await { Ok(report) => Ok(Json(report)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } // Path 2 foundation — dump in-memory playbook_memory state to a fresh // `successful_playbooks_live` dataset. Cheap to call (writes one parquet, // updates one manifest), so /log can call it after every seed to keep the // SQL-queryable surface honest without the destructive REPLACE bug that // /ingest/file has. async fn persist_playbook_memory_sql( State(state): State, ) -> impl IntoResponse { match playbook_memory::persist_to_sql(&state.playbook_memory, &state.catalog).await { Ok(report) => Ok(Json(report)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct PatternsRequest { query: String, #[serde(default = "default_pattern_k")] top_k_playbooks: usize, /// Minimum frequency (0.0-1.0) for a trait to make the report. /// Default 0.4 — at least 40% of examined workers must share it. #[serde(default = "default_pattern_min_freq")] min_trait_frequency: f32, } fn default_pattern_k() -> usize { 10 } fn default_pattern_min_freq() -> f32 { 0.4 } // Path 2 — meta-index discovery surface. "What did past similar fills // have in common that I didn't ask about?" — surfaces signals like // recurring certifications, skill clusters, archetype tendencies. async fn discover_playbook_patterns( State(state): State, Json(req): Json, ) -> impl IntoResponse { match playbook_memory::discover_patterns( &state.playbook_memory, &state.ai_client, &state.catalog, &state.bucket_registry, &req.query, req.top_k_playbooks, req.min_trait_frequency, ).await { Ok(report) => Ok(Json(report)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[derive(Deserialize)] struct MarkFailedRequest { /// Operation text, same shape as seed: "fill: Role xN in City, ST" operation: String, /// Names of workers who didn't deliver on the fill. failed_names: Vec, /// Short reason (no-show, fired, unreliable). Stored verbatim. #[serde(default)] reason: String, } async fn mark_playbook_failed( State(state): State, Json(req): Json, ) -> impl IntoResponse { // Parse city + state from the operation — mirrors seed's parser. let after_in = req.operation.split(" in ").nth(1).unwrap_or(""); let mut parts = after_in.splitn(2, ','); let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty()); let state_ = parts.next().map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::() ).filter(|s| !s.is_empty()); let (Some(city), Some(state_code)) = (city, state_) else { return Err((StatusCode::BAD_REQUEST, "operation must match 'fill: Role xN in City, ST' shape".into())); }; let ts = chrono::Utc::now().to_rfc3339(); let records: Vec = req.failed_names.iter() .map(|n| playbook_memory::FailureRecord { city: city.clone(), state: state_code.clone(), name: n.clone(), reason: req.reason.clone(), timestamp: ts.clone(), }) .collect(); match state.playbook_memory.mark_failures(records).await { Ok(added) => Ok(Json(serde_json::json!({ "added": added, "city": city, "state": state_code }))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } async fn playbook_memory_stats( State(state): State, ) -> impl IntoResponse { let entries = state.playbook_memory.snapshot().await; Json(serde_json::json!({ "entries": entries.len(), "total_names_endorsed": entries.iter().map(|e| e.endorsed_names.len()).sum::(), "entries_with_embeddings": entries.iter().filter(|e| e.embedding.is_some()).count(), "sample": entries.iter().take(3).map(|e| serde_json::json!({ "id": e.playbook_id, "operation": e.operation, "city": e.city, "state": e.state, "endorsed": e.endorsed_names, })).collect::>(), })) } #[derive(Deserialize)] struct RetirePlaybookRequest { /// Retire by playbook_id — exact match, single entry. Used for /// manual operator retirement via the UI. #[serde(default)] playbook_id: Option, /// Retire by scope — city + state required, with a fingerprint /// that entries must match to survive. Fingerprint mismatch → retire. /// Use when a schema migration produces a new fingerprint and /// historical playbooks need to be auto-retired. #[serde(default)] city: Option, #[serde(default)] state: Option, #[serde(default)] current_schema_fingerprint: Option, /// Human-readable reason stored on the retired entry. reason: String, } /// Phase 25 retirement endpoint. Two modes: /// {playbook_id, reason} → retire one /// {city, state, current_schema_fingerprint, reason} → retire all /// entries in scope whose /// fingerprint differs async fn retire_playbook_memory( State(state): State, Json(req): Json, ) -> impl IntoResponse { if let Some(id) = &req.playbook_id { return match state.playbook_memory.retire_one(id, &req.reason).await { Ok(found) => Ok(Json(serde_json::json!({ "mode": "by_id", "retired": if found { 1 } else { 0 } }))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), }; } if let (Some(city), Some(state_code), Some(fp)) = (&req.city, &req.state, &req.current_schema_fingerprint) { return match state.playbook_memory.retire_on_schema_drift(city, state_code, fp, &req.reason).await { Ok(n) => Ok(Json(serde_json::json!({ "mode": "schema_drift", "retired": n, "city": city, "state": state_code }))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), }; } Err((StatusCode::BAD_REQUEST, "supply either {playbook_id, reason} or {city, state, current_schema_fingerprint, reason}".into())) } /// Phase 27 — request body for `POST /playbook_memory/revise`. Same /// shape as a seed request minus `append` (revise is always /// append-semantics for a specific parent) plus `parent_id`. The new /// version's `playbook_id` is derived deterministically so callers get /// the same id back from repeated revises with identical content — /// useful for idempotent retry paths. #[derive(Deserialize)] struct RevisePlaybookRequest { parent_id: String, operation: String, approach: String, context: String, endorsed_names: Vec, #[serde(default)] schema_fingerprint: Option, #[serde(default)] valid_until: Option, } /// Phase 27 — create a new version of an existing playbook. The parent /// is marked superseded; the new entry inherits the chain via /// `parent_id` and carries `version = parent.version + 1`. Errors with /// 400 on a retired or already-superseded parent (must revise the tip /// of the chain). Embeds the new text through the same shape as /// `/seed` so cosine similarity stays comparable across rebuild + seed /// + revise entries. async fn revise_playbook_memory( State(state): State, Json(req): Json, ) -> Result, (StatusCode, String)> { let text = format!( "{} | {} | {} | fills: {}", req.operation, req.approach, req.context, req.endorsed_names.join(", "), ); let resp = state.ai_client.embed(EmbedRequest { texts: vec![text], model: None }) .await .map_err(|e| (StatusCode::BAD_GATEWAY, format!("embed revise: {e}")))?; if resp.embeddings.is_empty() { return Err((StatusCode::BAD_GATEWAY, "embed returned nothing".into())); } let emb: Vec = resp.embeddings[0].iter().map(|&x| x as f32).collect(); let (city, state_) = { let after_in = req.operation.split(" in ").nth(1).unwrap_or(""); let mut parts = after_in.splitn(2, ','); let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty()); let state = parts.next() .map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::()) .filter(|s| !s.is_empty()); (city, state) }; if city.is_none() || state_.is_none() { return Err((StatusCode::BAD_REQUEST, "operation must match 'fill: Role xN in City, ST' shape".into())); } let ts = chrono::Utc::now().to_rfc3339(); use sha2::{Digest, Sha256}; let mut h = Sha256::new(); h.update(ts.as_bytes()); h.update(b"|"); h.update(req.parent_id.as_bytes()); h.update(b"|"); h.update(req.operation.as_bytes()); let bytes = h.finalize(); let pid = format!("pb-rev-{}", bytes.iter().take(8).map(|b| format!("{b:02x}")).collect::()); let new_entry = playbook_memory::PlaybookEntry { playbook_id: pid.clone(), operation: req.operation, approach: req.approach, context: req.context, timestamp: ts, endorsed_names: req.endorsed_names, city, state: state_, embedding: Some(emb), schema_fingerprint: req.schema_fingerprint, valid_until: req.valid_until, retired_at: None, retirement_reason: None, // revise_entry overwrites these from the parent — values here // are just placeholders so the struct is well-formed. version: 1, parent_id: None, superseded_at: None, superseded_by: None, }; let outcome = state.playbook_memory.revise_entry(&req.parent_id, new_entry) .await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; Ok(Json(serde_json::json!({ "outcome": outcome, "entries_after": state.playbook_memory.entry_count().await, }))) } /// Phase 27 — return the full version chain containing `playbook_id`, /// ordered root → tip. 404 if the id isn't present. The walker is /// cycle-safe by construction (visited set per direction). async fn playbook_memory_history( State(state): State, Path(playbook_id): Path, ) -> Result, (StatusCode, String)> { let chain = state.playbook_memory.history(&playbook_id).await; if chain.is_empty() { return Err((StatusCode::NOT_FOUND, format!("no playbook with id '{playbook_id}'"))); } Ok(Json(serde_json::json!({ "playbook_id": playbook_id, "versions": chain.len(), "chain": chain, }))) } /// Phase 25 status endpoint — reports retirement counts so dashboards /// can show "N playbooks retired (12 from 2026-05 schema migration)". /// Phase 27 added `superseded` as a distinct counter. async fn playbook_memory_status( State(state): State, ) -> impl IntoResponse { let (total, retired, superseded, failures) = state.playbook_memory.status_counts().await; // `active` = entries eligible for boost. Retired and superseded are // distinct exclusion reasons; subtract both. An entry can in principle // be both retired AND superseded (e.g. revised then retired) so // saturating_sub guards against underflow if that pathological case // ever lands. let inactive = retired + superseded; Json(serde_json::json!({ "total": total, "retired": retired, "superseded": superseded, "active": total.saturating_sub(inactive), "failures": failures, })) } async fn lance_recall_harness( State(state): State, Path(index_name): Path, Json(req): Json, ) -> impl IntoResponse { let t0 = std::time::Instant::now(); let harness_set = state.harness_store.load_for_index(&index_name, &req.harness).await .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?; if !harness_set.ground_truth_built { return Err((StatusCode::BAD_REQUEST, "harness has no ground truth — run a regular /hnsw/trial first to compute it".into())); } let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let k = req.top_k; let mut per_query = Vec::with_capacity(harness_set.queries.len()); let mut latencies: Vec = Vec::with_capacity(harness_set.queries.len()); let mut recalls: Vec = Vec::with_capacity(harness_set.queries.len()); for q in &harness_set.queries { let qv = match &q.query_embedding { Some(v) => v, None => continue, }; let gt = match &q.ground_truth { Some(gt) => gt, None => continue, }; let qt0 = std::time::Instant::now(); let hits = lance_store.search( qv, k, Some(req.nprobes.unwrap_or(LANCE_DEFAULT_NPROBES)), Some(req.refine_factor.unwrap_or(LANCE_DEFAULT_REFINE_FACTOR)), ).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?; let lat_us = qt0.elapsed().as_micros() as f32; let predicted: Vec = hits.iter().map(|h| h.doc_id.clone()).collect(); let recall = harness::recall_at_k(&predicted, gt, k); per_query.push(LanceRecallQuery { query_id: q.id.clone(), recall, latency_us: lat_us, hits_returned: hits.len(), }); latencies.push(lat_us); recalls.push(recall); } let mean_recall = if recalls.is_empty() { 0.0 } else { recalls.iter().sum::() / recalls.len() as f32 }; latencies.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let p = |pct: f32| -> f32 { if latencies.is_empty() { return 0.0; } let idx = ((latencies.len() as f32 - 1.0) * pct).round() as usize; latencies[idx.min(latencies.len() - 1)] }; Ok(Json(LanceRecallResult { index_name, harness: req.harness, queries: per_query.len(), top_k: k, mean_recall, per_query, latency_p50_us: p(0.50), latency_p95_us: p(0.95), total_duration_secs: t0.elapsed().as_secs_f32(), })) } /// Build a scalar btree index on a column (typically `doc_id`). Makes /// filter-pushdown queries O(log N) instead of full-fragment scan. async fn lance_build_scalar_index( State(state): State, Path((index_name, column)): Path<(String, String)>, ) -> impl IntoResponse { let lance_store = state.lance.store_for(&index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; match lance_store.build_scalar_index(&column).await { Ok(stats) => Ok(Json(stats)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[cfg(test)] mod extractor_tests { use super::*; #[test] fn extract_target_geo_basic() { let f = "role = 'Welder' AND city = 'Toledo' AND state = 'OH' AND CAST(availability AS DOUBLE) > 0.5"; assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into()))); } #[test] fn extract_target_geo_missing_state_returns_none() { let f = "role = 'Welder' AND city = 'Toledo'"; assert_eq!(extract_target_geo(f), None); } #[test] fn extract_target_geo_word_boundary() { // "civilian" contains "city" as a substring — must not match. let f = "civilian_rank = 1 AND city = 'Toledo' AND state = 'OH'"; assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into()))); } #[test] fn extract_target_role_basic() { let f = "role = 'Welder' AND city = 'Toledo'"; assert_eq!(extract_target_role(f), Some("Welder".into())); } #[test] fn extract_target_role_none_when_absent() { let f = "city = 'Toledo' AND state = 'OH'"; assert_eq!(extract_target_role(f), None); } #[test] fn extract_target_role_multi_word() { let f = "role = 'Warehouse Associate' AND city = 'Chicago'"; assert_eq!(extract_target_role(f), Some("Warehouse Associate".into())); } }