profit 5b1fcf6d27 Phase 28-36 body of work
Accumulated since a6f12e2 (Phase 21 Rust port + Phase 27 versioning):

- Phase 36: embed_semaphore on VectorState (permits=1) serializes
  seed embed calls — prevents sidecar socket collisions under
  concurrent /seed stress load
- Phase 31+: run_stress.ts 6-task diverse stress scaffolding;
  run_e2e_rated.ts + orchestrator.ts tightening
- Catalog dedupe cleanup: 16 duplicate manifests removed; canonical
  candidates.parquet (10.5MB -> 76KB) + placements.parquet (1.2MB ->
  11KB) regenerated post-dedupe; fresh manifests for active datasets
- vectord: harness EvalSet refinements (+181), agent portfolio
  rotation + ingest triggers (+158), autotune + rag adjustments
- catalogd/storaged/ingestd/mcp-server: misc tightening
- docs: Phase 28-36 PRD entries + DECISIONS ADR additions;
  control-plane pivot banner added to top of docs/PRD.md (pointing
  at docs/CONTROL_PLANE_PRD.md which lands in next commit)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:41:15 -05:00

184 lines
7.0 KiB
Rust

/// RAG pipeline: question → embed → search → rerank → generate answer.
///
/// The rerank step (added 2026-04-17) uses the LLM as a cross-encoder
/// between retrieval and generation. This catches cases where the
/// embedding model scores documents as similar but the content isn't
/// actually relevant to the question — a known weakness of small
/// general-purpose embed models on domain-specific text.
use object_store::ObjectStore;
use std::sync::Arc;
use aibridge::client::{AiClient, EmbedRequest, GenerateRequest};
use aibridge::continuation::{generate_continuable, ContinuableOpts, ResponseShape};
use crate::search::{self, SearchResult};
use crate::store;
/// Cross-encoder rerank: ask the LLM to re-sort retrieved chunks by
/// relevance. Falls back to the original order if the model returns
/// garbage (which happens ~5% of the time with 7B models).
async fn rerank(
question: &str,
mut results: Vec<SearchResult>,
ai_client: &AiClient,
) -> Vec<SearchResult> {
if results.len() <= 1 {
return results;
}
let chunk_list: String = results.iter().enumerate().map(|(i, r)| {
let text: String = r.chunk_text.chars().take(200).collect();
format!("[{i}] {text}")
}).collect::<Vec<_>>().join("\n");
let resp = ai_client.generate(GenerateRequest {
prompt: format!(
"Rank these text chunks by relevance to the question.\n\
Return ONLY a comma-separated list of indices, most relevant first.\n\n\
Question: {question}\n\n\
Chunks:\n{chunk_list}\n\n\
Ranking:"
),
model: None,
system: None,
temperature: Some(0.0),
max_tokens: Some(50),
// Reranker returns a comma-separated int list — pure structured
// output, zero benefit from hidden reasoning. Opt out to avoid
// the empty-response failure mode Phase 21 catalogued.
think: Some(false),
}).await;
match resp {
Ok(gen_resp) => {
let text = gen_resp.text.trim();
let indices: Vec<usize> = text
.split(|c: char| c == ',' || c.is_whitespace())
.filter_map(|s| s.trim().parse::<usize>().ok())
.filter(|&i| i < results.len())
.collect();
if indices.is_empty() {
tracing::debug!("reranker returned unparseable output: {text}");
return results;
}
let mut reranked: Vec<SearchResult> = Vec::with_capacity(results.len());
let mut used = vec![false; results.len()];
for &i in &indices {
if !used[i] {
reranked.push(results[i].clone());
used[i] = true;
}
}
// Append any the model didn't mention (preserves all results).
for (i, r) in results.drain(..).enumerate() {
if !used[i] {
reranked.push(r);
}
}
tracing::info!("reranker reordered {}/{} chunks", indices.len(), reranked.len());
reranked
}
Err(e) => {
tracing::debug!("reranker failed (using original order): {e}");
results
}
}
}
/// Full RAG answer with provenance.
#[derive(Debug, Clone, serde::Serialize)]
pub struct RagResponse {
pub answer: String,
pub model: String,
pub sources: Vec<SearchResult>,
pub tokens_generated: Option<u64>,
}
/// Execute full RAG: embed question → search index → retrieve context → generate answer.
pub async fn query(
question: &str,
index_name: &str,
top_k: usize,
object_store: &Arc<dyn ObjectStore>,
ai_client: &AiClient,
) -> Result<RagResponse, String> {
// 1. Embed the question
tracing::info!("RAG: embedding question");
let embed_resp = ai_client.embed(EmbedRequest {
texts: vec![question.to_string()],
model: None,
}).await?;
if embed_resp.embeddings.is_empty() {
return Err("no embedding returned for question".into());
}
let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();
// 2. Load index and search
tracing::info!("RAG: searching index '{index_name}'");
let embeddings = store::load_embeddings(object_store, index_name).await?;
let results = search::search(&query_vec, &embeddings, top_k);
if results.is_empty() {
return Ok(RagResponse {
answer: "No relevant information found.".into(),
model: String::new(),
sources: vec![],
tokens_generated: None,
});
}
// 3. Rerank: ask the LLM to sort retrieved chunks by relevance.
// This cross-encoder step catches cases where embedding similarity
// is high but semantic relevance is low — common with small embed
// models on domain-specific text.
let results = rerank(question, results, ai_client).await;
// 4. Build context from (reranked) top chunks
let context: String = results.iter().enumerate().map(|(i, r)| {
format!("[{}] (source: {}, doc: {}) {}", i + 1, r.source, r.doc_id, r.chunk_text)
}).collect::<Vec<_>>().join("\n\n");
// 5. Generate answer
tracing::info!("RAG: generating answer from {} chunks", results.len());
let prompt = format!(
"You are a helpful assistant answering questions about a staffing database.\n\n\
Use ONLY the following retrieved records to answer. Be specific — cite names,\n\
numbers, skills, and cities from the records. If the context doesn't contain\n\
enough information to fully answer, say what you can and note what's missing.\n\
Cite sources by their number [1], [2], etc.\n\n\
Context:\n{context}\n\n\
Question: {question}\n\n\
Answer:"
);
// Route the answer call through Phase 21's generate_continuable so
// a thinking-model empty-response or a mid-JSON truncation self-
// repairs instead of silently returning half an answer. Shape is
// Text (the answer is prose, not JSON), think is Some(false) to
// opt out of hidden reasoning on the hot path. This is the first
// production caller of the Phase 21 primitives — see audit finding
// "Phase 21 Rust primitives are wired but not CALLED by any
// production surface" from 2026-04-21.
let mut cont_opts = ContinuableOpts::new("qwen2.5:latest");
cont_opts.max_tokens = Some(512);
cont_opts.temperature = Some(0.2);
cont_opts.shape = ResponseShape::Text;
cont_opts.think = Some(false);
let outcome = generate_continuable(ai_client, &prompt, &cont_opts).await?;
Ok(RagResponse {
answer: outcome.text.trim().to_string(),
// generate_continuable doesn't surface the model name (sidecar
// echoes whatever Ollama loaded). Use the configured tier model
// for now; if RAG needs to report the actual resolved model,
// the runner can add a post-call ps probe later.
model: "qwen2.5:latest".to_string(),
sources: results,
tokens_generated: None,
})
}