Accumulated since a6f12e2 (Phase 21 Rust port + Phase 27 versioning): - Phase 36: embed_semaphore on VectorState (permits=1) serializes seed embed calls — prevents sidecar socket collisions under concurrent /seed stress load - Phase 31+: run_stress.ts 6-task diverse stress scaffolding; run_e2e_rated.ts + orchestrator.ts tightening - Catalog dedupe cleanup: 16 duplicate manifests removed; canonical candidates.parquet (10.5MB -> 76KB) + placements.parquet (1.2MB -> 11KB) regenerated post-dedupe; fresh manifests for active datasets - vectord: harness EvalSet refinements (+181), agent portfolio rotation + ingest triggers (+158), autotune + rag adjustments - catalogd/storaged/ingestd/mcp-server: misc tightening - docs: Phase 28-36 PRD entries + DECISIONS ADR additions; control-plane pivot banner added to top of docs/PRD.md (pointing at docs/CONTROL_PLANE_PRD.md which lands in next commit) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
184 lines
7.0 KiB
Rust
184 lines
7.0 KiB
Rust
/// RAG pipeline: question → embed → search → rerank → generate answer.
|
|
///
|
|
/// The rerank step (added 2026-04-17) uses the LLM as a cross-encoder
|
|
/// between retrieval and generation. This catches cases where the
|
|
/// embedding model scores documents as similar but the content isn't
|
|
/// actually relevant to the question — a known weakness of small
|
|
/// general-purpose embed models on domain-specific text.
|
|
|
|
use object_store::ObjectStore;
|
|
use std::sync::Arc;
|
|
|
|
use aibridge::client::{AiClient, EmbedRequest, GenerateRequest};
|
|
use aibridge::continuation::{generate_continuable, ContinuableOpts, ResponseShape};
|
|
use crate::search::{self, SearchResult};
|
|
use crate::store;
|
|
|
|
/// Cross-encoder rerank: ask the LLM to re-sort retrieved chunks by
|
|
/// relevance. Falls back to the original order if the model returns
|
|
/// garbage (which happens ~5% of the time with 7B models).
|
|
async fn rerank(
|
|
question: &str,
|
|
mut results: Vec<SearchResult>,
|
|
ai_client: &AiClient,
|
|
) -> Vec<SearchResult> {
|
|
if results.len() <= 1 {
|
|
return results;
|
|
}
|
|
|
|
let chunk_list: String = results.iter().enumerate().map(|(i, r)| {
|
|
let text: String = r.chunk_text.chars().take(200).collect();
|
|
format!("[{i}] {text}")
|
|
}).collect::<Vec<_>>().join("\n");
|
|
|
|
let resp = ai_client.generate(GenerateRequest {
|
|
prompt: format!(
|
|
"Rank these text chunks by relevance to the question.\n\
|
|
Return ONLY a comma-separated list of indices, most relevant first.\n\n\
|
|
Question: {question}\n\n\
|
|
Chunks:\n{chunk_list}\n\n\
|
|
Ranking:"
|
|
),
|
|
model: None,
|
|
system: None,
|
|
temperature: Some(0.0),
|
|
max_tokens: Some(50),
|
|
// Reranker returns a comma-separated int list — pure structured
|
|
// output, zero benefit from hidden reasoning. Opt out to avoid
|
|
// the empty-response failure mode Phase 21 catalogued.
|
|
think: Some(false),
|
|
}).await;
|
|
|
|
match resp {
|
|
Ok(gen_resp) => {
|
|
let text = gen_resp.text.trim();
|
|
let indices: Vec<usize> = text
|
|
.split(|c: char| c == ',' || c.is_whitespace())
|
|
.filter_map(|s| s.trim().parse::<usize>().ok())
|
|
.filter(|&i| i < results.len())
|
|
.collect();
|
|
|
|
if indices.is_empty() {
|
|
tracing::debug!("reranker returned unparseable output: {text}");
|
|
return results;
|
|
}
|
|
|
|
let mut reranked: Vec<SearchResult> = Vec::with_capacity(results.len());
|
|
let mut used = vec![false; results.len()];
|
|
for &i in &indices {
|
|
if !used[i] {
|
|
reranked.push(results[i].clone());
|
|
used[i] = true;
|
|
}
|
|
}
|
|
// Append any the model didn't mention (preserves all results).
|
|
for (i, r) in results.drain(..).enumerate() {
|
|
if !used[i] {
|
|
reranked.push(r);
|
|
}
|
|
}
|
|
tracing::info!("reranker reordered {}/{} chunks", indices.len(), reranked.len());
|
|
reranked
|
|
}
|
|
Err(e) => {
|
|
tracing::debug!("reranker failed (using original order): {e}");
|
|
results
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Full RAG answer with provenance.
|
|
#[derive(Debug, Clone, serde::Serialize)]
|
|
pub struct RagResponse {
|
|
pub answer: String,
|
|
pub model: String,
|
|
pub sources: Vec<SearchResult>,
|
|
pub tokens_generated: Option<u64>,
|
|
}
|
|
|
|
/// Execute full RAG: embed question → search index → retrieve context → generate answer.
|
|
pub async fn query(
|
|
question: &str,
|
|
index_name: &str,
|
|
top_k: usize,
|
|
object_store: &Arc<dyn ObjectStore>,
|
|
ai_client: &AiClient,
|
|
) -> Result<RagResponse, String> {
|
|
// 1. Embed the question
|
|
tracing::info!("RAG: embedding question");
|
|
let embed_resp = ai_client.embed(EmbedRequest {
|
|
texts: vec![question.to_string()],
|
|
model: None,
|
|
}).await?;
|
|
|
|
if embed_resp.embeddings.is_empty() {
|
|
return Err("no embedding returned for question".into());
|
|
}
|
|
|
|
let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();
|
|
|
|
// 2. Load index and search
|
|
tracing::info!("RAG: searching index '{index_name}'");
|
|
let embeddings = store::load_embeddings(object_store, index_name).await?;
|
|
let results = search::search(&query_vec, &embeddings, top_k);
|
|
|
|
if results.is_empty() {
|
|
return Ok(RagResponse {
|
|
answer: "No relevant information found.".into(),
|
|
model: String::new(),
|
|
sources: vec![],
|
|
tokens_generated: None,
|
|
});
|
|
}
|
|
|
|
// 3. Rerank: ask the LLM to sort retrieved chunks by relevance.
|
|
// This cross-encoder step catches cases where embedding similarity
|
|
// is high but semantic relevance is low — common with small embed
|
|
// models on domain-specific text.
|
|
let results = rerank(question, results, ai_client).await;
|
|
|
|
// 4. Build context from (reranked) top chunks
|
|
let context: String = results.iter().enumerate().map(|(i, r)| {
|
|
format!("[{}] (source: {}, doc: {}) {}", i + 1, r.source, r.doc_id, r.chunk_text)
|
|
}).collect::<Vec<_>>().join("\n\n");
|
|
|
|
// 5. Generate answer
|
|
tracing::info!("RAG: generating answer from {} chunks", results.len());
|
|
let prompt = format!(
|
|
"You are a helpful assistant answering questions about a staffing database.\n\n\
|
|
Use ONLY the following retrieved records to answer. Be specific — cite names,\n\
|
|
numbers, skills, and cities from the records. If the context doesn't contain\n\
|
|
enough information to fully answer, say what you can and note what's missing.\n\
|
|
Cite sources by their number [1], [2], etc.\n\n\
|
|
Context:\n{context}\n\n\
|
|
Question: {question}\n\n\
|
|
Answer:"
|
|
);
|
|
|
|
// Route the answer call through Phase 21's generate_continuable so
|
|
// a thinking-model empty-response or a mid-JSON truncation self-
|
|
// repairs instead of silently returning half an answer. Shape is
|
|
// Text (the answer is prose, not JSON), think is Some(false) to
|
|
// opt out of hidden reasoning on the hot path. This is the first
|
|
// production caller of the Phase 21 primitives — see audit finding
|
|
// "Phase 21 Rust primitives are wired but not CALLED by any
|
|
// production surface" from 2026-04-21.
|
|
let mut cont_opts = ContinuableOpts::new("qwen2.5:latest");
|
|
cont_opts.max_tokens = Some(512);
|
|
cont_opts.temperature = Some(0.2);
|
|
cont_opts.shape = ResponseShape::Text;
|
|
cont_opts.think = Some(false);
|
|
let outcome = generate_continuable(ai_client, &prompt, &cont_opts).await?;
|
|
|
|
Ok(RagResponse {
|
|
answer: outcome.text.trim().to_string(),
|
|
// generate_continuable doesn't surface the model name (sidecar
|
|
// echoes whatever Ollama loaded). Use the configured tier model
|
|
// for now; if RAG needs to report the actual resolved model,
|
|
// the runner can add a post-call ps probe later.
|
|
model: "qwen2.5:latest".to_string(),
|
|
sources: results,
|
|
tokens_generated: None,
|
|
})
|
|
}
|