- vectord crate: chunk → embed → store → search → RAG - chunker: configurable chunk size + overlap, sentence-boundary aware splitting - store: embeddings as Parquet (binary blob f32 vectors), portable format - search: brute-force cosine similarity (works up to ~100K vectors) - rag: full pipeline — embed question → search index → retrieve context → LLM answer - Endpoints: POST /vectors/index, /vectors/search, /vectors/rag - Gateway wired with vectord service - Tested: 200 candidate resumes indexed in 5.4s, semantic search + RAG working - 20 unit tests passing (chunker, search, ingestd, shared) - AI gives honest "no match found" when context doesn't support an answer Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
85 lines
2.7 KiB
Rust
85 lines
2.7 KiB
Rust
/// RAG pipeline: question → embed → search → retrieve → generate answer.
|
|
|
|
use object_store::ObjectStore;
|
|
use std::sync::Arc;
|
|
|
|
use aibridge::client::{AiClient, EmbedRequest, GenerateRequest};
|
|
use crate::search::{self, SearchResult};
|
|
use crate::store;
|
|
|
|
/// Full RAG answer with provenance.
|
|
#[derive(Debug, Clone, serde::Serialize)]
|
|
pub struct RagResponse {
|
|
pub answer: String,
|
|
pub model: String,
|
|
pub sources: Vec<SearchResult>,
|
|
pub tokens_generated: Option<u64>,
|
|
}
|
|
|
|
/// Execute full RAG: embed question → search index → retrieve context → generate answer.
|
|
pub async fn query(
|
|
question: &str,
|
|
index_name: &str,
|
|
top_k: usize,
|
|
object_store: &Arc<dyn ObjectStore>,
|
|
ai_client: &AiClient,
|
|
) -> Result<RagResponse, String> {
|
|
// 1. Embed the question
|
|
tracing::info!("RAG: embedding question");
|
|
let embed_resp = ai_client.embed(EmbedRequest {
|
|
texts: vec![question.to_string()],
|
|
model: None,
|
|
}).await?;
|
|
|
|
if embed_resp.embeddings.is_empty() {
|
|
return Err("no embedding returned for question".into());
|
|
}
|
|
|
|
let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();
|
|
|
|
// 2. Load index and search
|
|
tracing::info!("RAG: searching index '{index_name}'");
|
|
let embeddings = store::load_embeddings(object_store, index_name).await?;
|
|
let results = search::search(&query_vec, &embeddings, top_k);
|
|
|
|
if results.is_empty() {
|
|
return Ok(RagResponse {
|
|
answer: "No relevant information found.".into(),
|
|
model: String::new(),
|
|
sources: vec![],
|
|
tokens_generated: None,
|
|
});
|
|
}
|
|
|
|
// 3. Build context from retrieved chunks
|
|
let context: String = results.iter().enumerate().map(|(i, r)| {
|
|
format!("[{}] (source: {}, doc: {}) {}", i + 1, r.source, r.doc_id, r.chunk_text)
|
|
}).collect::<Vec<_>>().join("\n\n");
|
|
|
|
// 4. Generate answer
|
|
tracing::info!("RAG: generating answer from {} chunks", results.len());
|
|
let prompt = format!(
|
|
"You are a helpful assistant answering questions based on retrieved documents from a data system.\n\n\
|
|
Use ONLY the following context to answer. If the context doesn't contain enough information, say so.\n\
|
|
Cite sources by their number [1], [2], etc.\n\n\
|
|
Context:\n{context}\n\n\
|
|
Question: {question}\n\n\
|
|
Answer:"
|
|
);
|
|
|
|
let gen_resp = ai_client.generate(GenerateRequest {
|
|
prompt,
|
|
model: None,
|
|
system: None,
|
|
temperature: Some(0.2),
|
|
max_tokens: Some(512),
|
|
}).await?;
|
|
|
|
Ok(RagResponse {
|
|
answer: gen_resp.text.trim().to_string(),
|
|
model: gen_resp.model,
|
|
sources: results,
|
|
tokens_generated: gen_resp.tokens_generated,
|
|
})
|
|
}
|