root 26fc98c885 Phase 7: Vector index + RAG pipeline
- vectord crate: chunk → embed → store → search → RAG
- chunker: configurable chunk size + overlap, sentence-boundary aware splitting
- store: embeddings as Parquet (binary blob f32 vectors), portable format
- search: brute-force cosine similarity (works up to ~100K vectors)
- rag: full pipeline — embed question → search index → retrieve context → LLM answer
- Endpoints: POST /vectors/index, /vectors/search, /vectors/rag
- Gateway wired with vectord service
- Tested: 200 candidate resumes indexed in 5.4s, semantic search + RAG working
- 20 unit tests passing (chunker, search, ingestd, shared)
- AI gives honest "no match found" when context doesn't support an answer

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 08:12:28 -05:00

108 lines
3.0 KiB
Rust

/// Brute-force vector search with cosine similarity.
/// Works well up to ~100K vectors. HNSW index would go here for larger scale.
use crate::store::StoredEmbedding;
/// A search result with score.
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchResult {
pub source: String,
pub doc_id: String,
pub chunk_idx: u32,
pub chunk_text: String,
pub score: f32,
}
/// Search embeddings by cosine similarity. Returns top_k results.
pub fn search(
query_vector: &[f32],
embeddings: &[StoredEmbedding],
top_k: usize,
) -> Vec<SearchResult> {
let query_norm = norm(query_vector);
if query_norm == 0.0 {
return vec![];
}
let mut scored: Vec<SearchResult> = embeddings.iter().map(|emb| {
let score = cosine_similarity(query_vector, &emb.vector, query_norm);
SearchResult {
source: emb.source.clone(),
doc_id: emb.doc_id.clone(),
chunk_idx: emb.chunk_idx,
chunk_text: emb.chunk_text.clone(),
score,
}
}).collect();
// Sort descending by score
scored.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
scored.truncate(top_k);
scored
}
fn cosine_similarity(a: &[f32], b: &[f32], a_norm: f32) -> f32 {
if a.len() != b.len() {
return 0.0;
}
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let b_norm = norm(b);
if b_norm == 0.0 {
return 0.0;
}
dot / (a_norm * b_norm)
}
fn norm(v: &[f32]) -> f32 {
v.iter().map(|x| x * x).sum::<f32>().sqrt()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn identical_vectors_score_1() {
let v = vec![1.0, 2.0, 3.0];
let emb = StoredEmbedding {
source: "test".into(),
doc_id: "1".into(),
chunk_idx: 0,
chunk_text: "hello".into(),
vector: v.clone(),
};
let results = search(&v, &[emb], 1);
assert!((results[0].score - 1.0).abs() < 0.001);
}
#[test]
fn orthogonal_vectors_score_0() {
let q = vec![1.0, 0.0];
let emb = StoredEmbedding {
source: "test".into(),
doc_id: "1".into(),
chunk_idx: 0,
chunk_text: "hello".into(),
vector: vec![0.0, 1.0],
};
let results = search(&q, &[emb], 1);
assert!(results[0].score.abs() < 0.001);
}
#[test]
fn returns_top_k() {
let q = vec![1.0, 0.0, 0.0];
let embs: Vec<StoredEmbedding> = (0..10).map(|i| StoredEmbedding {
source: "test".into(),
doc_id: format!("{i}"),
chunk_idx: 0,
chunk_text: format!("doc {i}"),
vector: vec![1.0 - i as f32 * 0.1, i as f32 * 0.1, 0.0],
}).collect();
let results = search(&q, &embs, 3);
assert_eq!(results.len(), 3);
assert!(results[0].score >= results[1].score);
assert!(results[1].score >= results[2].score);
}
}