lakehouse/crates/vectord/src/rag.rs

/// RAG pipeline: question → embed → search → rerank → generate answer.
///
/// The rerank step (added 2026-04-17) uses the LLM as a cross-encoder
/// between retrieval and generation. This catches cases where the
/// embedding model scores documents as similar but the content isn't
/// actually relevant to the question — a known weakness of small
/// general-purpose embed models on domain-specific text.

use object_store::ObjectStore;
use std::sync::Arc;

use aibridge::client::{AiClient, EmbedRequest, GenerateRequest};
use aibridge::continuation::{generate_continuable, ContinuableOpts, ResponseShape};
use crate::search::{self, SearchResult};
use crate::store;

/// Cross-encoder rerank: ask the LLM to re-sort retrieved chunks by
/// relevance. Falls back to the original order if the model returns
/// garbage (which happens ~5% of the time with 7B models).
async fn rerank(
    question: &str,
    mut results: Vec<SearchResult>,
    ai_client: &AiClient,
) -> Vec<SearchResult> {
    if results.len() <= 1 {
        return results;
    }

    let chunk_list: String = results.iter().enumerate().map(|(i, r)| {
        let text: String = r.chunk_text.chars().take(200).collect();
        format!("[{i}] {text}")
    }).collect::<Vec<_>>().join("\n");

    let resp = ai_client.generate(GenerateRequest {
        prompt: format!(
            "Rank these text chunks by relevance to the question.\n\
            Return ONLY a comma-separated list of indices, most relevant first.\n\n\
            Question: {question}\n\n\
            Chunks:\n{chunk_list}\n\n\
            Ranking:"
        ),
        model: None,
        system: None,
        temperature: Some(0.0),
        max_tokens: Some(50),
        // Reranker returns a comma-separated int list — pure structured
        // output, zero benefit from hidden reasoning. Opt out to avoid
        // the empty-response failure mode Phase 21 catalogued.
        think: Some(false),
    }).await;

    match resp {
        Ok(gen_resp) => {
            let text = gen_resp.text.trim();
            let indices: Vec<usize> = text
                .split(|c: char| c == ',' || c.is_whitespace())
                .filter_map(|s| s.trim().parse::<usize>().ok())
                .filter(|&i| i < results.len())
                .collect();

            if indices.is_empty() {
                tracing::debug!("reranker returned unparseable output: {text}");
                return results;
            }

            let mut reranked: Vec<SearchResult> = Vec::with_capacity(results.len());
            let mut used = vec![false; results.len()];
            for &i in &indices {
                if !used[i] {
                    reranked.push(results[i].clone());
                    used[i] = true;
                }
            }
            // Append any the model didn't mention (preserves all results).
            for (i, r) in results.drain(..).enumerate() {
                if !used[i] {
                    reranked.push(r);
                }
            }
            tracing::info!("reranker reordered {}/{} chunks", indices.len(), reranked.len());
            reranked
        }
        Err(e) => {
            tracing::debug!("reranker failed (using original order): {e}");
            results
        }
    }
}

/// Full RAG answer with provenance.
#[derive(Debug, Clone, serde::Serialize)]
pub struct RagResponse {
    pub answer: String,
    pub model: String,
    pub sources: Vec<SearchResult>,
    pub tokens_generated: Option<u64>,
}

/// Execute full RAG: embed question → search index → retrieve context → generate answer.
pub async fn query(
    question: &str,
    index_name: &str,
    top_k: usize,
    object_store: &Arc<dyn ObjectStore>,
    ai_client: &AiClient,
) -> Result<RagResponse, String> {
    // 1. Embed the question
    tracing::info!("RAG: embedding question");
    let embed_resp = ai_client.embed(EmbedRequest {
        texts: vec![question.to_string()],
        model: None,
    }).await?;

    if embed_resp.embeddings.is_empty() {
        return Err("no embedding returned for question".into());
    }

    let query_vec: Vec<f32> = embed_resp.embeddings[0].iter().map(|&x| x as f32).collect();

    // 2. Load index and search
    tracing::info!("RAG: searching index '{index_name}'");
    let embeddings = store::load_embeddings(object_store, index_name).await?;
    let results = search::search(&query_vec, &embeddings, top_k);

    if results.is_empty() {
        return Ok(RagResponse {
            answer: "No relevant information found.".into(),
            model: String::new(),
            sources: vec![],
            tokens_generated: None,
        });
    }

    // 3. Rerank: ask the LLM to sort retrieved chunks by relevance.
    //    This cross-encoder step catches cases where embedding similarity
    //    is high but semantic relevance is low — common with small embed
    //    models on domain-specific text.
    let results = rerank(question, results, ai_client).await;

    // 4. Build context from (reranked) top chunks
    let context: String = results.iter().enumerate().map(|(i, r)| {
        format!("[{}] (source: {}, doc: {}) {}", i + 1, r.source, r.doc_id, r.chunk_text)
    }).collect::<Vec<_>>().join("\n\n");

    // 5. Generate answer
    tracing::info!("RAG: generating answer from {} chunks", results.len());
    let prompt = format!(
        "You are a helpful assistant answering questions about a staffing database.\n\n\
        Use ONLY the following retrieved records to answer. Be specific — cite names,\n\
        numbers, skills, and cities from the records. If the context doesn't contain\n\
        enough information to fully answer, say what you can and note what's missing.\n\
        Cite sources by their number [1], [2], etc.\n\n\
        Context:\n{context}\n\n\
        Question: {question}\n\n\
        Answer:"
    );

    // Route the answer call through Phase 21's generate_continuable so
    // a thinking-model empty-response or a mid-JSON truncation self-
    // repairs instead of silently returning half an answer. Shape is
    // Text (the answer is prose, not JSON), think is Some(false) to
    // opt out of hidden reasoning on the hot path. This is the first
    // production caller of the Phase 21 primitives — see audit finding
    // "Phase 21 Rust primitives are wired but not CALLED by any
    // production surface" from 2026-04-21.
    let mut cont_opts = ContinuableOpts::new("qwen2.5:latest");
    cont_opts.max_tokens = Some(512);
    cont_opts.temperature = Some(0.2);
    cont_opts.shape = ResponseShape::Text;
    cont_opts.think = Some(false);
    let outcome = generate_continuable(ai_client, &prompt, &cont_opts).await?;

    Ok(RagResponse {
        answer: outcome.text.trim().to_string(),
        // generate_continuable doesn't surface the model name (sidecar
        // echoes whatever Ollama loaded). Use the configured tier model
        // for now; if RAG needs to report the actual resolved model,
        // the runner can add a post-call ps probe later.
        model: "qwen2.5:latest".to_string(),
        sources: results,
        tokens_generated: None,
    })
}