import { test, expect } from "bun:test"; import { scoreRelevance, filterChunks, extractDefinedSymbols, extractImportedSymbols, jaccard, tokenize, } from "./relevance"; const RUST_FOCUS = ` use queryd::context::build_context; use catalogd::Registry; use shared::types::{Tombstone, ModelProfile}; pub struct GatewayState { catalog: Registry, } pub async fn handle_query(state: &GatewayState, sql: &str) -> Result { let ctx = build_context(&state.catalog).await?; ctx.sql(sql).await.map(QueryResponse::from) } pub fn shutdown(state: GatewayState) { drop(state); } `; test("extractDefinedSymbols pulls pub fn / struct names", () => { const syms = extractDefinedSymbols(RUST_FOCUS); expect(syms).toContain("handle_query"); expect(syms).toContain("shutdown"); expect(syms).toContain("GatewayState"); }); test("extractImportedSymbols pulls names from use statements", () => { const syms = extractImportedSymbols(RUST_FOCUS); expect(syms).toContain("build_context"); expect(syms).toContain("Registry"); expect(syms).toContain("Tombstone"); expect(syms).toContain("ModelProfile"); // Should not include keywords expect(syms).not.toContain("use"); expect(syms).not.toContain("crate"); }); test("path_match dominates when chunk encodes focus path", () => { const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; const chunk = { source: "distilled_factual_v20260423095819", doc_id: "crates/gateway/src/main.rs:42", text: "Some chunk content unrelated to anything", score: 0.5, }; const { score, reasons } = scoreRelevance(focus, chunk); expect(score).toBeGreaterThanOrEqual(1.0); expect(reasons).toContain("path_match"); }); test("import_only adjacency pollution gets penalized", () => { // Chunk talks about queryd::context::build_context (imported by focus) // but never mentions any focus-defined symbol — classic pollution. const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; const chunk = { source: "distilled_procedural_v20260423102847", doc_id: "proc_8421", text: "When build_context fails the Registry must be invalidated. The Tombstone fields drive the merge-on-read filter — caller should not retry on stale fingerprints.", score: 0.65, }; const { score, reasons } = scoreRelevance(focus, chunk); expect(reasons.some(r => r.startsWith("import_only("))).toBe(true); expect(score).toBeLessThan(0.3); // below default threshold → dropped }); test("defined_match keeps a chunk that's actually about the focus", () => { const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; const chunk = { source: "distilled_factual_v20260423095819", doc_id: "fact_12", text: "handle_query in GatewayState must return QueryResponse, not anyhow::Error. The shutdown path drops state synchronously.", score: 0.4, }; const { score, reasons } = scoreRelevance(focus, chunk); expect(reasons.some(r => r.startsWith("defined_match"))).toBe(true); expect(score).toBeGreaterThan(0.3); // above threshold → kept }); test("filterChunks bucket-sorts kept vs dropped", () => { const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; const chunks = [ { source: "x", doc_id: "crates/gateway/src/main.rs:1", text: "anything", score: 0.5 }, // path_match — kept { source: "x", doc_id: "y", text: "build_context Tombstone Registry adjacent only", score: 0.7 }, // import_only — dropped { source: "x", doc_id: "z", text: "handle_query and GatewayState are at fault here", score: 0.4 }, // defined_match — kept { source: "x", doc_id: "w", text: "completely unrelated content about chicago permits", score: 0.6 }, // nothing — dropped ]; const result = filterChunks(focus, chunks); expect(result.kept.length).toBe(2); expect(result.dropped.length).toBe(2); expect(result.kept.map(c => c.doc_id)).toContain("crates/gateway/src/main.rs:1"); expect(result.kept.map(c => c.doc_id)).toContain("z"); }); test("threshold override changes filter behavior", () => { const focus = { path: "crates/queryd/src/x.rs", content: "pub fn foo() {}" }; const weak = { source: "x", doc_id: "y", text: "foo is referenced here briefly", score: 0.2 }; const result_strict = filterChunks(focus, [weak], 0.95); const result_loose = filterChunks(focus, [weak], 0.1); expect(result_strict.kept.length).toBe(0); expect(result_loose.kept.length).toBe(1); }); test("empty defined/imported gracefully scores by tokens only", () => { const focus = { path: "doc.md", content: "This is plain prose about welders in Chicago." }; const chunk = { source: "x", doc_id: "y", text: "Welders working in Chicago need OSHA certs.", score: 0.5 }; const { score, reasons } = scoreRelevance(focus, chunk); expect(score).toBeGreaterThan(0); expect(reasons.some(r => r.startsWith("token_overlap"))).toBe(true); }); test("jaccard / tokenize basic sanity", () => { const a = tokenize("the quick brown fox jumps over the lazy dog"); const b = tokenize("a fast brown wolf runs over a tired dog"); expect(a.has("the")).toBe(false); // stopword expect(a.has("brown")).toBe(true); const j = jaccard(a, b); expect(j).toBeGreaterThan(0); expect(j).toBeLessThan(1); });