package matrix import ( "strings" "testing" ) func TestTokenize(t *testing.T) { cases := []struct { text string want []string // expected tokens (sorted check inside) }{ {"", nil}, {"the quick brown fox", []string{"quick", "brown", "fox"}}, // stopwords dropped {"hello WORLD", []string{"hello", "world"}}, // lowercase {"a b c", nil}, // all under 3 chars {"struct Foo", []string{"foo"}}, // "struct" is a stopword, identifiers OK {"crates/queryd/db.go", []string{"crates", "queryd"}}, // db.go: "db" is 2 chars, "go" is 2 chars } for _, c := range cases { got := Tokenize(c.text) if len(got) != len(c.want) { t.Errorf("Tokenize(%q): want %d tokens %v, got %d %v", c.text, len(c.want), c.want, len(got), got) continue } for _, w := range c.want { if _, ok := got[w]; !ok { t.Errorf("Tokenize(%q): missing token %q in %v", c.text, w, got) } } } } func TestJaccard(t *testing.T) { mk := func(tokens ...string) map[string]struct{} { m := make(map[string]struct{}) for _, t := range tokens { m[t] = struct{}{} } return m } cases := []struct { name string a, b map[string]struct{} want float64 epsilon float64 }{ {"both empty", mk(), mk(), 0, 0}, {"a empty", mk(), mk("x"), 0, 0}, {"identical", mk("x", "y"), mk("x", "y"), 1, 0}, {"disjoint", mk("a", "b"), mk("c", "d"), 0, 0}, {"half overlap", mk("a", "b"), mk("b", "c"), 1.0 / 3.0, 0.001}, } for _, c := range cases { got := Jaccard(c.a, c.b) if got < c.want-c.epsilon || got > c.want+c.epsilon { t.Errorf("%s: want %.3f, got %.3f", c.name, c.want, got) } } } func TestExtractDefinedSymbols(t *testing.T) { rust := ` pub fn search_chunks(query: &str) -> Vec { todo!() } pub async fn build_index() {} pub struct ChunkRegistry {} pub enum Distance { Cosine, Euclidean } pub trait Searcher {} pub const MAX_K: usize = 1000; pub type ChunkMap = HashMap; fn private_helper() {} // not pub, must NOT match struct PrivateOnly {} // not pub, must NOT match ` got := ExtractDefinedSymbols(rust) want := []string{"search_chunks", "build_index", "ChunkRegistry", "Distance", "Searcher", "MAX_K", "ChunkMap"} if len(got) != len(want) { t.Errorf("Rust extract: want %v, got %v", want, got) } for _, w := range want { if !contains(got, w) { t.Errorf("Rust: missing %q in %v", w, got) } } // Negative cases — these should NOT match. for _, neg := range []string{"private_helper", "PrivateOnly"} { if contains(got, neg) { t.Errorf("Rust: should not match %q in %v", neg, got) } } ts := ` export function tokenize(text: string) {} export async function loadCorpus() {} export class IndexRegistry {} export interface FocusFile {} export const STOPWORDS = new Set(); export let counter = 0; function privateTs() {} // not export, must NOT match class Internal {} // not export, must NOT match ` got = ExtractDefinedSymbols(ts) want = []string{"tokenize", "loadCorpus", "IndexRegistry", "FocusFile", "STOPWORDS", "counter"} for _, w := range want { if !contains(got, w) { t.Errorf("TS: missing %q in %v", w, got) } } for _, neg := range []string{"privateTs", "Internal"} { if contains(got, neg) { t.Errorf("TS: should not match %q in %v", neg, got) } } } func TestExtractImportedSymbols(t *testing.T) { rust := ` use catalogd::Registry; use vectord::{Index, IndexParams}; use std::collections::HashMap; ` got := ExtractImportedSymbols(rust) for _, w := range []string{"catalogd", "Registry", "vectord", "Index", "IndexParams", "collections", "HashMap"} { if !contains(got, w) { t.Errorf("Rust use: missing %q in %v", w, got) } } for _, neg := range []string{"use", "as"} { if contains(got, neg) { t.Errorf("Rust use: should not match keyword %q in %v", neg, got) } } ts := ` import { tokenize, jaccard } from "./relevance"; import express from "express"; ` got = ExtractImportedSymbols(ts) for _, w := range []string{"tokenize", "jaccard", "express"} { if !contains(got, w) { t.Errorf("TS import: missing %q in %v", w, got) } } } func TestFilePrefix(t *testing.T) { cases := []struct { path, want string }{ {"crates/queryd/src/foo.rs", "crates/queryd"}, {"top.rs", "top.rs"}, {"a/b/c/d", "a/b"}, {"", ""}, } for _, c := range cases { got := FilePrefix(c.path) if got != c.want { t.Errorf("FilePrefix(%q): want %q, got %q", c.path, c.want, got) } } } func TestScoreRelevance_PathMatch(t *testing.T) { focus := FocusFile{Path: "crates/queryd/db.go"} chunk := CandidateChunk{Source: "lakehouse_arch_v1", DocID: "phase:queryd", Text: "code at crates/queryd/db.go does X"} score, reasons := ScoreRelevance(focus, chunk) if score < 1.0 { t.Errorf("path_match should give >=1.0; got %.2f reasons=%v", score, reasons) } if !contains(reasons, "path_match") { t.Errorf("expected path_match in reasons: %v", reasons) } } func TestScoreRelevance_ImportPenalty(t *testing.T) { // Focus defines Foo; chunk only mentions Bar (imported). Should // fire import_only penalty. focus := FocusFile{ Path: "crates/foo/main.go", Content: "pub fn run() {}\npub struct Foo {}\nuse barlib::Bar;\n", DefinedSymbols: []string{"Foo"}, ImportedSymbols: []string{"Bar"}, } chunk := CandidateChunk{ Source: "barlib_corpus", DocID: "barlib:Bar:42", Text: "Bar handles the actual lookup logic and returns a Result.", } score, reasons := ScoreRelevance(focus, chunk) if !contains(reasons, "import_only(1)") { t.Errorf("expected import_only penalty: reasons=%v score=%.2f", reasons, score) } if score >= 0 { // Without other positive signals, score should be net-negative. t.Errorf("expected negative net score; got %.2f reasons=%v", score, reasons) } } func TestFilterChunks_ThresholdSplitsKeptDropped(t *testing.T) { focus := FocusFile{Path: "crates/queryd/db.go"} chunks := []CandidateChunk{ {Source: "code", DocID: "queryd:db.go", Text: "crates/queryd/db.go is the focus"}, // path match → kept {Source: "elsewhere", DocID: "phase:0", Text: "no match anywhere"}, // dropped } res := FilterChunks(focus, chunks, DefaultRelevanceThreshold) if len(res.Kept) != 1 || len(res.Dropped) != 1 { t.Errorf("split: kept=%d dropped=%d (want 1/1)", len(res.Kept), len(res.Dropped)) } if res.TotalIn != 2 { t.Errorf("TotalIn: want 2, got %d", res.TotalIn) } if res.FocusPath != focus.Path { t.Errorf("FocusPath echo: want %q, got %q", focus.Path, res.FocusPath) } // Sanity: everything in Kept has Relevance >= threshold. for _, c := range res.Kept { if c.Relevance < DefaultRelevanceThreshold { t.Errorf("kept chunk below threshold: %v", c) } } for _, c := range res.Dropped { if c.Relevance >= DefaultRelevanceThreshold { t.Errorf("dropped chunk at/above threshold: %v", c) } } } // TestFilterChunks_AdjacencyPollutionScenario is the headline test — // the exact case the filter exists to catch. Focus file is // crates/queryd/db.go which defines Connector and imports // catalogd::Registry. A chunk about catalogd::Registry should be // dropped (adjacency); a chunk about Connector should be kept. func TestFilterChunks_AdjacencyPollutionScenario(t *testing.T) { focus := FocusFile{ Path: "crates/queryd/src/db.go", Content: ` package queryd import "catalogd" pub struct Connector {} pub fn open_connector() *Connector { return nil } use catalogd::Registry; `, } chunks := []CandidateChunk{ { Source: "lakehouse_symbols_v1", DocID: "symbol:queryd::struct::Connector", Text: "Connector wraps the DuckDB handle. open_connector creates one.", }, { Source: "lakehouse_symbols_v1", DocID: "symbol:catalogd::struct::Registry", Text: "Registry stores manifests. Used by ingestd and queryd.", }, } res := FilterChunks(focus, chunks, DefaultRelevanceThreshold) // Connector chunk should be kept (defined_match). keptIDs := make([]string, len(res.Kept)) for i, c := range res.Kept { keptIDs[i] = c.DocID } if !contains(keptIDs, "symbol:queryd::struct::Connector") { t.Errorf("expected Connector chunk kept; got %v", keptIDs) } // The Registry chunk MIGHT pass threshold depending on token_overlap // noise (queryd appears in its text too). The load-bearing assertion: // Connector ranks ≥ Registry. connectorRel, registryRel := -999.0, -999.0 for _, c := range append(res.Kept, res.Dropped...) { if strings.Contains(c.DocID, "Connector") { connectorRel = c.Relevance } if strings.Contains(c.DocID, "Registry") { registryRel = c.Relevance } } if connectorRel <= registryRel { t.Errorf("Connector should outrank Registry: connector=%.2f registry=%.2f", connectorRel, registryRel) } } func contains(haystack []string, needle string) bool { for _, h := range haystack { if h == needle { return true } } return false }