From 9588bd82ae9f68bc12bf5fbddd106a48370cd27a Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 19:13:22 -0500 Subject: [PATCH] =?UTF-8?q?matrix:=20relevance=20filter=20=E2=80=94=20SPEC?= =?UTF-8?q?=20=C2=A73.4=20component=203=20of=205?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Faithful port of mcp-server/relevance.ts (Rust observer's adjacency- pollution filter). Same 5-signal scoring, same default threshold 0.3. Adds POST /v1/matrix/relevance endpoint via matrixd. Scoring signals (additive, can sign-flip): path_match +1.0 chunk source/doc_id encodes focus.path filename_match +0.6 chunk text mentions focus's filename defined_match +0.6 chunk text mentions focus.defined_symbols token_overlap +0.4 jaccard of non-stopword tokens prefix_match +0.3 chunk source shares first-2-segment prefix import_penalty -0.5 mentions ONLY imported symbols, no defined ones What this does and doesn't do: - DOES filter code-aware corpora (eventually lakehouse_arch_v1, lakehouse_symbols_v1, scrum_findings_v1) — drops chunks about code the focus file IMPORTS rather than DEFINES, the "adjacency pollution" pattern that makes a reviewer LLM hallucinate imported-crate internals as belonging to the focus - DOES NOT meaningfully filter staffing data — the candidates reality test 2026-04-29 had "exact skill match buried at #3" which is a different problem (semantic-only ranking dominated by secondary text). Staffing needs structured filtering (status gates, location gates) that lives outside this package — future work, not in SPEC §3.4 yet Headline smoke assertion: focus = crates/queryd/src/db.go which defines Connector and imports catalogd::Registry. The filter scores: Connector chunk: +0.68 (defined_match fires, kept) Registry chunk: -0.46 (import_only penalty fires, dropped) unrelated junk: 0.00 (no signals, dropped) That's a 1.14-point gap between what we ARE and what we IMPORT — the entire purpose of the filter. Tests: - 9 unit tests in internal/matrix/relevance_test.go covering Tokenize, Jaccard, ExtractDefinedSymbols (Rust + TS), ExtractImportedSymbols, FilePrefix, ScoreRelevance per-signal, FilterChunks threshold splitting, and the headline AdjacencyPollutionScenario - scripts/relevance_smoke.sh integration smoke (3 assertions PASS): adjacency-pollution scenario, empty-chunks 400, threshold honored 13-smoke regression sweep all green (D1-D6, G1, G1P, G2, storaged_cap, pathway, matrix, relevance). Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/matrixd/main.go | 32 ++- internal/matrix/relevance.go | 376 ++++++++++++++++++++++++++++++ internal/matrix/relevance_test.go | 289 +++++++++++++++++++++++ scripts/relevance_smoke.sh | 156 +++++++++++++ 4 files changed, 851 insertions(+), 2 deletions(-) create mode 100644 internal/matrix/relevance.go create mode 100644 internal/matrix/relevance_test.go create mode 100755 scripts/relevance_smoke.sh diff --git a/cmd/matrixd/main.go b/cmd/matrixd/main.go index 427a3f5..dad6d30 100644 --- a/cmd/matrixd/main.go +++ b/cmd/matrixd/main.go @@ -2,8 +2,10 @@ // Retriever with HTTP routes per docs/SPEC.md §3.4. // // Routes: -// POST /matrix/search — multi-corpus retrieve+merge -// GET /matrix/corpora — list known vectord indexes (proxy) +// POST /matrix/search — multi-corpus retrieve+merge +// GET /matrix/corpora — list known vectord indexes (proxy) +// POST /matrix/relevance — adjacency-pollution filter (CODE-aware; +// port of mcp-server/relevance.ts) // // matrixd talks to embedd (for query-text embedding) and vectord // (for per-corpus search) via HTTP. Both URLs come from @@ -58,6 +60,7 @@ type handlers struct { func (h *handlers) register(r chi.Router) { r.Post("/matrix/search", h.handleSearch) r.Get("/matrix/corpora", h.handleCorpora) + r.Post("/matrix/relevance", h.handleRelevance) } func (h *handlers) handleSearch(w http.ResponseWriter, r *http.Request) { @@ -73,6 +76,31 @@ func (h *handlers) handleSearch(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, resp) } +// relevanceRequest is the POST /matrix/relevance body. Threshold +// defaults to matrix.DefaultRelevanceThreshold when zero. +type relevanceRequest struct { + Focus matrix.FocusFile `json:"focus"` + Chunks []matrix.CandidateChunk `json:"chunks"` + Threshold float64 `json:"threshold,omitempty"` +} + +func (h *handlers) handleRelevance(w http.ResponseWriter, r *http.Request) { + var req relevanceRequest + if !decodeJSON(w, r, &req) { + return + } + if len(req.Chunks) == 0 { + http.Error(w, "chunks must be non-empty", http.StatusBadRequest) + return + } + threshold := req.Threshold + if threshold == 0 { + threshold = matrix.DefaultRelevanceThreshold + } + res := matrix.FilterChunks(req.Focus, req.Chunks, threshold) + writeJSON(w, http.StatusOK, res) +} + func (h *handlers) handleCorpora(w http.ResponseWriter, r *http.Request) { names, err := h.r.Corpora(r.Context()) if err != nil { diff --git a/internal/matrix/relevance.go b/internal/matrix/relevance.go new file mode 100644 index 0000000..bb7a92a --- /dev/null +++ b/internal/matrix/relevance.go @@ -0,0 +1,376 @@ +package matrix + +// Heuristic relevance filter for matrix-retrieved chunks. Port of +// /home/profit/lakehouse/mcp-server/relevance.ts (Rust system). +// +// What it does: drops "adjacency pollution" — chunks that scored +// well on cosine but are actually about code the focus file IMPORTS, +// not the focus file itself. Without this, a reviewer LLM +// hallucinates imported-crate internals as belonging to the focus +// file ("I see main.rs does X" when X is in queryd::context that +// main.rs only calls through). +// +// IMPORTANT: this filter is CODE-aware. The signals are pub fn, +// struct, enum, use, import, file paths. It works for the eventual +// lakehouse_arch_v1 / lakehouse_symbols_v1 / scrum_findings_v1 +// corpora ports. It will NOT meaningfully filter staffing data +// (candidates, workers, placements) — those need a different +// mechanism (structured constraints + status gates) that lives +// outside this package. See the candidates reality test 2026-04-29 +// for the kind of staffing-side mismatch this filter doesn't fix. +// +// Scoring signals (all 0..1, additive then can sign-flip): +// path_match +1.0 chunk.source/doc_id encodes focus.path +// filename_match +0.6 chunk text mentions focus's filename +// defined_match +0.6 chunk text mentions focus.defined_symbols +// token_overlap +0.4 jaccard of non-stopword tokens +// prefix_match +0.3 chunk source shares first-2-segment prefix +// import_penalty -0.5 mentions ONLY imported symbols, no defined ones +// +// Threshold default 0.3 — same value the Rust observer ships. + +import ( + "fmt" + "regexp" + "strings" +) + +// DefaultRelevanceThreshold is the value the Rust observer ships. +// Empirically tuned to keep direct hits and drop adjacency pollution. +const DefaultRelevanceThreshold = 0.3 + +// stopwords is the same list as relevance.ts. Includes English +// articles + common Rust/TS keywords that would otherwise flood +// jaccard scores between any two source files. +var stopwords = func() map[string]struct{} { + list := []string{ + "the", "a", "an", "and", "or", "but", "if", "then", "else", "is", "are", "was", "were", + "be", "been", "being", "of", "in", "on", "at", "to", "for", "with", "by", "from", "as", + "that", "this", "these", "those", "it", "its", "they", "them", "their", "we", "our", + "you", "your", "i", "me", "my", "not", "no", "so", "do", "does", "did", "done", + "will", "would", "could", "should", "can", "may", "might", "must", "shall", + "fn", "let", "mut", "pub", "use", "mod", "struct", "enum", "trait", "impl", "self", + "type", "const", "static", "async", "await", "return", "match", "ok", "err", "some", + "none", "into", "from", "ref", "box", "arc", "rc", "vec", "string", "str", + } + m := make(map[string]struct{}, len(list)) + for _, s := range list { + m[s] = struct{}{} + } + return m +}() + +// FocusFile is what we're filtering chunks against. Path is required +// for path_match; Content lets the filter auto-extract Defined and +// ImportedSymbols when callers haven't already done so. +type FocusFile struct { + Path string + Content string + DefinedSymbols []string + ImportedSymbols []string +} + +// CandidateChunk is a single retrieved item to score. Source is the +// corpus name; DocID is the chunk identifier; Score is the upstream +// cosine signal (carried through but not used by this filter — the +// matrix layer uses cosine for ranking, this filter for retention). +type CandidateChunk struct { + Source string `json:"source"` + DocID string `json:"doc_id"` + Text string `json:"text"` + Score float64 `json:"score"` +} + +// ScoredChunk wraps a chunk with its computed relevance + the list +// of signals that fired. Reasons makes the filter auditable — +// debugging "why did this chunk get kept/dropped" is the hard part. +type ScoredChunk struct { + CandidateChunk + Relevance float64 `json:"relevance"` + Reasons []string `json:"reasons"` +} + +// FilterResult is the output of FilterChunks. Kept + Dropped are +// disjoint and together cover the input. TotalIn is for sanity +// checks; FocusPath echoes input for logging. +type FilterResult struct { + Kept []ScoredChunk `json:"kept"` + Dropped []ScoredChunk `json:"dropped"` + Threshold float64 `json:"threshold"` + FocusPath string `json:"focus_path"` + TotalIn int `json:"total_in"` +} + +// Tokenize lowercases, splits on identifier boundaries (>=3 chars), +// and drops stopwords. Used by Jaccard for token_overlap. Mirrors +// the TS regex /[a-z_][a-z0-9_]{2,}/g — RE2-compatible as written. +var tokenRe = regexp.MustCompile(`[a-z_][a-z0-9_]{2,}`) + +func Tokenize(text string) map[string]struct{} { + out := make(map[string]struct{}) + if text == "" { + return out + } + for _, m := range tokenRe.FindAllString(strings.ToLower(text), -1) { + if _, skip := stopwords[m]; skip { + continue + } + out[m] = struct{}{} + } + return out +} + +// Jaccard returns |A ∩ B| / |A ∪ B|. 0 when either set is empty +// (matches the TS contract). +func Jaccard(a, b map[string]struct{}) float64 { + if len(a) == 0 || len(b) == 0 { + return 0 + } + var inter int + for k := range a { + if _, ok := b[k]; ok { + inter++ + } + } + union := len(a) + len(b) - inter + if union == 0 { + return 0 + } + return float64(inter) / float64(union) +} + +// ExtractDefinedSymbols pulls pub-symbol names from Rust/TS source. +// Conservative — would rather miss a symbol than over-match. Patterns +// match exactly the TS impl; \b and (?:...) are RE2-supported. Case- +// sensitivity matches TS: pub fn is lowercase, struct/enum/trait/etc +// are PascalCase, const is SCREAMING_CASE. Only the "pub fn" match +// uses (?i) because TS uses /gi explicitly there (the rest are /g). +var definedPatterns = []*regexp.Regexp{ + regexp.MustCompile(`(?i)\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)`), + regexp.MustCompile(`\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bpub\s+const\s+([A-Z_][A-Z0-9_]*)`), + regexp.MustCompile(`\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)`), + regexp.MustCompile(`\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)`), + regexp.MustCompile(`\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)`), +} + +func ExtractDefinedSymbols(content string) []string { + if content == "" { + return nil + } + seen := make(map[string]struct{}) + var out []string + for _, re := range definedPatterns { + for _, m := range re.FindAllStringSubmatch(content, -1) { + if len(m) < 2 || m[1] == "" { + continue + } + if _, ok := seen[m[1]]; ok { + continue + } + seen[m[1]] = struct{}{} + out = append(out, m[1]) + } + } + return out +} + +// rustUseRe matches `use foo::bar::Baz;`, `use foo::{Bar, Baz};`, +// `use foo::bar as alias;`. Lazy `*?` so we don't run into the next +// `;` boundary too eagerly. +var rustUseRe = regexp.MustCompile(`\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);`) + +// tsImportRe matches `import { X, Y } from "foo"` and `import X from "foo"`. +var tsImportRe = regexp.MustCompile(`\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from`) + +// identRe extracts identifiers from a use/import block. +var identRe = regexp.MustCompile(`[A-Za-z_][A-Za-z0-9_]*`) + +func ExtractImportedSymbols(content string) []string { + if content == "" { + return nil + } + ignore := map[string]bool{ + "use": true, "as": true, "crate": true, "super": true, "self": true, "mod": true, + } + seen := make(map[string]struct{}) + var out []string + add := func(tok string) { + if len(tok) <= 2 { + return + } + if ignore[tok] { + return + } + if _, ok := seen[tok]; ok { + return + } + seen[tok] = struct{}{} + out = append(out, tok) + } + for _, m := range rustUseRe.FindAllStringSubmatch(content, -1) { + if len(m) < 2 { + continue + } + for _, ident := range identRe.FindAllString(m[1], -1) { + add(ident) + } + } + for _, m := range tsImportRe.FindAllStringSubmatch(content, -1) { + if len(m) < 3 { + continue + } + block := m[1] + if block == "" { + block = m[2] + } + for _, ident := range identRe.FindAllString(block, -1) { + add(ident) + } + } + return out +} + +// FilePrefix returns the first two path segments joined by "/" — +// e.g. "crates/queryd/src/foo.rs" → "crates/queryd". Used for cheap +// "same crate" comparisons; mirrors pathway_memory's notion. +func FilePrefix(path string) string { + parts := strings.Split(path, "/") + if len(parts) > 2 { + parts = parts[:2] + } + return strings.Join(parts, "/") +} + +// ScoreRelevance computes the additive 0..1-ish score plus the list +// of signals that fired. Negative scores are possible (import_penalty +// without compensating positive signal). Pure function — no side +// effects, no I/O. +func ScoreRelevance(focus FocusFile, chunk CandidateChunk) (float64, []string) { + var score float64 + var reasons []string + + focusPath := focus.Path + focusBase := "" + if focusPath != "" { + parts := strings.Split(focusPath, "/") + focusBase = parts[len(parts)-1] + } + chunkText := chunk.Text + chunkSource := chunk.Source + chunkDocID := chunk.DocID + + // path_match: chunk's provenance encodes the focus path or filename. + if focusPath != "" && (strings.Contains(chunkSource, focusPath) || + strings.Contains(chunkDocID, focusPath) || + strings.Contains(chunkText, focusPath)) { + score += 1.0 + reasons = append(reasons, "path_match") + } else if focusBase != "" && len(focusBase) > 4 && + (strings.Contains(chunkText, focusBase) || strings.Contains(chunkDocID, focusBase)) { + score += 0.6 + reasons = append(reasons, "filename_match") + } + + // defined_match: chunk text mentions symbols this file actually defines. + defined := focus.DefinedSymbols + if len(defined) == 0 && focus.Content != "" { + defined = ExtractDefinedSymbols(focus.Content) + } + if len(defined) > 0 { + var hits int + for _, s := range defined { + if len(s) > 2 && strings.Contains(chunkText, s) { + hits++ + } + } + if hits > 0 { + denom := len(defined) + if denom < 1 { + denom = 1 + } + ratio := float64(hits) / float64(denom) + if ratio > 1 { + ratio = 1 + } + score += 0.6 * ratio + reasons = append(reasons, fmt.Sprintf("defined_match(%d/%d)", hits, len(defined))) + } + } + + // token_overlap: jaccard of non-stopword tokens. + if focus.Content != "" { + overlap := Jaccard(Tokenize(focus.Content), Tokenize(chunkText)) + if overlap > 0.05 { + score += 0.4 * overlap + reasons = append(reasons, fmt.Sprintf("token_overlap(%.2f)", overlap)) + } + } + + // prefix_match: same first-2-segments (e.g. crates/queryd). + if focusPath != "" { + fp := FilePrefix(focusPath) + if fp != "" && (strings.Contains(chunkSource, fp) || + strings.Contains(chunkDocID, fp) || + strings.Contains(chunkText, fp)) { + score += 0.3 + reasons = append(reasons, "prefix_match") + } + } + + // import_penalty: chunk mentions only imported symbols, no defined + // ones. Strong signal of adjacency pollution — the chunk is about + // what we IMPORT, not what we ARE. + imported := focus.ImportedSymbols + if len(imported) == 0 && focus.Content != "" { + imported = ExtractImportedSymbols(focus.Content) + } + if len(imported) > 0 && len(defined) > 0 { + var importHits, definedHits int + for _, s := range imported { + if len(s) > 2 && strings.Contains(chunkText, s) { + importHits++ + } + } + for _, s := range defined { + if len(s) > 2 && strings.Contains(chunkText, s) { + definedHits++ + } + } + if importHits > 0 && definedHits == 0 { + score -= 0.5 + reasons = append(reasons, fmt.Sprintf("import_only(%d)", importHits)) + } + } + + return score, reasons +} + +// FilterChunks scores every chunk and partitions by threshold. The +// caller picks the threshold; pass 0 to keep everything (caller-as- +// intent contract — no auto-default substitution, since a literal 0 +// is meaningful as "keep everything I scored"). +func FilterChunks(focus FocusFile, chunks []CandidateChunk, threshold float64) FilterResult { + kept := make([]ScoredChunk, 0, len(chunks)) + dropped := make([]ScoredChunk, 0) + for _, c := range chunks { + score, reasons := ScoreRelevance(focus, c) + sc := ScoredChunk{CandidateChunk: c, Relevance: score, Reasons: reasons} + if score >= threshold { + kept = append(kept, sc) + } else { + dropped = append(dropped, sc) + } + } + return FilterResult{ + Kept: kept, + Dropped: dropped, + Threshold: threshold, + FocusPath: focus.Path, + TotalIn: len(chunks), + } +} diff --git a/internal/matrix/relevance_test.go b/internal/matrix/relevance_test.go new file mode 100644 index 0000000..710b5d7 --- /dev/null +++ b/internal/matrix/relevance_test.go @@ -0,0 +1,289 @@ +package matrix + +import ( + "strings" + "testing" +) + +func TestTokenize(t *testing.T) { + cases := []struct { + text string + want []string // expected tokens (sorted check inside) + }{ + {"", nil}, + {"the quick brown fox", []string{"quick", "brown", "fox"}}, // stopwords dropped + {"hello WORLD", []string{"hello", "world"}}, // lowercase + {"a b c", nil}, // all under 3 chars + {"struct Foo", []string{"foo"}}, // "struct" is a stopword, identifiers OK + {"crates/queryd/db.go", []string{"crates", "queryd"}}, // db.go: "db" is 2 chars, "go" is 2 chars + } + for _, c := range cases { + got := Tokenize(c.text) + if len(got) != len(c.want) { + t.Errorf("Tokenize(%q): want %d tokens %v, got %d %v", c.text, len(c.want), c.want, len(got), got) + continue + } + for _, w := range c.want { + if _, ok := got[w]; !ok { + t.Errorf("Tokenize(%q): missing token %q in %v", c.text, w, got) + } + } + } +} + +func TestJaccard(t *testing.T) { + mk := func(tokens ...string) map[string]struct{} { + m := make(map[string]struct{}) + for _, t := range tokens { + m[t] = struct{}{} + } + return m + } + cases := []struct { + name string + a, b map[string]struct{} + want float64 + epsilon float64 + }{ + {"both empty", mk(), mk(), 0, 0}, + {"a empty", mk(), mk("x"), 0, 0}, + {"identical", mk("x", "y"), mk("x", "y"), 1, 0}, + {"disjoint", mk("a", "b"), mk("c", "d"), 0, 0}, + {"half overlap", mk("a", "b"), mk("b", "c"), 1.0 / 3.0, 0.001}, + } + for _, c := range cases { + got := Jaccard(c.a, c.b) + if got < c.want-c.epsilon || got > c.want+c.epsilon { + t.Errorf("%s: want %.3f, got %.3f", c.name, c.want, got) + } + } +} + +func TestExtractDefinedSymbols(t *testing.T) { + rust := ` +pub fn search_chunks(query: &str) -> Vec { todo!() } +pub async fn build_index() {} +pub struct ChunkRegistry {} +pub enum Distance { Cosine, Euclidean } +pub trait Searcher {} +pub const MAX_K: usize = 1000; +pub type ChunkMap = HashMap; + +fn private_helper() {} // not pub, must NOT match +struct PrivateOnly {} // not pub, must NOT match +` + got := ExtractDefinedSymbols(rust) + want := []string{"search_chunks", "build_index", "ChunkRegistry", "Distance", "Searcher", "MAX_K", "ChunkMap"} + if len(got) != len(want) { + t.Errorf("Rust extract: want %v, got %v", want, got) + } + for _, w := range want { + if !contains(got, w) { + t.Errorf("Rust: missing %q in %v", w, got) + } + } + // Negative cases — these should NOT match. + for _, neg := range []string{"private_helper", "PrivateOnly"} { + if contains(got, neg) { + t.Errorf("Rust: should not match %q in %v", neg, got) + } + } + + ts := ` +export function tokenize(text: string) {} +export async function loadCorpus() {} +export class IndexRegistry {} +export interface FocusFile {} +export const STOPWORDS = new Set(); +export let counter = 0; + +function privateTs() {} // not export, must NOT match +class Internal {} // not export, must NOT match +` + got = ExtractDefinedSymbols(ts) + want = []string{"tokenize", "loadCorpus", "IndexRegistry", "FocusFile", "STOPWORDS", "counter"} + for _, w := range want { + if !contains(got, w) { + t.Errorf("TS: missing %q in %v", w, got) + } + } + for _, neg := range []string{"privateTs", "Internal"} { + if contains(got, neg) { + t.Errorf("TS: should not match %q in %v", neg, got) + } + } +} + +func TestExtractImportedSymbols(t *testing.T) { + rust := ` +use catalogd::Registry; +use vectord::{Index, IndexParams}; +use std::collections::HashMap; +` + got := ExtractImportedSymbols(rust) + for _, w := range []string{"catalogd", "Registry", "vectord", "Index", "IndexParams", "collections", "HashMap"} { + if !contains(got, w) { + t.Errorf("Rust use: missing %q in %v", w, got) + } + } + for _, neg := range []string{"use", "as"} { + if contains(got, neg) { + t.Errorf("Rust use: should not match keyword %q in %v", neg, got) + } + } + + ts := ` +import { tokenize, jaccard } from "./relevance"; +import express from "express"; +` + got = ExtractImportedSymbols(ts) + for _, w := range []string{"tokenize", "jaccard", "express"} { + if !contains(got, w) { + t.Errorf("TS import: missing %q in %v", w, got) + } + } +} + +func TestFilePrefix(t *testing.T) { + cases := []struct { + path, want string + }{ + {"crates/queryd/src/foo.rs", "crates/queryd"}, + {"top.rs", "top.rs"}, + {"a/b/c/d", "a/b"}, + {"", ""}, + } + for _, c := range cases { + got := FilePrefix(c.path) + if got != c.want { + t.Errorf("FilePrefix(%q): want %q, got %q", c.path, c.want, got) + } + } +} + +func TestScoreRelevance_PathMatch(t *testing.T) { + focus := FocusFile{Path: "crates/queryd/db.go"} + chunk := CandidateChunk{Source: "lakehouse_arch_v1", DocID: "phase:queryd", Text: "code at crates/queryd/db.go does X"} + score, reasons := ScoreRelevance(focus, chunk) + if score < 1.0 { + t.Errorf("path_match should give >=1.0; got %.2f reasons=%v", score, reasons) + } + if !contains(reasons, "path_match") { + t.Errorf("expected path_match in reasons: %v", reasons) + } +} + +func TestScoreRelevance_ImportPenalty(t *testing.T) { + // Focus defines Foo; chunk only mentions Bar (imported). Should + // fire import_only penalty. + focus := FocusFile{ + Path: "crates/foo/main.go", + Content: "pub fn run() {}\npub struct Foo {}\nuse barlib::Bar;\n", + DefinedSymbols: []string{"Foo"}, + ImportedSymbols: []string{"Bar"}, + } + chunk := CandidateChunk{ + Source: "barlib_corpus", DocID: "barlib:Bar:42", + Text: "Bar handles the actual lookup logic and returns a Result.", + } + score, reasons := ScoreRelevance(focus, chunk) + if !contains(reasons, "import_only(1)") { + t.Errorf("expected import_only penalty: reasons=%v score=%.2f", reasons, score) + } + if score >= 0 { + // Without other positive signals, score should be net-negative. + t.Errorf("expected negative net score; got %.2f reasons=%v", score, reasons) + } +} + +func TestFilterChunks_ThresholdSplitsKeptDropped(t *testing.T) { + focus := FocusFile{Path: "crates/queryd/db.go"} + chunks := []CandidateChunk{ + {Source: "code", DocID: "queryd:db.go", Text: "crates/queryd/db.go is the focus"}, // path match → kept + {Source: "elsewhere", DocID: "phase:0", Text: "no match anywhere"}, // dropped + } + res := FilterChunks(focus, chunks, DefaultRelevanceThreshold) + if len(res.Kept) != 1 || len(res.Dropped) != 1 { + t.Errorf("split: kept=%d dropped=%d (want 1/1)", len(res.Kept), len(res.Dropped)) + } + if res.TotalIn != 2 { + t.Errorf("TotalIn: want 2, got %d", res.TotalIn) + } + if res.FocusPath != focus.Path { + t.Errorf("FocusPath echo: want %q, got %q", focus.Path, res.FocusPath) + } + // Sanity: everything in Kept has Relevance >= threshold. + for _, c := range res.Kept { + if c.Relevance < DefaultRelevanceThreshold { + t.Errorf("kept chunk below threshold: %v", c) + } + } + for _, c := range res.Dropped { + if c.Relevance >= DefaultRelevanceThreshold { + t.Errorf("dropped chunk at/above threshold: %v", c) + } + } +} + +// TestFilterChunks_AdjacencyPollutionScenario is the headline test — +// the exact case the filter exists to catch. Focus file is +// crates/queryd/db.go which defines Connector and imports +// catalogd::Registry. A chunk about catalogd::Registry should be +// dropped (adjacency); a chunk about Connector should be kept. +func TestFilterChunks_AdjacencyPollutionScenario(t *testing.T) { + focus := FocusFile{ + Path: "crates/queryd/src/db.go", + Content: ` +package queryd + +import "catalogd" + +pub struct Connector {} +pub fn open_connector() *Connector { return nil } +use catalogd::Registry; +`, + } + chunks := []CandidateChunk{ + { + Source: "lakehouse_symbols_v1", DocID: "symbol:queryd::struct::Connector", + Text: "Connector wraps the DuckDB handle. open_connector creates one.", + }, + { + Source: "lakehouse_symbols_v1", DocID: "symbol:catalogd::struct::Registry", + Text: "Registry stores manifests. Used by ingestd and queryd.", + }, + } + res := FilterChunks(focus, chunks, DefaultRelevanceThreshold) + // Connector chunk should be kept (defined_match). + keptIDs := make([]string, len(res.Kept)) + for i, c := range res.Kept { + keptIDs[i] = c.DocID + } + if !contains(keptIDs, "symbol:queryd::struct::Connector") { + t.Errorf("expected Connector chunk kept; got %v", keptIDs) + } + // The Registry chunk MIGHT pass threshold depending on token_overlap + // noise (queryd appears in its text too). The load-bearing assertion: + // Connector ranks ≥ Registry. + connectorRel, registryRel := -999.0, -999.0 + for _, c := range append(res.Kept, res.Dropped...) { + if strings.Contains(c.DocID, "Connector") { + connectorRel = c.Relevance + } + if strings.Contains(c.DocID, "Registry") { + registryRel = c.Relevance + } + } + if connectorRel <= registryRel { + t.Errorf("Connector should outrank Registry: connector=%.2f registry=%.2f", connectorRel, registryRel) + } +} + +func contains(haystack []string, needle string) bool { + for _, h := range haystack { + if h == needle { + return true + } + } + return false +} diff --git a/scripts/relevance_smoke.sh b/scripts/relevance_smoke.sh new file mode 100755 index 0000000..ee4a1f6 --- /dev/null +++ b/scripts/relevance_smoke.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Relevance smoke — code-relevance filter via matrixd /matrix/relevance. +# All assertions go through gateway :3110. +# +# Validates the headline adjacency-pollution scenario: +# Focus: crates/queryd/src/db.go which defines Connector. +# Chunk A is about Connector → kept (defined_match). +# Chunk B is about catalogd::Registry which db.go imports → outranked +# by Chunk A. +# Chunk C is unrelated → dropped (no signals fire). +# +# Plus negative paths: +# - Empty chunks → 400 +# - Threshold honored when set explicitly + +set -euo pipefail +cd "$(dirname "$0")/.." + +export PATH="$PATH:/usr/local/go/bin" + +echo "[relevance-smoke] building matrixd + vectord + gateway..." +go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway + +pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true +sleep 0.3 + +PIDS=() +TMP="$(mktemp -d)" +CFG="$TMP/relevance.toml" + +cleanup() { + echo "[relevance-smoke] cleanup" + for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done + rm -rf "$TMP" +} +trap cleanup EXIT INT TERM + +# Custom toml: vectord persistence disabled. /matrix/relevance doesn't +# touch vectord at all, but matrixd config requires the URL anyway. +cat > "$CFG" </dev/null 2>&1; then return 0; fi + sleep 0.05 + done + return 1 +} + +echo "[relevance-smoke] launching vectord → matrixd → gateway..." +./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & +PIDS+=($!) +poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; } + +./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & +PIDS+=($!) +poll_health 3218 || { echo "matrixd failed"; tail /tmp/matrixd.log; exit 1; } + +./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & +PIDS+=($!) +poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; } + +FAILED=0 + +# ── 1. Adjacency-pollution scenario ────────────────────────────── +echo "[relevance-smoke] adjacency-pollution: Connector outranks Registry, junk dropped:" +PAYLOAD='{ + "focus": { + "Path": "crates/queryd/src/db.go", + "Content": "pub struct Connector {}\npub fn open_connector() *Connector { return nil }\nuse catalogd::Registry;" + }, + "chunks": [ + {"source":"lakehouse_symbols_v1","doc_id":"symbol:queryd::struct::Connector","text":"Connector wraps the DuckDB handle. open_connector creates one.","score":0.9}, + {"source":"lakehouse_symbols_v1","doc_id":"symbol:catalogd::struct::Registry","text":"Registry stores manifests. Used by ingestd.","score":0.85}, + {"source":"lakehouse_symbols_v1","doc_id":"symbol:totally_other::Thing","text":"completely unrelated text about something else entirely","score":0.7} + ], + "threshold": 0.3 +}' +RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/relevance -H 'Content-Type: application/json' -d "$PAYLOAD")" + +# Connector chunk should be in kept +CONNECTOR_KEPT="$(echo "$RESP" | jq -r '[.kept[] | select(.doc_id | contains("Connector"))] | length')" +# The unrelated junk chunk should be in dropped +JUNK_DROPPED="$(echo "$RESP" | jq -r '[.dropped[] | select(.doc_id | contains("Thing"))] | length')" +# Connector should outrank Registry (whichever bucket they end up in) +CONN_REL="$(echo "$RESP" | jq -r '[.kept[], .dropped[] | select(.doc_id | contains("Connector"))] | .[0].relevance // -999')" +REG_REL="$(echo "$RESP" | jq -r '[.kept[], .dropped[] | select(.doc_id | contains("Registry"))] | .[0].relevance // -999')" +TOTAL_IN="$(echo "$RESP" | jq -r '.total_in')" + +CONN_OUTRANKS_REG="$(awk -v a="$CONN_REL" -v b="$REG_REL" 'BEGIN{print (a>b)?"true":"false"}')" + +if [ "$CONNECTOR_KEPT" = "1" ] && [ "$JUNK_DROPPED" = "1" ] && [ "$CONN_OUTRANKS_REG" = "true" ] && [ "$TOTAL_IN" = "3" ]; then + echo " ✓ Connector kept, junk dropped, Connector ($CONN_REL) > Registry ($REG_REL)" +else + echo " ✗ kept_connector=$CONNECTOR_KEPT dropped_junk=$JUNK_DROPPED conn=$CONN_REL reg=$REG_REL total=$TOTAL_IN" + echo " full: $RESP" + FAILED=1 +fi + +# ── 2. Empty chunks → 400 ──────────────────────────────────────── +echo "[relevance-smoke] empty chunks → 400:" +HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/relevance \ + -H 'Content-Type: application/json' \ + -d '{"focus":{"Path":"x"},"chunks":[]}')" +if [ "$HTTP" = "400" ]; then + echo " ✓ 400 on empty chunks" +else + echo " ✗ got $HTTP"; FAILED=1 +fi + +# ── 3. Threshold honored ───────────────────────────────────────── +echo "[relevance-smoke] threshold=10 (impossibly high) drops everything:" +PAYLOAD2='{ + "focus": {"Path": "x.go", "Content": "pub fn known() {}", "DefinedSymbols": ["known"]}, + "chunks": [ + {"source":"s","doc_id":"d1","text":"known appears here","score":0.9} + ], + "threshold": 10 +}' +RESP2="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/relevance -H 'Content-Type: application/json' -d "$PAYLOAD2")" +KEPT_COUNT="$(echo "$RESP2" | jq -r '.kept | length')" +DROP_COUNT="$(echo "$RESP2" | jq -r '.dropped | length')" +if [ "$KEPT_COUNT" = "0" ] && [ "$DROP_COUNT" = "1" ]; then + echo " ✓ threshold=10 drops everything (0 kept / 1 dropped)" +else + echo " ✗ kept=$KEPT_COUNT dropped=$DROP_COUNT"; FAILED=1 +fi + +if [ "$FAILED" -eq 0 ]; then + echo "[relevance-smoke] Relevance acceptance gate: PASSED" + exit 0 +else + echo "[relevance-smoke] Relevance acceptance gate: FAILED" + exit 1 +fi