Faithful port of mcp-server/relevance.ts (Rust observer's adjacency-
pollution filter). Same 5-signal scoring, same default threshold 0.3.
Adds POST /v1/matrix/relevance endpoint via matrixd.
Scoring signals (additive, can sign-flip):
path_match +1.0 chunk source/doc_id encodes focus.path
filename_match +0.6 chunk text mentions focus's filename
defined_match +0.6 chunk text mentions focus.defined_symbols
token_overlap +0.4 jaccard of non-stopword tokens
prefix_match +0.3 chunk source shares first-2-segment prefix
import_penalty -0.5 mentions ONLY imported symbols, no defined ones
What this does and doesn't do:
- DOES filter code-aware corpora (eventually lakehouse_arch_v1,
lakehouse_symbols_v1, scrum_findings_v1) — drops chunks about
code the focus file IMPORTS rather than DEFINES, the
"adjacency pollution" pattern that makes a reviewer LLM
hallucinate imported-crate internals as belonging to the focus
- DOES NOT meaningfully filter staffing data — the candidates
reality test 2026-04-29 had "exact skill match buried at #3"
which is a different problem (semantic-only ranking dominated
by secondary text). Staffing needs structured filtering
(status gates, location gates) that lives outside this
package — future work, not in SPEC §3.4 yet
Headline smoke assertion: focus = crates/queryd/src/db.go which
defines Connector and imports catalogd::Registry. The filter
scores:
Connector chunk: +0.68 (defined_match fires, kept)
Registry chunk: -0.46 (import_only penalty fires, dropped)
unrelated junk: 0.00 (no signals, dropped)
That's a 1.14-point gap between what we ARE and what we IMPORT —
the entire purpose of the filter.
Tests:
- 9 unit tests in internal/matrix/relevance_test.go covering
Tokenize, Jaccard, ExtractDefinedSymbols (Rust + TS),
ExtractImportedSymbols, FilePrefix, ScoreRelevance per-signal,
FilterChunks threshold splitting, and the headline
AdjacencyPollutionScenario
- scripts/relevance_smoke.sh integration smoke (3 assertions PASS):
adjacency-pollution scenario, empty-chunks 400, threshold honored
13-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix, relevance).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
290 lines
8.7 KiB
Go
290 lines
8.7 KiB
Go
package matrix
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestTokenize(t *testing.T) {
|
|
cases := []struct {
|
|
text string
|
|
want []string // expected tokens (sorted check inside)
|
|
}{
|
|
{"", nil},
|
|
{"the quick brown fox", []string{"quick", "brown", "fox"}}, // stopwords dropped
|
|
{"hello WORLD", []string{"hello", "world"}}, // lowercase
|
|
{"a b c", nil}, // all under 3 chars
|
|
{"struct Foo", []string{"foo"}}, // "struct" is a stopword, identifiers OK
|
|
{"crates/queryd/db.go", []string{"crates", "queryd"}}, // db.go: "db" is 2 chars, "go" is 2 chars
|
|
}
|
|
for _, c := range cases {
|
|
got := Tokenize(c.text)
|
|
if len(got) != len(c.want) {
|
|
t.Errorf("Tokenize(%q): want %d tokens %v, got %d %v", c.text, len(c.want), c.want, len(got), got)
|
|
continue
|
|
}
|
|
for _, w := range c.want {
|
|
if _, ok := got[w]; !ok {
|
|
t.Errorf("Tokenize(%q): missing token %q in %v", c.text, w, got)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestJaccard(t *testing.T) {
|
|
mk := func(tokens ...string) map[string]struct{} {
|
|
m := make(map[string]struct{})
|
|
for _, t := range tokens {
|
|
m[t] = struct{}{}
|
|
}
|
|
return m
|
|
}
|
|
cases := []struct {
|
|
name string
|
|
a, b map[string]struct{}
|
|
want float64
|
|
epsilon float64
|
|
}{
|
|
{"both empty", mk(), mk(), 0, 0},
|
|
{"a empty", mk(), mk("x"), 0, 0},
|
|
{"identical", mk("x", "y"), mk("x", "y"), 1, 0},
|
|
{"disjoint", mk("a", "b"), mk("c", "d"), 0, 0},
|
|
{"half overlap", mk("a", "b"), mk("b", "c"), 1.0 / 3.0, 0.001},
|
|
}
|
|
for _, c := range cases {
|
|
got := Jaccard(c.a, c.b)
|
|
if got < c.want-c.epsilon || got > c.want+c.epsilon {
|
|
t.Errorf("%s: want %.3f, got %.3f", c.name, c.want, got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestExtractDefinedSymbols(t *testing.T) {
|
|
rust := `
|
|
pub fn search_chunks(query: &str) -> Vec<Chunk> { todo!() }
|
|
pub async fn build_index() {}
|
|
pub struct ChunkRegistry {}
|
|
pub enum Distance { Cosine, Euclidean }
|
|
pub trait Searcher {}
|
|
pub const MAX_K: usize = 1000;
|
|
pub type ChunkMap = HashMap<String, Chunk>;
|
|
|
|
fn private_helper() {} // not pub, must NOT match
|
|
struct PrivateOnly {} // not pub, must NOT match
|
|
`
|
|
got := ExtractDefinedSymbols(rust)
|
|
want := []string{"search_chunks", "build_index", "ChunkRegistry", "Distance", "Searcher", "MAX_K", "ChunkMap"}
|
|
if len(got) != len(want) {
|
|
t.Errorf("Rust extract: want %v, got %v", want, got)
|
|
}
|
|
for _, w := range want {
|
|
if !contains(got, w) {
|
|
t.Errorf("Rust: missing %q in %v", w, got)
|
|
}
|
|
}
|
|
// Negative cases — these should NOT match.
|
|
for _, neg := range []string{"private_helper", "PrivateOnly"} {
|
|
if contains(got, neg) {
|
|
t.Errorf("Rust: should not match %q in %v", neg, got)
|
|
}
|
|
}
|
|
|
|
ts := `
|
|
export function tokenize(text: string) {}
|
|
export async function loadCorpus() {}
|
|
export class IndexRegistry {}
|
|
export interface FocusFile {}
|
|
export const STOPWORDS = new Set();
|
|
export let counter = 0;
|
|
|
|
function privateTs() {} // not export, must NOT match
|
|
class Internal {} // not export, must NOT match
|
|
`
|
|
got = ExtractDefinedSymbols(ts)
|
|
want = []string{"tokenize", "loadCorpus", "IndexRegistry", "FocusFile", "STOPWORDS", "counter"}
|
|
for _, w := range want {
|
|
if !contains(got, w) {
|
|
t.Errorf("TS: missing %q in %v", w, got)
|
|
}
|
|
}
|
|
for _, neg := range []string{"privateTs", "Internal"} {
|
|
if contains(got, neg) {
|
|
t.Errorf("TS: should not match %q in %v", neg, got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestExtractImportedSymbols(t *testing.T) {
|
|
rust := `
|
|
use catalogd::Registry;
|
|
use vectord::{Index, IndexParams};
|
|
use std::collections::HashMap;
|
|
`
|
|
got := ExtractImportedSymbols(rust)
|
|
for _, w := range []string{"catalogd", "Registry", "vectord", "Index", "IndexParams", "collections", "HashMap"} {
|
|
if !contains(got, w) {
|
|
t.Errorf("Rust use: missing %q in %v", w, got)
|
|
}
|
|
}
|
|
for _, neg := range []string{"use", "as"} {
|
|
if contains(got, neg) {
|
|
t.Errorf("Rust use: should not match keyword %q in %v", neg, got)
|
|
}
|
|
}
|
|
|
|
ts := `
|
|
import { tokenize, jaccard } from "./relevance";
|
|
import express from "express";
|
|
`
|
|
got = ExtractImportedSymbols(ts)
|
|
for _, w := range []string{"tokenize", "jaccard", "express"} {
|
|
if !contains(got, w) {
|
|
t.Errorf("TS import: missing %q in %v", w, got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestFilePrefix(t *testing.T) {
|
|
cases := []struct {
|
|
path, want string
|
|
}{
|
|
{"crates/queryd/src/foo.rs", "crates/queryd"},
|
|
{"top.rs", "top.rs"},
|
|
{"a/b/c/d", "a/b"},
|
|
{"", ""},
|
|
}
|
|
for _, c := range cases {
|
|
got := FilePrefix(c.path)
|
|
if got != c.want {
|
|
t.Errorf("FilePrefix(%q): want %q, got %q", c.path, c.want, got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestScoreRelevance_PathMatch(t *testing.T) {
|
|
focus := FocusFile{Path: "crates/queryd/db.go"}
|
|
chunk := CandidateChunk{Source: "lakehouse_arch_v1", DocID: "phase:queryd", Text: "code at crates/queryd/db.go does X"}
|
|
score, reasons := ScoreRelevance(focus, chunk)
|
|
if score < 1.0 {
|
|
t.Errorf("path_match should give >=1.0; got %.2f reasons=%v", score, reasons)
|
|
}
|
|
if !contains(reasons, "path_match") {
|
|
t.Errorf("expected path_match in reasons: %v", reasons)
|
|
}
|
|
}
|
|
|
|
func TestScoreRelevance_ImportPenalty(t *testing.T) {
|
|
// Focus defines Foo; chunk only mentions Bar (imported). Should
|
|
// fire import_only penalty.
|
|
focus := FocusFile{
|
|
Path: "crates/foo/main.go",
|
|
Content: "pub fn run() {}\npub struct Foo {}\nuse barlib::Bar;\n",
|
|
DefinedSymbols: []string{"Foo"},
|
|
ImportedSymbols: []string{"Bar"},
|
|
}
|
|
chunk := CandidateChunk{
|
|
Source: "barlib_corpus", DocID: "barlib:Bar:42",
|
|
Text: "Bar handles the actual lookup logic and returns a Result.",
|
|
}
|
|
score, reasons := ScoreRelevance(focus, chunk)
|
|
if !contains(reasons, "import_only(1)") {
|
|
t.Errorf("expected import_only penalty: reasons=%v score=%.2f", reasons, score)
|
|
}
|
|
if score >= 0 {
|
|
// Without other positive signals, score should be net-negative.
|
|
t.Errorf("expected negative net score; got %.2f reasons=%v", score, reasons)
|
|
}
|
|
}
|
|
|
|
func TestFilterChunks_ThresholdSplitsKeptDropped(t *testing.T) {
|
|
focus := FocusFile{Path: "crates/queryd/db.go"}
|
|
chunks := []CandidateChunk{
|
|
{Source: "code", DocID: "queryd:db.go", Text: "crates/queryd/db.go is the focus"}, // path match → kept
|
|
{Source: "elsewhere", DocID: "phase:0", Text: "no match anywhere"}, // dropped
|
|
}
|
|
res := FilterChunks(focus, chunks, DefaultRelevanceThreshold)
|
|
if len(res.Kept) != 1 || len(res.Dropped) != 1 {
|
|
t.Errorf("split: kept=%d dropped=%d (want 1/1)", len(res.Kept), len(res.Dropped))
|
|
}
|
|
if res.TotalIn != 2 {
|
|
t.Errorf("TotalIn: want 2, got %d", res.TotalIn)
|
|
}
|
|
if res.FocusPath != focus.Path {
|
|
t.Errorf("FocusPath echo: want %q, got %q", focus.Path, res.FocusPath)
|
|
}
|
|
// Sanity: everything in Kept has Relevance >= threshold.
|
|
for _, c := range res.Kept {
|
|
if c.Relevance < DefaultRelevanceThreshold {
|
|
t.Errorf("kept chunk below threshold: %v", c)
|
|
}
|
|
}
|
|
for _, c := range res.Dropped {
|
|
if c.Relevance >= DefaultRelevanceThreshold {
|
|
t.Errorf("dropped chunk at/above threshold: %v", c)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestFilterChunks_AdjacencyPollutionScenario is the headline test —
|
|
// the exact case the filter exists to catch. Focus file is
|
|
// crates/queryd/db.go which defines Connector and imports
|
|
// catalogd::Registry. A chunk about catalogd::Registry should be
|
|
// dropped (adjacency); a chunk about Connector should be kept.
|
|
func TestFilterChunks_AdjacencyPollutionScenario(t *testing.T) {
|
|
focus := FocusFile{
|
|
Path: "crates/queryd/src/db.go",
|
|
Content: `
|
|
package queryd
|
|
|
|
import "catalogd"
|
|
|
|
pub struct Connector {}
|
|
pub fn open_connector() *Connector { return nil }
|
|
use catalogd::Registry;
|
|
`,
|
|
}
|
|
chunks := []CandidateChunk{
|
|
{
|
|
Source: "lakehouse_symbols_v1", DocID: "symbol:queryd::struct::Connector",
|
|
Text: "Connector wraps the DuckDB handle. open_connector creates one.",
|
|
},
|
|
{
|
|
Source: "lakehouse_symbols_v1", DocID: "symbol:catalogd::struct::Registry",
|
|
Text: "Registry stores manifests. Used by ingestd and queryd.",
|
|
},
|
|
}
|
|
res := FilterChunks(focus, chunks, DefaultRelevanceThreshold)
|
|
// Connector chunk should be kept (defined_match).
|
|
keptIDs := make([]string, len(res.Kept))
|
|
for i, c := range res.Kept {
|
|
keptIDs[i] = c.DocID
|
|
}
|
|
if !contains(keptIDs, "symbol:queryd::struct::Connector") {
|
|
t.Errorf("expected Connector chunk kept; got %v", keptIDs)
|
|
}
|
|
// The Registry chunk MIGHT pass threshold depending on token_overlap
|
|
// noise (queryd appears in its text too). The load-bearing assertion:
|
|
// Connector ranks ≥ Registry.
|
|
connectorRel, registryRel := -999.0, -999.0
|
|
for _, c := range append(res.Kept, res.Dropped...) {
|
|
if strings.Contains(c.DocID, "Connector") {
|
|
connectorRel = c.Relevance
|
|
}
|
|
if strings.Contains(c.DocID, "Registry") {
|
|
registryRel = c.Relevance
|
|
}
|
|
}
|
|
if connectorRel <= registryRel {
|
|
t.Errorf("Connector should outrank Registry: connector=%.2f registry=%.2f", connectorRel, registryRel)
|
|
}
|
|
}
|
|
|
|
func contains(haystack []string, needle string) bool {
|
|
for _, h := range haystack {
|
|
if h == needle {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|