package matrix // Heuristic relevance filter for matrix-retrieved chunks. Port of // /home/profit/lakehouse/mcp-server/relevance.ts (Rust system). // // What it does: drops "adjacency pollution" — chunks that scored // well on cosine but are actually about code the focus file IMPORTS, // not the focus file itself. Without this, a reviewer LLM // hallucinates imported-crate internals as belonging to the focus // file ("I see main.rs does X" when X is in queryd::context that // main.rs only calls through). // // IMPORTANT: this filter is CODE-aware. The signals are pub fn, // struct, enum, use, import, file paths. It works for the eventual // lakehouse_arch_v1 / lakehouse_symbols_v1 / scrum_findings_v1 // corpora ports. It will NOT meaningfully filter staffing data // (candidates, workers, placements) — those need a different // mechanism (structured constraints + status gates) that lives // outside this package. See the candidates reality test 2026-04-29 // for the kind of staffing-side mismatch this filter doesn't fix. // // Scoring signals (all 0..1, additive then can sign-flip): // path_match +1.0 chunk.source/doc_id encodes focus.path // filename_match +0.6 chunk text mentions focus's filename // defined_match +0.6 chunk text mentions focus.defined_symbols // token_overlap +0.4 jaccard of non-stopword tokens // prefix_match +0.3 chunk source shares first-2-segment prefix // import_penalty -0.5 mentions ONLY imported symbols, no defined ones // // Threshold default 0.3 — same value the Rust observer ships. import ( "fmt" "regexp" "strings" ) // DefaultRelevanceThreshold is the value the Rust observer ships. // Empirically tuned to keep direct hits and drop adjacency pollution. const DefaultRelevanceThreshold = 0.3 // stopwords is the same list as relevance.ts. Includes English // articles + common Rust/TS keywords that would otherwise flood // jaccard scores between any two source files. var stopwords = func() map[string]struct{} { list := []string{ "the", "a", "an", "and", "or", "but", "if", "then", "else", "is", "are", "was", "were", "be", "been", "being", "of", "in", "on", "at", "to", "for", "with", "by", "from", "as", "that", "this", "these", "those", "it", "its", "they", "them", "their", "we", "our", "you", "your", "i", "me", "my", "not", "no", "so", "do", "does", "did", "done", "will", "would", "could", "should", "can", "may", "might", "must", "shall", "fn", "let", "mut", "pub", "use", "mod", "struct", "enum", "trait", "impl", "self", "type", "const", "static", "async", "await", "return", "match", "ok", "err", "some", "none", "into", "from", "ref", "box", "arc", "rc", "vec", "string", "str", } m := make(map[string]struct{}, len(list)) for _, s := range list { m[s] = struct{}{} } return m }() // FocusFile is what we're filtering chunks against. Path is required // for path_match; Content lets the filter auto-extract Defined and // ImportedSymbols when callers haven't already done so. type FocusFile struct { Path string Content string DefinedSymbols []string ImportedSymbols []string } // CandidateChunk is a single retrieved item to score. Source is the // corpus name; DocID is the chunk identifier; Score is the upstream // cosine signal (carried through but not used by this filter — the // matrix layer uses cosine for ranking, this filter for retention). type CandidateChunk struct { Source string `json:"source"` DocID string `json:"doc_id"` Text string `json:"text"` Score float64 `json:"score"` } // ScoredChunk wraps a chunk with its computed relevance + the list // of signals that fired. Reasons makes the filter auditable — // debugging "why did this chunk get kept/dropped" is the hard part. type ScoredChunk struct { CandidateChunk Relevance float64 `json:"relevance"` Reasons []string `json:"reasons"` } // FilterResult is the output of FilterChunks. Kept + Dropped are // disjoint and together cover the input. TotalIn is for sanity // checks; FocusPath echoes input for logging. type FilterResult struct { Kept []ScoredChunk `json:"kept"` Dropped []ScoredChunk `json:"dropped"` Threshold float64 `json:"threshold"` FocusPath string `json:"focus_path"` TotalIn int `json:"total_in"` } // Tokenize lowercases, splits on identifier boundaries (>=3 chars), // and drops stopwords. Used by Jaccard for token_overlap. Mirrors // the TS regex /[a-z_][a-z0-9_]{2,}/g — RE2-compatible as written. var tokenRe = regexp.MustCompile(`[a-z_][a-z0-9_]{2,}`) func Tokenize(text string) map[string]struct{} { out := make(map[string]struct{}) if text == "" { return out } for _, m := range tokenRe.FindAllString(strings.ToLower(text), -1) { if _, skip := stopwords[m]; skip { continue } out[m] = struct{}{} } return out } // Jaccard returns |A ∩ B| / |A ∪ B|. 0 when either set is empty // (matches the TS contract). func Jaccard(a, b map[string]struct{}) float64 { if len(a) == 0 || len(b) == 0 { return 0 } var inter int for k := range a { if _, ok := b[k]; ok { inter++ } } union := len(a) + len(b) - inter if union == 0 { return 0 } return float64(inter) / float64(union) } // ExtractDefinedSymbols pulls pub-symbol names from Rust/TS source. // Conservative — would rather miss a symbol than over-match. Patterns // match exactly the TS impl; \b and (?:...) are RE2-supported. Case- // sensitivity matches TS: pub fn is lowercase, struct/enum/trait/etc // are PascalCase, const is SCREAMING_CASE. Only the "pub fn" match // uses (?i) because TS uses /gi explicitly there (the rest are /g). var definedPatterns = []*regexp.Regexp{ regexp.MustCompile(`(?i)\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)`), regexp.MustCompile(`\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bpub\s+const\s+([A-Z_][A-Z0-9_]*)`), regexp.MustCompile(`\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)`), regexp.MustCompile(`\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)`), regexp.MustCompile(`\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)`), } func ExtractDefinedSymbols(content string) []string { if content == "" { return nil } seen := make(map[string]struct{}) var out []string for _, re := range definedPatterns { for _, m := range re.FindAllStringSubmatch(content, -1) { if len(m) < 2 || m[1] == "" { continue } if _, ok := seen[m[1]]; ok { continue } seen[m[1]] = struct{}{} out = append(out, m[1]) } } return out } // rustUseRe matches `use foo::bar::Baz;`, `use foo::{Bar, Baz};`, // `use foo::bar as alias;`. Lazy `*?` so we don't run into the next // `;` boundary too eagerly. var rustUseRe = regexp.MustCompile(`\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);`) // tsImportRe matches `import { X, Y } from "foo"` and `import X from "foo"`. var tsImportRe = regexp.MustCompile(`\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from`) // identRe extracts identifiers from a use/import block. var identRe = regexp.MustCompile(`[A-Za-z_][A-Za-z0-9_]*`) func ExtractImportedSymbols(content string) []string { if content == "" { return nil } ignore := map[string]bool{ "use": true, "as": true, "crate": true, "super": true, "self": true, "mod": true, } seen := make(map[string]struct{}) var out []string add := func(tok string) { if len(tok) <= 2 { return } if ignore[tok] { return } if _, ok := seen[tok]; ok { return } seen[tok] = struct{}{} out = append(out, tok) } for _, m := range rustUseRe.FindAllStringSubmatch(content, -1) { if len(m) < 2 { continue } for _, ident := range identRe.FindAllString(m[1], -1) { add(ident) } } for _, m := range tsImportRe.FindAllStringSubmatch(content, -1) { if len(m) < 3 { continue } block := m[1] if block == "" { block = m[2] } for _, ident := range identRe.FindAllString(block, -1) { add(ident) } } return out } // FilePrefix returns the first two path segments joined by "/" — // e.g. "crates/queryd/src/foo.rs" → "crates/queryd". Used for cheap // "same crate" comparisons; mirrors pathway_memory's notion. func FilePrefix(path string) string { parts := strings.Split(path, "/") if len(parts) > 2 { parts = parts[:2] } return strings.Join(parts, "/") } // ScoreRelevance computes the additive 0..1-ish score plus the list // of signals that fired. Negative scores are possible (import_penalty // without compensating positive signal). Pure function — no side // effects, no I/O. func ScoreRelevance(focus FocusFile, chunk CandidateChunk) (float64, []string) { var score float64 var reasons []string focusPath := focus.Path focusBase := "" if focusPath != "" { parts := strings.Split(focusPath, "/") focusBase = parts[len(parts)-1] } chunkText := chunk.Text chunkSource := chunk.Source chunkDocID := chunk.DocID // path_match: chunk's provenance encodes the focus path or filename. if focusPath != "" && (strings.Contains(chunkSource, focusPath) || strings.Contains(chunkDocID, focusPath) || strings.Contains(chunkText, focusPath)) { score += 1.0 reasons = append(reasons, "path_match") } else if focusBase != "" && len(focusBase) > 4 && (strings.Contains(chunkText, focusBase) || strings.Contains(chunkDocID, focusBase)) { score += 0.6 reasons = append(reasons, "filename_match") } // defined_match: chunk text mentions symbols this file actually defines. defined := focus.DefinedSymbols if len(defined) == 0 && focus.Content != "" { defined = ExtractDefinedSymbols(focus.Content) } if len(defined) > 0 { var hits int for _, s := range defined { if len(s) > 2 && strings.Contains(chunkText, s) { hits++ } } if hits > 0 { denom := len(defined) if denom < 1 { denom = 1 } ratio := float64(hits) / float64(denom) if ratio > 1 { ratio = 1 } score += 0.6 * ratio reasons = append(reasons, fmt.Sprintf("defined_match(%d/%d)", hits, len(defined))) } } // token_overlap: jaccard of non-stopword tokens. if focus.Content != "" { overlap := Jaccard(Tokenize(focus.Content), Tokenize(chunkText)) if overlap > 0.05 { score += 0.4 * overlap reasons = append(reasons, fmt.Sprintf("token_overlap(%.2f)", overlap)) } } // prefix_match: same first-2-segments (e.g. crates/queryd). if focusPath != "" { fp := FilePrefix(focusPath) if fp != "" && (strings.Contains(chunkSource, fp) || strings.Contains(chunkDocID, fp) || strings.Contains(chunkText, fp)) { score += 0.3 reasons = append(reasons, "prefix_match") } } // import_penalty: chunk mentions only imported symbols, no defined // ones. Strong signal of adjacency pollution — the chunk is about // what we IMPORT, not what we ARE. imported := focus.ImportedSymbols if len(imported) == 0 && focus.Content != "" { imported = ExtractImportedSymbols(focus.Content) } if len(imported) > 0 && len(defined) > 0 { var importHits, definedHits int for _, s := range imported { if len(s) > 2 && strings.Contains(chunkText, s) { importHits++ } } for _, s := range defined { if len(s) > 2 && strings.Contains(chunkText, s) { definedHits++ } } if importHits > 0 && definedHits == 0 { score -= 0.5 reasons = append(reasons, fmt.Sprintf("import_only(%d)", importHits)) } } return score, reasons } // FilterChunks scores every chunk and partitions by threshold. The // caller picks the threshold; pass 0 to keep everything (caller-as- // intent contract — no auto-default substitution, since a literal 0 // is meaningful as "keep everything I scored"). func FilterChunks(focus FocusFile, chunks []CandidateChunk, threshold float64) FilterResult { kept := make([]ScoredChunk, 0, len(chunks)) dropped := make([]ScoredChunk, 0) for _, c := range chunks { score, reasons := ScoreRelevance(focus, c) sc := ScoredChunk{CandidateChunk: c, Relevance: score, Reasons: reasons} if score >= threshold { kept = append(kept, sc) } else { dropped = append(dropped, sc) } } return FilterResult{ Kept: kept, Dropped: dropped, Threshold: threshold, FocusPath: focus.Path, TotalIn: len(chunks), } }