golangLAKEHOUSE/internal/matrix/relevance.go

package matrix

// Heuristic relevance filter for matrix-retrieved chunks. Port of
// /home/profit/lakehouse/mcp-server/relevance.ts (Rust system).
//
// What it does: drops "adjacency pollution" — chunks that scored
// well on cosine but are actually about code the focus file IMPORTS,
// not the focus file itself. Without this, a reviewer LLM
// hallucinates imported-crate internals as belonging to the focus
// file ("I see main.rs does X" when X is in queryd::context that
// main.rs only calls through).
//
// IMPORTANT: this filter is CODE-aware. The signals are pub fn,
// struct, enum, use, import, file paths. It works for the eventual
// lakehouse_arch_v1 / lakehouse_symbols_v1 / scrum_findings_v1
// corpora ports. It will NOT meaningfully filter staffing data
// (candidates, workers, placements) — those need a different
// mechanism (structured constraints + status gates) that lives
// outside this package. See the candidates reality test 2026-04-29
// for the kind of staffing-side mismatch this filter doesn't fix.
//
// Scoring signals (all 0..1, additive then can sign-flip):
//   path_match     +1.0  chunk.source/doc_id encodes focus.path
//   filename_match +0.6  chunk text mentions focus's filename
//   defined_match  +0.6  chunk text mentions focus.defined_symbols
//   token_overlap  +0.4  jaccard of non-stopword tokens
//   prefix_match   +0.3  chunk source shares first-2-segment prefix
//   import_penalty -0.5  mentions ONLY imported symbols, no defined ones
//
// Threshold default 0.3 — same value the Rust observer ships.

import (
	"fmt"
	"regexp"
	"strings"
)

// DefaultRelevanceThreshold is the value the Rust observer ships.
// Empirically tuned to keep direct hits and drop adjacency pollution.
const DefaultRelevanceThreshold = 0.3

// stopwords is the same list as relevance.ts. Includes English
// articles + common Rust/TS keywords that would otherwise flood
// jaccard scores between any two source files.
var stopwords = func() map[string]struct{} {
	list := []string{
		"the", "a", "an", "and", "or", "but", "if", "then", "else", "is", "are", "was", "were",
		"be", "been", "being", "of", "in", "on", "at", "to", "for", "with", "by", "from", "as",
		"that", "this", "these", "those", "it", "its", "they", "them", "their", "we", "our",
		"you", "your", "i", "me", "my", "not", "no", "so", "do", "does", "did", "done",
		"will", "would", "could", "should", "can", "may", "might", "must", "shall",
		"fn", "let", "mut", "pub", "use", "mod", "struct", "enum", "trait", "impl", "self",
		"type", "const", "static", "async", "await", "return", "match", "ok", "err", "some",
		"none", "into", "from", "ref", "box", "arc", "rc", "vec", "string", "str",
	}
	m := make(map[string]struct{}, len(list))
	for _, s := range list {
		m[s] = struct{}{}
	}
	return m
}()

// FocusFile is what we're filtering chunks against. Path is required
// for path_match; Content lets the filter auto-extract Defined and
// ImportedSymbols when callers haven't already done so.
type FocusFile struct {
	Path            string
	Content         string
	DefinedSymbols  []string
	ImportedSymbols []string
}

// CandidateChunk is a single retrieved item to score. Source is the
// corpus name; DocID is the chunk identifier; Score is the upstream
// cosine signal (carried through but not used by this filter — the
// matrix layer uses cosine for ranking, this filter for retention).
type CandidateChunk struct {
	Source string  `json:"source"`
	DocID  string  `json:"doc_id"`
	Text   string  `json:"text"`
	Score  float64 `json:"score"`
}

// ScoredChunk wraps a chunk with its computed relevance + the list
// of signals that fired. Reasons makes the filter auditable —
// debugging "why did this chunk get kept/dropped" is the hard part.
type ScoredChunk struct {
	CandidateChunk
	Relevance float64  `json:"relevance"`
	Reasons   []string `json:"reasons"`
}

// FilterResult is the output of FilterChunks. Kept + Dropped are
// disjoint and together cover the input. TotalIn is for sanity
// checks; FocusPath echoes input for logging.
type FilterResult struct {
	Kept      []ScoredChunk `json:"kept"`
	Dropped   []ScoredChunk `json:"dropped"`
	Threshold float64       `json:"threshold"`
	FocusPath string        `json:"focus_path"`
	TotalIn   int           `json:"total_in"`
}

// Tokenize lowercases, splits on identifier boundaries (>=3 chars),
// and drops stopwords. Used by Jaccard for token_overlap. Mirrors
// the TS regex /[a-z_][a-z0-9_]{2,}/g — RE2-compatible as written.
var tokenRe = regexp.MustCompile(`[a-z_][a-z0-9_]{2,}`)

func Tokenize(text string) map[string]struct{} {
	out := make(map[string]struct{})
	if text == "" {
		return out
	}
	for _, m := range tokenRe.FindAllString(strings.ToLower(text), -1) {
		if _, skip := stopwords[m]; skip {
			continue
		}
		out[m] = struct{}{}
	}
	return out
}

// Jaccard returns |A ∩ B| / |A ∪ B|. 0 when either set is empty
// (matches the TS contract).
func Jaccard(a, b map[string]struct{}) float64 {
	if len(a) == 0 || len(b) == 0 {
		return 0
	}
	var inter int
	for k := range a {
		if _, ok := b[k]; ok {
			inter++
		}
	}
	union := len(a) + len(b) - inter
	if union == 0 {
		return 0
	}
	return float64(inter) / float64(union)
}

// ExtractDefinedSymbols pulls pub-symbol names from Rust/TS source.
// Conservative — would rather miss a symbol than over-match. Patterns
// match exactly the TS impl; \b and (?:...) are RE2-supported. Case-
// sensitivity matches TS: pub fn is lowercase, struct/enum/trait/etc
// are PascalCase, const is SCREAMING_CASE. Only the "pub fn" match
// uses (?i) because TS uses /gi explicitly there (the rest are /g).
var definedPatterns = []*regexp.Regexp{
	regexp.MustCompile(`(?i)\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)`),
	regexp.MustCompile(`\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bpub\s+const\s+([A-Z_][A-Z0-9_]*)`),
	regexp.MustCompile(`\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)`),
	regexp.MustCompile(`\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)`),
	regexp.MustCompile(`\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)`),
}

func ExtractDefinedSymbols(content string) []string {
	if content == "" {
		return nil
	}
	seen := make(map[string]struct{})
	var out []string
	for _, re := range definedPatterns {
		for _, m := range re.FindAllStringSubmatch(content, -1) {
			if len(m) < 2 || m[1] == "" {
				continue
			}
			if _, ok := seen[m[1]]; ok {
				continue
			}
			seen[m[1]] = struct{}{}
			out = append(out, m[1])
		}
	}
	return out
}

// rustUseRe matches `use foo::bar::Baz;`, `use foo::{Bar, Baz};`,
// `use foo::bar as alias;`. Lazy `*?` so we don't run into the next
// `;` boundary too eagerly.
var rustUseRe = regexp.MustCompile(`\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);`)

// tsImportRe matches `import { X, Y } from "foo"` and `import X from "foo"`.
var tsImportRe = regexp.MustCompile(`\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from`)

// identRe extracts identifiers from a use/import block.
var identRe = regexp.MustCompile(`[A-Za-z_][A-Za-z0-9_]*`)

func ExtractImportedSymbols(content string) []string {
	if content == "" {
		return nil
	}
	ignore := map[string]bool{
		"use": true, "as": true, "crate": true, "super": true, "self": true, "mod": true,
	}
	seen := make(map[string]struct{})
	var out []string
	add := func(tok string) {
		if len(tok) <= 2 {
			return
		}
		if ignore[tok] {
			return
		}
		if _, ok := seen[tok]; ok {
			return
		}
		seen[tok] = struct{}{}
		out = append(out, tok)
	}
	for _, m := range rustUseRe.FindAllStringSubmatch(content, -1) {
		if len(m) < 2 {
			continue
		}
		for _, ident := range identRe.FindAllString(m[1], -1) {
			add(ident)
		}
	}
	for _, m := range tsImportRe.FindAllStringSubmatch(content, -1) {
		if len(m) < 3 {
			continue
		}
		block := m[1]
		if block == "" {
			block = m[2]
		}
		for _, ident := range identRe.FindAllString(block, -1) {
			add(ident)
		}
	}
	return out
}

// FilePrefix returns the first two path segments joined by "/" —
// e.g. "crates/queryd/src/foo.rs" → "crates/queryd". Used for cheap
// "same crate" comparisons; mirrors pathway_memory's notion.
func FilePrefix(path string) string {
	parts := strings.Split(path, "/")
	if len(parts) > 2 {
		parts = parts[:2]
	}
	return strings.Join(parts, "/")
}

// ScoreRelevance computes the additive 0..1-ish score plus the list
// of signals that fired. Negative scores are possible (import_penalty
// without compensating positive signal). Pure function — no side
// effects, no I/O.
func ScoreRelevance(focus FocusFile, chunk CandidateChunk) (float64, []string) {
	var score float64
	var reasons []string

	focusPath := focus.Path
	focusBase := ""
	if focusPath != "" {
		parts := strings.Split(focusPath, "/")
		focusBase = parts[len(parts)-1]
	}
	chunkText := chunk.Text
	chunkSource := chunk.Source
	chunkDocID := chunk.DocID

	// path_match: chunk's provenance encodes the focus path or filename.
	if focusPath != "" && (strings.Contains(chunkSource, focusPath) ||
		strings.Contains(chunkDocID, focusPath) ||
		strings.Contains(chunkText, focusPath)) {
		score += 1.0
		reasons = append(reasons, "path_match")
	} else if focusBase != "" && len(focusBase) > 4 &&
		(strings.Contains(chunkText, focusBase) || strings.Contains(chunkDocID, focusBase)) {
		score += 0.6
		reasons = append(reasons, "filename_match")
	}

	// defined_match: chunk text mentions symbols this file actually defines.
	defined := focus.DefinedSymbols
	if len(defined) == 0 && focus.Content != "" {
		defined = ExtractDefinedSymbols(focus.Content)
	}
	if len(defined) > 0 {
		var hits int
		for _, s := range defined {
			if len(s) > 2 && strings.Contains(chunkText, s) {
				hits++
			}
		}
		if hits > 0 {
			denom := len(defined)
			if denom < 1 {
				denom = 1
			}
			ratio := float64(hits) / float64(denom)
			if ratio > 1 {
				ratio = 1
			}
			score += 0.6 * ratio
			reasons = append(reasons, fmt.Sprintf("defined_match(%d/%d)", hits, len(defined)))
		}
	}

	// token_overlap: jaccard of non-stopword tokens.
	if focus.Content != "" {
		overlap := Jaccard(Tokenize(focus.Content), Tokenize(chunkText))
		if overlap > 0.05 {
			score += 0.4 * overlap
			reasons = append(reasons, fmt.Sprintf("token_overlap(%.2f)", overlap))
		}
	}

	// prefix_match: same first-2-segments (e.g. crates/queryd).
	if focusPath != "" {
		fp := FilePrefix(focusPath)
		if fp != "" && (strings.Contains(chunkSource, fp) ||
			strings.Contains(chunkDocID, fp) ||
			strings.Contains(chunkText, fp)) {
			score += 0.3
			reasons = append(reasons, "prefix_match")
		}
	}

	// import_penalty: chunk mentions only imported symbols, no defined
	// ones. Strong signal of adjacency pollution — the chunk is about
	// what we IMPORT, not what we ARE.
	imported := focus.ImportedSymbols
	if len(imported) == 0 && focus.Content != "" {
		imported = ExtractImportedSymbols(focus.Content)
	}
	if len(imported) > 0 && len(defined) > 0 {
		var importHits, definedHits int
		for _, s := range imported {
			if len(s) > 2 && strings.Contains(chunkText, s) {
				importHits++
			}
		}
		for _, s := range defined {
			if len(s) > 2 && strings.Contains(chunkText, s) {
				definedHits++
			}
		}
		if importHits > 0 && definedHits == 0 {
			score -= 0.5
			reasons = append(reasons, fmt.Sprintf("import_only(%d)", importHits))
		}
	}

	return score, reasons
}

// FilterChunks scores every chunk and partitions by threshold. The
// caller picks the threshold; pass 0 to keep everything (caller-as-
// intent contract — no auto-default substitution, since a literal 0
// is meaningful as "keep everything I scored").
func FilterChunks(focus FocusFile, chunks []CandidateChunk, threshold float64) FilterResult {
	kept := make([]ScoredChunk, 0, len(chunks))
	dropped := make([]ScoredChunk, 0)
	for _, c := range chunks {
		score, reasons := ScoreRelevance(focus, c)
		sc := ScoredChunk{CandidateChunk: c, Relevance: score, Reasons: reasons}
		if score >= threshold {
			kept = append(kept, sc)
		} else {
			dropped = append(dropped, sc)
		}
	}
	return FilterResult{
		Kept:      kept,
		Dropped:   dropped,
		Threshold: threshold,
		FocusPath: focus.Path,
		TotalIn:   len(chunks),
	}
}