golangLAKEHOUSE/internal/matrix/relevance_test.go

package matrix

import (
	"strings"
	"testing"
)

func TestTokenize(t *testing.T) {
	cases := []struct {
		text string
		want []string // expected tokens (sorted check inside)
	}{
		{"", nil},
		{"the quick brown fox", []string{"quick", "brown", "fox"}}, // stopwords dropped
		{"hello WORLD", []string{"hello", "world"}},                // lowercase
		{"a b c", nil},                                             // all under 3 chars
		{"struct Foo", []string{"foo"}},                            // "struct" is a stopword, identifiers OK
		{"crates/queryd/db.go", []string{"crates", "queryd"}},      // db.go: "db" is 2 chars, "go" is 2 chars
	}
	for _, c := range cases {
		got := Tokenize(c.text)
		if len(got) != len(c.want) {
			t.Errorf("Tokenize(%q): want %d tokens %v, got %d %v", c.text, len(c.want), c.want, len(got), got)
			continue
		}
		for _, w := range c.want {
			if _, ok := got[w]; !ok {
				t.Errorf("Tokenize(%q): missing token %q in %v", c.text, w, got)
			}
		}
	}
}

func TestJaccard(t *testing.T) {
	mk := func(tokens ...string) map[string]struct{} {
		m := make(map[string]struct{})
		for _, t := range tokens {
			m[t] = struct{}{}
		}
		return m
	}
	cases := []struct {
		name    string
		a, b    map[string]struct{}
		want    float64
		epsilon float64
	}{
		{"both empty", mk(), mk(), 0, 0},
		{"a empty", mk(), mk("x"), 0, 0},
		{"identical", mk("x", "y"), mk("x", "y"), 1, 0},
		{"disjoint", mk("a", "b"), mk("c", "d"), 0, 0},
		{"half overlap", mk("a", "b"), mk("b", "c"), 1.0 / 3.0, 0.001},
	}
	for _, c := range cases {
		got := Jaccard(c.a, c.b)
		if got < c.want-c.epsilon || got > c.want+c.epsilon {
			t.Errorf("%s: want %.3f, got %.3f", c.name, c.want, got)
		}
	}
}

func TestExtractDefinedSymbols(t *testing.T) {
	rust := `
pub fn search_chunks(query: &str) -> Vec<Chunk> { todo!() }
pub async fn build_index() {}
pub struct ChunkRegistry {}
pub enum Distance { Cosine, Euclidean }
pub trait Searcher {}
pub const MAX_K: usize = 1000;
pub type ChunkMap = HashMap<String, Chunk>;

fn private_helper() {} // not pub, must NOT match
struct PrivateOnly {}  // not pub, must NOT match
`
	got := ExtractDefinedSymbols(rust)
	want := []string{"search_chunks", "build_index", "ChunkRegistry", "Distance", "Searcher", "MAX_K", "ChunkMap"}
	if len(got) != len(want) {
		t.Errorf("Rust extract: want %v, got %v", want, got)
	}
	for _, w := range want {
		if !contains(got, w) {
			t.Errorf("Rust: missing %q in %v", w, got)
		}
	}
	// Negative cases — these should NOT match.
	for _, neg := range []string{"private_helper", "PrivateOnly"} {
		if contains(got, neg) {
			t.Errorf("Rust: should not match %q in %v", neg, got)
		}
	}

	ts := `
export function tokenize(text: string) {}
export async function loadCorpus() {}
export class IndexRegistry {}
export interface FocusFile {}
export const STOPWORDS = new Set();
export let counter = 0;

function privateTs() {} // not export, must NOT match
class Internal {}        // not export, must NOT match
`
	got = ExtractDefinedSymbols(ts)
	want = []string{"tokenize", "loadCorpus", "IndexRegistry", "FocusFile", "STOPWORDS", "counter"}
	for _, w := range want {
		if !contains(got, w) {
			t.Errorf("TS: missing %q in %v", w, got)
		}
	}
	for _, neg := range []string{"privateTs", "Internal"} {
		if contains(got, neg) {
			t.Errorf("TS: should not match %q in %v", neg, got)
		}
	}
}

func TestExtractImportedSymbols(t *testing.T) {
	rust := `
use catalogd::Registry;
use vectord::{Index, IndexParams};
use std::collections::HashMap;
`
	got := ExtractImportedSymbols(rust)
	for _, w := range []string{"catalogd", "Registry", "vectord", "Index", "IndexParams", "collections", "HashMap"} {
		if !contains(got, w) {
			t.Errorf("Rust use: missing %q in %v", w, got)
		}
	}
	for _, neg := range []string{"use", "as"} {
		if contains(got, neg) {
			t.Errorf("Rust use: should not match keyword %q in %v", neg, got)
		}
	}

	ts := `
import { tokenize, jaccard } from "./relevance";
import express from "express";
`
	got = ExtractImportedSymbols(ts)
	for _, w := range []string{"tokenize", "jaccard", "express"} {
		if !contains(got, w) {
			t.Errorf("TS import: missing %q in %v", w, got)
		}
	}
}

func TestFilePrefix(t *testing.T) {
	cases := []struct {
		path, want string
	}{
		{"crates/queryd/src/foo.rs", "crates/queryd"},
		{"top.rs", "top.rs"},
		{"a/b/c/d", "a/b"},
		{"", ""},
	}
	for _, c := range cases {
		got := FilePrefix(c.path)
		if got != c.want {
			t.Errorf("FilePrefix(%q): want %q, got %q", c.path, c.want, got)
		}
	}
}

func TestScoreRelevance_PathMatch(t *testing.T) {
	focus := FocusFile{Path: "crates/queryd/db.go"}
	chunk := CandidateChunk{Source: "lakehouse_arch_v1", DocID: "phase:queryd", Text: "code at crates/queryd/db.go does X"}
	score, reasons := ScoreRelevance(focus, chunk)
	if score < 1.0 {
		t.Errorf("path_match should give >=1.0; got %.2f reasons=%v", score, reasons)
	}
	if !contains(reasons, "path_match") {
		t.Errorf("expected path_match in reasons: %v", reasons)
	}
}

func TestScoreRelevance_ImportPenalty(t *testing.T) {
	// Focus defines Foo; chunk only mentions Bar (imported). Should
	// fire import_only penalty.
	focus := FocusFile{
		Path:           "crates/foo/main.go",
		Content:        "pub fn run() {}\npub struct Foo {}\nuse barlib::Bar;\n",
		DefinedSymbols: []string{"Foo"},
		ImportedSymbols: []string{"Bar"},
	}
	chunk := CandidateChunk{
		Source: "barlib_corpus", DocID: "barlib:Bar:42",
		Text: "Bar handles the actual lookup logic and returns a Result.",
	}
	score, reasons := ScoreRelevance(focus, chunk)
	if !contains(reasons, "import_only(1)") {
		t.Errorf("expected import_only penalty: reasons=%v score=%.2f", reasons, score)
	}
	if score >= 0 {
		// Without other positive signals, score should be net-negative.
		t.Errorf("expected negative net score; got %.2f reasons=%v", score, reasons)
	}
}

func TestFilterChunks_ThresholdSplitsKeptDropped(t *testing.T) {
	focus := FocusFile{Path: "crates/queryd/db.go"}
	chunks := []CandidateChunk{
		{Source: "code", DocID: "queryd:db.go", Text: "crates/queryd/db.go is the focus"}, // path match → kept
		{Source: "elsewhere", DocID: "phase:0", Text: "no match anywhere"},                  // dropped
	}
	res := FilterChunks(focus, chunks, DefaultRelevanceThreshold)
	if len(res.Kept) != 1 || len(res.Dropped) != 1 {
		t.Errorf("split: kept=%d dropped=%d (want 1/1)", len(res.Kept), len(res.Dropped))
	}
	if res.TotalIn != 2 {
		t.Errorf("TotalIn: want 2, got %d", res.TotalIn)
	}
	if res.FocusPath != focus.Path {
		t.Errorf("FocusPath echo: want %q, got %q", focus.Path, res.FocusPath)
	}
	// Sanity: everything in Kept has Relevance >= threshold.
	for _, c := range res.Kept {
		if c.Relevance < DefaultRelevanceThreshold {
			t.Errorf("kept chunk below threshold: %v", c)
		}
	}
	for _, c := range res.Dropped {
		if c.Relevance >= DefaultRelevanceThreshold {
			t.Errorf("dropped chunk at/above threshold: %v", c)
		}
	}
}

// TestFilterChunks_AdjacencyPollutionScenario is the headline test —
// the exact case the filter exists to catch. Focus file is
// crates/queryd/db.go which defines Connector and imports
// catalogd::Registry. A chunk about catalogd::Registry should be
// dropped (adjacency); a chunk about Connector should be kept.
func TestFilterChunks_AdjacencyPollutionScenario(t *testing.T) {
	focus := FocusFile{
		Path: "crates/queryd/src/db.go",
		Content: `
package queryd

import "catalogd"

pub struct Connector {}
pub fn open_connector() *Connector { return nil }
use catalogd::Registry;
`,
	}
	chunks := []CandidateChunk{
		{
			Source: "lakehouse_symbols_v1", DocID: "symbol:queryd::struct::Connector",
			Text: "Connector wraps the DuckDB handle. open_connector creates one.",
		},
		{
			Source: "lakehouse_symbols_v1", DocID: "symbol:catalogd::struct::Registry",
			Text: "Registry stores manifests. Used by ingestd and queryd.",
		},
	}
	res := FilterChunks(focus, chunks, DefaultRelevanceThreshold)
	// Connector chunk should be kept (defined_match).
	keptIDs := make([]string, len(res.Kept))
	for i, c := range res.Kept {
		keptIDs[i] = c.DocID
	}
	if !contains(keptIDs, "symbol:queryd::struct::Connector") {
		t.Errorf("expected Connector chunk kept; got %v", keptIDs)
	}
	// The Registry chunk MIGHT pass threshold depending on token_overlap
	// noise (queryd appears in its text too). The load-bearing assertion:
	// Connector ranks ≥ Registry.
	connectorRel, registryRel := -999.0, -999.0
	for _, c := range append(res.Kept, res.Dropped...) {
		if strings.Contains(c.DocID, "Connector") {
			connectorRel = c.Relevance
		}
		if strings.Contains(c.DocID, "Registry") {
			registryRel = c.Relevance
		}
	}
	if connectorRel <= registryRel {
		t.Errorf("Connector should outrank Registry: connector=%.2f registry=%.2f", connectorRel, registryRel)
	}
}

func contains(haystack []string, needle string) bool {
	for _, h := range haystack {
		if h == needle {
			return true
		}
	}
	return false
}