diff --git a/mcp-server/observer.ts b/mcp-server/observer.ts index 9984fb5..4df134f 100644 --- a/mcp-server/observer.ts +++ b/mcp-server/observer.ts @@ -17,6 +17,8 @@ * agents do and helps them not repeat mistakes. */ +import { filterChunks } from "./relevance"; + const GATEWAY = process.env.GATEWAY_URL || "http://localhost:3700"; const LAKEHOUSE = process.env.LAKEHOUSE_URL || "http://localhost:3100"; const CYCLE_SECS = parseInt(process.env.OBSERVER_CYCLE || "30"); @@ -578,6 +580,24 @@ function startHttpListener() { }).catch((e: Error) => new Response(JSON.stringify({ error: e.message }), { status: 400 })); } + // ─── Relevance filter (2026-04-25) ─── + // Drops "adjacency pollution" from matrix-retrieved chunks before + // they reach the reviewer LLM. Caller (scrum/agent) posts the + // focus file + candidate chunks; observer scores via heuristic + // (path/symbol/token signals) and returns kept + dropped lists. + // Pure function — no I/O, safe to call hot. + if (req.method === "POST" && url.pathname === "/relevance") { + return req.json().then((body: any) => { + const focus = body.focus_file ?? body.focus ?? {}; + const chunks = body.chunks ?? []; + const threshold = typeof body.threshold === "number" ? body.threshold : 0.3; + const result = filterChunks(focus, chunks, threshold); + return new Response(JSON.stringify(result), { + headers: { "content-type": "application/json" }, + }); + }).catch((e: Error) => + new Response(JSON.stringify({ error: e.message }), { status: 400 })); + } return new Response("not found", { status: 404 }); }, }); diff --git a/mcp-server/relevance.test.ts b/mcp-server/relevance.test.ts new file mode 100644 index 0000000..93b91fa --- /dev/null +++ b/mcp-server/relevance.test.ts @@ -0,0 +1,129 @@ +import { test, expect } from "bun:test"; +import { + scoreRelevance, + filterChunks, + extractDefinedSymbols, + extractImportedSymbols, + jaccard, + tokenize, +} from "./relevance"; + +const RUST_FOCUS = ` +use queryd::context::build_context; +use catalogd::Registry; +use shared::types::{Tombstone, ModelProfile}; + +pub struct GatewayState { + catalog: Registry, +} + +pub async fn handle_query(state: &GatewayState, sql: &str) -> Result { + let ctx = build_context(&state.catalog).await?; + ctx.sql(sql).await.map(QueryResponse::from) +} + +pub fn shutdown(state: GatewayState) { + drop(state); +} +`; + +test("extractDefinedSymbols pulls pub fn / struct names", () => { + const syms = extractDefinedSymbols(RUST_FOCUS); + expect(syms).toContain("handle_query"); + expect(syms).toContain("shutdown"); + expect(syms).toContain("GatewayState"); +}); + +test("extractImportedSymbols pulls names from use statements", () => { + const syms = extractImportedSymbols(RUST_FOCUS); + expect(syms).toContain("build_context"); + expect(syms).toContain("Registry"); + expect(syms).toContain("Tombstone"); + expect(syms).toContain("ModelProfile"); + // Should not include keywords + expect(syms).not.toContain("use"); + expect(syms).not.toContain("crate"); +}); + +test("path_match dominates when chunk encodes focus path", () => { + const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; + const chunk = { + source: "distilled_factual_v20260423095819", + doc_id: "crates/gateway/src/main.rs:42", + text: "Some chunk content unrelated to anything", + score: 0.5, + }; + const { score, reasons } = scoreRelevance(focus, chunk); + expect(score).toBeGreaterThanOrEqual(1.0); + expect(reasons).toContain("path_match"); +}); + +test("import_only adjacency pollution gets penalized", () => { + // Chunk talks about queryd::context::build_context (imported by focus) + // but never mentions any focus-defined symbol — classic pollution. + const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; + const chunk = { + source: "distilled_procedural_v20260423102847", + doc_id: "proc_8421", + text: "When build_context fails the Registry must be invalidated. The Tombstone fields drive the merge-on-read filter — caller should not retry on stale fingerprints.", + score: 0.65, + }; + const { score, reasons } = scoreRelevance(focus, chunk); + expect(reasons.some(r => r.startsWith("import_only("))).toBe(true); + expect(score).toBeLessThan(0.3); // below default threshold → dropped +}); + +test("defined_match keeps a chunk that's actually about the focus", () => { + const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; + const chunk = { + source: "distilled_factual_v20260423095819", + doc_id: "fact_12", + text: "handle_query in GatewayState must return QueryResponse, not anyhow::Error. The shutdown path drops state synchronously.", + score: 0.4, + }; + const { score, reasons } = scoreRelevance(focus, chunk); + expect(reasons.some(r => r.startsWith("defined_match"))).toBe(true); + expect(score).toBeGreaterThan(0.3); // above threshold → kept +}); + +test("filterChunks bucket-sorts kept vs dropped", () => { + const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS }; + const chunks = [ + { source: "x", doc_id: "crates/gateway/src/main.rs:1", text: "anything", score: 0.5 }, // path_match — kept + { source: "x", doc_id: "y", text: "build_context Tombstone Registry adjacent only", score: 0.7 }, // import_only — dropped + { source: "x", doc_id: "z", text: "handle_query and GatewayState are at fault here", score: 0.4 }, // defined_match — kept + { source: "x", doc_id: "w", text: "completely unrelated content about chicago permits", score: 0.6 }, // nothing — dropped + ]; + const result = filterChunks(focus, chunks); + expect(result.kept.length).toBe(2); + expect(result.dropped.length).toBe(2); + expect(result.kept.map(c => c.doc_id)).toContain("crates/gateway/src/main.rs:1"); + expect(result.kept.map(c => c.doc_id)).toContain("z"); +}); + +test("threshold override changes filter behavior", () => { + const focus = { path: "crates/queryd/src/x.rs", content: "pub fn foo() {}" }; + const weak = { source: "x", doc_id: "y", text: "foo is referenced here briefly", score: 0.2 }; + const result_strict = filterChunks(focus, [weak], 0.95); + const result_loose = filterChunks(focus, [weak], 0.1); + expect(result_strict.kept.length).toBe(0); + expect(result_loose.kept.length).toBe(1); +}); + +test("empty defined/imported gracefully scores by tokens only", () => { + const focus = { path: "doc.md", content: "This is plain prose about welders in Chicago." }; + const chunk = { source: "x", doc_id: "y", text: "Welders working in Chicago need OSHA certs.", score: 0.5 }; + const { score, reasons } = scoreRelevance(focus, chunk); + expect(score).toBeGreaterThan(0); + expect(reasons.some(r => r.startsWith("token_overlap"))).toBe(true); +}); + +test("jaccard / tokenize basic sanity", () => { + const a = tokenize("the quick brown fox jumps over the lazy dog"); + const b = tokenize("a fast brown wolf runs over a tired dog"); + expect(a.has("the")).toBe(false); // stopword + expect(a.has("brown")).toBe(true); + const j = jaccard(a, b); + expect(j).toBeGreaterThan(0); + expect(j).toBeLessThan(1); +}); diff --git a/mcp-server/relevance.ts b/mcp-server/relevance.ts new file mode 100644 index 0000000..f99d985 --- /dev/null +++ b/mcp-server/relevance.ts @@ -0,0 +1,246 @@ +/** + * Heuristic relevance filter for matrix-retrieved chunks. + * + * Drops "adjacency pollution" — chunks that scored well on cosine but + * are actually about code the focus file IMPORTS, not the focus file + * itself. Without this, the reviewer LLM hallucinates imported-crate + * internals as belonging to the focus file ("I see main.rs does X" + * when X is in queryd::context that main.rs only calls through). + * + * Pure functions here; HTTP wiring lives in observer.ts. + * + * Scoring signals (all 0..1, additive then clamped): + * path_match +1.0 chunk.source/doc_id encodes focus.path + * defined_match +0.6 chunk text mentions focus.defined_symbols + * token_overlap +0.4 jaccard of non-stopword tokens + * prefix_match +0.3 chunk source shares first-2-segment prefix + * import_penalty -0.5 mentions ONLY imported symbols, no defined ones + * + * Threshold default 0.3 — empirically tuned to keep direct hits and drop + * the obvious adjacency cases. Caller can override per-request. + */ + +const STOPWORDS = new Set([ + "the","a","an","and","or","but","if","then","else","is","are","was","were", + "be","been","being","of","in","on","at","to","for","with","by","from","as", + "that","this","these","those","it","its","they","them","their","we","our", + "you","your","i","me","my","not","no","so","do","does","did","done", + "will","would","could","should","can","may","might","must","shall", + "fn","let","mut","pub","use","mod","struct","enum","trait","impl","self", + "type","const","static","async","await","return","match","ok","err","some", + "none","into","from","ref","box","arc","rc","vec","string","str", +]); + +export interface FocusFile { + path: string; + content?: string; + defined_symbols?: string[]; + imported_symbols?: string[]; +} + +export interface CandidateChunk { + source: string; // corpus name or producer file + doc_id: string; // chunk identifier + text: string; + score: number; // upstream cosine score +} + +export interface ScoredChunk extends CandidateChunk { + relevance: number; + reasons: string[]; +} + +export interface FilterResult { + kept: ScoredChunk[]; + dropped: ScoredChunk[]; + threshold: number; + focus_path: string; + total_in: number; +} + +export function tokenize(text: string): Set { + const out = new Set(); + if (!text) return out; + const words = text.toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) ?? []; + for (const w of words) { + if (!STOPWORDS.has(w)) out.add(w); + } + return out; +} + +export function jaccard(a: Set, b: Set): number { + if (a.size === 0 || b.size === 0) return 0; + let inter = 0; + for (const x of a) if (b.has(x)) inter++; + const union = a.size + b.size - inter; + return union === 0 ? 0 : inter / union; +} + +function collectMatches(content: string, re: RegExp, group: number): string[] { + const out: string[] = []; + for (const m of content.matchAll(re)) { + if (m[group]) out.push(m[group]); + } + return out; +} + +/** + * Extract pub-symbol names from Rust/TS source. Conservative — we'd + * rather miss a symbol than over-match on something unrelated. + */ +export function extractDefinedSymbols(content: string): string[] { + if (!content) return []; + const out = new Set(); + const patterns: Array<[RegExp, number]> = [ + [/\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)/gi, 1], + [/\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bpub\s+const\s+([A-Z_][A-Z0-9_]*)/g, 1], + [/\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)/g, 1], + [/\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)/g, 1], + [/\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)/g, 1], + ]; + for (const [re, g] of patterns) { + for (const sym of collectMatches(content, re, g)) out.add(sym); + } + return [...out]; +} + +/** + * Extract imported symbol names from Rust/TS source. Used as the + * negative signal — chunks about THESE belong to other files. + */ +export function extractImportedSymbols(content: string): string[] { + if (!content) return []; + const out = new Set(); + const ignore = new Set(["use","as","crate","super","self","mod"]); + // Rust: use foo::bar::Baz, use foo::{Bar, Baz}, use foo::bar as alias. + // Character class must include uppercase or paths like + // `use catalogd::Registry;` get skipped because the regex backs off + // when it can't extend the captured block past the uppercase letter. + const useRe = /\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);/g; + for (const block of collectMatches(content, useRe, 1)) { + for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) { + const tok = ident[0]; + if (tok.length > 2 && !ignore.has(tok)) out.add(tok); + } + } + // TS: import { X, Y } from "foo"; import X from "foo"; + const tsRe = /\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from/g; + for (const m of content.matchAll(tsRe)) { + const block = m[1] || m[2] || ""; + for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) { + const tok = ident[0]; + if (tok.length > 2 && tok !== "as") out.add(tok); + } + } + return [...out]; +} + +/** + * First-2-segment prefix used to compare paths cheaply. Mirrors the + * pathway_memory file_prefix() so the same "same crate" notion applies. + */ +export function filePrefix(path: string): string { + return path.split("/").slice(0, 2).join("/"); +} + +export function scoreRelevance( + focus: FocusFile, + chunk: CandidateChunk, +): { score: number; reasons: string[] } { + const reasons: string[] = []; + let score = 0; + + const focusPath = focus.path ?? ""; + const focusBase = focusPath.split("/").pop() ?? ""; + const chunkText = chunk.text ?? ""; + const chunkSource = chunk.source ?? ""; + const chunkDocId = chunk.doc_id ?? ""; + + // path_match: chunk's provenance encodes the focus path or filename. + if (focusPath && (chunkSource.includes(focusPath) || chunkDocId.includes(focusPath) || chunkText.includes(focusPath))) { + score += 1.0; + reasons.push("path_match"); + } else if (focusBase && focusBase.length > 4 && (chunkText.includes(focusBase) || chunkDocId.includes(focusBase))) { + score += 0.6; + reasons.push("filename_match"); + } + + // defined_match: chunk text mentions symbols this file actually defines + const defined = focus.defined_symbols ?? (focus.content ? extractDefinedSymbols(focus.content) : []); + if (defined.length > 0) { + let hits = 0; + for (const s of defined) { + if (s.length > 2 && chunkText.includes(s)) hits++; + } + if (hits > 0) { + const ratio = Math.min(1, hits / Math.max(1, defined.length)); + const contrib = 0.6 * ratio; + score += contrib; + reasons.push(`defined_match(${hits}/${defined.length})`); + } + } + + // token_overlap: jaccard of non-stopword tokens + if (focus.content) { + const overlap = jaccard(tokenize(focus.content), tokenize(chunkText)); + if (overlap > 0.05) { + const contrib = 0.4 * overlap; + score += contrib; + reasons.push(`token_overlap(${overlap.toFixed(2)})`); + } + } + + // prefix_match: same first-2-segments (e.g. crates/queryd) + if (focusPath) { + const fp = filePrefix(focusPath); + if (fp && (chunkSource.includes(fp) || chunkDocId.includes(fp) || chunkText.includes(fp))) { + score += 0.3; + reasons.push("prefix_match"); + } + } + + // import_penalty: chunk mentions only symbols this file imports, never + // any it defines. Strong signal of adjacency pollution. + const imported = focus.imported_symbols ?? (focus.content ? extractImportedSymbols(focus.content) : []); + if (imported.length > 0 && defined.length > 0) { + let importHits = 0; + let definedHits = 0; + for (const s of imported) { + if (s.length > 2 && chunkText.includes(s)) importHits++; + } + for (const s of defined) { + if (s.length > 2 && chunkText.includes(s)) definedHits++; + } + if (importHits > 0 && definedHits === 0) { + score -= 0.5; + reasons.push(`import_only(${importHits})`); + } + } + + return { score, reasons }; +} + +export function filterChunks( + focus: FocusFile, + chunks: CandidateChunk[], + threshold = 0.3, +): FilterResult { + const scored: ScoredChunk[] = chunks.map((c) => { + const { score, reasons } = scoreRelevance(focus, c); + return { ...c, relevance: score, reasons }; + }); + const kept = scored.filter((c) => c.relevance >= threshold); + const dropped = scored.filter((c) => c.relevance < threshold); + return { + kept, + dropped, + threshold, + focus_path: focus.path, + total_in: chunks.length, + }; +} diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index d4b16d6..9e956e6 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -499,6 +499,7 @@ async function fetchMatrixContext( filePath: string, topPerCorpus = 3, topOverall = 8, + focusContent?: string, ): Promise { const t0 = Date.now(); const corpora = MATRIX_CORPORA_FOR_TASK[taskClass] ?? []; @@ -548,9 +549,48 @@ async function fetchMatrixContext( errors.pathway_memory = e.message; } + // Optional adjacency-pollution filter via observer /relevance. + // Drops chunks that scored well on cosine but are about symbols the + // focus file only IMPORTS (not defines). Opt-out with + // LH_RELEVANCE_FILTER=0 for A/B comparison. + const relevanceEnabled = process.env.LH_RELEVANCE_FILTER !== "0"; + let filteredHits = allHits; + let droppedCount = 0; + if (relevanceEnabled && allHits.length > 0) { + try { + const r = await fetch(`${OBSERVER_URL}/relevance`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + focus_file: { path: filePath, content: focusContent ?? "" }, + chunks: allHits.map(h => ({ + source: h.source_corpus, + doc_id: h.doc_id, + text: h.text, + score: h.score, + })), + threshold: Number(process.env.LH_RELEVANCE_THRESHOLD ?? 0.3), + }), + signal: AbortSignal.timeout(5000), + }); + if (r.ok) { + const fr: any = await r.json(); + const keptKeys = new Set((fr.kept ?? []).map((c: any) => `${c.source}|${c.doc_id}`)); + filteredHits = allHits.filter(h => keptKeys.has(`${h.source_corpus}|${h.doc_id}`)); + droppedCount = (fr.dropped ?? []).length; + } else { + errors.relevance_filter = `HTTP ${r.status}`; + } + } catch (e: any) { + // Fall-open: filter failure must not block the pipeline. + errors.relevance_filter = e.message; + } + } + // Sort all hits by score desc, take top N - allHits.sort((a, b) => b.score - a.score); - const topHits = allHits.slice(0, topOverall); + filteredHits.sort((a, b) => b.score - a.score); + const topHits = filteredHits.slice(0, topOverall); + if (droppedCount > 0) byCorpus._relevance_dropped = droppedCount; return { hits: topHits, @@ -1349,7 +1389,7 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of // source so retrieval anchors against both the metadata and the // actual code being reviewed. const matrixQuery = `${taskClass} ${rel} ${content.slice(0, 500)}`; - const matrixCtx = await fetchMatrixContext(matrixQuery, taskClass, rel, 3, 8); + const matrixCtx = await fetchMatrixContext(matrixQuery, taskClass, rel, 3, 8, content); provenApproachesPreamble = buildMatrixPreamble(matrixCtx); const corporaSummary = Object.entries(matrixCtx.by_corpus) .map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");