/** * Heuristic relevance filter for matrix-retrieved chunks. * * Drops "adjacency pollution" — chunks that scored well on cosine but * are actually about code the focus file IMPORTS, not the focus file * itself. Without this, the reviewer LLM hallucinates imported-crate * internals as belonging to the focus file ("I see main.rs does X" * when X is in queryd::context that main.rs only calls through). * * Pure functions here; HTTP wiring lives in observer.ts. * * Scoring signals (all 0..1, additive then clamped): * path_match +1.0 chunk.source/doc_id encodes focus.path * defined_match +0.6 chunk text mentions focus.defined_symbols * token_overlap +0.4 jaccard of non-stopword tokens * prefix_match +0.3 chunk source shares first-2-segment prefix * import_penalty -0.5 mentions ONLY imported symbols, no defined ones * * Threshold default 0.3 — empirically tuned to keep direct hits and drop * the obvious adjacency cases. Caller can override per-request. */ const STOPWORDS = new Set([ "the","a","an","and","or","but","if","then","else","is","are","was","were", "be","been","being","of","in","on","at","to","for","with","by","from","as", "that","this","these","those","it","its","they","them","their","we","our", "you","your","i","me","my","not","no","so","do","does","did","done", "will","would","could","should","can","may","might","must","shall", "fn","let","mut","pub","use","mod","struct","enum","trait","impl","self", "type","const","static","async","await","return","match","ok","err","some", "none","into","from","ref","box","arc","rc","vec","string","str", ]); export interface FocusFile { path: string; content?: string; defined_symbols?: string[]; imported_symbols?: string[]; } export interface CandidateChunk { source: string; // corpus name or producer file doc_id: string; // chunk identifier text: string; score: number; // upstream cosine score } export interface ScoredChunk extends CandidateChunk { relevance: number; reasons: string[]; } export interface FilterResult { kept: ScoredChunk[]; dropped: ScoredChunk[]; threshold: number; focus_path: string; total_in: number; } export function tokenize(text: string): Set { const out = new Set(); if (!text) return out; const words = text.toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) ?? []; for (const w of words) { if (!STOPWORDS.has(w)) out.add(w); } return out; } export function jaccard(a: Set, b: Set): number { if (a.size === 0 || b.size === 0) return 0; let inter = 0; for (const x of a) if (b.has(x)) inter++; const union = a.size + b.size - inter; return union === 0 ? 0 : inter / union; } function collectMatches(content: string, re: RegExp, group: number): string[] { const out: string[] = []; for (const m of content.matchAll(re)) { if (m[group]) out.push(m[group]); } return out; } /** * Extract pub-symbol names from Rust/TS source. Conservative — we'd * rather miss a symbol than over-match on something unrelated. */ export function extractDefinedSymbols(content: string): string[] { if (!content) return []; const out = new Set(); const patterns: Array<[RegExp, number]> = [ [/\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)/gi, 1], [/\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bpub\s+const\s+([A-Z_][A-Z0-9_]*)/g, 1], [/\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)/g, 1], [/\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)/g, 1], [/\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)/g, 1], ]; for (const [re, g] of patterns) { for (const sym of collectMatches(content, re, g)) out.add(sym); } return [...out]; } /** * Extract imported symbol names from Rust/TS source. Used as the * negative signal — chunks about THESE belong to other files. */ export function extractImportedSymbols(content: string): string[] { if (!content) return []; const out = new Set(); const ignore = new Set(["use","as","crate","super","self","mod"]); // Rust: use foo::bar::Baz, use foo::{Bar, Baz}, use foo::bar as alias. // Character class must include uppercase or paths like // `use catalogd::Registry;` get skipped because the regex backs off // when it can't extend the captured block past the uppercase letter. const useRe = /\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);/g; for (const block of collectMatches(content, useRe, 1)) { for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) { const tok = ident[0]; if (tok.length > 2 && !ignore.has(tok)) out.add(tok); } } // TS: import { X, Y } from "foo"; import X from "foo"; const tsRe = /\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from/g; for (const m of content.matchAll(tsRe)) { const block = m[1] || m[2] || ""; for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) { const tok = ident[0]; if (tok.length > 2 && tok !== "as") out.add(tok); } } return [...out]; } /** * First-2-segment prefix used to compare paths cheaply. Mirrors the * pathway_memory file_prefix() so the same "same crate" notion applies. */ export function filePrefix(path: string): string { return path.split("/").slice(0, 2).join("/"); } export function scoreRelevance( focus: FocusFile, chunk: CandidateChunk, ): { score: number; reasons: string[] } { const reasons: string[] = []; let score = 0; const focusPath = focus.path ?? ""; const focusBase = focusPath.split("/").pop() ?? ""; const chunkText = chunk.text ?? ""; const chunkSource = chunk.source ?? ""; const chunkDocId = chunk.doc_id ?? ""; // path_match: chunk's provenance encodes the focus path or filename. if (focusPath && (chunkSource.includes(focusPath) || chunkDocId.includes(focusPath) || chunkText.includes(focusPath))) { score += 1.0; reasons.push("path_match"); } else if (focusBase && focusBase.length > 4 && (chunkText.includes(focusBase) || chunkDocId.includes(focusBase))) { score += 0.6; reasons.push("filename_match"); } // defined_match: chunk text mentions symbols this file actually defines const defined = focus.defined_symbols ?? (focus.content ? extractDefinedSymbols(focus.content) : []); if (defined.length > 0) { let hits = 0; for (const s of defined) { if (s.length > 2 && chunkText.includes(s)) hits++; } if (hits > 0) { const ratio = Math.min(1, hits / Math.max(1, defined.length)); const contrib = 0.6 * ratio; score += contrib; reasons.push(`defined_match(${hits}/${defined.length})`); } } // token_overlap: jaccard of non-stopword tokens if (focus.content) { const overlap = jaccard(tokenize(focus.content), tokenize(chunkText)); if (overlap > 0.05) { const contrib = 0.4 * overlap; score += contrib; reasons.push(`token_overlap(${overlap.toFixed(2)})`); } } // prefix_match: same first-2-segments (e.g. crates/queryd) if (focusPath) { const fp = filePrefix(focusPath); if (fp && (chunkSource.includes(fp) || chunkDocId.includes(fp) || chunkText.includes(fp))) { score += 0.3; reasons.push("prefix_match"); } } // import_penalty: chunk mentions only symbols this file imports, never // any it defines. Strong signal of adjacency pollution. const imported = focus.imported_symbols ?? (focus.content ? extractImportedSymbols(focus.content) : []); if (imported.length > 0 && defined.length > 0) { let importHits = 0; let definedHits = 0; for (const s of imported) { if (s.length > 2 && chunkText.includes(s)) importHits++; } for (const s of defined) { if (s.length > 2 && chunkText.includes(s)) definedHits++; } if (importHits > 0 && definedHits === 0) { score -= 0.5; reasons.push(`import_only(${importHits})`); } } return { score, reasons }; } export function filterChunks( focus: FocusFile, chunks: CandidateChunk[], threshold = 0.3, ): FilterResult { const scored: ScoredChunk[] = chunks.map((c) => { const { score, reasons } = scoreRelevance(focus, c); return { ...c, relevance: score, reasons }; }); const kept = scored.filter((c) => c.relevance >= threshold); const dropped = scored.filter((c) => c.relevance < threshold); return { kept, dropped, threshold, focus_path: focus.path, total_in: chunks.length, }; }