lakehouse/mcp-server/relevance.ts

/**
 * Heuristic relevance filter for matrix-retrieved chunks.
 *
 * Drops "adjacency pollution" — chunks that scored well on cosine but
 * are actually about code the focus file IMPORTS, not the focus file
 * itself. Without this, the reviewer LLM hallucinates imported-crate
 * internals as belonging to the focus file ("I see main.rs does X"
 * when X is in queryd::context that main.rs only calls through).
 *
 * Pure functions here; HTTP wiring lives in observer.ts.
 *
 * Scoring signals (all 0..1, additive then clamped):
 *   path_match     +1.0  chunk.source/doc_id encodes focus.path
 *   defined_match  +0.6  chunk text mentions focus.defined_symbols
 *   token_overlap  +0.4  jaccard of non-stopword tokens
 *   prefix_match   +0.3  chunk source shares first-2-segment prefix
 *   import_penalty -0.5  mentions ONLY imported symbols, no defined ones
 *
 * Threshold default 0.3 — empirically tuned to keep direct hits and drop
 * the obvious adjacency cases. Caller can override per-request.
 */

const STOPWORDS = new Set([
  "the","a","an","and","or","but","if","then","else","is","are","was","were",
  "be","been","being","of","in","on","at","to","for","with","by","from","as",
  "that","this","these","those","it","its","they","them","their","we","our",
  "you","your","i","me","my","not","no","so","do","does","did","done",
  "will","would","could","should","can","may","might","must","shall",
  "fn","let","mut","pub","use","mod","struct","enum","trait","impl","self",
  "type","const","static","async","await","return","match","ok","err","some",
  "none","into","from","ref","box","arc","rc","vec","string","str",
]);

export interface FocusFile {
  path: string;
  content?: string;
  defined_symbols?: string[];
  imported_symbols?: string[];
}

export interface CandidateChunk {
  source: string;       // corpus name or producer file
  doc_id: string;       // chunk identifier
  text: string;
  score: number;        // upstream cosine score
}

export interface ScoredChunk extends CandidateChunk {
  relevance: number;
  reasons: string[];
}

export interface FilterResult {
  kept: ScoredChunk[];
  dropped: ScoredChunk[];
  threshold: number;
  focus_path: string;
  total_in: number;
}

export function tokenize(text: string): Set<string> {
  const out = new Set<string>();
  if (!text) return out;
  const words = text.toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) ?? [];
  for (const w of words) {
    if (!STOPWORDS.has(w)) out.add(w);
  }
  return out;
}

export function jaccard(a: Set<string>, b: Set<string>): number {
  if (a.size === 0 || b.size === 0) return 0;
  let inter = 0;
  for (const x of a) if (b.has(x)) inter++;
  const union = a.size + b.size - inter;
  return union === 0 ? 0 : inter / union;
}

function collectMatches(content: string, re: RegExp, group: number): string[] {
  const out: string[] = [];
  for (const m of content.matchAll(re)) {
    if (m[group]) out.push(m[group]);
  }
  return out;
}

/**
 * Extract pub-symbol names from Rust/TS source. Conservative — we'd
 * rather miss a symbol than over-match on something unrelated.
 */
export function extractDefinedSymbols(content: string): string[] {
  if (!content) return [];
  const out = new Set<string>();
  const patterns: Array<[RegExp, number]> = [
    [/\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)/gi, 1],
    [/\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bpub\s+const\s+([A-Z_][A-Z0-9_]*)/g, 1],
    [/\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)/g, 1],
    [/\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)/g, 1],
    [/\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)/g, 1],
  ];
  for (const [re, g] of patterns) {
    for (const sym of collectMatches(content, re, g)) out.add(sym);
  }
  return [...out];
}

/**
 * Extract imported symbol names from Rust/TS source. Used as the
 * negative signal — chunks about THESE belong to other files.
 */
export function extractImportedSymbols(content: string): string[] {
  if (!content) return [];
  const out = new Set<string>();
  const ignore = new Set(["use","as","crate","super","self","mod"]);
  // Rust: use foo::bar::Baz, use foo::{Bar, Baz}, use foo::bar as alias.
  // Character class must include uppercase or paths like
  // `use catalogd::Registry;` get skipped because the regex backs off
  // when it can't extend the captured block past the uppercase letter.
  const useRe = /\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);/g;
  for (const block of collectMatches(content, useRe, 1)) {
    for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) {
      const tok = ident[0];
      if (tok.length > 2 && !ignore.has(tok)) out.add(tok);
    }
  }
  // TS: import { X, Y } from "foo"; import X from "foo";
  const tsRe = /\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from/g;
  for (const m of content.matchAll(tsRe)) {
    const block = m[1] || m[2] || "";
    for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) {
      const tok = ident[0];
      if (tok.length > 2 && tok !== "as") out.add(tok);
    }
  }
  return [...out];
}

/**
 * First-2-segment prefix used to compare paths cheaply. Mirrors the
 * pathway_memory file_prefix() so the same "same crate" notion applies.
 */
export function filePrefix(path: string): string {
  return path.split("/").slice(0, 2).join("/");
}

export function scoreRelevance(
  focus: FocusFile,
  chunk: CandidateChunk,
): { score: number; reasons: string[] } {
  const reasons: string[] = [];
  let score = 0;

  const focusPath = focus.path ?? "";
  const focusBase = focusPath.split("/").pop() ?? "";
  const chunkText = chunk.text ?? "";
  const chunkSource = chunk.source ?? "";
  const chunkDocId = chunk.doc_id ?? "";

  // path_match: chunk's provenance encodes the focus path or filename.
  if (focusPath && (chunkSource.includes(focusPath) || chunkDocId.includes(focusPath) || chunkText.includes(focusPath))) {
    score += 1.0;
    reasons.push("path_match");
  } else if (focusBase && focusBase.length > 4 && (chunkText.includes(focusBase) || chunkDocId.includes(focusBase))) {
    score += 0.6;
    reasons.push("filename_match");
  }

  // defined_match: chunk text mentions symbols this file actually defines
  const defined = focus.defined_symbols ?? (focus.content ? extractDefinedSymbols(focus.content) : []);
  if (defined.length > 0) {
    let hits = 0;
    for (const s of defined) {
      if (s.length > 2 && chunkText.includes(s)) hits++;
    }
    if (hits > 0) {
      const ratio = Math.min(1, hits / Math.max(1, defined.length));
      const contrib = 0.6 * ratio;
      score += contrib;
      reasons.push(`defined_match(${hits}/${defined.length})`);
    }
  }

  // token_overlap: jaccard of non-stopword tokens
  if (focus.content) {
    const overlap = jaccard(tokenize(focus.content), tokenize(chunkText));
    if (overlap > 0.05) {
      const contrib = 0.4 * overlap;
      score += contrib;
      reasons.push(`token_overlap(${overlap.toFixed(2)})`);
    }
  }

  // prefix_match: same first-2-segments (e.g. crates/queryd)
  if (focusPath) {
    const fp = filePrefix(focusPath);
    if (fp && (chunkSource.includes(fp) || chunkDocId.includes(fp) || chunkText.includes(fp))) {
      score += 0.3;
      reasons.push("prefix_match");
    }
  }

  // import_penalty: chunk mentions only symbols this file imports, never
  // any it defines. Strong signal of adjacency pollution.
  const imported = focus.imported_symbols ?? (focus.content ? extractImportedSymbols(focus.content) : []);
  if (imported.length > 0 && defined.length > 0) {
    let importHits = 0;
    let definedHits = 0;
    for (const s of imported) {
      if (s.length > 2 && chunkText.includes(s)) importHits++;
    }
    for (const s of defined) {
      if (s.length > 2 && chunkText.includes(s)) definedHits++;
    }
    if (importHits > 0 && definedHits === 0) {
      score -= 0.5;
      reasons.push(`import_only(${importHits})`);
    }
  }

  return { score, reasons };
}

export function filterChunks(
  focus: FocusFile,
  chunks: CandidateChunk[],
  threshold = 0.3,
): FilterResult {
  const scored: ScoredChunk[] = chunks.map((c) => {
    const { score, reasons } = scoreRelevance(focus, c);
    return { ...c, relevance: score, reasons };
  });
  const kept = scored.filter((c) => c.relevance >= threshold);
  const dropped = scored.filter((c) => c.relevance < threshold);
  return {
    kept,
    dropped,
    threshold,
    focus_path: focus.path,
    total_in: chunks.length,
  };
}