observer: add /relevance heuristic filter for adjacency pollution
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Some checks failed
lakehouse/auditor 1 blocking issue: todo!() macro call in tests/real-world/scrum_master_pipeline.ts
Matrix retrieval often surfaces high-cosine chunks that are about
symbols the focus file IMPORTS but doesn't define. The reviewer LLM
then hallucinates those imported-crate internals as in-file content
("I see main.rs does X" when X lives in queryd::context).
mcp-server/relevance.ts — pure scorer with five signals:
path_match +1.0 chunk source/doc_id encodes focus path
defined_match +0.6 chunk text mentions focus.defined_symbols
token_overlap +0.4 jaccard of non-stopword tokens
prefix_match +0.3 shared first-2-segment prefix
import_only -0.5 mentions only imported symbols (pollution)
Default threshold 0.3 — tuned empirically on the gateway/main.rs case.
Also fixes a regex bug in the import extractor: the character class
was lowercase-only, so `use catalogd::Registry;` silently never
matched (regex backed off when it hit the uppercase R). Caught by
the test suite.
observer.ts — POST /relevance endpoint wraps filterChunks().
scrum_master_pipeline.ts — fetchMatrixContext gains optional
focusContent param; calls /relevance after collecting allHits and
before sort+top. Opt-out via LH_RELEVANCE_FILTER=0; threshold via
LH_RELEVANCE_THRESHOLD. Fall-open on observer failure.
9 unit tests, all green. Live probe on real shape correctly drops
a 0.7-cosine adjacency-pollution chunk while keeping in-focus hits.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
54689d523c
commit
0115a60072
@ -17,6 +17,8 @@
|
|||||||
* agents do and helps them not repeat mistakes.
|
* agents do and helps them not repeat mistakes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { filterChunks } from "./relevance";
|
||||||
|
|
||||||
const GATEWAY = process.env.GATEWAY_URL || "http://localhost:3700";
|
const GATEWAY = process.env.GATEWAY_URL || "http://localhost:3700";
|
||||||
const LAKEHOUSE = process.env.LAKEHOUSE_URL || "http://localhost:3100";
|
const LAKEHOUSE = process.env.LAKEHOUSE_URL || "http://localhost:3100";
|
||||||
const CYCLE_SECS = parseInt(process.env.OBSERVER_CYCLE || "30");
|
const CYCLE_SECS = parseInt(process.env.OBSERVER_CYCLE || "30");
|
||||||
@ -578,6 +580,24 @@ function startHttpListener() {
|
|||||||
}).catch((e: Error) =>
|
}).catch((e: Error) =>
|
||||||
new Response(JSON.stringify({ error: e.message }), { status: 400 }));
|
new Response(JSON.stringify({ error: e.message }), { status: 400 }));
|
||||||
}
|
}
|
||||||
|
// ─── Relevance filter (2026-04-25) ───
|
||||||
|
// Drops "adjacency pollution" from matrix-retrieved chunks before
|
||||||
|
// they reach the reviewer LLM. Caller (scrum/agent) posts the
|
||||||
|
// focus file + candidate chunks; observer scores via heuristic
|
||||||
|
// (path/symbol/token signals) and returns kept + dropped lists.
|
||||||
|
// Pure function — no I/O, safe to call hot.
|
||||||
|
if (req.method === "POST" && url.pathname === "/relevance") {
|
||||||
|
return req.json().then((body: any) => {
|
||||||
|
const focus = body.focus_file ?? body.focus ?? {};
|
||||||
|
const chunks = body.chunks ?? [];
|
||||||
|
const threshold = typeof body.threshold === "number" ? body.threshold : 0.3;
|
||||||
|
const result = filterChunks(focus, chunks, threshold);
|
||||||
|
return new Response(JSON.stringify(result), {
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
});
|
||||||
|
}).catch((e: Error) =>
|
||||||
|
new Response(JSON.stringify({ error: e.message }), { status: 400 }));
|
||||||
|
}
|
||||||
return new Response("not found", { status: 404 });
|
return new Response("not found", { status: 404 });
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
129
mcp-server/relevance.test.ts
Normal file
129
mcp-server/relevance.test.ts
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import { test, expect } from "bun:test";
|
||||||
|
import {
|
||||||
|
scoreRelevance,
|
||||||
|
filterChunks,
|
||||||
|
extractDefinedSymbols,
|
||||||
|
extractImportedSymbols,
|
||||||
|
jaccard,
|
||||||
|
tokenize,
|
||||||
|
} from "./relevance";
|
||||||
|
|
||||||
|
const RUST_FOCUS = `
|
||||||
|
use queryd::context::build_context;
|
||||||
|
use catalogd::Registry;
|
||||||
|
use shared::types::{Tombstone, ModelProfile};
|
||||||
|
|
||||||
|
pub struct GatewayState {
|
||||||
|
catalog: Registry,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn handle_query(state: &GatewayState, sql: &str) -> Result<QueryResponse, Error> {
|
||||||
|
let ctx = build_context(&state.catalog).await?;
|
||||||
|
ctx.sql(sql).await.map(QueryResponse::from)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn shutdown(state: GatewayState) {
|
||||||
|
drop(state);
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
|
||||||
|
test("extractDefinedSymbols pulls pub fn / struct names", () => {
|
||||||
|
const syms = extractDefinedSymbols(RUST_FOCUS);
|
||||||
|
expect(syms).toContain("handle_query");
|
||||||
|
expect(syms).toContain("shutdown");
|
||||||
|
expect(syms).toContain("GatewayState");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("extractImportedSymbols pulls names from use statements", () => {
|
||||||
|
const syms = extractImportedSymbols(RUST_FOCUS);
|
||||||
|
expect(syms).toContain("build_context");
|
||||||
|
expect(syms).toContain("Registry");
|
||||||
|
expect(syms).toContain("Tombstone");
|
||||||
|
expect(syms).toContain("ModelProfile");
|
||||||
|
// Should not include keywords
|
||||||
|
expect(syms).not.toContain("use");
|
||||||
|
expect(syms).not.toContain("crate");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("path_match dominates when chunk encodes focus path", () => {
|
||||||
|
const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS };
|
||||||
|
const chunk = {
|
||||||
|
source: "distilled_factual_v20260423095819",
|
||||||
|
doc_id: "crates/gateway/src/main.rs:42",
|
||||||
|
text: "Some chunk content unrelated to anything",
|
||||||
|
score: 0.5,
|
||||||
|
};
|
||||||
|
const { score, reasons } = scoreRelevance(focus, chunk);
|
||||||
|
expect(score).toBeGreaterThanOrEqual(1.0);
|
||||||
|
expect(reasons).toContain("path_match");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("import_only adjacency pollution gets penalized", () => {
|
||||||
|
// Chunk talks about queryd::context::build_context (imported by focus)
|
||||||
|
// but never mentions any focus-defined symbol — classic pollution.
|
||||||
|
const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS };
|
||||||
|
const chunk = {
|
||||||
|
source: "distilled_procedural_v20260423102847",
|
||||||
|
doc_id: "proc_8421",
|
||||||
|
text: "When build_context fails the Registry must be invalidated. The Tombstone fields drive the merge-on-read filter — caller should not retry on stale fingerprints.",
|
||||||
|
score: 0.65,
|
||||||
|
};
|
||||||
|
const { score, reasons } = scoreRelevance(focus, chunk);
|
||||||
|
expect(reasons.some(r => r.startsWith("import_only("))).toBe(true);
|
||||||
|
expect(score).toBeLessThan(0.3); // below default threshold → dropped
|
||||||
|
});
|
||||||
|
|
||||||
|
test("defined_match keeps a chunk that's actually about the focus", () => {
|
||||||
|
const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS };
|
||||||
|
const chunk = {
|
||||||
|
source: "distilled_factual_v20260423095819",
|
||||||
|
doc_id: "fact_12",
|
||||||
|
text: "handle_query in GatewayState must return QueryResponse, not anyhow::Error. The shutdown path drops state synchronously.",
|
||||||
|
score: 0.4,
|
||||||
|
};
|
||||||
|
const { score, reasons } = scoreRelevance(focus, chunk);
|
||||||
|
expect(reasons.some(r => r.startsWith("defined_match"))).toBe(true);
|
||||||
|
expect(score).toBeGreaterThan(0.3); // above threshold → kept
|
||||||
|
});
|
||||||
|
|
||||||
|
test("filterChunks bucket-sorts kept vs dropped", () => {
|
||||||
|
const focus = { path: "crates/gateway/src/main.rs", content: RUST_FOCUS };
|
||||||
|
const chunks = [
|
||||||
|
{ source: "x", doc_id: "crates/gateway/src/main.rs:1", text: "anything", score: 0.5 }, // path_match — kept
|
||||||
|
{ source: "x", doc_id: "y", text: "build_context Tombstone Registry adjacent only", score: 0.7 }, // import_only — dropped
|
||||||
|
{ source: "x", doc_id: "z", text: "handle_query and GatewayState are at fault here", score: 0.4 }, // defined_match — kept
|
||||||
|
{ source: "x", doc_id: "w", text: "completely unrelated content about chicago permits", score: 0.6 }, // nothing — dropped
|
||||||
|
];
|
||||||
|
const result = filterChunks(focus, chunks);
|
||||||
|
expect(result.kept.length).toBe(2);
|
||||||
|
expect(result.dropped.length).toBe(2);
|
||||||
|
expect(result.kept.map(c => c.doc_id)).toContain("crates/gateway/src/main.rs:1");
|
||||||
|
expect(result.kept.map(c => c.doc_id)).toContain("z");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("threshold override changes filter behavior", () => {
|
||||||
|
const focus = { path: "crates/queryd/src/x.rs", content: "pub fn foo() {}" };
|
||||||
|
const weak = { source: "x", doc_id: "y", text: "foo is referenced here briefly", score: 0.2 };
|
||||||
|
const result_strict = filterChunks(focus, [weak], 0.95);
|
||||||
|
const result_loose = filterChunks(focus, [weak], 0.1);
|
||||||
|
expect(result_strict.kept.length).toBe(0);
|
||||||
|
expect(result_loose.kept.length).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("empty defined/imported gracefully scores by tokens only", () => {
|
||||||
|
const focus = { path: "doc.md", content: "This is plain prose about welders in Chicago." };
|
||||||
|
const chunk = { source: "x", doc_id: "y", text: "Welders working in Chicago need OSHA certs.", score: 0.5 };
|
||||||
|
const { score, reasons } = scoreRelevance(focus, chunk);
|
||||||
|
expect(score).toBeGreaterThan(0);
|
||||||
|
expect(reasons.some(r => r.startsWith("token_overlap"))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("jaccard / tokenize basic sanity", () => {
|
||||||
|
const a = tokenize("the quick brown fox jumps over the lazy dog");
|
||||||
|
const b = tokenize("a fast brown wolf runs over a tired dog");
|
||||||
|
expect(a.has("the")).toBe(false); // stopword
|
||||||
|
expect(a.has("brown")).toBe(true);
|
||||||
|
const j = jaccard(a, b);
|
||||||
|
expect(j).toBeGreaterThan(0);
|
||||||
|
expect(j).toBeLessThan(1);
|
||||||
|
});
|
||||||
246
mcp-server/relevance.ts
Normal file
246
mcp-server/relevance.ts
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
/**
|
||||||
|
* Heuristic relevance filter for matrix-retrieved chunks.
|
||||||
|
*
|
||||||
|
* Drops "adjacency pollution" — chunks that scored well on cosine but
|
||||||
|
* are actually about code the focus file IMPORTS, not the focus file
|
||||||
|
* itself. Without this, the reviewer LLM hallucinates imported-crate
|
||||||
|
* internals as belonging to the focus file ("I see main.rs does X"
|
||||||
|
* when X is in queryd::context that main.rs only calls through).
|
||||||
|
*
|
||||||
|
* Pure functions here; HTTP wiring lives in observer.ts.
|
||||||
|
*
|
||||||
|
* Scoring signals (all 0..1, additive then clamped):
|
||||||
|
* path_match +1.0 chunk.source/doc_id encodes focus.path
|
||||||
|
* defined_match +0.6 chunk text mentions focus.defined_symbols
|
||||||
|
* token_overlap +0.4 jaccard of non-stopword tokens
|
||||||
|
* prefix_match +0.3 chunk source shares first-2-segment prefix
|
||||||
|
* import_penalty -0.5 mentions ONLY imported symbols, no defined ones
|
||||||
|
*
|
||||||
|
* Threshold default 0.3 — empirically tuned to keep direct hits and drop
|
||||||
|
* the obvious adjacency cases. Caller can override per-request.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const STOPWORDS = new Set([
|
||||||
|
"the","a","an","and","or","but","if","then","else","is","are","was","were",
|
||||||
|
"be","been","being","of","in","on","at","to","for","with","by","from","as",
|
||||||
|
"that","this","these","those","it","its","they","them","their","we","our",
|
||||||
|
"you","your","i","me","my","not","no","so","do","does","did","done",
|
||||||
|
"will","would","could","should","can","may","might","must","shall",
|
||||||
|
"fn","let","mut","pub","use","mod","struct","enum","trait","impl","self",
|
||||||
|
"type","const","static","async","await","return","match","ok","err","some",
|
||||||
|
"none","into","from","ref","box","arc","rc","vec","string","str",
|
||||||
|
]);
|
||||||
|
|
||||||
|
export interface FocusFile {
|
||||||
|
path: string;
|
||||||
|
content?: string;
|
||||||
|
defined_symbols?: string[];
|
||||||
|
imported_symbols?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CandidateChunk {
|
||||||
|
source: string; // corpus name or producer file
|
||||||
|
doc_id: string; // chunk identifier
|
||||||
|
text: string;
|
||||||
|
score: number; // upstream cosine score
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScoredChunk extends CandidateChunk {
|
||||||
|
relevance: number;
|
||||||
|
reasons: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FilterResult {
|
||||||
|
kept: ScoredChunk[];
|
||||||
|
dropped: ScoredChunk[];
|
||||||
|
threshold: number;
|
||||||
|
focus_path: string;
|
||||||
|
total_in: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function tokenize(text: string): Set<string> {
|
||||||
|
const out = new Set<string>();
|
||||||
|
if (!text) return out;
|
||||||
|
const words = text.toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) ?? [];
|
||||||
|
for (const w of words) {
|
||||||
|
if (!STOPWORDS.has(w)) out.add(w);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function jaccard(a: Set<string>, b: Set<string>): number {
|
||||||
|
if (a.size === 0 || b.size === 0) return 0;
|
||||||
|
let inter = 0;
|
||||||
|
for (const x of a) if (b.has(x)) inter++;
|
||||||
|
const union = a.size + b.size - inter;
|
||||||
|
return union === 0 ? 0 : inter / union;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectMatches(content: string, re: RegExp, group: number): string[] {
|
||||||
|
const out: string[] = [];
|
||||||
|
for (const m of content.matchAll(re)) {
|
||||||
|
if (m[group]) out.push(m[group]);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract pub-symbol names from Rust/TS source. Conservative — we'd
|
||||||
|
* rather miss a symbol than over-match on something unrelated.
|
||||||
|
*/
|
||||||
|
export function extractDefinedSymbols(content: string): string[] {
|
||||||
|
if (!content) return [];
|
||||||
|
const out = new Set<string>();
|
||||||
|
const patterns: Array<[RegExp, number]> = [
|
||||||
|
[/\bpub\s+(?:async\s+)?fn\s+([a-z_][a-z0-9_]*)/gi, 1],
|
||||||
|
[/\bpub\s+struct\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bpub\s+enum\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bpub\s+trait\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bpub\s+const\s+([A-Z_][A-Z0-9_]*)/g, 1],
|
||||||
|
[/\bpub\s+type\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bexport\s+(?:async\s+)?function\s+([a-z_][a-zA-Z0-9_]*)/g, 1],
|
||||||
|
[/\bexport\s+class\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bexport\s+interface\s+([A-Z][A-Za-z0-9_]*)/g, 1],
|
||||||
|
[/\bexport\s+(?:const|let|var)\s+([a-zA-Z_][a-zA-Z0-9_]*)/g, 1],
|
||||||
|
];
|
||||||
|
for (const [re, g] of patterns) {
|
||||||
|
for (const sym of collectMatches(content, re, g)) out.add(sym);
|
||||||
|
}
|
||||||
|
return [...out];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract imported symbol names from Rust/TS source. Used as the
|
||||||
|
* negative signal — chunks about THESE belong to other files.
|
||||||
|
*/
|
||||||
|
export function extractImportedSymbols(content: string): string[] {
|
||||||
|
if (!content) return [];
|
||||||
|
const out = new Set<string>();
|
||||||
|
const ignore = new Set(["use","as","crate","super","self","mod"]);
|
||||||
|
// Rust: use foo::bar::Baz, use foo::{Bar, Baz}, use foo::bar as alias.
|
||||||
|
// Character class must include uppercase or paths like
|
||||||
|
// `use catalogd::Registry;` get skipped because the regex backs off
|
||||||
|
// when it can't extend the captured block past the uppercase letter.
|
||||||
|
const useRe = /\buse\s+([A-Za-z_][A-Za-z0-9_:{}, \n]*?);/g;
|
||||||
|
for (const block of collectMatches(content, useRe, 1)) {
|
||||||
|
for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) {
|
||||||
|
const tok = ident[0];
|
||||||
|
if (tok.length > 2 && !ignore.has(tok)) out.add(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TS: import { X, Y } from "foo"; import X from "foo";
|
||||||
|
const tsRe = /\bimport\s+(?:\{([^}]+)\}|([A-Za-z_][A-Za-z0-9_]*))\s+from/g;
|
||||||
|
for (const m of content.matchAll(tsRe)) {
|
||||||
|
const block = m[1] || m[2] || "";
|
||||||
|
for (const ident of block.matchAll(/[A-Za-z_][A-Za-z0-9_]*/g)) {
|
||||||
|
const tok = ident[0];
|
||||||
|
if (tok.length > 2 && tok !== "as") out.add(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [...out];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* First-2-segment prefix used to compare paths cheaply. Mirrors the
|
||||||
|
* pathway_memory file_prefix() so the same "same crate" notion applies.
|
||||||
|
*/
|
||||||
|
export function filePrefix(path: string): string {
|
||||||
|
return path.split("/").slice(0, 2).join("/");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function scoreRelevance(
|
||||||
|
focus: FocusFile,
|
||||||
|
chunk: CandidateChunk,
|
||||||
|
): { score: number; reasons: string[] } {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
const focusPath = focus.path ?? "";
|
||||||
|
const focusBase = focusPath.split("/").pop() ?? "";
|
||||||
|
const chunkText = chunk.text ?? "";
|
||||||
|
const chunkSource = chunk.source ?? "";
|
||||||
|
const chunkDocId = chunk.doc_id ?? "";
|
||||||
|
|
||||||
|
// path_match: chunk's provenance encodes the focus path or filename.
|
||||||
|
if (focusPath && (chunkSource.includes(focusPath) || chunkDocId.includes(focusPath) || chunkText.includes(focusPath))) {
|
||||||
|
score += 1.0;
|
||||||
|
reasons.push("path_match");
|
||||||
|
} else if (focusBase && focusBase.length > 4 && (chunkText.includes(focusBase) || chunkDocId.includes(focusBase))) {
|
||||||
|
score += 0.6;
|
||||||
|
reasons.push("filename_match");
|
||||||
|
}
|
||||||
|
|
||||||
|
// defined_match: chunk text mentions symbols this file actually defines
|
||||||
|
const defined = focus.defined_symbols ?? (focus.content ? extractDefinedSymbols(focus.content) : []);
|
||||||
|
if (defined.length > 0) {
|
||||||
|
let hits = 0;
|
||||||
|
for (const s of defined) {
|
||||||
|
if (s.length > 2 && chunkText.includes(s)) hits++;
|
||||||
|
}
|
||||||
|
if (hits > 0) {
|
||||||
|
const ratio = Math.min(1, hits / Math.max(1, defined.length));
|
||||||
|
const contrib = 0.6 * ratio;
|
||||||
|
score += contrib;
|
||||||
|
reasons.push(`defined_match(${hits}/${defined.length})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// token_overlap: jaccard of non-stopword tokens
|
||||||
|
if (focus.content) {
|
||||||
|
const overlap = jaccard(tokenize(focus.content), tokenize(chunkText));
|
||||||
|
if (overlap > 0.05) {
|
||||||
|
const contrib = 0.4 * overlap;
|
||||||
|
score += contrib;
|
||||||
|
reasons.push(`token_overlap(${overlap.toFixed(2)})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prefix_match: same first-2-segments (e.g. crates/queryd)
|
||||||
|
if (focusPath) {
|
||||||
|
const fp = filePrefix(focusPath);
|
||||||
|
if (fp && (chunkSource.includes(fp) || chunkDocId.includes(fp) || chunkText.includes(fp))) {
|
||||||
|
score += 0.3;
|
||||||
|
reasons.push("prefix_match");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// import_penalty: chunk mentions only symbols this file imports, never
|
||||||
|
// any it defines. Strong signal of adjacency pollution.
|
||||||
|
const imported = focus.imported_symbols ?? (focus.content ? extractImportedSymbols(focus.content) : []);
|
||||||
|
if (imported.length > 0 && defined.length > 0) {
|
||||||
|
let importHits = 0;
|
||||||
|
let definedHits = 0;
|
||||||
|
for (const s of imported) {
|
||||||
|
if (s.length > 2 && chunkText.includes(s)) importHits++;
|
||||||
|
}
|
||||||
|
for (const s of defined) {
|
||||||
|
if (s.length > 2 && chunkText.includes(s)) definedHits++;
|
||||||
|
}
|
||||||
|
if (importHits > 0 && definedHits === 0) {
|
||||||
|
score -= 0.5;
|
||||||
|
reasons.push(`import_only(${importHits})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { score, reasons };
|
||||||
|
}
|
||||||
|
|
||||||
|
export function filterChunks(
|
||||||
|
focus: FocusFile,
|
||||||
|
chunks: CandidateChunk[],
|
||||||
|
threshold = 0.3,
|
||||||
|
): FilterResult {
|
||||||
|
const scored: ScoredChunk[] = chunks.map((c) => {
|
||||||
|
const { score, reasons } = scoreRelevance(focus, c);
|
||||||
|
return { ...c, relevance: score, reasons };
|
||||||
|
});
|
||||||
|
const kept = scored.filter((c) => c.relevance >= threshold);
|
||||||
|
const dropped = scored.filter((c) => c.relevance < threshold);
|
||||||
|
return {
|
||||||
|
kept,
|
||||||
|
dropped,
|
||||||
|
threshold,
|
||||||
|
focus_path: focus.path,
|
||||||
|
total_in: chunks.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
@ -499,6 +499,7 @@ async function fetchMatrixContext(
|
|||||||
filePath: string,
|
filePath: string,
|
||||||
topPerCorpus = 3,
|
topPerCorpus = 3,
|
||||||
topOverall = 8,
|
topOverall = 8,
|
||||||
|
focusContent?: string,
|
||||||
): Promise<MatrixContext> {
|
): Promise<MatrixContext> {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const corpora = MATRIX_CORPORA_FOR_TASK[taskClass] ?? [];
|
const corpora = MATRIX_CORPORA_FOR_TASK[taskClass] ?? [];
|
||||||
@ -548,9 +549,48 @@ async function fetchMatrixContext(
|
|||||||
errors.pathway_memory = e.message;
|
errors.pathway_memory = e.message;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Optional adjacency-pollution filter via observer /relevance.
|
||||||
|
// Drops chunks that scored well on cosine but are about symbols the
|
||||||
|
// focus file only IMPORTS (not defines). Opt-out with
|
||||||
|
// LH_RELEVANCE_FILTER=0 for A/B comparison.
|
||||||
|
const relevanceEnabled = process.env.LH_RELEVANCE_FILTER !== "0";
|
||||||
|
let filteredHits = allHits;
|
||||||
|
let droppedCount = 0;
|
||||||
|
if (relevanceEnabled && allHits.length > 0) {
|
||||||
|
try {
|
||||||
|
const r = await fetch(`${OBSERVER_URL}/relevance`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
focus_file: { path: filePath, content: focusContent ?? "" },
|
||||||
|
chunks: allHits.map(h => ({
|
||||||
|
source: h.source_corpus,
|
||||||
|
doc_id: h.doc_id,
|
||||||
|
text: h.text,
|
||||||
|
score: h.score,
|
||||||
|
})),
|
||||||
|
threshold: Number(process.env.LH_RELEVANCE_THRESHOLD ?? 0.3),
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(5000),
|
||||||
|
});
|
||||||
|
if (r.ok) {
|
||||||
|
const fr: any = await r.json();
|
||||||
|
const keptKeys = new Set((fr.kept ?? []).map((c: any) => `${c.source}|${c.doc_id}`));
|
||||||
|
filteredHits = allHits.filter(h => keptKeys.has(`${h.source_corpus}|${h.doc_id}`));
|
||||||
|
droppedCount = (fr.dropped ?? []).length;
|
||||||
|
} else {
|
||||||
|
errors.relevance_filter = `HTTP ${r.status}`;
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
// Fall-open: filter failure must not block the pipeline.
|
||||||
|
errors.relevance_filter = e.message;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sort all hits by score desc, take top N
|
// Sort all hits by score desc, take top N
|
||||||
allHits.sort((a, b) => b.score - a.score);
|
filteredHits.sort((a, b) => b.score - a.score);
|
||||||
const topHits = allHits.slice(0, topOverall);
|
const topHits = filteredHits.slice(0, topOverall);
|
||||||
|
if (droppedCount > 0) byCorpus._relevance_dropped = droppedCount;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
hits: topHits,
|
hits: topHits,
|
||||||
@ -1349,7 +1389,7 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of
|
|||||||
// source so retrieval anchors against both the metadata and the
|
// source so retrieval anchors against both the metadata and the
|
||||||
// actual code being reviewed.
|
// actual code being reviewed.
|
||||||
const matrixQuery = `${taskClass} ${rel} ${content.slice(0, 500)}`;
|
const matrixQuery = `${taskClass} ${rel} ${content.slice(0, 500)}`;
|
||||||
const matrixCtx = await fetchMatrixContext(matrixQuery, taskClass, rel, 3, 8);
|
const matrixCtx = await fetchMatrixContext(matrixQuery, taskClass, rel, 3, 8, content);
|
||||||
provenApproachesPreamble = buildMatrixPreamble(matrixCtx);
|
provenApproachesPreamble = buildMatrixPreamble(matrixCtx);
|
||||||
const corporaSummary = Object.entries(matrixCtx.by_corpus)
|
const corporaSummary = Object.entries(matrixCtx.by_corpus)
|
||||||
.map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");
|
.map(([k, v]) => `${k.split("_v")[0]}=${v}`).join(" ");
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user