From b199093d1f7ddba103589c553f12a602b052b9da Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 20:08:56 -0500 Subject: [PATCH] =?UTF-8?q?B:=20matrix=20metadata=20filter=20=E2=80=94=20p?= =?UTF-8?q?ost-retrieval=20structured=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the reality-test gap surfaced by the candidates and multi-corpus e2e runs (0d1553c, a97881d): semantic-only retrieval can't gate by status / state / availability. SearchRequest now takes an optional MetadataFilter map; results whose metadata doesn't match every key are dropped before top-K truncation. Filter value semantics: string|number|bool → exact equality (JSON-canonical, so 1 ≡ 1.0) []any → OR within key (any element matching wins) AND across keys: every filter key must match. Missing key in metadata = drop. Malformed metadata = drop. Filter absent or empty = pass through (zero overhead). The response now reports MetadataFilterDropped so callers can see how aggressive the filter was without re-querying. Caveat (also captured in code comment): this is POST-retrieval, not PRE-filtering via SQL. Aggressive filters can shrink the result set below K; caller should bump PerCorpusK to compensate. A queryd- backed pre-filter is a future commit; this lands the user-visible fix today. Tests: - 7 unit tests (internal/matrix/filter_test.go) covering: nil/ empty filter pass-through, missing-metadata always-fails, single-value exact match (incl. numeric 5 ≡ 5.0), AND across keys, OR within list, bool match, malformed JSON metadata - matrix_smoke.sh: new assertion #7 — filter label∈{"a near","b near"} drops the 4 mid/far entries from the 6-entry pool, keeping exactly 2 (one per corpus, both with the matching label). Dropped count surfaces in the response. 15-smoke regression all green. vet clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/matrix/filter_test.go | 95 ++++++++++++++++++++++++ internal/matrix/retrieve.go | 127 +++++++++++++++++++++++++++++---- scripts/matrix_smoke.sh | 17 +++++ 3 files changed, 225 insertions(+), 14 deletions(-) create mode 100644 internal/matrix/filter_test.go diff --git a/internal/matrix/filter_test.go b/internal/matrix/filter_test.go new file mode 100644 index 0000000..509f55d --- /dev/null +++ b/internal/matrix/filter_test.go @@ -0,0 +1,95 @@ +package matrix + +import ( + "encoding/json" + "testing" +) + +func TestMatchesMetadataFilter_NoFilter_KeepsAll(t *testing.T) { + meta := json.RawMessage(`{"role":"Forklift Operator","state":"IL"}`) + if !matchesMetadataFilter(meta, nil) { + t.Error("nil filter should match everything") + } + if !matchesMetadataFilter(meta, map[string]any{}) { + t.Error("empty filter should match everything") + } +} + +func TestMatchesMetadataFilter_NoMetadata_AlwaysFails(t *testing.T) { + if matchesMetadataFilter(nil, map[string]any{"x": "y"}) { + t.Error("missing metadata should fail any filter") + } +} + +func TestMatchesMetadataFilter_SingleValueExactMatch(t *testing.T) { + meta := json.RawMessage(`{"state":"IL","status":"active","years":5}`) + cases := []struct { + filter map[string]any + want bool + }{ + {map[string]any{"state": "IL"}, true}, + {map[string]any{"state": "TX"}, false}, + {map[string]any{"status": "active"}, true}, + {map[string]any{"status": "inactive"}, false}, + // JSON normalizes both sides, so 5 matches 5.0 + {map[string]any{"years": 5.0}, true}, + {map[string]any{"years": 5}, true}, + // Missing key = fail + {map[string]any{"city": "Chicago"}, false}, + } + for _, c := range cases { + got := matchesMetadataFilter(meta, c.filter) + if got != c.want { + t.Errorf("filter %v on %s: want %v, got %v", c.filter, meta, c.want, got) + } + } +} + +func TestMatchesMetadataFilter_AllKeysAND(t *testing.T) { + meta := json.RawMessage(`{"state":"IL","status":"active","role":"Forklift Operator"}`) + if !matchesMetadataFilter(meta, map[string]any{ + "state": "IL", + "status": "active", + }) { + t.Error("both keys match: should pass") + } + if matchesMetadataFilter(meta, map[string]any{ + "state": "IL", + "status": "inactive", // mismatch + }) { + t.Error("one key mismatches: should fail (AND across keys)") + } +} + +func TestMatchesMetadataFilter_ListValueOR(t *testing.T) { + meta := json.RawMessage(`{"state":"IL"}`) + // state in {"IL","WI","IN"} → match + if !matchesMetadataFilter(meta, map[string]any{ + "state": []any{"IL", "WI", "IN"}, + }) { + t.Error("list with matching element: should pass") + } + // state in {"TX","CA"} → fail + if matchesMetadataFilter(meta, map[string]any{ + "state": []any{"TX", "CA"}, + }) { + t.Error("list with no matching element: should fail") + } +} + +func TestMatchesMetadataFilter_BoolMatch(t *testing.T) { + meta := json.RawMessage(`{"available":true,"placed":false}`) + if !matchesMetadataFilter(meta, map[string]any{"available": true}) { + t.Error("bool true match") + } + if matchesMetadataFilter(meta, map[string]any{"available": false}) { + t.Error("bool true should not match false filter") + } +} + +func TestMatchesMetadataFilter_MalformedMetadataFails(t *testing.T) { + meta := json.RawMessage(`{not valid json}`) + if matchesMetadataFilter(meta, map[string]any{"x": "y"}) { + t.Error("malformed metadata should fail") + } +} diff --git a/internal/matrix/retrieve.go b/internal/matrix/retrieve.go index 96dfd2b..e8c55ce 100644 --- a/internal/matrix/retrieve.go +++ b/internal/matrix/retrieve.go @@ -55,17 +55,29 @@ type Result struct { // DefaultPlaybookTopK. // PlaybookMaxDistance: cosine ceiling for "similar enough"; 0 = // DefaultPlaybookMaxDistance. +// +// Metadata filter (post-retrieval structured gate): +// MetadataFilter: map of metadata-field → expected value. Results +// whose metadata doesn't match every key are dropped. Addresses +// the reality-test gap surfaced in the candidates/workers +// experiments — pure semantic retrieval can't gate by status, +// state, etc. Caller can compensate for filter shrinkage by +// requesting larger PerCorpusK. +// Each filter value can be a single value (string|number|bool — +// whatever JSON unmarshals to `any`) or a []any meaning "any +// of these values" (OR semantics within one key, AND across keys). type SearchRequest struct { - QueryText string `json:"query_text,omitempty"` - QueryVector []float32 `json:"query_vector,omitempty"` - Corpora []string `json:"corpora"` - K int `json:"k"` - PerCorpusK int `json:"per_corpus_k,omitempty"` - Model string `json:"model,omitempty"` - UsePlaybook bool `json:"use_playbook,omitempty"` - PlaybookCorpus string `json:"playbook_corpus,omitempty"` - PlaybookTopK int `json:"playbook_top_k,omitempty"` - PlaybookMaxDistance float64 `json:"playbook_max_distance,omitempty"` + QueryText string `json:"query_text,omitempty"` + QueryVector []float32 `json:"query_vector,omitempty"` + Corpora []string `json:"corpora"` + K int `json:"k"` + PerCorpusK int `json:"per_corpus_k,omitempty"` + Model string `json:"model,omitempty"` + UsePlaybook bool `json:"use_playbook,omitempty"` + PlaybookCorpus string `json:"playbook_corpus,omitempty"` + PlaybookTopK int `json:"playbook_top_k,omitempty"` + PlaybookMaxDistance float64 `json:"playbook_max_distance,omitempty"` + MetadataFilter map[string]any `json:"metadata_filter,omitempty"` } // SearchResponse wraps the merged results plus per-corpus return @@ -73,10 +85,13 @@ type SearchRequest struct { // without re-querying. PlaybookBoosted is the count of results that // received a boost from playbook memory; useful for telemetry on // "how much the learning loop influenced this query." +// MetadataFilterDropped is the count of results dropped by the +// post-retrieval structured filter (when set in the request). type SearchResponse struct { - Results []Result `json:"results"` - PerCorpusCounts map[string]int `json:"per_corpus_counts"` - PlaybookBoosted int `json:"playbook_boosted,omitempty"` + Results []Result `json:"results"` + PerCorpusCounts map[string]int `json:"per_corpus_counts"` + PlaybookBoosted int `json:"playbook_boosted,omitempty"` + MetadataFilterDropped int `json:"metadata_filter_dropped,omitempty"` } // Retriever holds the HTTP clients to embedd and vectord. Stateless @@ -176,10 +191,32 @@ func (r *Retriever) Search(ctx context.Context, req SearchRequest) (*SearchRespo sort.SliceStable(allHits, func(i, j int) bool { return allHits[i].Distance < allHits[j].Distance }) + + // Metadata filter (component B — staffing-side structured gate). + // Applied BEFORE top-K truncation so the filter doesn't accidentally + // reduce coverage further. Caller can request larger PerCorpusK to + // compensate when filters are aggressive. + var dropped int + if len(req.MetadataFilter) > 0 { + filtered := make([]Result, 0, len(allHits)) + for _, h := range allHits { + if matchesMetadataFilter(h.Metadata, req.MetadataFilter) { + filtered = append(filtered, h) + } else { + dropped++ + } + } + allHits = filtered + } + if len(allHits) > req.K { allHits = allHits[:req.K] } - resp := &SearchResponse{Results: allHits, PerCorpusCounts: perCorpus} + resp := &SearchResponse{ + Results: allHits, + PerCorpusCounts: perCorpus, + MetadataFilterDropped: dropped, + } // Playbook boost (component 5). Reuses the query vector — no // extra embed call. If the playbook corpus doesn't exist (first @@ -352,6 +389,68 @@ func (r *Retriever) addItem(ctx context.Context, corpus, id string, vec []float3 return nil } +// matchesMetadataFilter reports whether a result's metadata satisfies +// the filter. Each filter key must be present in the metadata; the +// value must equal (or for a list filter, contain) the metadata +// value. Missing key = drop. Type mismatches are JSON-equality +// checked (e.g. filter wants 1 but metadata has 1.0 → match via +// canonical JSON form). +// +// Filter value semantics: +// string|number|bool → exact equality (after JSON normalization) +// []any → OR within key (any element matching wins) +// +// AND across keys: every filter key must match. +func matchesMetadataFilter(rawMeta json.RawMessage, filter map[string]any) bool { + if len(filter) == 0 { + return true + } + if len(rawMeta) == 0 { + return false // no metadata can't satisfy any filter + } + var meta map[string]any + if err := json.Unmarshal(rawMeta, &meta); err != nil { + return false + } + for k, expected := range filter { + got, present := meta[k] + if !present { + return false + } + if !valueMatches(got, expected) { + return false + } + } + return true +} + +// valueMatches handles single-value and list-value filter semantics. +// JSON-canonical equality so 1 ≡ 1.0 and "true" != true. +func valueMatches(got, expected any) bool { + if list, ok := expected.([]any); ok { + for _, e := range list { + if jsonEqual(got, e) { + return true + } + } + return false + } + return jsonEqual(got, expected) +} + +// jsonEqual marshals both sides and compares the canonical forms. +// Handles the float64-vs-int problem inherent to encoding/json +// (which decodes all numbers as float64) — both sides go through +// the same encoder so 1 == 1.0 if both came in as numbers. +func jsonEqual(a, b any) bool { + ab, errA := json.Marshal(a) + bb, errB := json.Marshal(b) + if errA != nil || errB != nil { + return false + } + return string(ab) == string(bb) +} + // Corpora returns the list of vectord index names. Thin proxy to // GET /vectors/index — exposed at the matrix layer so callers don't // need direct vectord access. diff --git a/scripts/matrix_smoke.sh b/scripts/matrix_smoke.sh index 2cd6f19..a22e0c9 100755 --- a/scripts/matrix_smoke.sh +++ b/scripts/matrix_smoke.sh @@ -204,6 +204,23 @@ else FAILED=1 fi +# ── 7. metadata filter (component B — staffing-side structured gate) +echo "[matrix-smoke] metadata_filter drops non-matching results:" +RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \ + -H 'Content-Type: application/json' \ + -d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3, + "metadata_filter":{"label":["a near","b near"]}}')" +RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')" +DROPPED="$(echo "$RESP" | jq -r '.metadata_filter_dropped')" +KEPT_LABELS="$(echo "$RESP" | jq -r '[.results[].metadata.label] | sort | join(",")')" +if [ "$RESULTS_LEN" = "2" ] && [ "$DROPPED" = "4" ] && [ "$KEPT_LABELS" = "a near,b near" ]; then + echo " ✓ filter kept 2 ('a near' + 'b near'), dropped 4 mid/far entries" +else + echo " ✗ len=$RESULTS_LEN dropped=$DROPPED labels=$KEPT_LABELS" + echo " full: $RESP" + FAILED=1 +fi + if [ "$FAILED" -eq 0 ]; then echo "[matrix-smoke] Matrix acceptance gate: PASSED" exit 0