golangLAKEHOUSE/scripts/matrix_smoke.sh
root b199093d1f B: matrix metadata filter — post-retrieval structured gate
Addresses the reality-test gap surfaced by the candidates and
multi-corpus e2e runs (0d1553c, a97881d): semantic-only retrieval
can't gate by status / state / availability. SearchRequest now
takes an optional MetadataFilter map; results whose metadata
doesn't match every key are dropped before top-K truncation.

Filter value semantics:
  string|number|bool → exact equality (JSON-canonical, so 1 ≡ 1.0)
  []any              → OR within key (any element matching wins)
  AND across keys: every filter key must match.

Missing key in metadata = drop. Malformed metadata = drop. Filter
absent or empty = pass through (zero overhead).

The response now reports MetadataFilterDropped so callers can see
how aggressive the filter was without re-querying.

Caveat (also captured in code comment): this is POST-retrieval, not
PRE-filtering via SQL. Aggressive filters can shrink the result set
below K; caller should bump PerCorpusK to compensate. A queryd-
backed pre-filter is a future commit; this lands the user-visible
fix today.

Tests:
  - 7 unit tests (internal/matrix/filter_test.go) covering: nil/
    empty filter pass-through, missing-metadata always-fails,
    single-value exact match (incl. numeric 5 ≡ 5.0), AND across
    keys, OR within list, bool match, malformed JSON metadata
  - matrix_smoke.sh: new assertion #7 — filter
    label∈{"a near","b near"} drops the 4 mid/far entries from the
    6-entry pool, keeping exactly 2 (one per corpus, both with the
    matching label). Dropped count surfaces in the response.

15-smoke regression all green. vet clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:08:56 -05:00

231 lines
9.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# Matrix smoke — multi-corpus retrieve+merge via matrixd (SPEC §3.4).
# All assertions go through gateway :3110.
#
# Validates:
# - Multi-corpus search returns hits from BOTH corpora
# - Each result carries its corpus attribution (load-bearing — losing
# it defeats the matrix's purpose)
# - Merged top-k is ordered by distance across corpora
# - /matrix/corpora lists known indexes
# - Empty corpora list → 400
# - Bad corpus name → 502 (matrix bubbles vectord's 404 as upstream error)
#
# Uses query_vector (not query_text) to skip the embedd dependency so
# this smoke runs without Ollama. End-to-end embed→matrix→search has
# its own integration test (next commit).
#
# Usage: ./scripts/matrix_smoke.sh
set -euo pipefail
cd "$(dirname "$0")/.."
export PATH="$PATH:/usr/local/go/bin"
echo "[matrix-smoke] building matrixd + vectord + gateway..."
go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway
pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true
sleep 0.3
PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/matrix.toml"
cleanup() {
echo "[matrix-smoke] cleanup"
for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done
rm -rf "$TMP"
}
trap cleanup EXIT INT TERM
# Custom toml: vectord persistence disabled (don't pollute storaged
# state with the test corpora).
cat > "$CFG" <<EOF
[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url = "http://127.0.0.1:3213"
queryd_url = "http://127.0.0.1:3214"
vectord_url = "http://127.0.0.1:3215"
embedd_url = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url = "http://127.0.0.1:3218"
[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""
[matrixd]
bind = "127.0.0.1:3218"
embedd_url = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF
poll_health() {
local port="$1" deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
sleep 0.05
done
return 1
}
echo "[matrix-smoke] launching vectord → matrixd → gateway..."
./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 &
PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; }
./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 &
PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; tail /tmp/matrixd.log; exit 1; }
./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 &
PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; }
FAILED=0
DIM=4
# Create two corpora — corpus_a and corpus_b — each with a few
# vectors at known distances from a chosen query vector.
echo "[matrix-smoke] create two corpora:"
for c in corpus_a corpus_b; do
HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/vectors/index \
-H 'Content-Type: application/json' \
-d "{\"name\":\"$c\",\"dimension\":$DIM,\"distance\":\"euclidean\"}")"
if [ "$HTTP" != "201" ]; then echo " ✗ create $c$HTTP"; FAILED=1; fi
done
echo " ✓ corpus_a and corpus_b created"
# Add vectors. Use euclidean distance for predictable arithmetic.
# Query vector will be [1,0,0,0]. Distances from it:
# corpus_a/a-near : [1.1, 0, 0, 0] ≈ 0.1
# corpus_a/a-mid : [1, 0.5, 0, 0] ≈ 0.5
# corpus_a/a-far : [3, 0, 0, 0] ≈ 2.0
# corpus_b/b-near : [1.05, 0, 0, 0] ≈ 0.05 (closest globally)
# corpus_b/b-mid : [1, 0.7, 0, 0] ≈ 0.7
# corpus_b/b-far : [4, 0, 0, 0] ≈ 3.0
echo "[matrix-smoke] add vectors to both corpora:"
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_a/add" \
-H 'Content-Type: application/json' \
-d '{"items":[
{"id":"a-near","vector":[1.1,0,0,0],"metadata":{"label":"a near"}},
{"id":"a-mid","vector":[1,0.5,0,0],"metadata":{"label":"a mid"}},
{"id":"a-far","vector":[3,0,0,0],"metadata":{"label":"a far"}}
]}'
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_b/add" \
-H 'Content-Type: application/json' \
-d '{"items":[
{"id":"b-near","vector":[1.05,0,0,0],"metadata":{"label":"b near"}},
{"id":"b-mid","vector":[1,0.7,0,0],"metadata":{"label":"b mid"}},
{"id":"b-far","vector":[4,0,0,0],"metadata":{"label":"b far"}}
]}'
echo " ✓ 3 + 3 vectors loaded"
# ── 1. /matrix/corpora lists both ─────────────────────────────────
echo "[matrix-smoke] /matrix/corpora lists both:"
RESP="$(curl -sS http://127.0.0.1:3110/v1/matrix/corpora)"
COUNT="$(echo "$RESP" | jq -r '.count')"
HAS_A="$(echo "$RESP" | jq -r '.corpora | index("corpus_a") != null')"
HAS_B="$(echo "$RESP" | jq -r '.corpora | index("corpus_b") != null')"
if [ "$COUNT" = "2" ] && [ "$HAS_A" = "true" ] && [ "$HAS_B" = "true" ]; then
echo " ✓ count=2, both corpora listed"
else
echo " ✗ resp: $RESP"; FAILED=1
fi
# ── 2. multi-corpus search returns hits from BOTH ─────────────────
echo "[matrix-smoke] /matrix/search multi-corpus retrieve+merge:"
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3}')"
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
A_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_a')"
B_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_b')"
HAS_A_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_a")] | length > 0')"
HAS_B_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_b")] | length > 0')"
if [ "$RESULTS_LEN" = "4" ] && [ "$A_COUNT" = "3" ] && [ "$B_COUNT" = "3" ] && [ "$HAS_A_RESULT" = "true" ] && [ "$HAS_B_RESULT" = "true" ]; then
echo " ✓ 4 merged results · 3+3 per-corpus · both corpora represented"
else
echo " ✗ len=$RESULTS_LEN per_corpus={a:$A_COUNT b:$B_COUNT} a_hit=$HAS_A_RESULT b_hit=$HAS_B_RESULT"
echo " full: $RESP"
FAILED=1
fi
# ── 3. distance-merged top-k correct across corpora ───────────────
echo "[matrix-smoke] top hit comes from corpus_b (b-near is globally closest):"
TOP_ID="$(echo "$RESP" | jq -r '.results[0].id')"
TOP_CORPUS="$(echo "$RESP" | jq -r '.results[0].corpus')"
if [ "$TOP_ID" = "b-near" ] && [ "$TOP_CORPUS" = "corpus_b" ]; then
echo " ✓ top hit: id=b-near corpus=corpus_b (closer than corpus_a's a-near)"
else
echo " ✗ top: id=$TOP_ID corpus=$TOP_CORPUS (expected b-near/corpus_b)"
FAILED=1
fi
# ── 4. corpus attribution preserved in metadata ───────────────────
echo "[matrix-smoke] metadata preserved on merged results:"
TOP_LABEL="$(echo "$RESP" | jq -r '.results[0].metadata.label')"
if [ "$TOP_LABEL" = "b near" ]; then
echo " ✓ metadata.label round-trips through matrix"
else
echo " ✗ label=$TOP_LABEL"; FAILED=1
fi
# ── 5. distances ascending in result list ─────────────────────────
echo "[matrix-smoke] results sorted by distance ascending:"
ASCENDING="$(echo "$RESP" | jq -r '[.results[].distance] | . == (sort)')"
if [ "$ASCENDING" = "true" ]; then
echo " ✓ distances ascending"
else
echo " ✗ distances not sorted: $(echo "$RESP" | jq -c '[.results[].distance]')"
FAILED=1
fi
# ── 6. negative paths ─────────────────────────────────────────────
echo "[matrix-smoke] empty corpora → 400:"
HTTP_400="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":[],"k":4}')"
echo "[matrix-smoke] missing corpus name → 502:"
HTTP_502="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":["does_not_exist"],"k":4}')"
echo "[matrix-smoke] no query (empty text and vector) → 400:"
HTTP_400b="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"corpora":["corpus_a"],"k":4}')"
if [ "$HTTP_400" = "400" ] && [ "$HTTP_502" = "502" ] && [ "$HTTP_400b" = "400" ]; then
echo " ✓ empty=400, missing-corpus=502, no-query=400"
else
echo " ✗ empty=$HTTP_400 missing=$HTTP_502 noquery=$HTTP_400b"
FAILED=1
fi
# ── 7. metadata filter (component B — staffing-side structured gate)
echo "[matrix-smoke] metadata_filter drops non-matching results:"
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3,
"metadata_filter":{"label":["a near","b near"]}}')"
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
DROPPED="$(echo "$RESP" | jq -r '.metadata_filter_dropped')"
KEPT_LABELS="$(echo "$RESP" | jq -r '[.results[].metadata.label] | sort | join(",")')"
if [ "$RESULTS_LEN" = "2" ] && [ "$DROPPED" = "4" ] && [ "$KEPT_LABELS" = "a near,b near" ]; then
echo " ✓ filter kept 2 ('a near' + 'b near'), dropped 4 mid/far entries"
else
echo " ✗ len=$RESULTS_LEN dropped=$DROPPED labels=$KEPT_LABELS"
echo " full: $RESP"
FAILED=1
fi
if [ "$FAILED" -eq 0 ]; then
echo "[matrix-smoke] Matrix acceptance gate: PASSED"
exit 0
else
echo "[matrix-smoke] Matrix acceptance gate: FAILED"
exit 1
fi