Addresses the reality-test gap surfaced by the candidates and
multi-corpus e2e runs (0d1553c, a97881d): semantic-only retrieval
can't gate by status / state / availability. SearchRequest now
takes an optional MetadataFilter map; results whose metadata
doesn't match every key are dropped before top-K truncation.
Filter value semantics:
string|number|bool → exact equality (JSON-canonical, so 1 ≡ 1.0)
[]any → OR within key (any element matching wins)
AND across keys: every filter key must match.
Missing key in metadata = drop. Malformed metadata = drop. Filter
absent or empty = pass through (zero overhead).
The response now reports MetadataFilterDropped so callers can see
how aggressive the filter was without re-querying.
Caveat (also captured in code comment): this is POST-retrieval, not
PRE-filtering via SQL. Aggressive filters can shrink the result set
below K; caller should bump PerCorpusK to compensate. A queryd-
backed pre-filter is a future commit; this lands the user-visible
fix today.
Tests:
- 7 unit tests (internal/matrix/filter_test.go) covering: nil/
empty filter pass-through, missing-metadata always-fails,
single-value exact match (incl. numeric 5 ≡ 5.0), AND across
keys, OR within list, bool match, malformed JSON metadata
- matrix_smoke.sh: new assertion #7 — filter
label∈{"a near","b near"} drops the 4 mid/far entries from the
6-entry pool, keeping exactly 2 (one per corpus, both with the
matching label). Dropped count surfaces in the response.
15-smoke regression all green. vet clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
231 lines
9.5 KiB
Bash
Executable File
231 lines
9.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Matrix smoke — multi-corpus retrieve+merge via matrixd (SPEC §3.4).
|
|
# All assertions go through gateway :3110.
|
|
#
|
|
# Validates:
|
|
# - Multi-corpus search returns hits from BOTH corpora
|
|
# - Each result carries its corpus attribution (load-bearing — losing
|
|
# it defeats the matrix's purpose)
|
|
# - Merged top-k is ordered by distance across corpora
|
|
# - /matrix/corpora lists known indexes
|
|
# - Empty corpora list → 400
|
|
# - Bad corpus name → 502 (matrix bubbles vectord's 404 as upstream error)
|
|
#
|
|
# Uses query_vector (not query_text) to skip the embedd dependency so
|
|
# this smoke runs without Ollama. End-to-end embed→matrix→search has
|
|
# its own integration test (next commit).
|
|
#
|
|
# Usage: ./scripts/matrix_smoke.sh
|
|
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")/.."
|
|
|
|
export PATH="$PATH:/usr/local/go/bin"
|
|
|
|
echo "[matrix-smoke] building matrixd + vectord + gateway..."
|
|
go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway
|
|
|
|
pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true
|
|
sleep 0.3
|
|
|
|
PIDS=()
|
|
TMP="$(mktemp -d)"
|
|
CFG="$TMP/matrix.toml"
|
|
|
|
cleanup() {
|
|
echo "[matrix-smoke] cleanup"
|
|
for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done
|
|
rm -rf "$TMP"
|
|
}
|
|
trap cleanup EXIT INT TERM
|
|
|
|
# Custom toml: vectord persistence disabled (don't pollute storaged
|
|
# state with the test corpora).
|
|
cat > "$CFG" <<EOF
|
|
[gateway]
|
|
bind = "127.0.0.1:3110"
|
|
storaged_url = "http://127.0.0.1:3211"
|
|
catalogd_url = "http://127.0.0.1:3212"
|
|
ingestd_url = "http://127.0.0.1:3213"
|
|
queryd_url = "http://127.0.0.1:3214"
|
|
vectord_url = "http://127.0.0.1:3215"
|
|
embedd_url = "http://127.0.0.1:3216"
|
|
pathwayd_url = "http://127.0.0.1:3217"
|
|
matrixd_url = "http://127.0.0.1:3218"
|
|
|
|
[vectord]
|
|
bind = "127.0.0.1:3215"
|
|
storaged_url = ""
|
|
|
|
[matrixd]
|
|
bind = "127.0.0.1:3218"
|
|
embedd_url = "http://127.0.0.1:3216"
|
|
vectord_url = "http://127.0.0.1:3215"
|
|
EOF
|
|
|
|
poll_health() {
|
|
local port="$1" deadline=$(($(date +%s) + 5))
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
|
if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
|
|
sleep 0.05
|
|
done
|
|
return 1
|
|
}
|
|
|
|
echo "[matrix-smoke] launching vectord → matrixd → gateway..."
|
|
./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; }
|
|
|
|
./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3218 || { echo "matrixd failed"; tail /tmp/matrixd.log; exit 1; }
|
|
|
|
./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; }
|
|
|
|
FAILED=0
|
|
DIM=4
|
|
|
|
# Create two corpora — corpus_a and corpus_b — each with a few
|
|
# vectors at known distances from a chosen query vector.
|
|
echo "[matrix-smoke] create two corpora:"
|
|
for c in corpus_a corpus_b; do
|
|
HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/vectors/index \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"name\":\"$c\",\"dimension\":$DIM,\"distance\":\"euclidean\"}")"
|
|
if [ "$HTTP" != "201" ]; then echo " ✗ create $c → $HTTP"; FAILED=1; fi
|
|
done
|
|
echo " ✓ corpus_a and corpus_b created"
|
|
|
|
# Add vectors. Use euclidean distance for predictable arithmetic.
|
|
# Query vector will be [1,0,0,0]. Distances from it:
|
|
# corpus_a/a-near : [1.1, 0, 0, 0] ≈ 0.1
|
|
# corpus_a/a-mid : [1, 0.5, 0, 0] ≈ 0.5
|
|
# corpus_a/a-far : [3, 0, 0, 0] ≈ 2.0
|
|
# corpus_b/b-near : [1.05, 0, 0, 0] ≈ 0.05 (closest globally)
|
|
# corpus_b/b-mid : [1, 0.7, 0, 0] ≈ 0.7
|
|
# corpus_b/b-far : [4, 0, 0, 0] ≈ 3.0
|
|
echo "[matrix-smoke] add vectors to both corpora:"
|
|
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_a/add" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"items":[
|
|
{"id":"a-near","vector":[1.1,0,0,0],"metadata":{"label":"a near"}},
|
|
{"id":"a-mid","vector":[1,0.5,0,0],"metadata":{"label":"a mid"}},
|
|
{"id":"a-far","vector":[3,0,0,0],"metadata":{"label":"a far"}}
|
|
]}'
|
|
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_b/add" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"items":[
|
|
{"id":"b-near","vector":[1.05,0,0,0],"metadata":{"label":"b near"}},
|
|
{"id":"b-mid","vector":[1,0.7,0,0],"metadata":{"label":"b mid"}},
|
|
{"id":"b-far","vector":[4,0,0,0],"metadata":{"label":"b far"}}
|
|
]}'
|
|
echo " ✓ 3 + 3 vectors loaded"
|
|
|
|
# ── 1. /matrix/corpora lists both ─────────────────────────────────
|
|
echo "[matrix-smoke] /matrix/corpora lists both:"
|
|
RESP="$(curl -sS http://127.0.0.1:3110/v1/matrix/corpora)"
|
|
COUNT="$(echo "$RESP" | jq -r '.count')"
|
|
HAS_A="$(echo "$RESP" | jq -r '.corpora | index("corpus_a") != null')"
|
|
HAS_B="$(echo "$RESP" | jq -r '.corpora | index("corpus_b") != null')"
|
|
if [ "$COUNT" = "2" ] && [ "$HAS_A" = "true" ] && [ "$HAS_B" = "true" ]; then
|
|
echo " ✓ count=2, both corpora listed"
|
|
else
|
|
echo " ✗ resp: $RESP"; FAILED=1
|
|
fi
|
|
|
|
# ── 2. multi-corpus search returns hits from BOTH ─────────────────
|
|
echo "[matrix-smoke] /matrix/search multi-corpus retrieve+merge:"
|
|
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3}')"
|
|
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
|
|
A_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_a')"
|
|
B_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_b')"
|
|
HAS_A_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_a")] | length > 0')"
|
|
HAS_B_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_b")] | length > 0')"
|
|
if [ "$RESULTS_LEN" = "4" ] && [ "$A_COUNT" = "3" ] && [ "$B_COUNT" = "3" ] && [ "$HAS_A_RESULT" = "true" ] && [ "$HAS_B_RESULT" = "true" ]; then
|
|
echo " ✓ 4 merged results · 3+3 per-corpus · both corpora represented"
|
|
else
|
|
echo " ✗ len=$RESULTS_LEN per_corpus={a:$A_COUNT b:$B_COUNT} a_hit=$HAS_A_RESULT b_hit=$HAS_B_RESULT"
|
|
echo " full: $RESP"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 3. distance-merged top-k correct across corpora ───────────────
|
|
echo "[matrix-smoke] top hit comes from corpus_b (b-near is globally closest):"
|
|
TOP_ID="$(echo "$RESP" | jq -r '.results[0].id')"
|
|
TOP_CORPUS="$(echo "$RESP" | jq -r '.results[0].corpus')"
|
|
if [ "$TOP_ID" = "b-near" ] && [ "$TOP_CORPUS" = "corpus_b" ]; then
|
|
echo " ✓ top hit: id=b-near corpus=corpus_b (closer than corpus_a's a-near)"
|
|
else
|
|
echo " ✗ top: id=$TOP_ID corpus=$TOP_CORPUS (expected b-near/corpus_b)"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 4. corpus attribution preserved in metadata ───────────────────
|
|
echo "[matrix-smoke] metadata preserved on merged results:"
|
|
TOP_LABEL="$(echo "$RESP" | jq -r '.results[0].metadata.label')"
|
|
if [ "$TOP_LABEL" = "b near" ]; then
|
|
echo " ✓ metadata.label round-trips through matrix"
|
|
else
|
|
echo " ✗ label=$TOP_LABEL"; FAILED=1
|
|
fi
|
|
|
|
# ── 5. distances ascending in result list ─────────────────────────
|
|
echo "[matrix-smoke] results sorted by distance ascending:"
|
|
ASCENDING="$(echo "$RESP" | jq -r '[.results[].distance] | . == (sort)')"
|
|
if [ "$ASCENDING" = "true" ]; then
|
|
echo " ✓ distances ascending"
|
|
else
|
|
echo " ✗ distances not sorted: $(echo "$RESP" | jq -c '[.results[].distance]')"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 6. negative paths ─────────────────────────────────────────────
|
|
echo "[matrix-smoke] empty corpora → 400:"
|
|
HTTP_400="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":[],"k":4}')"
|
|
echo "[matrix-smoke] missing corpus name → 502:"
|
|
HTTP_502="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":["does_not_exist"],"k":4}')"
|
|
echo "[matrix-smoke] no query (empty text and vector) → 400:"
|
|
HTTP_400b="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"corpora":["corpus_a"],"k":4}')"
|
|
if [ "$HTTP_400" = "400" ] && [ "$HTTP_502" = "502" ] && [ "$HTTP_400b" = "400" ]; then
|
|
echo " ✓ empty=400, missing-corpus=502, no-query=400"
|
|
else
|
|
echo " ✗ empty=$HTTP_400 missing=$HTTP_502 noquery=$HTTP_400b"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 7. metadata filter (component B — staffing-side structured gate)
|
|
echo "[matrix-smoke] metadata_filter drops non-matching results:"
|
|
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3,
|
|
"metadata_filter":{"label":["a near","b near"]}}')"
|
|
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
|
|
DROPPED="$(echo "$RESP" | jq -r '.metadata_filter_dropped')"
|
|
KEPT_LABELS="$(echo "$RESP" | jq -r '[.results[].metadata.label] | sort | join(",")')"
|
|
if [ "$RESULTS_LEN" = "2" ] && [ "$DROPPED" = "4" ] && [ "$KEPT_LABELS" = "a near,b near" ]; then
|
|
echo " ✓ filter kept 2 ('a near' + 'b near'), dropped 4 mid/far entries"
|
|
else
|
|
echo " ✗ len=$RESULTS_LEN dropped=$DROPPED labels=$KEPT_LABELS"
|
|
echo " full: $RESP"
|
|
FAILED=1
|
|
fi
|
|
|
|
if [ "$FAILED" -eq 0 ]; then
|
|
echo "[matrix-smoke] Matrix acceptance gate: PASSED"
|
|
exit 0
|
|
else
|
|
echo "[matrix-smoke] Matrix acceptance gate: FAILED"
|
|
exit 1
|
|
fi
|