Lands the matrix indexer's first piece per docs/SPEC.md §3.4:
multi-corpus retrieve+merge with corpus attribution per result.
Future components (relevance filter, downgrade gate, learning-loop
integration) layer on top of this surface.
Architecture:
- internal/matrix/retrieve.go — Retriever takes (query, corpora,
k, per_corpus_k), parallel-fans across vectord indexes, merges
by distance ascending, preserves corpus origin per hit
- cmd/matrixd — HTTP service on :3217, fronts /v1/matrix/*
- gateway proxy + [matrixd] config + lakehouse.toml entry
- Either query_text (matrix calls embedd) or query_vector
(caller pre-embedded) — vector takes precedence if both set
Error policy: fail-loud on any corpus error. Silent partial returns
would lie about coverage, defeating the matrix's whole purpose.
Bubbles vectord errors as 502 (upstream), validation as 400.
Smoke (scripts/matrix_smoke.sh, 6 assertions PASS first try):
- /matrix/corpora lists indexes
- Multi-corpus search returns hits from BOTH corpora
- Top hit is the globally-closest across all corpora
(b-near beats a-near at distance 0.05 vs 0.1 — proves merge)
- Metadata round-trips through the merge
- Distances ascending in result list
- Negative paths: empty corpora → 400, missing corpus → 502,
no query → 400
12-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
214 lines
8.6 KiB
Bash
Executable File
214 lines
8.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Matrix smoke — multi-corpus retrieve+merge via matrixd (SPEC §3.4).
|
|
# All assertions go through gateway :3110.
|
|
#
|
|
# Validates:
|
|
# - Multi-corpus search returns hits from BOTH corpora
|
|
# - Each result carries its corpus attribution (load-bearing — losing
|
|
# it defeats the matrix's purpose)
|
|
# - Merged top-k is ordered by distance across corpora
|
|
# - /matrix/corpora lists known indexes
|
|
# - Empty corpora list → 400
|
|
# - Bad corpus name → 502 (matrix bubbles vectord's 404 as upstream error)
|
|
#
|
|
# Uses query_vector (not query_text) to skip the embedd dependency so
|
|
# this smoke runs without Ollama. End-to-end embed→matrix→search has
|
|
# its own integration test (next commit).
|
|
#
|
|
# Usage: ./scripts/matrix_smoke.sh
|
|
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")/.."
|
|
|
|
export PATH="$PATH:/usr/local/go/bin"
|
|
|
|
echo "[matrix-smoke] building matrixd + vectord + gateway..."
|
|
go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway
|
|
|
|
pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true
|
|
sleep 0.3
|
|
|
|
PIDS=()
|
|
TMP="$(mktemp -d)"
|
|
CFG="$TMP/matrix.toml"
|
|
|
|
cleanup() {
|
|
echo "[matrix-smoke] cleanup"
|
|
for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done
|
|
rm -rf "$TMP"
|
|
}
|
|
trap cleanup EXIT INT TERM
|
|
|
|
# Custom toml: vectord persistence disabled (don't pollute storaged
|
|
# state with the test corpora).
|
|
cat > "$CFG" <<EOF
|
|
[gateway]
|
|
bind = "127.0.0.1:3110"
|
|
storaged_url = "http://127.0.0.1:3211"
|
|
catalogd_url = "http://127.0.0.1:3212"
|
|
ingestd_url = "http://127.0.0.1:3213"
|
|
queryd_url = "http://127.0.0.1:3214"
|
|
vectord_url = "http://127.0.0.1:3215"
|
|
embedd_url = "http://127.0.0.1:3216"
|
|
pathwayd_url = "http://127.0.0.1:3217"
|
|
matrixd_url = "http://127.0.0.1:3218"
|
|
|
|
[vectord]
|
|
bind = "127.0.0.1:3215"
|
|
storaged_url = ""
|
|
|
|
[matrixd]
|
|
bind = "127.0.0.1:3218"
|
|
embedd_url = "http://127.0.0.1:3216"
|
|
vectord_url = "http://127.0.0.1:3215"
|
|
EOF
|
|
|
|
poll_health() {
|
|
local port="$1" deadline=$(($(date +%s) + 5))
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
|
if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
|
|
sleep 0.05
|
|
done
|
|
return 1
|
|
}
|
|
|
|
echo "[matrix-smoke] launching vectord → matrixd → gateway..."
|
|
./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; }
|
|
|
|
./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3218 || { echo "matrixd failed"; tail /tmp/matrixd.log; exit 1; }
|
|
|
|
./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 &
|
|
PIDS+=($!)
|
|
poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; }
|
|
|
|
FAILED=0
|
|
DIM=4
|
|
|
|
# Create two corpora — corpus_a and corpus_b — each with a few
|
|
# vectors at known distances from a chosen query vector.
|
|
echo "[matrix-smoke] create two corpora:"
|
|
for c in corpus_a corpus_b; do
|
|
HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/vectors/index \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"name\":\"$c\",\"dimension\":$DIM,\"distance\":\"euclidean\"}")"
|
|
if [ "$HTTP" != "201" ]; then echo " ✗ create $c → $HTTP"; FAILED=1; fi
|
|
done
|
|
echo " ✓ corpus_a and corpus_b created"
|
|
|
|
# Add vectors. Use euclidean distance for predictable arithmetic.
|
|
# Query vector will be [1,0,0,0]. Distances from it:
|
|
# corpus_a/a-near : [1.1, 0, 0, 0] ≈ 0.1
|
|
# corpus_a/a-mid : [1, 0.5, 0, 0] ≈ 0.5
|
|
# corpus_a/a-far : [3, 0, 0, 0] ≈ 2.0
|
|
# corpus_b/b-near : [1.05, 0, 0, 0] ≈ 0.05 (closest globally)
|
|
# corpus_b/b-mid : [1, 0.7, 0, 0] ≈ 0.7
|
|
# corpus_b/b-far : [4, 0, 0, 0] ≈ 3.0
|
|
echo "[matrix-smoke] add vectors to both corpora:"
|
|
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_a/add" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"items":[
|
|
{"id":"a-near","vector":[1.1,0,0,0],"metadata":{"label":"a near"}},
|
|
{"id":"a-mid","vector":[1,0.5,0,0],"metadata":{"label":"a mid"}},
|
|
{"id":"a-far","vector":[3,0,0,0],"metadata":{"label":"a far"}}
|
|
]}'
|
|
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_b/add" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"items":[
|
|
{"id":"b-near","vector":[1.05,0,0,0],"metadata":{"label":"b near"}},
|
|
{"id":"b-mid","vector":[1,0.7,0,0],"metadata":{"label":"b mid"}},
|
|
{"id":"b-far","vector":[4,0,0,0],"metadata":{"label":"b far"}}
|
|
]}'
|
|
echo " ✓ 3 + 3 vectors loaded"
|
|
|
|
# ── 1. /matrix/corpora lists both ─────────────────────────────────
|
|
echo "[matrix-smoke] /matrix/corpora lists both:"
|
|
RESP="$(curl -sS http://127.0.0.1:3110/v1/matrix/corpora)"
|
|
COUNT="$(echo "$RESP" | jq -r '.count')"
|
|
HAS_A="$(echo "$RESP" | jq -r '.corpora | index("corpus_a") != null')"
|
|
HAS_B="$(echo "$RESP" | jq -r '.corpora | index("corpus_b") != null')"
|
|
if [ "$COUNT" = "2" ] && [ "$HAS_A" = "true" ] && [ "$HAS_B" = "true" ]; then
|
|
echo " ✓ count=2, both corpora listed"
|
|
else
|
|
echo " ✗ resp: $RESP"; FAILED=1
|
|
fi
|
|
|
|
# ── 2. multi-corpus search returns hits from BOTH ─────────────────
|
|
echo "[matrix-smoke] /matrix/search multi-corpus retrieve+merge:"
|
|
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3}')"
|
|
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
|
|
A_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_a')"
|
|
B_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_b')"
|
|
HAS_A_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_a")] | length > 0')"
|
|
HAS_B_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_b")] | length > 0')"
|
|
if [ "$RESULTS_LEN" = "4" ] && [ "$A_COUNT" = "3" ] && [ "$B_COUNT" = "3" ] && [ "$HAS_A_RESULT" = "true" ] && [ "$HAS_B_RESULT" = "true" ]; then
|
|
echo " ✓ 4 merged results · 3+3 per-corpus · both corpora represented"
|
|
else
|
|
echo " ✗ len=$RESULTS_LEN per_corpus={a:$A_COUNT b:$B_COUNT} a_hit=$HAS_A_RESULT b_hit=$HAS_B_RESULT"
|
|
echo " full: $RESP"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 3. distance-merged top-k correct across corpora ───────────────
|
|
echo "[matrix-smoke] top hit comes from corpus_b (b-near is globally closest):"
|
|
TOP_ID="$(echo "$RESP" | jq -r '.results[0].id')"
|
|
TOP_CORPUS="$(echo "$RESP" | jq -r '.results[0].corpus')"
|
|
if [ "$TOP_ID" = "b-near" ] && [ "$TOP_CORPUS" = "corpus_b" ]; then
|
|
echo " ✓ top hit: id=b-near corpus=corpus_b (closer than corpus_a's a-near)"
|
|
else
|
|
echo " ✗ top: id=$TOP_ID corpus=$TOP_CORPUS (expected b-near/corpus_b)"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 4. corpus attribution preserved in metadata ───────────────────
|
|
echo "[matrix-smoke] metadata preserved on merged results:"
|
|
TOP_LABEL="$(echo "$RESP" | jq -r '.results[0].metadata.label')"
|
|
if [ "$TOP_LABEL" = "b near" ]; then
|
|
echo " ✓ metadata.label round-trips through matrix"
|
|
else
|
|
echo " ✗ label=$TOP_LABEL"; FAILED=1
|
|
fi
|
|
|
|
# ── 5. distances ascending in result list ─────────────────────────
|
|
echo "[matrix-smoke] results sorted by distance ascending:"
|
|
ASCENDING="$(echo "$RESP" | jq -r '[.results[].distance] | . == (sort)')"
|
|
if [ "$ASCENDING" = "true" ]; then
|
|
echo " ✓ distances ascending"
|
|
else
|
|
echo " ✗ distances not sorted: $(echo "$RESP" | jq -c '[.results[].distance]')"
|
|
FAILED=1
|
|
fi
|
|
|
|
# ── 6. negative paths ─────────────────────────────────────────────
|
|
echo "[matrix-smoke] empty corpora → 400:"
|
|
HTTP_400="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":[],"k":4}')"
|
|
echo "[matrix-smoke] missing corpus name → 502:"
|
|
HTTP_502="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query_vector":[1,0,0,0],"corpora":["does_not_exist"],"k":4}')"
|
|
echo "[matrix-smoke] no query (empty text and vector) → 400:"
|
|
HTTP_400b="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"corpora":["corpus_a"],"k":4}')"
|
|
if [ "$HTTP_400" = "400" ] && [ "$HTTP_502" = "502" ] && [ "$HTTP_400b" = "400" ]; then
|
|
echo " ✓ empty=400, missing-corpus=502, no-query=400"
|
|
else
|
|
echo " ✗ empty=$HTTP_400 missing=$HTTP_502 noquery=$HTTP_400b"
|
|
FAILED=1
|
|
fi
|
|
|
|
if [ "$FAILED" -eq 0 ]; then
|
|
echo "[matrix-smoke] Matrix acceptance gate: PASSED"
|
|
exit 0
|
|
else
|
|
echo "[matrix-smoke] Matrix acceptance gate: FAILED"
|
|
exit 1
|
|
fi
|