golangLAKEHOUSE/scripts/matrix_smoke.sh
root c1d96b7b60 matrixd: multi-corpus retrieve+merge — SPEC §3.4 component 2 of 5
Lands the matrix indexer's first piece per docs/SPEC.md §3.4:
multi-corpus retrieve+merge with corpus attribution per result.
Future components (relevance filter, downgrade gate, learning-loop
integration) layer on top of this surface.

Architecture:
  - internal/matrix/retrieve.go — Retriever takes (query, corpora,
    k, per_corpus_k), parallel-fans across vectord indexes, merges
    by distance ascending, preserves corpus origin per hit
  - cmd/matrixd — HTTP service on :3217, fronts /v1/matrix/*
  - gateway proxy + [matrixd] config + lakehouse.toml entry
  - Either query_text (matrix calls embedd) or query_vector
    (caller pre-embedded) — vector takes precedence if both set

Error policy: fail-loud on any corpus error. Silent partial returns
would lie about coverage, defeating the matrix's whole purpose.
Bubbles vectord errors as 502 (upstream), validation as 400.

Smoke (scripts/matrix_smoke.sh, 6 assertions PASS first try):
  - /matrix/corpora lists indexes
  - Multi-corpus search returns hits from BOTH corpora
  - Top hit is the globally-closest across all corpora
    (b-near beats a-near at distance 0.05 vs 0.1 — proves merge)
  - Metadata round-trips through the merge
  - Distances ascending in result list
  - Negative paths: empty corpora → 400, missing corpus → 502,
    no query → 400

12-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 18:39:17 -05:00

214 lines
8.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# Matrix smoke — multi-corpus retrieve+merge via matrixd (SPEC §3.4).
# All assertions go through gateway :3110.
#
# Validates:
# - Multi-corpus search returns hits from BOTH corpora
# - Each result carries its corpus attribution (load-bearing — losing
# it defeats the matrix's purpose)
# - Merged top-k is ordered by distance across corpora
# - /matrix/corpora lists known indexes
# - Empty corpora list → 400
# - Bad corpus name → 502 (matrix bubbles vectord's 404 as upstream error)
#
# Uses query_vector (not query_text) to skip the embedd dependency so
# this smoke runs without Ollama. End-to-end embed→matrix→search has
# its own integration test (next commit).
#
# Usage: ./scripts/matrix_smoke.sh
set -euo pipefail
cd "$(dirname "$0")/.."
export PATH="$PATH:/usr/local/go/bin"
echo "[matrix-smoke] building matrixd + vectord + gateway..."
go build -o bin/ ./cmd/matrixd ./cmd/vectord ./cmd/gateway
pkill -f "bin/(matrixd|vectord|gateway)" 2>/dev/null || true
sleep 0.3
PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/matrix.toml"
cleanup() {
echo "[matrix-smoke] cleanup"
for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done
rm -rf "$TMP"
}
trap cleanup EXIT INT TERM
# Custom toml: vectord persistence disabled (don't pollute storaged
# state with the test corpora).
cat > "$CFG" <<EOF
[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url = "http://127.0.0.1:3213"
queryd_url = "http://127.0.0.1:3214"
vectord_url = "http://127.0.0.1:3215"
embedd_url = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url = "http://127.0.0.1:3218"
[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""
[matrixd]
bind = "127.0.0.1:3218"
embedd_url = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF
poll_health() {
local port="$1" deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
sleep 0.05
done
return 1
}
echo "[matrix-smoke] launching vectord → matrixd → gateway..."
./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 &
PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; tail /tmp/vectord.log; exit 1; }
./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 &
PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; tail /tmp/matrixd.log; exit 1; }
./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 &
PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; tail /tmp/gateway.log; exit 1; }
FAILED=0
DIM=4
# Create two corpora — corpus_a and corpus_b — each with a few
# vectors at known distances from a chosen query vector.
echo "[matrix-smoke] create two corpora:"
for c in corpus_a corpus_b; do
HTTP="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/vectors/index \
-H 'Content-Type: application/json' \
-d "{\"name\":\"$c\",\"dimension\":$DIM,\"distance\":\"euclidean\"}")"
if [ "$HTTP" != "201" ]; then echo " ✗ create $c$HTTP"; FAILED=1; fi
done
echo " ✓ corpus_a and corpus_b created"
# Add vectors. Use euclidean distance for predictable arithmetic.
# Query vector will be [1,0,0,0]. Distances from it:
# corpus_a/a-near : [1.1, 0, 0, 0] ≈ 0.1
# corpus_a/a-mid : [1, 0.5, 0, 0] ≈ 0.5
# corpus_a/a-far : [3, 0, 0, 0] ≈ 2.0
# corpus_b/b-near : [1.05, 0, 0, 0] ≈ 0.05 (closest globally)
# corpus_b/b-mid : [1, 0.7, 0, 0] ≈ 0.7
# corpus_b/b-far : [4, 0, 0, 0] ≈ 3.0
echo "[matrix-smoke] add vectors to both corpora:"
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_a/add" \
-H 'Content-Type: application/json' \
-d '{"items":[
{"id":"a-near","vector":[1.1,0,0,0],"metadata":{"label":"a near"}},
{"id":"a-mid","vector":[1,0.5,0,0],"metadata":{"label":"a mid"}},
{"id":"a-far","vector":[3,0,0,0],"metadata":{"label":"a far"}}
]}'
curl -sS -o /dev/null -X POST "http://127.0.0.1:3110/v1/vectors/index/corpus_b/add" \
-H 'Content-Type: application/json' \
-d '{"items":[
{"id":"b-near","vector":[1.05,0,0,0],"metadata":{"label":"b near"}},
{"id":"b-mid","vector":[1,0.7,0,0],"metadata":{"label":"b mid"}},
{"id":"b-far","vector":[4,0,0,0],"metadata":{"label":"b far"}}
]}'
echo " ✓ 3 + 3 vectors loaded"
# ── 1. /matrix/corpora lists both ─────────────────────────────────
echo "[matrix-smoke] /matrix/corpora lists both:"
RESP="$(curl -sS http://127.0.0.1:3110/v1/matrix/corpora)"
COUNT="$(echo "$RESP" | jq -r '.count')"
HAS_A="$(echo "$RESP" | jq -r '.corpora | index("corpus_a") != null')"
HAS_B="$(echo "$RESP" | jq -r '.corpora | index("corpus_b") != null')"
if [ "$COUNT" = "2" ] && [ "$HAS_A" = "true" ] && [ "$HAS_B" = "true" ]; then
echo " ✓ count=2, both corpora listed"
else
echo " ✗ resp: $RESP"; FAILED=1
fi
# ── 2. multi-corpus search returns hits from BOTH ─────────────────
echo "[matrix-smoke] /matrix/search multi-corpus retrieve+merge:"
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":["corpus_a","corpus_b"],"k":4,"per_corpus_k":3}')"
RESULTS_LEN="$(echo "$RESP" | jq -r '.results | length')"
A_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_a')"
B_COUNT="$(echo "$RESP" | jq -r '.per_corpus_counts.corpus_b')"
HAS_A_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_a")] | length > 0')"
HAS_B_RESULT="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="corpus_b")] | length > 0')"
if [ "$RESULTS_LEN" = "4" ] && [ "$A_COUNT" = "3" ] && [ "$B_COUNT" = "3" ] && [ "$HAS_A_RESULT" = "true" ] && [ "$HAS_B_RESULT" = "true" ]; then
echo " ✓ 4 merged results · 3+3 per-corpus · both corpora represented"
else
echo " ✗ len=$RESULTS_LEN per_corpus={a:$A_COUNT b:$B_COUNT} a_hit=$HAS_A_RESULT b_hit=$HAS_B_RESULT"
echo " full: $RESP"
FAILED=1
fi
# ── 3. distance-merged top-k correct across corpora ───────────────
echo "[matrix-smoke] top hit comes from corpus_b (b-near is globally closest):"
TOP_ID="$(echo "$RESP" | jq -r '.results[0].id')"
TOP_CORPUS="$(echo "$RESP" | jq -r '.results[0].corpus')"
if [ "$TOP_ID" = "b-near" ] && [ "$TOP_CORPUS" = "corpus_b" ]; then
echo " ✓ top hit: id=b-near corpus=corpus_b (closer than corpus_a's a-near)"
else
echo " ✗ top: id=$TOP_ID corpus=$TOP_CORPUS (expected b-near/corpus_b)"
FAILED=1
fi
# ── 4. corpus attribution preserved in metadata ───────────────────
echo "[matrix-smoke] metadata preserved on merged results:"
TOP_LABEL="$(echo "$RESP" | jq -r '.results[0].metadata.label')"
if [ "$TOP_LABEL" = "b near" ]; then
echo " ✓ metadata.label round-trips through matrix"
else
echo " ✗ label=$TOP_LABEL"; FAILED=1
fi
# ── 5. distances ascending in result list ─────────────────────────
echo "[matrix-smoke] results sorted by distance ascending:"
ASCENDING="$(echo "$RESP" | jq -r '[.results[].distance] | . == (sort)')"
if [ "$ASCENDING" = "true" ]; then
echo " ✓ distances ascending"
else
echo " ✗ distances not sorted: $(echo "$RESP" | jq -c '[.results[].distance]')"
FAILED=1
fi
# ── 6. negative paths ─────────────────────────────────────────────
echo "[matrix-smoke] empty corpora → 400:"
HTTP_400="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":[],"k":4}')"
echo "[matrix-smoke] missing corpus name → 502:"
HTTP_502="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"query_vector":[1,0,0,0],"corpora":["does_not_exist"],"k":4}')"
echo "[matrix-smoke] no query (empty text and vector) → 400:"
HTTP_400b="$(curl -sS -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:3110/v1/matrix/search \
-H 'Content-Type: application/json' \
-d '{"corpora":["corpus_a"],"k":4}')"
if [ "$HTTP_400" = "400" ] && [ "$HTTP_502" = "502" ] && [ "$HTTP_400b" = "400" ]; then
echo " ✓ empty=400, missing-corpus=502, no-query=400"
else
echo " ✗ empty=$HTTP_400 missing=$HTTP_502 noquery=$HTTP_400b"
FAILED=1
fi
if [ "$FAILED" -eq 0 ]; then
echo "[matrix-smoke] Matrix acceptance gate: PASSED"
exit 0
else
echo "[matrix-smoke] Matrix acceptance gate: FAILED"
exit 1
fi