#!/usr/bin/env bash # Multi-corpus reality test — first deep-field test with TWO real # staffing corpora composed via /v1/matrix/search. # # Pipeline: # - Bring up the Go stack (storaged, embedd, vectord, matrixd, gateway) # - Ingest workers (5000 rows from workers_500k.parquet) # - Ingest candidates (1000 rows from candidates.parquet) # - Run a real query through /v1/matrix/search with both corpora # - Print the merged top-k with corpus attribution # # Headline assertion: results include hits from BOTH corpora (the # whole point of multi-corpus matrix retrieval). # # Requires: Ollama on :11434 with nomic-embed-text loaded. Skips # (exit 0) when Ollama is absent. # # Usage: ./scripts/multi_corpus_e2e.sh # ./scripts/multi_corpus_e2e.sh "your custom query" set -euo pipefail cd "$(dirname "$0")/.." export PATH="$PATH:/usr/local/go/bin" QUERY="${1:-Forklift operator with OSHA-30 certification, warehouse experience}" if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then echo "[multi-corpus-e2e] Ollama not reachable on :11434 — skipping" exit 0 fi echo "[multi-corpus-e2e] building binaries..." go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \ ./scripts/staffing_workers ./scripts/staffing_candidates pkill -f "bin/(storaged|embedd|vectord|matrixd|gateway)" 2>/dev/null || true sleep 0.3 PIDS=() TMP="$(mktemp -d)" CFG="$TMP/e2e.toml" cleanup() { echo "[multi-corpus-e2e] cleanup" for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done rm -rf "$TMP" } trap cleanup EXIT INT TERM # Ephemeral mode (vectord storaged_url=""); same rationale as # candidates_e2e — don't pollute MinIO _vectors/ between runs. cat > "$CFG" </dev/null 2>&1; then return 0; fi sleep 0.05 done return 1 } echo "[multi-corpus-e2e] launching stack..." ./bin/storaged -config "$CFG" > /tmp/storaged.log 2>&1 & PIDS+=($!) poll_health 3211 || { echo "storaged failed"; exit 1; } ./bin/embedd -config "$CFG" > /tmp/embedd.log 2>&1 & PIDS+=($!) poll_health 3216 || { echo "embedd failed"; exit 1; } ./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & PIDS+=($!) poll_health 3215 || { echo "vectord failed"; exit 1; } ./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & PIDS+=($!) poll_health 3218 || { echo "matrixd failed"; exit 1; } ./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & PIDS+=($!) poll_health 3110 || { echo "gateway failed"; exit 1; } echo echo "[multi-corpus-e2e] ingest workers (limit=5000)..." ./bin/staffing_workers -limit 5000 echo echo "[multi-corpus-e2e] ingest candidates..." ./bin/staffing_candidates -skip-populate=false -query "$QUERY" 2>&1 | grep -v "^\[candidates\]\(matrix\|reality\)" || true echo echo "[multi-corpus-e2e] /matrix/corpora — confirm both registered:" curl -sS http://127.0.0.1:3110/v1/matrix/corpora | jq -c echo echo "[multi-corpus-e2e] multi-corpus query: $QUERY" RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \ -H 'Content-Type: application/json' \ -d "{\"query_text\":\"$QUERY\",\"corpora\":[\"workers\",\"candidates\"],\"k\":8,\"per_corpus_k\":6}")" # Sanity / headline assertions WORKER_HITS="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="workers")] | length')" CAND_HITS="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="candidates")] | length')" TOTAL="$(echo "$RESP" | jq -r '.results | length')" echo echo "[multi-corpus-e2e] merged top-$TOTAL: workers=$WORKER_HITS candidates=$CAND_HITS" echo "$RESP" | jq -r '.results[] | " \(.corpus | .[0:1]) d=\(.distance | tostring | .[0:6]) \(.id) \(.metadata.role // .metadata.skills // "n/a")"' if [ "$WORKER_HITS" -gt 0 ] && [ "$CAND_HITS" -gt 0 ]; then echo echo "[multi-corpus-e2e] PASS: both corpora represented in merged top-$TOTAL" exit 0 else echo echo "[multi-corpus-e2e] FAIL: corpus mix was workers=$WORKER_HITS candidates=$CAND_HITS" exit 1 fi