golangLAKEHOUSE/scripts/multi_corpus_e2e.sh

#!/usr/bin/env bash
# Multi-corpus reality test — first deep-field test with TWO real
# staffing corpora composed via /v1/matrix/search.
#
# Pipeline:
#   - Bring up the Go stack (storaged, embedd, vectord, matrixd, gateway)
#   - Ingest workers (5000 rows from workers_500k.parquet)
#   - Ingest candidates (1000 rows from candidates.parquet)
#   - Run a real query through /v1/matrix/search with both corpora
#   - Print the merged top-k with corpus attribution
#
# Headline assertion: results include hits from BOTH corpora (the
# whole point of multi-corpus matrix retrieval).
#
# Requires: Ollama on :11434 with nomic-embed-text loaded. Skips
# (exit 0) when Ollama is absent.
#
# Usage: ./scripts/multi_corpus_e2e.sh
#        ./scripts/multi_corpus_e2e.sh "your custom query"

set -euo pipefail
cd "$(dirname "$0")/.."

export PATH="$PATH:/usr/local/go/bin"

QUERY="${1:-Forklift operator with OSHA-30 certification, warehouse experience}"
WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"

if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
  echo "[multi-corpus-e2e] Ollama not reachable on :11434 — skipping"
  exit 0
fi

echo "[multi-corpus-e2e] building binaries..."
go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \
                  ./scripts/staffing_workers ./scripts/staffing_candidates

pkill -f "bin/(storaged|embedd|vectord|matrixd|gateway)" 2>/dev/null || true
sleep 0.3

PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/e2e.toml"

cleanup() {
  echo "[multi-corpus-e2e] cleanup"
  for p in "${PIDS[@]}"; do [ -n "$p" ] && kill "$p" 2>/dev/null || true; done
  rm -rf "$TMP"
}
trap cleanup EXIT INT TERM

# Ephemeral mode (vectord storaged_url=""); same rationale as
# candidates_e2e — don't pollute MinIO _vectors/ between runs.
cat > "$CFG" <<EOF
[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url  = "http://127.0.0.1:3213"
queryd_url   = "http://127.0.0.1:3214"
vectord_url  = "http://127.0.0.1:3215"
embedd_url   = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url  = "http://127.0.0.1:3218"

[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""

[matrixd]
bind = "127.0.0.1:3218"
embedd_url  = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF

poll_health() {
  local port="$1" deadline=$(($(date +%s) + 5))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
    sleep 0.05
  done
  return 1
}

echo "[multi-corpus-e2e] launching stack..."
./bin/storaged -config "$CFG" > /tmp/storaged.log 2>&1 & PIDS+=($!)
poll_health 3211 || { echo "storaged failed"; exit 1; }
./bin/embedd -config "$CFG" > /tmp/embedd.log 2>&1 & PIDS+=($!)
poll_health 3216 || { echo "embedd failed"; exit 1; }
./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; exit 1; }
./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; exit 1; }
./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; exit 1; }

echo
echo "[multi-corpus-e2e] ingest workers (limit=$WORKERS_LIMIT)..."
./bin/staffing_workers -limit "$WORKERS_LIMIT"

echo
echo "[multi-corpus-e2e] ingest candidates..."
./bin/staffing_candidates -skip-populate=false -query "$QUERY" 2>&1 | grep -v "^\[candidates\]\(matrix\|reality\)" || true

echo
echo "[multi-corpus-e2e] /matrix/corpora — confirm both registered:"
curl -sS http://127.0.0.1:3110/v1/matrix/corpora | jq -c

echo
echo "[multi-corpus-e2e] multi-corpus query: $QUERY"
RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/matrix/search \
  -H 'Content-Type: application/json' \
  -d "{\"query_text\":\"$QUERY\",\"corpora\":[\"workers\",\"candidates\"],\"k\":8,\"per_corpus_k\":6}")"

# Sanity / headline assertions
WORKER_HITS="$(echo "$RESP" | jq -r '[.results[] | select(.corpus=="workers")] | length')"
CAND_HITS="$(echo "$RESP" | jq -r   '[.results[] | select(.corpus=="candidates")] | length')"
TOTAL="$(echo "$RESP" | jq -r '.results | length')"

echo
echo "[multi-corpus-e2e] merged top-$TOTAL: workers=$WORKER_HITS candidates=$CAND_HITS"
echo "$RESP" | jq -r '.results[] | "  \(.corpus | .[0:1])  d=\(.distance | tostring | .[0:6])  \(.id)  \(.metadata.role // .metadata.skills // "n/a")"'

if [ "$WORKER_HITS" -gt 0 ] && [ "$CAND_HITS" -gt 0 ]; then
  echo
  echo "[multi-corpus-e2e] PASS: both corpora represented in merged top-$TOTAL"
  exit 0
else
  echo
  echo "[multi-corpus-e2e] FAIL: corpus mix was workers=$WORKER_HITS candidates=$CAND_HITS"
  exit 1
fi