golangLAKEHOUSE/scripts/multi_coord_stress.sh

#!/usr/bin/env bash
# Multi-coordinator stress harness — Phase 1 of the 48-hour mock.
#
# Three coordinators (Alice / Bob / Carol) own three distinct contracts
# (Milwaukee distribution, Indianapolis manufacturing, Chicago
# construction). The driver fires phases:
#   1. baseline — each coord runs their contract's role queries
#   2. surge    — each contract's demand doubles (URGENT phrasing)
#   3. merge    — alpha + beta combined under alice
#   4. handover — bob takes alpha, USING alice's playbook namespace
#   5. split    — alpha surge re-distributed across all 3 coords
#   6. reissue  — non-determinism check: same baselines reissued
#   7. analysis — diversity + determinism + learning metrics
#
# Phase 1 deliberately skips the 48-hour clock, email/SMS endpoints,
# and Langfuse wiring — those are Phase 2/3.
#
# Usage:
#   ./scripts/multi_coord_stress.sh                    # run #001
#   RUN_ID=002 ./scripts/multi_coord_stress.sh
#   K=12 ./scripts/multi_coord_stress.sh

set -euo pipefail
cd "$(dirname "$0")/.."

export PATH="$PATH:/usr/local/go/bin"

RUN_ID="${RUN_ID:-001}"
WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"
ETHEREAL_LIMIT="${ETHEREAL_LIMIT:-0}"
CORPORA="${CORPORA:-workers,ethereal_workers}"
K="${K:-8}"

OUT_JSON="reports/reality-tests/multi_coord_stress_${RUN_ID}.json"
OUT_MD="reports/reality-tests/multi_coord_stress_${RUN_ID}.md"

if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
  echo "[stress] Ollama not reachable on :11434 — skipping (need it for embeddings)"
  exit 0
fi

echo "[stress] building binaries..."
go build -o bin/ ./cmd/storaged ./cmd/catalogd ./cmd/ingestd ./cmd/queryd \
                 ./cmd/embedd ./cmd/vectord ./cmd/pathwayd ./cmd/observerd \
                 ./cmd/matrixd ./cmd/gateway \
                 ./scripts/staffing_workers ./scripts/multi_coord_stress

pkill -f "bin/(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)" 2>/dev/null || true
sleep 0.3

PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/stress.toml"

cleanup() {
  echo "[stress] cleanup"
  for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done
  rm -rf "$TMP"
}
trap cleanup EXIT INT TERM

cat > "$CFG" <<EOF
[s3]
endpoint        = "http://localhost:9000"
region          = "us-east-1"
bucket          = "lakehouse-go-primary"
use_path_style  = true

[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url  = "http://127.0.0.1:3213"
queryd_url   = "http://127.0.0.1:3214"
vectord_url  = "http://127.0.0.1:3215"
embedd_url   = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url  = "http://127.0.0.1:3218"
observerd_url = "http://127.0.0.1:3219"

[storaged]
bind = "127.0.0.1:3211"

[catalogd]
bind = "127.0.0.1:3212"
storaged_url = "http://127.0.0.1:3211"

[ingestd]
bind = "127.0.0.1:3213"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
max_ingest_bytes = 268435456

[queryd]
bind = "127.0.0.1:3214"
catalogd_url = "http://127.0.0.1:3212"
secrets_path = "/etc/lakehouse/secrets-go.toml"
refresh_every = "1s"

[embedd]
bind = "127.0.0.1:3216"
provider_url  = "http://localhost:11434"
default_model = "nomic-embed-text"

[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""

[pathwayd]
bind = "127.0.0.1:3217"
persist_path = ""

[observerd]
bind = "127.0.0.1:3219"
persist_path = ""

[matrixd]
bind = "127.0.0.1:3218"
embedd_url  = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF

poll_health() {
  local port="$1" deadline=$(($(date +%s) + 5))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
    sleep 0.05
  done
  return 1
}

echo "[stress] launching stack..."
./bin/storaged  -config "$CFG" > /tmp/stress_storaged.log  2>&1 & PIDS+=($!); poll_health 3211 || { echo "storaged failed"; exit 1; }
./bin/catalogd  -config "$CFG" > /tmp/stress_catalogd.log  2>&1 & PIDS+=($!); poll_health 3212 || { echo "catalogd failed"; exit 1; }
./bin/ingestd   -config "$CFG" > /tmp/stress_ingestd.log   2>&1 & PIDS+=($!); poll_health 3213 || { echo "ingestd failed"; exit 1; }
./bin/queryd    -config "$CFG" > /tmp/stress_queryd.log    2>&1 & PIDS+=($!); poll_health 3214 || { echo "queryd failed"; exit 1; }
./bin/embedd    -config "$CFG" > /tmp/stress_embedd.log    2>&1 & PIDS+=($!); poll_health 3216 || { echo "embedd failed"; exit 1; }
./bin/vectord   -config "$CFG" > /tmp/stress_vectord.log   2>&1 & PIDS+=($!); poll_health 3215 || { echo "vectord failed"; exit 1; }
./bin/pathwayd  -config "$CFG" > /tmp/stress_pathwayd.log  2>&1 & PIDS+=($!); poll_health 3217 || { echo "pathwayd failed"; exit 1; }
./bin/observerd -config "$CFG" > /tmp/stress_observerd.log 2>&1 & PIDS+=($!); poll_health 3219 || { echo "observerd failed"; exit 1; }
./bin/matrixd   -config "$CFG" > /tmp/stress_matrixd.log   2>&1 & PIDS+=($!); poll_health 3218 || { echo "matrixd failed"; exit 1; }
./bin/gateway   -config "$CFG" > /tmp/stress_gateway.log   2>&1 & PIDS+=($!); poll_health 3110 || { echo "gateway failed"; exit 1; }

echo
echo "[stress] ingest workers (limit=$WORKERS_LIMIT) into 'workers' corpus..."
./bin/staffing_workers -limit "$WORKERS_LIMIT"

echo
echo "[stress] ingest ethereal_workers (limit=$ETHEREAL_LIMIT, 0=all) into 'ethereal_workers' corpus..."
./bin/staffing_workers \
  -parquet "/home/profit/lakehouse/data/datasets/ethereal_workers.parquet" \
  -index-name ethereal_workers \
  -id-prefix "e-" \
  -limit "$ETHEREAL_LIMIT"

echo
echo "[stress] running multi-coord stress driver..."
EXTRA_FLAGS=""
if [ "${WITH_PARAPHRASE_HANDOVER:-1}" = "1" ]; then
  EXTRA_FLAGS="$EXTRA_FLAGS -with-paraphrase-handover"
fi
./bin/multi_coord_stress \
  -gateway "http://127.0.0.1:3110" \
  -contracts tests/reality/contracts \
  -corpora "$CORPORA" \
  -k "$K" \
  -out "$OUT_JSON" \
  -ollama  "http://localhost:11434" \
  -judge   "${JUDGE_MODEL:-qwen2.5:latest}" \
  $EXTRA_FLAGS

echo
echo "[stress] generating markdown report → $OUT_MD"

# Render compact markdown from the JSON. Same shape as the lift harness
# reports so reviewers can compare format.
total=$(jq -r '.events | length' "$OUT_JSON")
gen_at=$(jq -r '.generated_at' "$OUT_JSON")
div_role=$(jq -r '.diversity.same_role_across_contracts_mean_jaccard' "$OUT_JSON")
div_role_n=$(jq -r '.diversity.num_pairs_same_role_across_contracts' "$OUT_JSON")
div_xrole=$(jq -r '.diversity.different_roles_same_contract_mean_jaccard' "$OUT_JSON")
div_xrole_n=$(jq -r '.diversity.num_pairs_different_roles_same_contract' "$OUT_JSON")
det_jacc=$(jq -r '.determinism.mean_jaccard' "$OUT_JSON")
det_n=$(jq -r '.determinism.num_reissued_pairs' "$OUT_JSON")
hand_run=$(jq -r '.learning.handover_queries_run' "$OUT_JSON")
hand_top1=$(jq -r '.learning.recorded_answers_top1_count' "$OUT_JSON")
hand_topk=$(jq -r '.learning.recorded_answers_topk_count' "$OUT_JSON")
hand_rate=$(jq -r '.learning.handover_hit_rate' "$OUT_JSON")
ph_run=$(jq -r '.learning.paraphrase_handover_run // 0' "$OUT_JSON")
ph_top1=$(jq -r '.learning.paraphrase_top1_count // 0' "$OUT_JSON")
ph_topk=$(jq -r '.learning.paraphrase_topk_count // 0' "$OUT_JSON")
ph_rate=$(jq -r '.learning.paraphrase_handover_hit_rate // 0' "$OUT_JSON")

cat > "$OUT_MD" <<MDEOF
# Multi-Coordinator Stress Test — Run ${RUN_ID}

**Generated:** ${gen_at}
**Coordinators:** alice / bob / carol (each with own playbook namespace: \`playbook_alice\` / \`playbook_bob\` / \`playbook_carol\`)
**Contracts:** $(jq -r '.contracts | join(" / ")' "$OUT_JSON")
**Corpora:** \`${CORPORA}\`
**K per query:** ${K}
**Total events captured:** ${total}
**Evidence:** \`${OUT_JSON}\`

---

## Diversity — is the system locking into scenarios or cycling?

| Metric | Mean Jaccard | n pairs | Interpretation |
|---|---:|---:|---|
| Same role across different contracts | ${div_role} | ${div_role_n} | Lower = more diverse (different region/cert mix → different workers) |
| Different roles within same contract | ${div_xrole} | ${div_xrole_n} | Should be near-zero (different roles = different worker pools) |

**Healthy ranges:**
- Same role across contracts: < 0.30 means the system is genuinely picking different workers per region/contract.
- Different roles same contract: < 0.10 means role-specific retrieval is working.
- If either is > 0.50, the system is "cycling" the same handful of workers regardless of query intent.

---

## Determinism — same query reissued, top-K stability

| Metric | Value |
|---|---:|
| Mean Jaccard on retrieval-only reissue | ${det_jacc} |
| Number of reissue pairs | ${det_n} |

**Interpretation:**
- ≥ 0.95: HNSW retrieval is highly deterministic; reissues land on near-identical top-K. Good — system locks into a stable view of "best workers for this query."
- 0.80 – 0.95: Some HNSW or embed variance, acceptable.
- < 0.80: Retrieval is unstable — reissues see substantially different results, suggesting either embed nondeterminism (Ollama returning slightly different vectors) or vectord nondeterminism (HNSW insertion order affecting recall).

---

## Learning — handover hit rate

Bob takes Alice's contract using Alice's playbook namespace. Did Alice's recorded answers surface in Bob's results?

| Metric | Value |
|---|---:|
| Verbatim handover queries run | ${hand_run} |
| Alice's recorded answer at Bob's top-1 (verbatim) | ${hand_top1} |
| Alice's recorded answer in Bob's top-K (verbatim) | ${hand_topk} |
| **Verbatim handover hit rate (top-1)** | **${hand_rate}** |
| Paraphrase handover queries run | ${ph_run} |
| Alice's recorded answer at Bob's top-1 (paraphrase) | ${ph_top1} |
| Alice's recorded answer in Bob's top-K (paraphrase) | ${ph_topk} |
| **Paraphrase handover hit rate (top-1)** | **${ph_rate}** |

**Interpretation:**
- Verbatim hit rate ≈ 1.0: trivial case — Bob runs identical queries; should always hit.
- Paraphrase hit rate ≥ 0.5: institutional memory survives wording change — the harder learning property.
- Paraphrase hit rate ≈ 0.0: Bob's paraphrases drift past the inject threshold, so Alice's recordings don't activate. Same caveat as the playbook_lift paraphrase pass.

---

## Per-event capture

All matrix.search responses live in the JSON — top-K with worker IDs, distances, and per-corpus counts. Search by phase:

\`\`\`bash
jq '.events[] | select(.phase == "merge")' ${OUT_JSON}
jq '.events[] | select(.coordinator == "alice" and .phase == "baseline")' ${OUT_JSON}
jq '.events[] | select(.role == "warehouse worker") | {phase, contract, top_k_ids: [.top_k[].id]}' ${OUT_JSON}
\`\`\`

---

## What's NOT in this run (Phase 1 deliberately defers)

- **48-hour clock.** Events fire as discrete steps, not on a timeline.
- **Email / SMS ingest.** No endpoints exist on the Go side yet.
- **New-resume injection mid-run.** The corpus is fixed at the start.
- **Langfuse traces.** Need Go-side wiring.

These are Phase 2/3. The Phase 1 substrate is what the time-based runner will mount on top of.
MDEOF

echo
echo "[stress] DONE"
echo "[stress]   evidence:  $OUT_JSON"
echo "[stress]   report:    $OUT_MD"