golangLAKEHOUSE/scripts/playbook_lift.sh

#!/usr/bin/env bash
# Playbook-lift reality test — measure whether the 5-loop substrate
# (matrix retrieve+merge + playbook + small-model judge) actually beats
# raw cosine on staffing queries.
#
# Pipeline:
#   1. Boot the full Go HTTP stack (storaged, catalogd, ingestd, queryd,
#      embedd, vectord, pathwayd, observerd, matrixd, gateway). Earlier
#      versions booted only the 5 daemons matrix.search needs, which
#      gave a falsely clean "everything works" signal — we now exercise
#      the prod-realistic daemon graph so daemons that observe (observerd)
#      or persist (pathwayd) are actually in the loop.
#   2. SQL surface probe — ingest a 3-row CSV via /v1/ingest (catalogd
#      → ingestd → queryd refresh), assert SELECT COUNT(*)=3. Proves the
#      ingestd→catalogd→queryd path is wired even though the lift driver
#      itself is vector-only retrieval.
#   3. Ingest workers (default 5000) + candidates corpora into vectord
#   4. Run the playbook_lift driver: cold pass → judge → record →
#      warm pass → measure
#   5. Generate markdown report from the JSON evidence
#
# Output:
#   reports/reality-tests/playbook_lift_<N>.json    — raw evidence
#   reports/reality-tests/playbook_lift_<N>.md      — human report
#
# Requires: Ollama on :11434 with nomic-embed-text + the judge model
# loaded. Skips (exit 0) if Ollama is absent.
#
# Usage:
#   ./scripts/playbook_lift.sh                      # run #001 with defaults
#   RUN_ID=002 ./scripts/playbook_lift.sh           # explicit run id
#   JUDGE_MODEL=qwen2.5:latest ./scripts/playbook_lift.sh
#   WORKERS_LIMIT=2000 ./scripts/playbook_lift.sh

set -euo pipefail
cd "$(dirname "$0")/.."

export PATH="$PATH:/usr/local/go/bin"

RUN_ID="${RUN_ID:-001}"
# JUDGE_MODEL: empty means "let the Go driver resolve from
# lakehouse.toml [models].local_judge". Set explicitly to override.
JUDGE_MODEL="${JUDGE_MODEL:-}"
WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"
QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}"
CORPORA="${CORPORA:-workers,ethereal_workers}"
K="${K:-10}"
CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}"

OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json"
OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md"

if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
  echo "[lift] Ollama not reachable on :11434 — skipping"
  exit 0
fi

# Resolve judge from config when not set explicitly — needed for the
# Ollama model-presence check below. Mirrors the Go driver's priority.
EFFECTIVE_JUDGE="$JUDGE_MODEL"
if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then
  EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')"
fi
EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}"

if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \
    '.models[] | select(.name == $m)' >/dev/null 2>&1; then
  echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first"
  exit 1
fi
# Compute a single string for "where did the judge come from" so the
# log line + the markdown report don't have to chain :+/:- substitutions
# (those silently fuse "env JUDGE_MODEL" + the value into "env JUDGE_MODELx"
# without a separator — the bug Opus caught on lift_001's report).
if [ -n "$JUDGE_MODEL" ]; then
  JUDGE_SOURCE="env JUDGE_MODEL=${JUDGE_MODEL}"
else
  JUDGE_SOURCE="config [models].local_judge"
fi
echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from $JUDGE_SOURCE)"

echo "[lift] building binaries..."
go build -o bin/ ./cmd/storaged ./cmd/catalogd ./cmd/ingestd ./cmd/queryd \
                 ./cmd/embedd ./cmd/vectord ./cmd/pathwayd ./cmd/observerd \
                 ./cmd/matrixd ./cmd/gateway \
                 ./scripts/staffing_workers ./scripts/staffing_candidates \
                 ./scripts/playbook_lift

# Anchor pkill to bin/<name>$ so we don't accidentally hit unrelated
# binaries — and exclude chatd (independent of retrieval, stays up).
pkill -f "bin/(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)" 2>/dev/null || true
sleep 0.3

PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/lift.toml"

cleanup() {
  echo "[lift] cleanup"
  for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done
  rm -rf "$TMP"
}
trap cleanup EXIT INT TERM

cat > "$CFG" <<EOF
# [s3] tells storaged which bucket to talk to. Without it, defaults
# resolve to "lakehouse-primary" (no -go-) which doesn't exist on this
# box and catalogd's rehydrate fails with NoSuchBucket. Access keys
# come from the secrets file (storaged -secrets defaults to
# /etc/lakehouse/secrets-go.toml), not this temp toml.
[s3]
endpoint        = "http://localhost:9000"
region          = "us-east-1"
bucket          = "lakehouse-go-primary"
use_path_style  = true

[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url  = "http://127.0.0.1:3213"
queryd_url   = "http://127.0.0.1:3214"
vectord_url  = "http://127.0.0.1:3215"
embedd_url   = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url  = "http://127.0.0.1:3218"
observerd_url = "http://127.0.0.1:3219"

[storaged]
bind = "127.0.0.1:3211"

[catalogd]
bind = "127.0.0.1:3212"
storaged_url = "http://127.0.0.1:3211"

[ingestd]
bind = "127.0.0.1:3213"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
max_ingest_bytes = 268435456

[queryd]
bind = "127.0.0.1:3214"
catalogd_url = "http://127.0.0.1:3212"
secrets_path = "/etc/lakehouse/secrets-go.toml"
# Aggressive refresh so the SQL probe table appears within ~1s of
# ingestd registering it, instead of the prod default 30s.
refresh_every = "1s"

[embedd]
bind = "127.0.0.1:3216"
provider_url  = "http://localhost:11434"
default_model = "nomic-embed-text"

[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""

[pathwayd]
bind = "127.0.0.1:3217"
persist_path = ""

[observerd]
bind = "127.0.0.1:3219"
persist_path = ""

[matrixd]
bind = "127.0.0.1:3218"
embedd_url  = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF

poll_health() {
  local port="$1" deadline=$(($(date +%s) + 5))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
    sleep 0.05
  done
  return 1
}

echo "[lift] launching stack (10 daemons; chatd stays up independently)..."
# Order respects dependencies: storaged → catalogd (needs storaged) →
# ingestd (needs storaged+catalogd) → queryd (needs catalogd) → embedd →
# vectord → pathwayd → observerd → matrixd (needs embedd+vectord) →
# gateway (needs all of them).
./bin/storaged  -config "$CFG" > /tmp/storaged.log  2>&1 & PIDS+=($!)
poll_health 3211 || { echo "storaged failed"; exit 1; }
./bin/catalogd  -config "$CFG" > /tmp/catalogd.log  2>&1 & PIDS+=($!)
poll_health 3212 || { echo "catalogd failed"; exit 1; }
./bin/ingestd   -config "$CFG" > /tmp/ingestd.log   2>&1 & PIDS+=($!)
poll_health 3213 || { echo "ingestd failed"; exit 1; }
./bin/queryd    -config "$CFG" > /tmp/queryd.log    2>&1 & PIDS+=($!)
poll_health 3214 || { echo "queryd failed"; exit 1; }
./bin/embedd    -config "$CFG" > /tmp/embedd.log    2>&1 & PIDS+=($!)
poll_health 3216 || { echo "embedd failed"; exit 1; }
./bin/vectord   -config "$CFG" > /tmp/vectord.log   2>&1 & PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; exit 1; }
./bin/pathwayd  -config "$CFG" > /tmp/pathwayd.log  2>&1 & PIDS+=($!)
poll_health 3217 || { echo "pathwayd failed"; exit 1; }
./bin/observerd -config "$CFG" > /tmp/observerd.log 2>&1 & PIDS+=($!)
poll_health 3219 || { echo "observerd failed"; exit 1; }
./bin/matrixd   -config "$CFG" > /tmp/matrixd.log   2>&1 & PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; exit 1; }
./bin/gateway   -config "$CFG" > /tmp/gateway.log   2>&1 & PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; exit 1; }

echo
echo "[lift] SQL surface probe — ingest 3-row CSV, assert SELECT COUNT(*)=3..."
PROBE_CSV="$TMP/sql_probe.csv"
cat > "$PROBE_CSV" <<CSVEOF
id,name,role
1,Alice,Forklift Operator
2,Bob,Production Worker
3,Charlie,Warehouse Associate
CSVEOF
INGEST_RESP="$(curl -sS -F "file=@$PROBE_CSV" "http://127.0.0.1:3110/v1/ingest?name=lift_sql_probe")"
echo "[lift]   ingest response: $INGEST_RESP"
# Poll up to 5s for queryd to discover the manifest. refresh_every=1s
# is a lower bound; under load or slow disks the manifest may not be
# visible in a fixed sleep, which would 4xx the SQL probe spuriously.
PROBE_COUNT=ERR
SQL_RESP=""
deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
  SQL_RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/sql \
      -H 'content-type: application/json' \
      -d '{"sql":"SELECT COUNT(*) FROM lift_sql_probe"}')"
  PROBE_COUNT="$(echo "$SQL_RESP" | jq -r '.rows[0][0] // "ERR"' 2>/dev/null || echo "ERR")"
  [ "$PROBE_COUNT" = "3" ] && break
  sleep 0.25
done
if [ "$PROBE_COUNT" = "3" ]; then
  echo "[lift]   ✓ SQL surface probe passed (rowcount=3)"
else
  echo "[lift]   ✗ SQL surface probe FAILED after 5s (got: $SQL_RESP)"
  exit 1
fi

echo
echo "[lift] ingest workers (limit=$WORKERS_LIMIT)..."
./bin/staffing_workers -limit "$WORKERS_LIMIT"

echo
echo "[lift] ingest ethereal_workers (10K, second staffing-domain corpus)..."
# ethereal_workers is the right second corpus for staffing-domain reality
# tests: same schema as workers_500k but a different population (Material
# Handlers, Admin Assistants, etc.) so the matrix layer's multi-corpus
# retrieve+merge actually has TWO relevant corpora to compose against.
# Earlier versions used scripts/staffing_candidates against the SWE-tech
# candidates parquet (Swift/iOS, Scala/Spark, Rust/DataFusion) — wrong
# domain for staffing queries; effectively dead-corpus noise.
# id-prefix "e-" prevents collisions with workers' "w-" since both files
# count worker_id from 1.
./bin/staffing_workers \
  -parquet "/home/profit/lakehouse/data/datasets/ethereal_workers.parquet" \
  -index-name ethereal_workers \
  -id-prefix "e-" \
  -limit 0

echo
echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K"
# -judge "$JUDGE_MODEL" passes either the explicit env override or
# the empty string. The Go driver treats empty -judge as "not set"
# and runs its own resolution chain (env → config → fallback). When
# JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver
# regardless of what its env-lookup would find — flag wins by design.
./bin/playbook_lift \
  -config  "$CONFIG_PATH" \
  -gateway "http://127.0.0.1:3110" \
  -ollama  "http://localhost:11434" \
  -queries "$QUERIES_FILE" \
  -corpora "$CORPORA" \
  -judge   "$JUDGE_MODEL" \
  -k       "$K" \
  -out     "$OUT_JSON"

echo
echo "[lift] generating markdown report → $OUT_MD"
generate_md() {
  local json="$1" md="$2"
  local total discovery lift no_change boosted mean_delta gen_at
  total=$(jq -r '.summary.total' "$json")
  discovery=$(jq -r '.summary.with_discovery' "$json")
  lift=$(jq -r '.summary.lift_count' "$json")
  no_change=$(jq -r '.summary.no_change' "$json")
  boosted=$(jq -r '.summary.playbook_boosted_total' "$json")
  mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json")
  gen_at=$(jq -r '.summary.generated_at' "$json")

  cat > "$md" <<MDEOF
# Playbook-Lift Reality Test — Run ${RUN_ID}

**Generated:** ${gen_at}
**Judge:** \`${EFFECTIVE_JUDGE}\` (Ollama, resolved from ${JUDGE_SOURCE})
**Corpora:** \`${CORPORA}\`
**Workers limit:** ${WORKERS_LIMIT}
**Queries:** \`${QUERIES_FILE}\` (${total} executed)
**K per pass:** ${K}
**Evidence:** \`${OUT_JSON}\`

---

## Headline

| Metric | Value |
|---|---:|
| Total queries run | ${total} |
| Cold-pass discoveries (judge-best ≠ top-1) | ${discovery} |
| Warm-pass lifts (recorded playbook → top-1) | ${lift} |
| No change (judge-best already top-1, no playbook needed) | ${no_change} |
| Playbook boosts triggered (warm pass) | ${boosted} |
| Mean Δ top-1 distance (warm − cold) | ${mean_delta} |

**Lift rate:** ${lift} of ${discovery} discoveries became top-1 after warm pass.

---

## Per-query results

| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift |
|---|---|---|---|---|---|---|---|
MDEOF

  jq -r '.runs | to_entries[] |
    [
      (.key + 1 | tostring),
      (.value.query | .[0:60]),
      .value.cold_top1_id,
      ((.value.cold_judge_best_rank | tostring) + "/" + (.value.cold_judge_best_rating | tostring)),
      (if .value.playbook_recorded then "✓ " + (.value.playbook_target_id // "") else "—" end),
      .value.warm_top1_id,
      (.value.warm_judge_best_rank | tostring),
      (if .value.lift then "**YES**" else "no" end)
    ] | "| " + join(" | ") + " |"
  ' "$json" >> "$md"

  cat >> "$md" <<MDEOF

---

## Honesty caveats

1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM
   judge's verdict is what defines "best." If \`${JUDGE_MODEL}\` rates badly,
   the lift number is meaningless. To validate the judge itself, sample 5–10
   verdicts manually and check agreement.
2. **Score-1.0 boost = distance halved.** Playbook math is
   \`distance' = distance × (1 - 0.5 × score)\`. Lift requires the judge-best
   result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise
   even halving doesn't promote it. Tight clusters → little visible lift.
3. **Same-query replay is the cheap case.** Real lift comes from *similar but
   not identical* queries hitting a recorded playbook. This run only tests
   verbatim replay. A v2 should add paraphrase queries.
4. **Multi-corpus skew.** Default corpora=\`${CORPORA}\` — if all judge-best
   results land in one corpus, the matrix layer's purpose isn't being tested.
   Check per-corpus distribution in the JSON.
5. **Judge resolution.** This run used \`${EFFECTIVE_JUDGE}\` from
   ${JUDGE_SOURCE}.
   Bumping the judge for run #N+1 means editing one line in lakehouse.toml.

## Next moves

- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real
  work. Move to paraphrase queries + tag-based boost (currently ignored).
- If lift rate < 20%: investigate why — judge variance, distance gap too
  wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need
  retuning.
- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is
  already close to optimal on this query distribution. Either the corpus
  is too narrow or the queries are too easy.
MDEOF
}

generate_md "$OUT_JSON" "$OUT_MD"

echo
echo "[lift] DONE"
echo "[lift]   evidence:  $OUT_JSON"
echo "[lift]   report:    $OUT_MD"