golangLAKEHOUSE/scripts/playbook_lift.sh

#!/usr/bin/env bash
# Playbook-lift reality test — measure whether the 5-loop substrate
# (matrix retrieve+merge + playbook + small-model judge) actually beats
# raw cosine on staffing queries.
#
# Pipeline:
#   1. Boot the Go stack (storaged, embedd, vectord, matrixd, gateway)
#   2. Ingest workers (default 5000) + candidates corpora
#   3. Run the playbook_lift driver: cold pass → judge → record →
#      warm pass → measure
#   4. Generate markdown report from the JSON evidence
#
# Output:
#   reports/reality-tests/playbook_lift_<N>.json    — raw evidence
#   reports/reality-tests/playbook_lift_<N>.md      — human report
#
# Requires: Ollama on :11434 with nomic-embed-text + the judge model
# loaded. Skips (exit 0) if Ollama is absent.
#
# Usage:
#   ./scripts/playbook_lift.sh                      # run #001 with defaults
#   RUN_ID=002 ./scripts/playbook_lift.sh           # explicit run id
#   JUDGE_MODEL=qwen2.5:latest ./scripts/playbook_lift.sh
#   WORKERS_LIMIT=2000 ./scripts/playbook_lift.sh

set -euo pipefail
cd "$(dirname "$0")/.."

export PATH="$PATH:/usr/local/go/bin"

RUN_ID="${RUN_ID:-001}"
# JUDGE_MODEL: empty means "let the Go driver resolve from
# lakehouse.toml [models].local_judge". Set explicitly to override.
JUDGE_MODEL="${JUDGE_MODEL:-}"
WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"
QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}"
CORPORA="${CORPORA:-workers,candidates}"
K="${K:-10}"
CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}"

OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json"
OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md"

if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
  echo "[lift] Ollama not reachable on :11434 — skipping"
  exit 0
fi

# Resolve judge from config when not set explicitly — needed for the
# Ollama model-presence check below. Mirrors the Go driver's priority.
EFFECTIVE_JUDGE="$JUDGE_MODEL"
if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then
  EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')"
fi
EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}"

if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \
    '.models[] | select(.name == $m)' >/dev/null 2>&1; then
  echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first"
  exit 1
fi
echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from ${JUDGE_MODEL:+env}${JUDGE_MODEL:-config})"

echo "[lift] building binaries..."
go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \
                 ./scripts/staffing_workers ./scripts/staffing_candidates \
                 ./scripts/playbook_lift

pkill -f "bin/(storaged|embedd|vectord|matrixd|gateway)" 2>/dev/null || true
sleep 0.3

PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/lift.toml"

cleanup() {
  echo "[lift] cleanup"
  for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done
  rm -rf "$TMP"
}
trap cleanup EXIT INT TERM

cat > "$CFG" <<EOF
[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url  = "http://127.0.0.1:3213"
queryd_url   = "http://127.0.0.1:3214"
vectord_url  = "http://127.0.0.1:3215"
embedd_url   = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url  = "http://127.0.0.1:3218"

[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""

[matrixd]
bind = "127.0.0.1:3218"
embedd_url  = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF

poll_health() {
  local port="$1" deadline=$(($(date +%s) + 5))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
    sleep 0.05
  done
  return 1
}

echo "[lift] launching stack..."
./bin/storaged -config "$CFG" > /tmp/storaged.log 2>&1 & PIDS+=($!)
poll_health 3211 || { echo "storaged failed"; exit 1; }
./bin/embedd   -config "$CFG" > /tmp/embedd.log   2>&1 & PIDS+=($!)
poll_health 3216 || { echo "embedd failed"; exit 1; }
./bin/vectord  -config "$CFG" > /tmp/vectord.log  2>&1 & PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; exit 1; }
./bin/matrixd  -config "$CFG" > /tmp/matrixd.log  2>&1 & PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; exit 1; }
./bin/gateway  -config "$CFG" > /tmp/gateway.log  2>&1 & PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; exit 1; }

echo
echo "[lift] ingest workers (limit=$WORKERS_LIMIT)..."
./bin/staffing_workers -limit "$WORKERS_LIMIT"

echo
echo "[lift] ingest candidates..."
./bin/staffing_candidates -skip-populate=false -query "warmup" 2>&1 \
  | grep -v "^\[candidates\]\(matrix\|reality\)" || true

echo
echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K"
# -judge "$JUDGE_MODEL" passes either the explicit env override or
# the empty string. The Go driver treats empty -judge as "not set"
# and runs its own resolution chain (env → config → fallback). When
# JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver
# regardless of what its env-lookup would find — flag wins by design.
./bin/playbook_lift \
  -config  "$CONFIG_PATH" \
  -gateway "http://127.0.0.1:3110" \
  -ollama  "http://localhost:11434" \
  -queries "$QUERIES_FILE" \
  -corpora "$CORPORA" \
  -judge   "$JUDGE_MODEL" \
  -k       "$K" \
  -out     "$OUT_JSON"

echo
echo "[lift] generating markdown report → $OUT_MD"
generate_md() {
  local json="$1" md="$2"
  local total discovery lift no_change boosted mean_delta gen_at
  total=$(jq -r '.summary.total' "$json")
  discovery=$(jq -r '.summary.with_discovery' "$json")
  lift=$(jq -r '.summary.lift_count' "$json")
  no_change=$(jq -r '.summary.no_change' "$json")
  boosted=$(jq -r '.summary.playbook_boosted_total' "$json")
  mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json")
  gen_at=$(jq -r '.summary.generated_at' "$json")

  cat > "$md" <<MDEOF
# Playbook-Lift Reality Test — Run ${RUN_ID}

**Generated:** ${gen_at}
**Judge:** \`${EFFECTIVE_JUDGE}\` (Ollama, resolved from ${JUDGE_MODEL:+env JUDGE_MODEL}${JUDGE_MODEL:-config [models].local_judge})
**Corpora:** \`${CORPORA}\`
**Workers limit:** ${WORKERS_LIMIT}
**Queries:** \`${QUERIES_FILE}\` (${total} executed)
**K per pass:** ${K}
**Evidence:** \`${OUT_JSON}\`

---

## Headline

| Metric | Value |
|---|---:|
| Total queries run | ${total} |
| Cold-pass discoveries (judge-best ≠ top-1) | ${discovery} |
| Warm-pass lifts (recorded playbook → top-1) | ${lift} |
| No change (judge-best already top-1, no playbook needed) | ${no_change} |
| Playbook boosts triggered (warm pass) | ${boosted} |
| Mean Δ top-1 distance (warm − cold) | ${mean_delta} |

**Lift rate:** ${lift} of ${discovery} discoveries became top-1 after warm pass.

---

## Per-query results

| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift |
|---|---|---|---|---|---|---|---|
MDEOF

  jq -r '.runs | to_entries[] |
    [
      (.key + 1 | tostring),
      (.value.query | .[0:60]),
      .value.cold_top1_id,
      ((.value.cold_judge_best_rank | tostring) + "/" + (.value.cold_judge_best_rating | tostring)),
      (if .value.playbook_recorded then "✓ " + (.value.playbook_target_id // "") else "—" end),
      .value.warm_top1_id,
      (.value.warm_judge_best_rank | tostring),
      (if .value.lift then "**YES**" else "no" end)
    ] | "| " + join(" | ") + " |"
  ' "$json" >> "$md"

  cat >> "$md" <<MDEOF

---

## Honesty caveats

1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM
   judge's verdict is what defines "best." If \`${JUDGE_MODEL}\` rates badly,
   the lift number is meaningless. To validate the judge itself, sample 5–10
   verdicts manually and check agreement.
2. **Score-1.0 boost = distance halved.** Playbook math is
   \`distance' = distance × (1 - 0.5 × score)\`. Lift requires the judge-best
   result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise
   even halving doesn't promote it. Tight clusters → little visible lift.
3. **Same-query replay is the cheap case.** Real lift comes from *similar but
   not identical* queries hitting a recorded playbook. This run only tests
   verbatim replay. A v2 should add paraphrase queries.
4. **Multi-corpus skew.** Default corpora=\`${CORPORA}\` — if all judge-best
   results land in one corpus, the matrix layer's purpose isn't being tested.
   Check per-corpus distribution in the JSON.
5. **Judge resolution.** This run used \`${EFFECTIVE_JUDGE}\` from
   ${JUDGE_MODEL:+env JUDGE_MODEL override}${JUDGE_MODEL:-the lakehouse.toml [models].local_judge tier}.
   Bumping the judge for run #N+1 means editing one line in lakehouse.toml.

## Next moves

- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real
  work. Move to paraphrase queries + tag-based boost (currently ignored).
- If lift rate < 20%: investigate why — judge variance, distance gap too
  wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need
  retuning.
- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is
  already close to optimal on this query distribution. Either the corpus
  is too narrow or the queries are too easy.
MDEOF
}

generate_md "$OUT_JSON" "$OUT_MD"

echo
echo "[lift] DONE"
echo "[lift]   evidence:  $OUT_JSON"
echo "[lift]   report:    $OUT_MD"