golangLAKEHOUSE/scripts/playbook_lift.sh

#!/usr/bin/env bash
# Playbook-lift reality test — measure whether the 5-loop substrate
# (matrix retrieve+merge + playbook + small-model judge) actually beats
# raw cosine on staffing queries.
#
# Pipeline:
#   1. Boot the full Go HTTP stack (storaged, catalogd, ingestd, queryd,
#      embedd, vectord, pathwayd, observerd, matrixd, gateway). Earlier
#      versions booted only the 5 daemons matrix.search needs, which
#      gave a falsely clean "everything works" signal — we now exercise
#      the prod-realistic daemon graph so daemons that observe (observerd)
#      or persist (pathwayd) are actually in the loop.
#   2. SQL surface probe — ingest a 3-row CSV via /v1/ingest (catalogd
#      → ingestd → queryd refresh), assert SELECT COUNT(*)=3. Proves the
#      ingestd→catalogd→queryd path is wired even though the lift driver
#      itself is vector-only retrieval.
#   3. Ingest workers (default 5000) + candidates corpora into vectord
#   4. Run the playbook_lift driver: cold pass → judge → record →
#      warm pass → measure
#   5. Generate markdown report from the JSON evidence
#
# Output:
#   reports/reality-tests/playbook_lift_<N>.json    — raw evidence
#   reports/reality-tests/playbook_lift_<N>.md      — human report
#
# Requires: Ollama on :11434 with nomic-embed-text + the judge model
# loaded. Skips (exit 0) if Ollama is absent.
#
# Usage:
#   ./scripts/playbook_lift.sh                      # run #001 with defaults
#   RUN_ID=002 ./scripts/playbook_lift.sh           # explicit run id
#   JUDGE_MODEL=qwen2.5:latest ./scripts/playbook_lift.sh
#   WORKERS_LIMIT=2000 ./scripts/playbook_lift.sh

set -euo pipefail
cd "$(dirname "$0")/.."

export PATH="$PATH:/usr/local/go/bin"

RUN_ID="${RUN_ID:-001}"
# JUDGE_MODEL: empty means "let the Go driver resolve from
# lakehouse.toml [models].local_judge". Set explicitly to override.
JUDGE_MODEL="${JUDGE_MODEL:-}"
WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"
QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}"
CORPORA="${CORPORA:-workers,ethereal_workers}"
K="${K:-10}"
CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}"
# WITH_PARAPHRASE=1 (default) adds a Pass 3 — for each query whose
# Pass 1 cold pass recorded a playbook, generate a paraphrase via the
# judge and re-query with playbook=true. The paraphrase pass is the
# actual learning-property test (does cosine on paraphrase find the
# recorded entry?). Set WITH_PARAPHRASE=0 for a faster verbatim-only run.
WITH_PARAPHRASE="${WITH_PARAPHRASE:-1}"
# WITH_REJUDGE=1 (default) adds a Pass 4 — judge warm top-1 to measure
# quality lift (warm rating vs cold rating). Catches cases where Shape B
# surfaces a different-but-equally-good answer (which the rank-based
# lift metric misses). +21 judge calls (~30s on qwen2.5).
WITH_REJUDGE="${WITH_REJUDGE:-1}"

OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json"
OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md"

if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
  echo "[lift] Ollama not reachable on :11434 — skipping"
  exit 0
fi

# Resolve judge from config when not set explicitly — needed for the
# Ollama model-presence check below. Mirrors the Go driver's priority.
EFFECTIVE_JUDGE="$JUDGE_MODEL"
if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then
  EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')"
fi
EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}"

if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \
    '.models[] | select(.name == $m)' >/dev/null 2>&1; then
  echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first"
  exit 1
fi
# Compute a single string for "where did the judge come from" so the
# log line + the markdown report don't have to chain :+/:- substitutions
# (those silently fuse "env JUDGE_MODEL" + the value into "env JUDGE_MODELx"
# without a separator — the bug Opus caught on lift_001's report).
if [ -n "$JUDGE_MODEL" ]; then
  JUDGE_SOURCE="env JUDGE_MODEL=${JUDGE_MODEL}"
else
  JUDGE_SOURCE="config [models].local_judge"
fi
echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from $JUDGE_SOURCE)"

echo "[lift] building binaries..."
go build -o bin/ ./cmd/storaged ./cmd/catalogd ./cmd/ingestd ./cmd/queryd \
                 ./cmd/embedd ./cmd/vectord ./cmd/pathwayd ./cmd/observerd \
                 ./cmd/matrixd ./cmd/gateway \
                 ./scripts/staffing_workers ./scripts/staffing_candidates \
                 ./scripts/playbook_lift

# Anchor pkill to bin/<name>$ so we don't accidentally hit unrelated
# binaries — and exclude chatd (independent of retrieval, stays up).
pkill -f "bin/(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)" 2>/dev/null || true
sleep 0.3

PIDS=()
TMP="$(mktemp -d)"
CFG="$TMP/lift.toml"

cleanup() {
  echo "[lift] cleanup"
  for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done
  rm -rf "$TMP"
}
trap cleanup EXIT INT TERM

cat > "$CFG" <<EOF
# [s3] tells storaged which bucket to talk to. Without it, defaults
# resolve to "lakehouse-primary" (no -go-) which doesn't exist on this
# box and catalogd's rehydrate fails with NoSuchBucket. Access keys
# come from the secrets file (storaged -secrets defaults to
# /etc/lakehouse/secrets-go.toml), not this temp toml.
[s3]
endpoint        = "http://localhost:9000"
region          = "us-east-1"
bucket          = "lakehouse-go-primary"
use_path_style  = true

[gateway]
bind = "127.0.0.1:3110"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
ingestd_url  = "http://127.0.0.1:3213"
queryd_url   = "http://127.0.0.1:3214"
vectord_url  = "http://127.0.0.1:3215"
embedd_url   = "http://127.0.0.1:3216"
pathwayd_url = "http://127.0.0.1:3217"
matrixd_url  = "http://127.0.0.1:3218"
observerd_url = "http://127.0.0.1:3219"

[storaged]
bind = "127.0.0.1:3211"

[catalogd]
bind = "127.0.0.1:3212"
storaged_url = "http://127.0.0.1:3211"

[ingestd]
bind = "127.0.0.1:3213"
storaged_url = "http://127.0.0.1:3211"
catalogd_url = "http://127.0.0.1:3212"
max_ingest_bytes = 268435456

[queryd]
bind = "127.0.0.1:3214"
catalogd_url = "http://127.0.0.1:3212"
secrets_path = "/etc/lakehouse/secrets-go.toml"
# Aggressive refresh so the SQL probe table appears within ~1s of
# ingestd registering it, instead of the prod default 30s.
refresh_every = "1s"

[embedd]
bind = "127.0.0.1:3216"
provider_url  = "http://localhost:11434"
default_model = "nomic-embed-text"

[vectord]
bind = "127.0.0.1:3215"
storaged_url = ""

[pathwayd]
bind = "127.0.0.1:3217"
persist_path = ""

[observerd]
bind = "127.0.0.1:3219"
persist_path = ""

[matrixd]
bind = "127.0.0.1:3218"
embedd_url  = "http://127.0.0.1:3216"
vectord_url = "http://127.0.0.1:3215"
EOF

poll_health() {
  local port="$1" deadline=$(($(date +%s) + 5))
  while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then return 0; fi
    sleep 0.05
  done
  return 1
}

echo "[lift] launching stack (10 daemons; chatd stays up independently)..."
# Order respects dependencies: storaged → catalogd (needs storaged) →
# ingestd (needs storaged+catalogd) → queryd (needs catalogd) → embedd →
# vectord → pathwayd → observerd → matrixd (needs embedd+vectord) →
# gateway (needs all of them).
./bin/storaged  -config "$CFG" > /tmp/storaged.log  2>&1 & PIDS+=($!)
poll_health 3211 || { echo "storaged failed"; exit 1; }
./bin/catalogd  -config "$CFG" > /tmp/catalogd.log  2>&1 & PIDS+=($!)
poll_health 3212 || { echo "catalogd failed"; exit 1; }
./bin/ingestd   -config "$CFG" > /tmp/ingestd.log   2>&1 & PIDS+=($!)
poll_health 3213 || { echo "ingestd failed"; exit 1; }
./bin/queryd    -config "$CFG" > /tmp/queryd.log    2>&1 & PIDS+=($!)
poll_health 3214 || { echo "queryd failed"; exit 1; }
./bin/embedd    -config "$CFG" > /tmp/embedd.log    2>&1 & PIDS+=($!)
poll_health 3216 || { echo "embedd failed"; exit 1; }
./bin/vectord   -config "$CFG" > /tmp/vectord.log   2>&1 & PIDS+=($!)
poll_health 3215 || { echo "vectord failed"; exit 1; }
./bin/pathwayd  -config "$CFG" > /tmp/pathwayd.log  2>&1 & PIDS+=($!)
poll_health 3217 || { echo "pathwayd failed"; exit 1; }
./bin/observerd -config "$CFG" > /tmp/observerd.log 2>&1 & PIDS+=($!)
poll_health 3219 || { echo "observerd failed"; exit 1; }
./bin/matrixd   -config "$CFG" > /tmp/matrixd.log   2>&1 & PIDS+=($!)
poll_health 3218 || { echo "matrixd failed"; exit 1; }
./bin/gateway   -config "$CFG" > /tmp/gateway.log   2>&1 & PIDS+=($!)
poll_health 3110 || { echo "gateway failed"; exit 1; }

echo
echo "[lift] SQL surface probe — ingest 3-row CSV, assert SELECT COUNT(*)=3..."
PROBE_CSV="$TMP/sql_probe.csv"
cat > "$PROBE_CSV" <<CSVEOF
id,name,role
1,Alice,Forklift Operator
2,Bob,Production Worker
3,Charlie,Warehouse Associate
CSVEOF
INGEST_RESP="$(curl -sS -F "file=@$PROBE_CSV" "http://127.0.0.1:3110/v1/ingest?name=lift_sql_probe")"
echo "[lift]   ingest response: $INGEST_RESP"
# Poll up to 5s for queryd to discover the manifest. refresh_every=1s
# is a lower bound; under load or slow disks the manifest may not be
# visible in a fixed sleep, which would 4xx the SQL probe spuriously.
PROBE_COUNT=ERR
SQL_RESP=""
deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
  SQL_RESP="$(curl -sS -X POST http://127.0.0.1:3110/v1/sql \
      -H 'content-type: application/json' \
      -d '{"sql":"SELECT COUNT(*) FROM lift_sql_probe"}')"
  PROBE_COUNT="$(echo "$SQL_RESP" | jq -r '.rows[0][0] // "ERR"' 2>/dev/null || echo "ERR")"
  [ "$PROBE_COUNT" = "3" ] && break
  sleep 0.25
done
if [ "$PROBE_COUNT" = "3" ]; then
  echo "[lift]   ✓ SQL surface probe passed (rowcount=3)"
else
  echo "[lift]   ✗ SQL surface probe FAILED after 5s (got: $SQL_RESP)"
  exit 1
fi

echo
echo "[lift] ingest workers (limit=$WORKERS_LIMIT)..."
./bin/staffing_workers -limit "$WORKERS_LIMIT"

echo
echo "[lift] ingest ethereal_workers (10K, second staffing-domain corpus)..."
# ethereal_workers is the right second corpus for staffing-domain reality
# tests: same schema as workers_500k but a different population (Material
# Handlers, Admin Assistants, etc.) so the matrix layer's multi-corpus
# retrieve+merge actually has TWO relevant corpora to compose against.
# Earlier versions used scripts/staffing_candidates against the SWE-tech
# candidates parquet (Swift/iOS, Scala/Spark, Rust/DataFusion) — wrong
# domain for staffing queries; effectively dead-corpus noise.
# id-prefix "e-" prevents collisions with workers' "w-" since both files
# count worker_id from 1.
./bin/staffing_workers \
  -parquet "/home/profit/lakehouse/data/datasets/ethereal_workers.parquet" \
  -index-name ethereal_workers \
  -id-prefix "e-" \
  -limit 0

echo
echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K"
# -judge "$JUDGE_MODEL" passes either the explicit env override or
# the empty string. The Go driver treats empty -judge as "not set"
# and runs its own resolution chain (env → config → fallback). When
# JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver
# regardless of what its env-lookup would find — flag wins by design.
EXTRA_FLAGS=""
if [ "$WITH_PARAPHRASE" = "1" ]; then
  EXTRA_FLAGS="$EXTRA_FLAGS -with-paraphrase"
fi
if [ "$WITH_REJUDGE" = "1" ]; then
  EXTRA_FLAGS="$EXTRA_FLAGS -with-rejudge"
fi
./bin/playbook_lift \
  -config  "$CONFIG_PATH" \
  -gateway "http://127.0.0.1:3110" \
  -ollama  "http://localhost:11434" \
  -queries "$QUERIES_FILE" \
  -corpora "$CORPORA" \
  -judge   "$JUDGE_MODEL" \
  -k       "$K" \
  -out     "$OUT_JSON" \
  $EXTRA_FLAGS

echo
echo "[lift] generating markdown report → $OUT_MD"
generate_md() {
  local json="$1" md="$2"
  local total discovery lift no_change boosted mean_delta gen_at
  local p_attempted p_top1 p_anyrank p_block
  total=$(jq -r '.summary.total' "$json")
  discovery=$(jq -r '.summary.with_discovery' "$json")
  lift=$(jq -r '.summary.lift_count' "$json")
  no_change=$(jq -r '.summary.no_change' "$json")
  boosted=$(jq -r '.summary.playbook_boosted_total' "$json")
  mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json")
  gen_at=$(jq -r '.summary.generated_at' "$json")
  p_attempted=$(jq -r '.summary.paraphrase_attempted // 0' "$json")
  p_top1=$(jq -r '.summary.paraphrase_top1_lifts // 0' "$json")
  p_anyrank=$(jq -r '.summary.paraphrase_any_rank_hits // 0' "$json")
  rj_attempted=$(jq -r '.summary.rejudge_attempted // 0' "$json")
  q_lifted=$(jq -r '.summary.quality_lifted // 0' "$json")
  q_neutral=$(jq -r '.summary.quality_neutral // 0' "$json")
  q_regressed=$(jq -r '.summary.quality_regressed // 0' "$json")

  # Only emit the paraphrase block when --with-paraphrase actually ran
  # (i.e. .summary.paraphrase_attempted > 0). For verbatim-only runs we
  # leave the headline clean.
  p_block=""
  if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then
    p_block="| **Paraphrase pass — recorded answer at rank 0 (top-1)** | **${p_top1} / ${p_attempted}** |
| Paraphrase pass — recorded answer at any rank in top-K | ${p_anyrank} / ${p_attempted} |"
  fi

  rj_block=""
  if [ "$rj_attempted" != "0" ] && [ "$rj_attempted" != "null" ]; then
    rj_block="| **Quality lift** (warm top-1 rating > cold top-1 rating) | **${q_lifted} / ${rj_attempted}** |
| Quality neutral (warm top-1 rating = cold top-1 rating) | ${q_neutral} / ${rj_attempted} |
| Quality regressed (warm top-1 rating < cold top-1 rating) | ${q_regressed} / ${rj_attempted} |"
  fi

  cat > "$md" <<MDEOF
# Playbook-Lift Reality Test — Run ${RUN_ID}

**Generated:** ${gen_at}
**Judge:** \`${EFFECTIVE_JUDGE}\` (Ollama, resolved from ${JUDGE_SOURCE})
**Corpora:** \`${CORPORA}\`
**Workers limit:** ${WORKERS_LIMIT}
**Queries:** \`${QUERIES_FILE}\` (${total} executed)
**K per pass:** ${K}
**Paraphrase pass:** $([ "$WITH_PARAPHRASE" = "1" ] && echo "ENABLED" || echo "disabled")
**Re-judge pass:** $([ "$WITH_REJUDGE" = "1" ] && echo "ENABLED" || echo "disabled")
**Evidence:** \`${OUT_JSON}\`

---

## Headline

| Metric | Value |
|---|---:|
| Total queries run | ${total} |
| Cold-pass discoveries (judge-best ≠ top-1) | ${discovery} |
| Warm-pass lifts (recorded playbook → top-1) | ${lift} |
| No change (judge-best already top-1, no playbook needed) | ${no_change} |
| Playbook boosts triggered (warm pass) | ${boosted} |
| Mean Δ top-1 distance (warm − cold) | ${mean_delta} |
${p_block}
${rj_block}

**Verbatim lift rate:** ${lift} of ${discovery} discoveries became top-1 after warm pass.

---

## Per-query results

| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift |
|---|---|---|---|---|---|---|---|
MDEOF

  jq -r '.runs | to_entries[] |
    [
      (.key + 1 | tostring),
      (.value.query | .[0:60]),
      .value.cold_top1_id,
      ((.value.cold_judge_best_rank | tostring) + "/" + (.value.cold_judge_best_rating | tostring)),
      (if .value.playbook_recorded then "✓ " + (.value.playbook_target_id // "") else "—" end),
      .value.warm_top1_id,
      (.value.warm_judge_best_rank | tostring),
      (if .value.lift then "**YES**" else "no" end)
    ] | "| " + join(" | ") + " |"
  ' "$json" >> "$md"

  # Paraphrase per-query table — only emit when the pass ran, and only
  # for queries where Pass 1 recorded a playbook (others have no
  # paraphrase_query field).
  if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then
    cat >> "$md" <<MDEOF

---

## Paraphrase pass — does the playbook help similar-but-different queries?

For each query whose Pass 1 cold pass recorded a playbook entry, the
judge model rephrased the query, and the rephrased version was sent
through warm matrix.search. The recorded answer ID's rank in those
results tests whether cosine on the embedded paraphrase finds the
recorded query's vector.

| # | Original (≤40c) | Paraphrase (≤60c) | Recorded answer | Paraphrase top-1 | Recorded rank | Paraphrase lift |
|---|---|---|---|---|---|---|
MDEOF
    jq -r '.runs | to_entries[] |
      select(.value.playbook_recorded == true and (.value.paraphrase_query // "") != "") |
      [
        (.key + 1 | tostring),
        (.value.query | .[0:40]),
        ((.value.paraphrase_query // "") | .[0:60]),
        (.value.playbook_target_id // "—"),
        (.value.paraphrase_top1_id // "—"),
        (.value.paraphrase_recorded_rank | tostring),
        (if .value.paraphrase_lift then "**YES**" else "no" end)
      ] | "| " + join(" | ") + " |"
    ' "$json" >> "$md"
  fi

  cat >> "$md" <<MDEOF

---

## Honesty caveats

1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM
   judge's verdict is what defines "best." If \`${JUDGE_MODEL}\` rates badly,
   the lift number is meaningless. To validate the judge itself, sample 5–10
   verdicts manually and check agreement.
2. **Score-1.0 boost = distance halved.** Playbook math is
   \`distance' = distance × (1 - 0.5 × score)\`. Lift requires the judge-best
   result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise
   even halving doesn't promote it. Tight clusters → little visible lift.
3. **Verbatim vs paraphrase.** The verbatim lift rate (above) is the cheap
   case — same query, recorded playbook, expected boost. The paraphrase
   pass (when enabled) is the actual learning property: similar-but-different
   queries hitting a recorded playbook. Compare verbatim and paraphrase
   lift rates — paraphrase should be lower (semantic-distance gates some
   playbook hits) but non-zero is the meaningful signal.
4. **Multi-corpus skew.** Default corpora=\`${CORPORA}\` — if all judge-best
   results land in one corpus, the matrix layer's purpose isn't being tested.
   Check per-corpus distribution in the JSON.
5. **Judge resolution.** This run used \`${EFFECTIVE_JUDGE}\` from
   ${JUDGE_SOURCE}.
   Bumping the judge for run #N+1 means editing one line in lakehouse.toml.
6. **Paraphrase generation also uses the judge.** The same model that rates
   relevance also rephrases queries. A judge that's bad at rating staffing
   queries is probably also bad at rephrasing them. Worth sanity-checking
   a sample of \`paraphrase_query\` values in the JSON before trusting the
   paraphrase lift number.

## Next moves

- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real
  work. Move to paraphrase queries + tag-based boost (currently ignored).
- If lift rate < 20%: investigate why — judge variance, distance gap too
  wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need
  retuning.
- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is
  already close to optimal on this query distribution. Either the corpus
  is too narrow or the queries are too easy.
MDEOF
}

generate_md "$OUT_JSON" "$OUT_MD"

echo
echo "[lift] DONE"
echo "[lift]   evidence:  $OUT_JSON"
echo "[lift]   report:    $OUT_MD"