#!/usr/bin/env bash # Playbook-lift reality test — measure whether the 5-loop substrate # (matrix retrieve+merge + playbook + small-model judge) actually beats # raw cosine on staffing queries. # # Pipeline: # 1. Boot the full Go HTTP stack (storaged, catalogd, ingestd, queryd, # embedd, vectord, pathwayd, observerd, matrixd, gateway). Earlier # versions booted only the 5 daemons matrix.search needs, which # gave a falsely clean "everything works" signal — we now exercise # the prod-realistic daemon graph so daemons that observe (observerd) # or persist (pathwayd) are actually in the loop. # 2. SQL surface probe — ingest a 3-row CSV via /v1/ingest (catalogd # → ingestd → queryd refresh), assert SELECT COUNT(*)=3. Proves the # ingestd→catalogd→queryd path is wired even though the lift driver # itself is vector-only retrieval. # 3. Ingest workers (default 5000) + candidates corpora into vectord # 4. Run the playbook_lift driver: cold pass → judge → record → # warm pass → measure # 5. Generate markdown report from the JSON evidence # # Output: # reports/reality-tests/playbook_lift_.json — raw evidence # reports/reality-tests/playbook_lift_.md — human report # # Requires: Ollama on :11434 with nomic-embed-text + the judge model # loaded. Skips (exit 0) if Ollama is absent. # # Usage: # ./scripts/playbook_lift.sh # run #001 with defaults # RUN_ID=002 ./scripts/playbook_lift.sh # explicit run id # JUDGE_MODEL=qwen2.5:latest ./scripts/playbook_lift.sh # WORKERS_LIMIT=2000 ./scripts/playbook_lift.sh set -euo pipefail cd "$(dirname "$0")/.." export PATH="$PATH:/usr/local/go/bin" RUN_ID="${RUN_ID:-001}" # JUDGE_MODEL: empty means "let the Go driver resolve from # lakehouse.toml [models].local_judge". Set explicitly to override. JUDGE_MODEL="${JUDGE_MODEL:-}" WORKERS_LIMIT="${WORKERS_LIMIT:-5000}" QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}" CORPORA="${CORPORA:-workers,ethereal_workers}" K="${K:-10}" CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}" # WITH_PARAPHRASE=1 (default) adds a Pass 3 — for each query whose # Pass 1 cold pass recorded a playbook, generate a paraphrase via the # judge and re-query with playbook=true. The paraphrase pass is the # actual learning-property test (does cosine on paraphrase find the # recorded entry?). Set WITH_PARAPHRASE=0 for a faster verbatim-only run. WITH_PARAPHRASE="${WITH_PARAPHRASE:-1}" # WITH_REJUDGE=1 (default) adds a Pass 4 — judge warm top-1 to measure # quality lift (warm rating vs cold rating). Catches cases where Shape B # surfaces a different-but-equally-good answer (which the rank-based # lift metric misses). +21 judge calls (~30s on qwen2.5). WITH_REJUDGE="${WITH_REJUDGE:-1}" OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json" OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md" if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then echo "[lift] Ollama not reachable on :11434 — skipping" exit 0 fi # Resolve judge from config when not set explicitly — needed for the # Ollama model-presence check below. Mirrors the Go driver's priority. EFFECTIVE_JUDGE="$JUDGE_MODEL" if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')" fi EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}" if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \ '.models[] | select(.name == $m)' >/dev/null 2>&1; then echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first" exit 1 fi # Compute a single string for "where did the judge come from" so the # log line + the markdown report don't have to chain :+/:- substitutions # (those silently fuse "env JUDGE_MODEL" + the value into "env JUDGE_MODELx" # without a separator — the bug Opus caught on lift_001's report). if [ -n "$JUDGE_MODEL" ]; then JUDGE_SOURCE="env JUDGE_MODEL=${JUDGE_MODEL}" else JUDGE_SOURCE="config [models].local_judge" fi echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from $JUDGE_SOURCE)" echo "[lift] building binaries..." go build -o bin/ ./cmd/storaged ./cmd/catalogd ./cmd/ingestd ./cmd/queryd \ ./cmd/embedd ./cmd/vectord ./cmd/pathwayd ./cmd/observerd \ ./cmd/matrixd ./cmd/gateway \ ./scripts/staffing_workers ./scripts/staffing_candidates \ ./scripts/playbook_lift # Anchor pkill to bin/$ so we don't accidentally hit unrelated # binaries — and exclude chatd (independent of retrieval, stays up). pkill -f "bin/(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)" 2>/dev/null || true sleep 0.3 PIDS=() TMP="$(mktemp -d)" CFG="$TMP/lift.toml" cleanup() { echo "[lift] cleanup" for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done rm -rf "$TMP" } trap cleanup EXIT INT TERM cat > "$CFG" </dev/null 2>&1; then return 0; fi sleep 0.05 done return 1 } echo "[lift] launching stack (10 daemons; chatd stays up independently)..." # Order respects dependencies: storaged → catalogd (needs storaged) → # ingestd (needs storaged+catalogd) → queryd (needs catalogd) → embedd → # vectord → pathwayd → observerd → matrixd (needs embedd+vectord) → # gateway (needs all of them). ./bin/storaged -config "$CFG" > /tmp/storaged.log 2>&1 & PIDS+=($!) poll_health 3211 || { echo "storaged failed"; exit 1; } ./bin/catalogd -config "$CFG" > /tmp/catalogd.log 2>&1 & PIDS+=($!) poll_health 3212 || { echo "catalogd failed"; exit 1; } ./bin/ingestd -config "$CFG" > /tmp/ingestd.log 2>&1 & PIDS+=($!) poll_health 3213 || { echo "ingestd failed"; exit 1; } ./bin/queryd -config "$CFG" > /tmp/queryd.log 2>&1 & PIDS+=($!) poll_health 3214 || { echo "queryd failed"; exit 1; } ./bin/embedd -config "$CFG" > /tmp/embedd.log 2>&1 & PIDS+=($!) poll_health 3216 || { echo "embedd failed"; exit 1; } ./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & PIDS+=($!) poll_health 3215 || { echo "vectord failed"; exit 1; } ./bin/pathwayd -config "$CFG" > /tmp/pathwayd.log 2>&1 & PIDS+=($!) poll_health 3217 || { echo "pathwayd failed"; exit 1; } ./bin/observerd -config "$CFG" > /tmp/observerd.log 2>&1 & PIDS+=($!) poll_health 3219 || { echo "observerd failed"; exit 1; } ./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & PIDS+=($!) poll_health 3218 || { echo "matrixd failed"; exit 1; } ./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & PIDS+=($!) poll_health 3110 || { echo "gateway failed"; exit 1; } echo echo "[lift] SQL surface probe — ingest 3-row CSV, assert SELECT COUNT(*)=3..." PROBE_CSV="$TMP/sql_probe.csv" cat > "$PROBE_CSV" </dev/null || echo "ERR")" [ "$PROBE_COUNT" = "3" ] && break sleep 0.25 done if [ "$PROBE_COUNT" = "3" ]; then echo "[lift] ✓ SQL surface probe passed (rowcount=3)" else echo "[lift] ✗ SQL surface probe FAILED after 5s (got: $SQL_RESP)" exit 1 fi echo echo "[lift] ingest workers (limit=$WORKERS_LIMIT)..." ./bin/staffing_workers -limit "$WORKERS_LIMIT" echo echo "[lift] ingest ethereal_workers (10K, second staffing-domain corpus)..." # ethereal_workers is the right second corpus for staffing-domain reality # tests: same schema as workers_500k but a different population (Material # Handlers, Admin Assistants, etc.) so the matrix layer's multi-corpus # retrieve+merge actually has TWO relevant corpora to compose against. # Earlier versions used scripts/staffing_candidates against the SWE-tech # candidates parquet (Swift/iOS, Scala/Spark, Rust/DataFusion) — wrong # domain for staffing queries; effectively dead-corpus noise. # id-prefix "e-" prevents collisions with workers' "w-" since both files # count worker_id from 1. ./bin/staffing_workers \ -parquet "/home/profit/lakehouse/data/datasets/ethereal_workers.parquet" \ -index-name ethereal_workers \ -id-prefix "e-" \ -limit 0 echo echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K" # -judge "$JUDGE_MODEL" passes either the explicit env override or # the empty string. The Go driver treats empty -judge as "not set" # and runs its own resolution chain (env → config → fallback). When # JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver # regardless of what its env-lookup would find — flag wins by design. EXTRA_FLAGS="" if [ "$WITH_PARAPHRASE" = "1" ]; then EXTRA_FLAGS="$EXTRA_FLAGS -with-paraphrase" fi if [ "$WITH_REJUDGE" = "1" ]; then EXTRA_FLAGS="$EXTRA_FLAGS -with-rejudge" fi ./bin/playbook_lift \ -config "$CONFIG_PATH" \ -gateway "http://127.0.0.1:3110" \ -ollama "http://localhost:11434" \ -queries "$QUERIES_FILE" \ -corpora "$CORPORA" \ -judge "$JUDGE_MODEL" \ -k "$K" \ -out "$OUT_JSON" \ $EXTRA_FLAGS echo echo "[lift] generating markdown report → $OUT_MD" generate_md() { local json="$1" md="$2" local total discovery lift no_change boosted mean_delta gen_at local p_attempted p_top1 p_anyrank p_block total=$(jq -r '.summary.total' "$json") discovery=$(jq -r '.summary.with_discovery' "$json") lift=$(jq -r '.summary.lift_count' "$json") no_change=$(jq -r '.summary.no_change' "$json") boosted=$(jq -r '.summary.playbook_boosted_total' "$json") mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json") gen_at=$(jq -r '.summary.generated_at' "$json") p_attempted=$(jq -r '.summary.paraphrase_attempted // 0' "$json") p_top1=$(jq -r '.summary.paraphrase_top1_lifts // 0' "$json") p_anyrank=$(jq -r '.summary.paraphrase_any_rank_hits // 0' "$json") rj_attempted=$(jq -r '.summary.rejudge_attempted // 0' "$json") q_lifted=$(jq -r '.summary.quality_lifted // 0' "$json") q_neutral=$(jq -r '.summary.quality_neutral // 0' "$json") q_regressed=$(jq -r '.summary.quality_regressed // 0' "$json") # Only emit the paraphrase block when --with-paraphrase actually ran # (i.e. .summary.paraphrase_attempted > 0). For verbatim-only runs we # leave the headline clean. p_block="" if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then p_block="| **Paraphrase pass — recorded answer at rank 0 (top-1)** | **${p_top1} / ${p_attempted}** | | Paraphrase pass — recorded answer at any rank in top-K | ${p_anyrank} / ${p_attempted} |" fi rj_block="" if [ "$rj_attempted" != "0" ] && [ "$rj_attempted" != "null" ]; then rj_block="| **Quality lift** (warm top-1 rating > cold top-1 rating) | **${q_lifted} / ${rj_attempted}** | | Quality neutral (warm top-1 rating = cold top-1 rating) | ${q_neutral} / ${rj_attempted} | | Quality regressed (warm top-1 rating < cold top-1 rating) | ${q_regressed} / ${rj_attempted} |" fi cat > "$md" <> "$md" # Paraphrase per-query table — only emit when the pass ran, and only # for queries where Pass 1 recorded a playbook (others have no # paraphrase_query field). if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then cat >> "$md" <> "$md" fi cat >> "$md" <