diff --git a/reports/reality-tests/playbook_lift_005.md b/reports/reality-tests/playbook_lift_005.md new file mode 100644 index 0000000..e76318e --- /dev/null +++ b/reports/reality-tests/playbook_lift_005.md @@ -0,0 +1,120 @@ +# Playbook-Lift Reality Test — Run 005 + +**Generated:** 2026-04-30T12:40:48.475901847Z +**Judge:** `qwen2.5:latest` (Ollama, resolved from env JUDGE_MODEL=qwen2.5:latest) +**Corpora:** `workers,ethereal_workers` +**Workers limit:** 5000 +**Queries:** `tests/reality/playbook_lift_queries.txt` (21 executed) +**K per pass:** 10 +**Paraphrase pass:** ENABLED +**Re-judge pass:** ENABLED +**Evidence:** `reports/reality-tests/playbook_lift_005.json` + +--- + +## Headline + +| Metric | Value | +|---|---:| +| Total queries run | 21 | +| Cold-pass discoveries (judge-best ≠ top-1) | 7 | +| Warm-pass lifts (recorded playbook → top-1) | 5 | +| No change (judge-best already top-1, no playbook needed) | 16 | +| Playbook boosts triggered (warm pass) | 9 | +| Mean Δ top-1 distance (warm − cold) | -0.076170966 | +| **Paraphrase pass — recorded answer at rank 0 (top-1)** | **5 / 7** | +| Paraphrase pass — recorded answer at any rank in top-K | 5 / 7 | +| **Quality lift** (warm top-1 rating > cold top-1 rating) | **5 / 21** | +| Quality neutral (warm top-1 rating = cold top-1 rating) | 13 / 21 | +| Quality regressed (warm top-1 rating < cold top-1 rating) | 3 / 21 | + +**Verbatim lift rate:** 5 of 7 discoveries became top-1 after warm pass. + +--- + +## Per-query results + +| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift | +|---|---|---|---|---|---|---|---| +| 1 | Forklift operator with OSHA-30, warehouse experience, day sh | e-5670 | 2/4 | ✓ e-5729 | e-5729 | 0 | **YES** | +| 2 | OSHA-30 certified forklift operator in Wisconsin, cold stora | e-6293 | 7/3 | — | w-1566 | 8 | no | +| 3 | Production worker with confined-space cert and hazmat traini | w-602 | 0/2 | — | w-3575 | 1 | no | +| 4 | CDL Class A driver, clean record, willing to do regional 4-d | w-3854 | 0/1 | — | w-3854 | 0 | no | +| 5 | Warehouse lead with current OSHA-30 certification, NOT OSHA- | w-1807 | 6/3 | — | w-1807 | 6 | no | +| 6 | Forklift-certified loader, certification must be active, dis | w-1807 | 3/4 | ✓ w-205 | w-4257 | 1 | no | +| 7 | Hazmat-certified warehouse worker comfortable with cold stor | e-4910 | 2/4 | ✓ w-4257 | w-205 | 1 | no | +| 8 | Bilingual production worker with team-lead experience and tr | w-4988 | 0/4 | — | w-4988 | 0 | no | +| 9 | Inventory specialist with confined-space cert and compliance | w-388 | 3/4 | ✓ w-3575 | w-3575 | 0 | **YES** | +| 10 | Warehouse worker who can run inventory cycles and lead a sma | e-3011 | 0/4 | — | e-3011 | 0 | no | +| 11 | Production line worker comfortable filling in as line superv | w-1387 | 0/4 | — | e-5729 | 1 | no | +| 12 | Customer service rep willing to cross-train into dispatch or | w-1451 | 0/2 | — | w-1451 | 0 | no | +| 13 | Reliable production line lead with strong attendance and lea | e-7360 | 5/4 | ✓ w-2886 | w-2886 | 0 | **YES** | +| 14 | Highly responsive forklift operator available for last-minut | e-6108 | 5/4 | ✓ w-1566 | w-1566 | 0 | **YES** | +| 15 | Engaged warehouse associate with strong safety compliance re | e-2743 | 2/4 | ✓ w-49 | w-49 | 0 | **YES** | +| 16 | CDL-A driver based in IL or WI, willing to run regional 4-da | w-2486 | 5/2 | — | w-2486 | 5 | no | +| 17 | Bilingual customer service rep in Indianapolis or Cincinnati | e-9749 | 9/2 | — | e-9749 | 9 | no | +| 18 | Production supervisor open to Midwest relocation for permane | w-379 | 6/3 | — | w-379 | 6 | no | +| 19 | Dental hygienist with three years experience, Indianapolis a | e-6772 | 0/1 | — | w-3575 | 1 | no | +| 20 | Registered nurse with ICU experience, willing to take per-di | w-379 | 0/1 | — | w-379 | 0 | no | +| 21 | Software engineer with React and TypeScript, three years exp | w-1773 | 0/1 | — | w-1773 | 0 | no | + +--- + +## Paraphrase pass — does the playbook help similar-but-different queries? + +For each query whose Pass 1 cold pass recorded a playbook entry, the +judge model rephrased the query, and the rephrased version was sent +through warm matrix.search. The recorded answer ID's rank in those +results tests whether cosine on the embedded paraphrase finds the +recorded query's vector. + +| # | Original (≤40c) | Paraphrase (≤60c) | Recorded answer | Paraphrase top-1 | Recorded rank | Paraphrase lift | +|---|---|---|---|---|---|---| +| 1 | Forklift operator with OSHA-30, warehous | Seeking forklift operator certified in OSHA-30, looking for | e-5729 | e-5729 | 0 | **YES** | +| 6 | Forklift-certified loader, certification | Loader requiring active forklift certification, this must no | w-205 | w-205 | 0 | **YES** | +| 7 | Hazmat-certified warehouse worker comfor | Warehouse worker with Hazmat certification and experience in | w-4257 | w-4257 | 0 | **YES** | +| 9 | Inventory specialist with confined-space | Specialist in inventory management requiring certified confi | w-3575 | w-49 | -1 | no | +| 13 | Reliable production line lead with stron | Experienced production line supervisor with excellent punctu | w-2886 | w-2886 | 0 | **YES** | +| 14 | Highly responsive forklift operator avai | Available forklift operator ready for urgent shift coverage | w-1566 | w-1566 | 0 | **YES** | +| 15 | Engaged warehouse associate with strong | Warehouse associate dedicated to engagement and boasting a r | w-49 | w-984 | -1 | no | + +--- + +## Honesty caveats + +1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM + judge's verdict is what defines "best." If `qwen2.5:latest` rates badly, + the lift number is meaningless. To validate the judge itself, sample 5–10 + verdicts manually and check agreement. +2. **Score-1.0 boost = distance halved.** Playbook math is + `distance' = distance × (1 - 0.5 × score)`. Lift requires the judge-best + result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise + even halving doesn't promote it. Tight clusters → little visible lift. +3. **Verbatim vs paraphrase.** The verbatim lift rate (above) is the cheap + case — same query, recorded playbook, expected boost. The paraphrase + pass (when enabled) is the actual learning property: similar-but-different + queries hitting a recorded playbook. Compare verbatim and paraphrase + lift rates — paraphrase should be lower (semantic-distance gates some + playbook hits) but non-zero is the meaningful signal. +4. **Multi-corpus skew.** Default corpora=`workers,ethereal_workers` — if all judge-best + results land in one corpus, the matrix layer's purpose isn't being tested. + Check per-corpus distribution in the JSON. +5. **Judge resolution.** This run used `qwen2.5:latest` from + env JUDGE_MODEL=qwen2.5:latest. + Bumping the judge for run #N+1 means editing one line in lakehouse.toml. +6. **Paraphrase generation also uses the judge.** The same model that rates + relevance also rephrases queries. A judge that's bad at rating staffing + queries is probably also bad at rephrasing them. Worth sanity-checking + a sample of `paraphrase_query` values in the JSON before trusting the + paraphrase lift number. + +## Next moves + +- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real + work. Move to paraphrase queries + tag-based boost (currently ignored). +- If lift rate < 20%: investigate why — judge variance, distance gap too + wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need + retuning. +- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is + already close to optimal on this query distribution. Either the corpus + is too narrow or the queries are too easy. diff --git a/scripts/playbook_lift.sh b/scripts/playbook_lift.sh index a73dafa..170d451 100755 --- a/scripts/playbook_lift.sh +++ b/scripts/playbook_lift.sh @@ -52,6 +52,11 @@ CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}" # actual learning-property test (does cosine on paraphrase find the # recorded entry?). Set WITH_PARAPHRASE=0 for a faster verbatim-only run. WITH_PARAPHRASE="${WITH_PARAPHRASE:-1}" +# WITH_REJUDGE=1 (default) adds a Pass 4 — judge warm top-1 to measure +# quality lift (warm rating vs cold rating). Catches cases where Shape B +# surfaces a different-but-equally-good answer (which the rank-based +# lift metric misses). +21 judge calls (~30s on qwen2.5). +WITH_REJUDGE="${WITH_REJUDGE:-1}" OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json" OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md" @@ -271,9 +276,12 @@ echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE # and runs its own resolution chain (env → config → fallback). When # JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver # regardless of what its env-lookup would find — flag wins by design. -PARAPHRASE_FLAG="" +EXTRA_FLAGS="" if [ "$WITH_PARAPHRASE" = "1" ]; then - PARAPHRASE_FLAG="-with-paraphrase" + EXTRA_FLAGS="$EXTRA_FLAGS -with-paraphrase" +fi +if [ "$WITH_REJUDGE" = "1" ]; then + EXTRA_FLAGS="$EXTRA_FLAGS -with-rejudge" fi ./bin/playbook_lift \ -config "$CONFIG_PATH" \ @@ -284,7 +292,7 @@ fi -judge "$JUDGE_MODEL" \ -k "$K" \ -out "$OUT_JSON" \ - $PARAPHRASE_FLAG + $EXTRA_FLAGS echo echo "[lift] generating markdown report → $OUT_MD" @@ -302,6 +310,10 @@ generate_md() { p_attempted=$(jq -r '.summary.paraphrase_attempted // 0' "$json") p_top1=$(jq -r '.summary.paraphrase_top1_lifts // 0' "$json") p_anyrank=$(jq -r '.summary.paraphrase_any_rank_hits // 0' "$json") + rj_attempted=$(jq -r '.summary.rejudge_attempted // 0' "$json") + q_lifted=$(jq -r '.summary.quality_lifted // 0' "$json") + q_neutral=$(jq -r '.summary.quality_neutral // 0' "$json") + q_regressed=$(jq -r '.summary.quality_regressed // 0' "$json") # Only emit the paraphrase block when --with-paraphrase actually ran # (i.e. .summary.paraphrase_attempted > 0). For verbatim-only runs we @@ -312,6 +324,13 @@ generate_md() { | Paraphrase pass — recorded answer at any rank in top-K | ${p_anyrank} / ${p_attempted} |" fi + rj_block="" + if [ "$rj_attempted" != "0" ] && [ "$rj_attempted" != "null" ]; then + rj_block="| **Quality lift** (warm top-1 rating > cold top-1 rating) | **${q_lifted} / ${rj_attempted}** | +| Quality neutral (warm top-1 rating = cold top-1 rating) | ${q_neutral} / ${rj_attempted} | +| Quality regressed (warm top-1 rating < cold top-1 rating) | ${q_regressed} / ${rj_attempted} |" + fi + cat > "$md" < cold-top-1 rating + QualityNeutral int `json:"quality_neutral,omitempty"` // ratings equal (could be same or different item) + QualityRegressed int `json:"quality_regressed,omitempty"` // warm-top-1 rating < cold-top-1 rating + GeneratedAt time.Time `json:"generated_at"` } @@ -128,6 +146,7 @@ func main() { k := flag.Int("k", 10, "top-k from matrix.search per pass") out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path") withParaphrase := flag.Bool("with-paraphrase", false, "after warm pass, generate a paraphrase via the judge model and re-query with playbook=true to test the learning property") + withRejudge := flag.Bool("with-rejudge", false, "after warm pass, judge warm top-1 to measure QUALITY lift (vs cold top-1 rating), not just rank-of-cold-judge-best") flag.Parse() // Judge resolution priority: explicit flag > $JUDGE_MODEL env > @@ -225,6 +244,7 @@ func main() { } runs[i].WarmTop1ID = resp.Results[0].ID runs[i].WarmTop1Distance = resp.Results[0].Distance + runs[i].WarmTop1Metadata = resp.Results[0].Metadata // cache for Pass 4 rejudge runs[i].WarmBoostedCount = resp.PlaybookBoosted playbookBoostedTotal += resp.PlaybookBoosted @@ -304,6 +324,47 @@ func main() { } } + // Pass 4 (warm-rejudge) — opt-in via --with-rejudge. Judge warm + // top-1 against the same prompt as cold ratings, then compare to + // cold top-1 rating. This measures QUALITY lift (did the playbook + // produce a better candidate?) rather than just rank-of-cold-judge- + // best lift (did the recorded answer move to top-1, even if cold's + // top-1 was already good?). See STATE_OF_PLAY OPEN — added because + // run #003's verbatim 2/6 didn't tell us whether Shape B was + // surfacing better OR same-quality alternatives. + rejudgeAttempted := 0 + qualityLifted := 0 + qualityNeutral := 0 + qualityRegressed := 0 + if *withRejudge { + log.Printf("[lift] warm-rejudge pass: measuring quality lift (warm top-1 rating vs cold top-1 rating)") + for i := range runs { + if runs[i].WarmTop1ID == "" || len(runs[i].WarmTop1Metadata) == 0 { + continue // warm pass didn't complete for this query + } + rejudgeAttempted++ + result := matrixResult{ + ID: runs[i].WarmTop1ID, + Distance: runs[i].WarmTop1Distance, + Metadata: runs[i].WarmTop1Metadata, + } + warmRating := judgeRate(hc, *ollama, *judge, runs[i].Query, result) + runs[i].WarmTop1Rating = &warmRating + coldRating := 0 + if len(runs[i].ColdRatings) > 0 { + coldRating = runs[i].ColdRatings[0] + } + switch { + case warmRating > coldRating: + qualityLifted++ + case warmRating < coldRating: + qualityRegressed++ + default: + qualityNeutral++ + } + } + } + sum := summary{ Total: len(runs), WithDiscovery: withDiscovery, @@ -314,6 +375,10 @@ func main() { ParaphraseAttempted: paraphraseAttempted, ParaphraseTop1Lifts: paraphraseTop1Lifts, ParaphraseAnyRankHits: paraphraseAnyRankHits, + RejudgeAttempted: rejudgeAttempted, + QualityLifted: qualityLifted, + QualityNeutral: qualityNeutral, + QualityRegressed: qualityRegressed, GeneratedAt: time.Now().UTC(), } if len(runs) > 0 { @@ -323,11 +388,11 @@ func main() { if err := writeJSON(*out, runs, sum); err != nil { log.Fatalf("write %s: %v", *out, err) } - if *withParaphrase { - log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1, %d/%d→anyrank", + if *withParaphrase || *withRejudge { + log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1 · quality=lifted%d/neutral%d/regressed%d", sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance, sum.ParaphraseTop1Lifts, sum.ParaphraseAttempted, - sum.ParaphraseAnyRankHits, sum.ParaphraseAttempted) + sum.QualityLifted, sum.QualityNeutral, sum.QualityRegressed) } else { log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance)