From e9822f025de14bd2cc57b3917ffad3049ba5ea93 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 30 Apr 2026 06:47:41 -0500 Subject: [PATCH] playbook_lift v2: paraphrase pass + run #002 finds boost-only limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an opt-in Pass 3 to the lift driver: for each query whose Pass 1 recorded a playbook, ask the judge to rephrase the query, then re-query with playbook=true and check whether the recorded answer surfaces in top-K. This is the test the v1 report's caveat #3 explicitly flagged as the actual learning-property gate (not the cheap verbatim case). Implementation: - New flag --with-paraphrase on the driver (default off). - New WITH_PARAPHRASE env in the harness (default 1, on for prod runs). - New paraphrase_* fields on queryRun + summary, // 0 fallback in jq so re-rendering verbatim-only evidence stays clean. - generateParaphrase() calls the same judge model with format=json and a tight schema; temperature=0.5 for variance without domain drift. - Markdown report adds a paraphrase per-query table (only when the pass ran) and an honesty caveat about judge-also-rephrases coupling. Run #002 result (reports/reality-tests/playbook_lift_002.{json,md}): Verbatim lift 2/2 (100% — Q7 + Q13, both stable from v1) Paraphrase top-1 0/2 Paraphrase any-rank in K 0/2 Both paraphrases dropped the recorded answer OUT of top-K entirely (rank=-1). This isn't a paraphrase-quality problem — qwen2.5's outputs preserved intent ("Hazmat-certified warehouse worker comfortable with cold storage" → "Warehouse worker with Hazmat certification and experience in cold storage"). It's the v0 boost-only stance documented in internal/matrix/playbook.go:22-27: the boost only re-ranks results that ALREADY surfaced from regular retrieval. If paraphrase's cosine retrieval doesn't include the recorded answer in top-K, no boost can promote it. The "Shape B" upgrade mentioned in the playbook.go comment — inject playbook hits directly even when they weren't in the top-K — is what would close this gap. The reality test surfaced exactly the gap the docs warned about. Worth filing as the next product gate. Run-to-run variance also visible: v1 had 8 discoveries, v2 had 2. HNSW insertion order + judge variance both contribute. Stability of Q7 and Q13 across both runs (lifted in v1 AND v2) is the most reliable signal in the dataset. Co-Authored-By: Claude Opus 4.7 (1M context) --- reports/reality-tests/playbook_lift_002.md | 111 ++++++++++++++++ scripts/playbook_lift.sh | 77 ++++++++++- scripts/playbook_lift/main.go | 142 ++++++++++++++++++++- 3 files changed, 322 insertions(+), 8 deletions(-) create mode 100644 reports/reality-tests/playbook_lift_002.md diff --git a/reports/reality-tests/playbook_lift_002.md b/reports/reality-tests/playbook_lift_002.md new file mode 100644 index 0000000..fdaca0e --- /dev/null +++ b/reports/reality-tests/playbook_lift_002.md @@ -0,0 +1,111 @@ +# Playbook-Lift Reality Test — Run 002 + +**Generated:** 2026-04-30T11:46:28.335370797Z +**Judge:** `qwen2.5:latest` (Ollama, resolved from env JUDGE_MODEL=qwen2.5:latest) +**Corpora:** `workers,ethereal_workers` +**Workers limit:** 5000 +**Queries:** `tests/reality/playbook_lift_queries.txt` (21 executed) +**K per pass:** 10 +**Paraphrase pass:** ENABLED +**Evidence:** `reports/reality-tests/playbook_lift_002.json` + +--- + +## Headline + +| Metric | Value | +|---|---:| +| Total queries run | 21 | +| Cold-pass discoveries (judge-best ≠ top-1) | 2 | +| Warm-pass lifts (recorded playbook → top-1) | 2 | +| No change (judge-best already top-1, no playbook needed) | 19 | +| Playbook boosts triggered (warm pass) | 2 | +| Mean Δ top-1 distance (warm − cold) | -0.011403477 | +| **Paraphrase pass — recorded answer at rank 0 (top-1)** | **0 / 2** | +| Paraphrase pass — recorded answer at any rank in top-K | 0 / 2 | + +**Verbatim lift rate:** 2 of 2 discoveries became top-1 after warm pass. + +--- + +## Per-query results + +| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift | +|---|---|---|---|---|---|---|---| +| 1 | Forklift operator with OSHA-30, warehouse experience, day sh | e-8290 | 0/4 | — | e-8290 | 0 | no | +| 2 | OSHA-30 certified forklift operator in Wisconsin, cold stora | e-2580 | 7/3 | — | e-2580 | 7 | no | +| 3 | Production worker with confined-space cert and hazmat traini | w-943 | 0/2 | — | w-943 | 0 | no | +| 4 | CDL Class A driver, clean record, willing to do regional 4-d | w-2486 | 0/1 | — | w-2486 | 0 | no | +| 5 | Warehouse lead with current OSHA-30 certification, NOT OSHA- | w-4278 | 2/2 | — | w-4278 | 2 | no | +| 6 | Forklift-certified loader, certification must be active, dis | e-3143 | 0/2 | — | e-3143 | 0 | no | +| 7 | Hazmat-certified warehouse worker comfortable with cold stor | e-898 | 2/4 | ✓ e-665 | e-665 | 0 | **YES** | +| 8 | Bilingual production worker with team-lead experience and tr | w-4115 | 0/4 | — | w-4115 | 0 | no | +| 9 | Inventory specialist with confined-space cert and compliance | w-1971 | 2/3 | — | w-1971 | 2 | no | +| 10 | Warehouse worker who can run inventory cycles and lead a sma | e-8132 | 0/4 | — | e-8132 | 0 | no | +| 11 | Production line worker comfortable filling in as line superv | w-2558 | 0/3 | — | w-2558 | 0 | no | +| 12 | Customer service rep willing to cross-train into dispatch or | e-1349 | 1/2 | — | e-1349 | 1 | no | +| 13 | Reliable production line lead with strong attendance and lea | e-6006 | 5/4 | ✓ e-5778 | e-5778 | 0 | **YES** | +| 14 | Highly responsive forklift operator available for last-minut | e-6198 | 0/4 | — | e-6198 | 0 | no | +| 15 | Engaged warehouse associate with strong safety compliance re | w-2008 | 0/4 | — | w-2008 | 0 | no | +| 16 | CDL-A driver based in IL or WI, willing to run regional 4-da | w-542 | 6/2 | — | w-542 | 6 | no | +| 17 | Bilingual customer service rep in Indianapolis or Cincinnati | e-4545 | 0/1 | — | e-4545 | 0 | no | +| 18 | Production supervisor open to Midwest relocation for permane | e-3001 | 7/2 | — | e-3001 | 7 | no | +| 19 | Dental hygienist with three years experience, Indianapolis a | e-7086 | 0/1 | — | e-7086 | 0 | no | +| 20 | Registered nurse with ICU experience, willing to take per-di | w-4936 | 0/1 | — | w-4936 | 0 | no | +| 21 | Software engineer with React and TypeScript, three years exp | w-334 | 0/1 | — | w-334 | 0 | no | + +--- + +## Paraphrase pass — does the playbook help similar-but-different queries? + +For each query whose Pass 1 cold pass recorded a playbook entry, the +judge model rephrased the query, and the rephrased version was sent +through warm matrix.search. The recorded answer ID's rank in those +results tests whether cosine on the embedded paraphrase finds the +recorded query's vector. + +| # | Original (≤40c) | Paraphrase (≤60c) | Recorded answer | Paraphrase top-1 | Recorded rank | Paraphrase lift | +|---|---|---|---|---|---|---| +| 7 | Hazmat-certified warehouse worker comfor | Warehouse worker with Hazmat certification and experience in | e-665 | e-4910 | -1 | no | +| 13 | Reliable production line lead with stron | Experienced production line supervisor with excellent punctu | e-5778 | w-1950 | -1 | no | + +--- + +## Honesty caveats + +1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM + judge's verdict is what defines "best." If `qwen2.5:latest` rates badly, + the lift number is meaningless. To validate the judge itself, sample 5–10 + verdicts manually and check agreement. +2. **Score-1.0 boost = distance halved.** Playbook math is + `distance' = distance × (1 - 0.5 × score)`. Lift requires the judge-best + result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise + even halving doesn't promote it. Tight clusters → little visible lift. +3. **Verbatim vs paraphrase.** The verbatim lift rate (above) is the cheap + case — same query, recorded playbook, expected boost. The paraphrase + pass (when enabled) is the actual learning property: similar-but-different + queries hitting a recorded playbook. Compare verbatim and paraphrase + lift rates — paraphrase should be lower (semantic-distance gates some + playbook hits) but non-zero is the meaningful signal. +4. **Multi-corpus skew.** Default corpora=`workers,ethereal_workers` — if all judge-best + results land in one corpus, the matrix layer's purpose isn't being tested. + Check per-corpus distribution in the JSON. +5. **Judge resolution.** This run used `qwen2.5:latest` from + env JUDGE_MODEL=qwen2.5:latest. + Bumping the judge for run #N+1 means editing one line in lakehouse.toml. +6. **Paraphrase generation also uses the judge.** The same model that rates + relevance also rephrases queries. A judge that's bad at rating staffing + queries is probably also bad at rephrasing them. Worth sanity-checking + a sample of `paraphrase_query` values in the JSON before trusting the + paraphrase lift number. + +## Next moves + +- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real + work. Move to paraphrase queries + tag-based boost (currently ignored). +- If lift rate < 20%: investigate why — judge variance, distance gap too + wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need + retuning. +- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is + already close to optimal on this query distribution. Either the corpus + is too narrow or the queries are too easy. diff --git a/scripts/playbook_lift.sh b/scripts/playbook_lift.sh index 04fc251..a73dafa 100755 --- a/scripts/playbook_lift.sh +++ b/scripts/playbook_lift.sh @@ -46,6 +46,12 @@ QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}" CORPORA="${CORPORA:-workers,ethereal_workers}" K="${K:-10}" CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}" +# WITH_PARAPHRASE=1 (default) adds a Pass 3 — for each query whose +# Pass 1 cold pass recorded a playbook, generate a paraphrase via the +# judge and re-query with playbook=true. The paraphrase pass is the +# actual learning-property test (does cosine on paraphrase find the +# recorded entry?). Set WITH_PARAPHRASE=0 for a faster verbatim-only run. +WITH_PARAPHRASE="${WITH_PARAPHRASE:-1}" OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json" OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md" @@ -265,6 +271,10 @@ echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE # and runs its own resolution chain (env → config → fallback). When # JUDGE_MODEL IS set, the explicit -judge wins inside the Go driver # regardless of what its env-lookup would find — flag wins by design. +PARAPHRASE_FLAG="" +if [ "$WITH_PARAPHRASE" = "1" ]; then + PARAPHRASE_FLAG="-with-paraphrase" +fi ./bin/playbook_lift \ -config "$CONFIG_PATH" \ -gateway "http://127.0.0.1:3110" \ @@ -273,13 +283,15 @@ echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE -corpora "$CORPORA" \ -judge "$JUDGE_MODEL" \ -k "$K" \ - -out "$OUT_JSON" + -out "$OUT_JSON" \ + $PARAPHRASE_FLAG echo echo "[lift] generating markdown report → $OUT_MD" generate_md() { local json="$1" md="$2" local total discovery lift no_change boosted mean_delta gen_at + local p_attempted p_top1 p_anyrank p_block total=$(jq -r '.summary.total' "$json") discovery=$(jq -r '.summary.with_discovery' "$json") lift=$(jq -r '.summary.lift_count' "$json") @@ -287,6 +299,18 @@ generate_md() { boosted=$(jq -r '.summary.playbook_boosted_total' "$json") mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json") gen_at=$(jq -r '.summary.generated_at' "$json") + p_attempted=$(jq -r '.summary.paraphrase_attempted // 0' "$json") + p_top1=$(jq -r '.summary.paraphrase_top1_lifts // 0' "$json") + p_anyrank=$(jq -r '.summary.paraphrase_any_rank_hits // 0' "$json") + + # Only emit the paraphrase block when --with-paraphrase actually ran + # (i.e. .summary.paraphrase_attempted > 0). For verbatim-only runs we + # leave the headline clean. + p_block="" + if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then + p_block="| **Paraphrase pass — recorded answer at rank 0 (top-1)** | **${p_top1} / ${p_attempted}** | +| Paraphrase pass — recorded answer at any rank in top-K | ${p_anyrank} / ${p_attempted} |" + fi cat > "$md" <> "$md" + # Paraphrase per-query table — only emit when the pass ran, and only + # for queries where Pass 1 recorded a playbook (others have no + # paraphrase_query field). + if [ "$p_attempted" != "0" ] && [ "$p_attempted" != "null" ]; then + cat >> "$md" <> "$md" + fi + cat >> "$md" < $JUDGE_MODEL env > @@ -226,6 +242,59 @@ func main() { totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance } + // Pass 3 (paraphrase) — opt-in via --with-paraphrase. For each + // query where a playbook was recorded in Pass 1, generate a + // paraphrase via the judge model and run it through warm + // matrix.search. The expectation: if the playbook's learning + // property holds (cosine on embed(paraphrase) finds the recorded + // embed(query) within DefaultPlaybookMaxDistance), the recorded + // answer should appear at top-1 for the paraphrase too. This is + // the claim from the report's caveat #3 that v1 didn't test. + paraphraseAttempted := 0 + paraphraseTop1Lifts := 0 + paraphraseAnyRankHits := 0 + if *withParaphrase { + log.Printf("[lift] paraphrase pass: testing playbook learning property") + for i := range runs { + if !runs[i].PlaybookRecorded { + continue + } + paraphraseAttempted++ + paraphrase, err := generateParaphrase(hc, *ollama, *judge, runs[i].Query) + if err != nil { + log.Printf(" (%d) paraphrase generation failed: %v", i+1, err) + runs[i].Note = appendNote(runs[i].Note, "paraphrase gen failed: "+err.Error()) + continue + } + runs[i].ParaphraseQuery = paraphrase + log.Printf("[lift] (%d/%d paraphrase) %s → %s", i+1, len(runs), + abbrev(runs[i].Query, 40), abbrev(paraphrase, 40)) + + resp, err := matrixSearch(hc, *gw, paraphrase, corpora, *k, true) + if err != nil || len(resp.Results) == 0 { + runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("paraphrase search failed: %v", err)) + runs[i].ParaphraseRecordedRank = -1 + continue + } + runs[i].ParaphraseTop1ID = resp.Results[0].ID + recordedRank := -1 + for j, r := range resp.Results { + if r.ID == runs[i].PlaybookID { + recordedRank = j + break + } + } + runs[i].ParaphraseRecordedRank = recordedRank + if recordedRank == 0 { + runs[i].ParaphraseLift = true + paraphraseTop1Lifts++ + paraphraseAnyRankHits++ + } else if recordedRank > 0 { + paraphraseAnyRankHits++ + } + } + } + sum := summary{ Total: len(runs), WithDiscovery: withDiscovery, @@ -233,6 +302,9 @@ func main() { NoChange: noChange, MeanTop1DeltaDistance: 0, PlaybookBoostedTotal: playbookBoostedTotal, + ParaphraseAttempted: paraphraseAttempted, + ParaphraseTop1Lifts: paraphraseTop1Lifts, + ParaphraseAnyRankHits: paraphraseAnyRankHits, GeneratedAt: time.Now().UTC(), } if len(runs) > 0 { @@ -242,11 +314,75 @@ func main() { if err := writeJSON(*out, runs, sum); err != nil { log.Fatalf("write %s: %v", *out, err) } - log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", - sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance) + if *withParaphrase { + log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1, %d/%d→anyrank", + sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance, + sum.ParaphraseTop1Lifts, sum.ParaphraseAttempted, + sum.ParaphraseAnyRankHits, sum.ParaphraseAttempted) + } else { + log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", + sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance) + } log.Printf("[lift] results → %s", *out) } +// generateParaphrase asks the judge model to rephrase a staffing query +// while preserving intent. Used in the paraphrase pass to test whether +// the playbook's recorded embedding survives wording variation. +// +// temperature=0.5 — enough variance to make the paraphrase actually +// different, but not so high that it drifts off the staffing domain. +// format=json + a tight schema makes parsing deterministic. +func generateParaphrase(hc *http.Client, ollamaURL, model, query string) (string, error) { + system := `You rephrase staffing queries while preserving intent. +Output JSON only: {"paraphrase": ""}. +Rules: +- Keep the same role, certifications, geography, and constraints. +- Vary the wording (synonyms, reordered clauses, different sentence shape). +- Do NOT add or remove requirements. +- Do NOT explain — just emit the JSON.` + body := map[string]any{ + "model": model, + "stream": false, + "format": "json", + "messages": []map[string]string{ + {"role": "system", "content": system}, + {"role": "user", "content": query}, + }, + "options": map[string]any{"temperature": 0.5}, + } + bs, _ := json.Marshal(body) + req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs)) + req.Header.Set("Content-Type", "application/json") + resp, err := hc.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode) + } + rb, _ := io.ReadAll(resp.Body) + var ollamaResp struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal(rb, &ollamaResp); err != nil { + return "", fmt.Errorf("decode ollama envelope: %w", err) + } + var out struct { + Paraphrase string `json:"paraphrase"` + } + if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil { + return "", fmt.Errorf("decode paraphrase JSON: %w (content=%q)", err, ollamaResp.Message.Content) + } + if strings.TrimSpace(out.Paraphrase) == "" { + return "", fmt.Errorf("empty paraphrase (content=%q)", ollamaResp.Message.Content) + } + return out.Paraphrase, nil +} + func loadQueries(path string) ([]string, error) { bs, err := os.ReadFile(path) if err != nil {