golangLAKEHOUSE/scripts/playbook_lift/main.go

// Playbook-lift reality test driver. Two-pass design:
//
//   Pass 1 (cold): for each query → matrix.search use_playbook=false →
//                  LLM judge rates top-K → record playbook entry pointing
//                  at the highest-rated result (which may NOT be top-1
//                  by distance — that's the discovery worth boosting).
//
//   Pass 2 (warm): same queries → use_playbook=true → measure how the
//                  ranking shifted.
//
// Lift = real if pass-2 brings the LLM-judged-best result into top-1
// more often than pass-1. If lift ≈ 0, the playbook is just confirming
// what cosine already said and the 5-loop thesis is unproven.
//
// Honest about what this measures: with no human-labeled ground truth,
// the LLM judge IS the ground truth proxy. That's the small-model
// pipeline thesis itself — the same model class that runs the inner
// loop is also what we trust to evaluate it. If you don't trust the
// judge, the lift number is meaningless; that's a separate problem
// for ground-truth labeling.
//
// Usage (driven by scripts/playbook_lift.sh):
//   playbook_lift -gateway http://127.0.0.1:3110 \
//                 -queries tests/reality/playbook_lift_queries.txt \
//                 -judge qwen3.5:latest \
//                 -corpora workers,candidates \
//                 -k 10 \
//                 -out reports/reality-tests/playbook_lift_001.json
package main

import (
	"bytes"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"regexp"
	"sort"
	"strings"
	"time"

	"git.agentview.dev/profit/golangLAKEHOUSE/internal/shared"
)

type matrixResult struct {
	ID       string          `json:"id"`
	Distance float32         `json:"distance"`
	Corpus   string          `json:"corpus"`
	Metadata json.RawMessage `json:"metadata,omitempty"`
}

type matrixResp struct {
	Results         []matrixResult `json:"results"`
	PerCorpusCounts map[string]int `json:"per_corpus_counts"`
	PlaybookBoosted int            `json:"playbook_boosted,omitempty"`
}

type judgeVerdict struct {
	Rating int    `json:"rating"`
	Reason string `json:"reason"`
}

type queryRun struct {
	Query string `json:"query"`

	ColdTop1ID       string  `json:"cold_top1_id"`
	ColdTop1Distance float32 `json:"cold_top1_distance"`
	ColdJudgeBestID  string  `json:"cold_judge_best_id"`
	ColdJudgeBestRank int    `json:"cold_judge_best_rank"`
	ColdJudgeBestRating int  `json:"cold_judge_best_rating"`
	ColdRatings      []int   `json:"cold_ratings"`

	PlaybookRecorded bool   `json:"playbook_recorded"`
	PlaybookID       string `json:"playbook_target_id,omitempty"`

	WarmTop1ID       string          `json:"warm_top1_id"`
	WarmTop1Distance float32         `json:"warm_top1_distance"`
	WarmBoostedCount int             `json:"warm_boosted_count"`
	WarmJudgeBestRank int            `json:"warm_judge_best_rank"` // rank of cold judge-best in warm — NOT the warm pass's own judge-best
	WarmTop1Metadata json.RawMessage `json:"-"`                    // cached for Pass 4 rejudge; not emitted

	// WarmTop1Rating: only populated when --with-rejudge. Compare to
	// ColdRatings[0] (== cold top-1 rating) to measure quality lift.
	// *int so absence (no rejudge pass) and a 0-rating verdict are
	// distinguishable.
	WarmTop1Rating *int `json:"warm_top1_rating,omitempty"`

	Lift bool `json:"lift"` // judge-best was below top-1 cold, but top-1 warm

	// Paraphrase pass — only populated when --with-paraphrase. Tests
	// the playbook's actual learning property: does a recorded entry
	// for query Q help a similar-but-different query Q'?
	//
	// ParaphraseRecordedRank semantics:
	//   nil    = paraphrase pass didn't run for this query (no playbook
	//            was recorded in cold pass, so nothing to test)
	//   0      = recorded answer landed at top-1
	//   1..K-1 = recorded answer present in top-K at that rank
	//   -1     = recorded answer absent from top-K
	// Pointer (not int) so nil and rank-0 are distinguishable in JSON.
	ParaphraseQuery        string `json:"paraphrase_query,omitempty"`
	ParaphraseTop1ID       string `json:"paraphrase_top1_id,omitempty"`
	ParaphraseRecordedRank *int   `json:"paraphrase_recorded_rank,omitempty"`
	ParaphraseLift         bool   `json:"paraphrase_lift,omitempty"` // recorded answer at rank 0 for paraphrase

	Note string `json:"note,omitempty"`
}

type summary struct {
	Total                 int       `json:"total"`
	WithDiscovery         int       `json:"with_discovery"` // judge-best != cold top-1
	LiftCount             int       `json:"lift_count"`     // top-1 changed warm→ judge-best
	NoChange              int       `json:"no_change"`
	MeanTop1DeltaDistance float32   `json:"mean_top1_delta_distance"`
	PlaybookBoostedTotal  int       `json:"playbook_boosted_total"`

	// Paraphrase pass aggregates — only populated when --with-paraphrase.
	ParaphraseAttempted   int `json:"paraphrase_attempted,omitempty"`   // queries with playbook recorded that ran a paraphrase
	ParaphraseTop1Lifts   int `json:"paraphrase_top1_lifts,omitempty"`  // recorded answer surfaced at rank 0
	ParaphraseAnyRankHits int `json:"paraphrase_any_rank_hits,omitempty"` // recorded answer surfaced at any rank in top-K

	// Re-judge pass aggregates — only populated when --with-rejudge.
	// Measures QUALITY lift (warm top-1 rating vs cold top-1 rating)
	// rather than rank-of-cold-judge-best lift. The latter conflates
	// "warm surfaced a different but equally-good result" with "warm
	// shuffled ranks but the answer was the same"; quality lift
	// disambiguates them.
	RejudgeAttempted   int `json:"rejudge_attempted,omitempty"`   // queries that ran the rejudge pass
	QualityLifted      int `json:"quality_lifted,omitempty"`      // warm-top-1 rating > cold-top-1 rating
	QualityNeutral     int `json:"quality_neutral,omitempty"`     // ratings equal (could be same or different item)
	QualityRegressed   int `json:"quality_regressed,omitempty"`   // warm-top-1 rating < cold-top-1 rating

	GeneratedAt time.Time `json:"generated_at"`
}

func main() {
	configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)")
	gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL")
	ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge")
	queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path")
	corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora")
	// Empty default — resolved below from (priority): flag > env > config > hardcoded.
	judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)")
	k := flag.Int("k", 10, "top-k from matrix.search per pass")
	out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path")
	withParaphrase := flag.Bool("with-paraphrase", false, "after warm pass, generate a paraphrase via the judge model and re-query with playbook=true to test the learning property")
	withRejudge := flag.Bool("with-rejudge", false, "after warm pass, judge warm top-1 to measure QUALITY lift (vs cold top-1 rating), not just rank-of-cold-judge-best")
	llmRoleExtract := flag.Bool("llm-role-extract", false, "fall back to LLM (qwen2.5 format=json) when the regex extractor returns empty — closes the shorthand-style cross-role bleed surfaced in real_003 at the cost of ~1-3s/query")
	llmRoleModel := flag.String("llm-role-model", "qwen2.5:latest", "Ollama model used for LLM role extraction; ignored when -llm-role-extract is off")
	flag.Parse()

	// Judge resolution priority: explicit flag > $JUDGE_MODEL env >
	// cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this
	// up so model bumps land in lakehouse.toml, not in this driver.
	if *judge == "" {
		if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" {
			*judge = env
		} else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" {
			*judge = cfg.Models.LocalJudge
		} else {
			*judge = "qwen3.5:latest"
			log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge)
		}
	}

	corpora := strings.Split(*corporaCSV, ",")

	qs, err := loadQueries(*queries)
	if err != nil {
		log.Fatalf("load queries: %v", err)
	}
	if len(qs) == 0 {
		log.Fatalf("no queries in %s", *queries)
	}
	log.Printf("[lift] %d queries · corpora=%v · k=%d · judge=%s", len(qs), corpora, *k, *judge)

	hc := &http.Client{Timeout: 60 * time.Second}

	// Package-global role extractor — used by matrixSearch +
	// playbookRecord. Off-by-default so the existing harness behavior
	// (regex-only extraction) is unchanged unless -llm-role-extract.
	{
		mdl := ""
		if *llmRoleExtract {
			mdl = *llmRoleModel
			log.Printf("[lift] llm role extraction ON (model=%s) — shorthand queries get LLM fallback", mdl)
		}
		globalRoleExtractor = &roleExtractor{
			hc:        hc,
			ollamaURL: *ollama,
			model:     mdl,
		}
	}

	runs := make([]queryRun, 0, len(qs))
	totalDelta := float32(0)
	playbookBoostedTotal := 0
	withDiscovery := 0
	liftCount := 0
	noChange := 0

	// Pass 1 (cold) + record playbooks based on judge verdicts.
	for i, q := range qs {
		log.Printf("[lift] (%d/%d cold) %s", i+1, len(qs), abbrev(q, 60))
		resp, err := matrixSearch(hc, *gw, q, corpora, *k, false)
		if err != nil {
			log.Printf("  cold search failed: %v — skipping", err)
			continue
		}
		if len(resp.Results) == 0 {
			log.Printf("  cold returned 0 results — skipping")
			continue
		}
		ratings := make([]int, len(resp.Results))
		bestRank := 0
		bestRating := -1
		for j, r := range resp.Results {
			rating := judgeRate(hc, *ollama, *judge, q, r)
			ratings[j] = rating
			if rating > bestRating {
				bestRating = rating
				bestRank = j
			}
		}
		run := queryRun{
			Query:               q,
			ColdTop1ID:          resp.Results[0].ID,
			ColdTop1Distance:    resp.Results[0].Distance,
			ColdJudgeBestID:     resp.Results[bestRank].ID,
			ColdJudgeBestRank:   bestRank,
			ColdJudgeBestRating: bestRating,
			ColdRatings:         ratings,
		}
		// Record a playbook only if the judge best is not already top-1
		// (otherwise we're boosting something cosine already crowned).
		if bestRank > 0 && bestRating >= 4 {
			withDiscovery++
			if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil {
				log.Printf("  playbook record failed: %v", err)
				run.Note = "playbook record failed: " + err.Error()
			} else {
				run.PlaybookRecorded = true
				run.PlaybookID = resp.Results[bestRank].ID
			}
		} else if bestRank == 0 {
			run.Note = "judge-best already top-1 cold — no playbook needed"
		} else {
			run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating)
		}
		runs = append(runs, run)
	}

	// Pass 2 (warm) on the same queries.
	for i := range runs {
		q := runs[i].Query
		log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60))
		resp, err := matrixSearch(hc, *gw, q, corpora, *k, true)
		if err != nil || len(resp.Results) == 0 {
			runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err))
			continue
		}
		runs[i].WarmTop1ID = resp.Results[0].ID
		runs[i].WarmTop1Distance = resp.Results[0].Distance
		runs[i].WarmTop1Metadata = resp.Results[0].Metadata // cache for Pass 4 rejudge
		runs[i].WarmBoostedCount = resp.PlaybookBoosted
		playbookBoostedTotal += resp.PlaybookBoosted

		// Find where the cold judge-best ID landed in the warm ranking.
		warmRank := -1
		for j, r := range resp.Results {
			if r.ID == runs[i].ColdJudgeBestID {
				warmRank = j
				break
			}
		}
		runs[i].WarmJudgeBestRank = warmRank

		switch {
		case runs[i].PlaybookRecorded && warmRank == 0:
			runs[i].Lift = true
			liftCount++
		case !runs[i].PlaybookRecorded:
			noChange++
		default:
			noChange++
		}
		totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance
	}

	// Pass 3 (paraphrase) — opt-in via --with-paraphrase. For each
	// query where a playbook was recorded in Pass 1, generate a
	// paraphrase via the judge model and run it through warm
	// matrix.search. The expectation: if the playbook's learning
	// property holds (cosine on embed(paraphrase) finds the recorded
	// embed(query) within DefaultPlaybookMaxDistance), the recorded
	// answer should appear at top-1 for the paraphrase too. This is
	// the claim from the report's caveat #3 that v1 didn't test.
	paraphraseAttempted := 0
	paraphraseTop1Lifts := 0
	paraphraseAnyRankHits := 0
	if *withParaphrase {
		log.Printf("[lift] paraphrase pass: testing playbook learning property")
		for i := range runs {
			if !runs[i].PlaybookRecorded {
				continue
			}
			paraphraseAttempted++
			paraphrase, err := generateParaphrase(hc, *ollama, *judge, runs[i].Query)
			if err != nil {
				log.Printf("  (%d) paraphrase generation failed: %v", i+1, err)
				runs[i].Note = appendNote(runs[i].Note, "paraphrase gen failed: "+err.Error())
				continue
			}
			runs[i].ParaphraseQuery = paraphrase
			log.Printf("[lift] (%d/%d paraphrase) %s → %s", i+1, len(runs),
				abbrev(runs[i].Query, 40), abbrev(paraphrase, 40))

			resp, err := matrixSearch(hc, *gw, paraphrase, corpora, *k, true)
			if err != nil || len(resp.Results) == 0 {
				runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("paraphrase search failed: %v", err))
				missed := -1
				runs[i].ParaphraseRecordedRank = &missed
				continue
			}
			runs[i].ParaphraseTop1ID = resp.Results[0].ID
			recordedRank := -1
			for j, r := range resp.Results {
				if r.ID == runs[i].PlaybookID {
					recordedRank = j
					break
				}
			}
			runs[i].ParaphraseRecordedRank = &recordedRank
			if recordedRank == 0 {
				runs[i].ParaphraseLift = true
				paraphraseTop1Lifts++
				paraphraseAnyRankHits++
			} else if recordedRank > 0 {
				paraphraseAnyRankHits++
			}
		}
	}

	// Pass 4 (warm-rejudge) — opt-in via --with-rejudge. Judge warm
	// top-1 against the same prompt as cold ratings, then compare to
	// cold top-1 rating. This measures QUALITY lift (did the playbook
	// produce a better candidate?) rather than just rank-of-cold-judge-
	// best lift (did the recorded answer move to top-1, even if cold's
	// top-1 was already good?). See STATE_OF_PLAY OPEN — added because
	// run #003's verbatim 2/6 didn't tell us whether Shape B was
	// surfacing better OR same-quality alternatives.
	rejudgeAttempted := 0
	qualityLifted := 0
	qualityNeutral := 0
	qualityRegressed := 0
	if *withRejudge {
		log.Printf("[lift] warm-rejudge pass: measuring quality lift (warm top-1 rating vs cold top-1 rating)")
		for i := range runs {
			if runs[i].WarmTop1ID == "" || len(runs[i].WarmTop1Metadata) == 0 {
				continue // warm pass didn't complete for this query
			}
			rejudgeAttempted++
			result := matrixResult{
				ID:       runs[i].WarmTop1ID,
				Distance: runs[i].WarmTop1Distance,
				Metadata: runs[i].WarmTop1Metadata,
			}
			warmRating := judgeRate(hc, *ollama, *judge, runs[i].Query, result)
			runs[i].WarmTop1Rating = &warmRating
			coldRating := 0
			if len(runs[i].ColdRatings) > 0 {
				coldRating = runs[i].ColdRatings[0]
			}
			switch {
			case warmRating > coldRating:
				qualityLifted++
			case warmRating < coldRating:
				qualityRegressed++
			default:
				qualityNeutral++
			}
		}
	}

	sum := summary{
		Total:                 len(runs),
		WithDiscovery:         withDiscovery,
		LiftCount:             liftCount,
		NoChange:              noChange,
		MeanTop1DeltaDistance: 0,
		PlaybookBoostedTotal:  playbookBoostedTotal,
		ParaphraseAttempted:   paraphraseAttempted,
		ParaphraseTop1Lifts:   paraphraseTop1Lifts,
		ParaphraseAnyRankHits: paraphraseAnyRankHits,
		RejudgeAttempted:      rejudgeAttempted,
		QualityLifted:         qualityLifted,
		QualityNeutral:        qualityNeutral,
		QualityRegressed:      qualityRegressed,
		GeneratedAt:           time.Now().UTC(),
	}
	if len(runs) > 0 {
		sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs))
	}

	if err := writeJSON(*out, runs, sum); err != nil {
		log.Fatalf("write %s: %v", *out, err)
	}
	if *withParaphrase || *withRejudge {
		log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1 · quality=lifted%d/neutral%d/regressed%d",
			sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance,
			sum.ParaphraseTop1Lifts, sum.ParaphraseAttempted,
			sum.QualityLifted, sum.QualityNeutral, sum.QualityRegressed)
	} else {
		log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f",
			sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance)
	}
	log.Printf("[lift] results → %s", *out)
}

// generateParaphrase asks the judge model to rephrase a staffing query
// while preserving intent. Used in the paraphrase pass to test whether
// the playbook's recorded embedding survives wording variation.
//
// temperature=0.5 — enough variance to make the paraphrase actually
// different, but not so high that it drifts off the staffing domain.
// format=json + a tight schema makes parsing deterministic.
func generateParaphrase(hc *http.Client, ollamaURL, model, query string) (string, error) {
	system := `You rephrase staffing queries while preserving intent.
Output JSON only: {"paraphrase": "<rephrased query>"}.
Rules:
- Keep the same role, certifications, geography, and constraints.
- Vary the wording (synonyms, reordered clauses, different sentence shape).
- Do NOT add or remove requirements.
- Do NOT explain — just emit the JSON.`
	body := map[string]any{
		"model":  model,
		"stream": false,
		"format": "json",
		"messages": []map[string]string{
			{"role": "system", "content": system},
			{"role": "user", "content": query},
		},
		"options": map[string]any{"temperature": 0.5},
	}
	bs, _ := json.Marshal(body)
	req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()
	if resp.StatusCode/100 != 2 {
		return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode)
	}
	rb, _ := io.ReadAll(resp.Body)
	var ollamaResp struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
	}
	if err := json.Unmarshal(rb, &ollamaResp); err != nil {
		return "", fmt.Errorf("decode ollama envelope: %w", err)
	}
	var out struct {
		Paraphrase string `json:"paraphrase"`
	}
	if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil {
		return "", fmt.Errorf("decode paraphrase JSON: %w (content=%q)", err, ollamaResp.Message.Content)
	}
	if strings.TrimSpace(out.Paraphrase) == "" {
		return "", fmt.Errorf("empty paraphrase (content=%q)", ollamaResp.Message.Content)
	}
	return out.Paraphrase, nil
}

func loadQueries(path string) ([]string, error) {
	bs, err := os.ReadFile(path)
	if err != nil {
		return nil, err
	}
	var out []string
	for _, line := range strings.Split(string(bs), "\n") {
		s := strings.TrimSpace(line)
		if s == "" || strings.HasPrefix(s, "#") {
			continue
		}
		out = append(out, s)
	}
	return out, nil
}

func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) {
	body := map[string]any{
		"query_text":   query,
		"corpora":      corpora,
		"k":            k,
		"per_corpus_k": k,
		"use_playbook": usePlaybook,
	}
	// Role extraction (real_001 + real_003 cross-role bleed fixes).
	// Goes through globalRoleExtractor so shorthand-style queries get
	// LLM fallback when -llm-role-extract is on. Empty result leaves
	// the gate disabled — harness preserves current behavior on
	// truly-unparseable shapes.
	if role := globalRoleExtractor.extract(query); role != "" {
		body["query_role"] = role
	}
	bs, _ := json.Marshal(body)
	req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()
	rb, _ := io.ReadAll(resp.Body)
	if resp.StatusCode/100 != 2 {
		return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
	}
	var out matrixResp
	if err := json.Unmarshal(rb, &out); err != nil {
		return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200))
	}
	return &out, nil
}

func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error {
	body := map[string]any{
		"query_text":    query,
		"answer_id":     answerID,
		"answer_corpus": answerCorpus,
		"score":         score,
		"tags":          []string{"reality-test", "playbook-lift-001"},
	}
	// Same extractor as matrixSearch — shared cache, same LLM fallback
	// rules. Recorded role lets retrieve-time gate fire on cross-role
	// queries (real_001 + real_003 fixes).
	if role := globalRoleExtractor.extract(query); role != "" {
		body["role"] = role
	}
	bs, _ := json.Marshal(body)
	req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if resp.StatusCode/100 != 2 {
		rb, _ := io.ReadAll(resp.Body)
		return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
	}
	return nil
}

// judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating
// of the result against the query. Returns 0 on any failure (treated as
// "couldn't judge, exclude from best-of consideration").
func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int {
	system := `You rate retrieval results for a staffing co-pilot.
Rate the result 1-5 against the query:
  5 = perfect match (this person/job IS what was asked for)
  4 = strong match (right field, right level, minor mismatches)
  3 = adjacent match (related field or partial overlap)
  2 = weak/tangential match
  1 = irrelevant
Output JSON only: {"rating": N, "reason": "<one sentence>"}.`
	user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s",
		query, r.Corpus, r.ID, string(r.Metadata))

	body := map[string]any{
		"model":  model,
		"stream": false,
		"format": "json",
		"messages": []map[string]string{
			{"role": "system", "content": system},
			{"role": "user", "content": user},
		},
		"options": map[string]any{"temperature": 0},
	}
	bs, _ := json.Marshal(body)
	req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		return 0
	}
	defer resp.Body.Close()
	if resp.StatusCode/100 != 2 {
		return 0
	}
	rb, _ := io.ReadAll(resp.Body)
	var ollamaResp struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
	}
	if err := json.Unmarshal(rb, &ollamaResp); err != nil {
		return 0
	}
	var v judgeVerdict
	if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil {
		return 0
	}
	if v.Rating < 1 || v.Rating > 5 {
		return 0
	}
	return v.Rating
}

func writeJSON(path string, runs []queryRun, sum summary) error {
	if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil {
		return err
	}
	out := struct {
		Summary summary    `json:"summary"`
		Runs    []queryRun `json:"runs"`
	}{Summary: sum, Runs: runs}
	bs, err := json.MarshalIndent(out, "", "  ")
	if err != nil {
		return err
	}
	return os.WriteFile(path, bs, 0o644)
}

func filepath_dir(p string) string {
	if i := strings.LastIndex(p, "/"); i >= 0 {
		return p[:i]
	}
	return "."
}

func abbrev(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}

func appendNote(existing, add string) string {
	if existing == "" {
		return add
	}
	return existing + "; " + add
}

// Suppress unused-import warning when sort isn't used in a future
// refactor; harmless for now.
var _ = sort.Slice

// globalRoleExtractor is set in main() and read by matrixSearch +
// playbookRecord. nil-safe: a nil receiver on roleExtractor.extract
// degrades to regex-only behavior, so the field is checked once at
// startup and never re-validated per call.
var globalRoleExtractor *roleExtractor

// roleExtractor combines the fast-path regex with an optional LLM
// fallback so callers can pay for shorthand coverage only when they
// need it. Per real_003_findings.md: shorthand-style queries
// ("N {role} {city} {state} {at} {client}") have no separator
// between role and city, so a regex can't reliably extract — but a
// small LLM with a tight format=json prompt can. Cost is ~1-3s per
// extraction on local qwen2.5; cached per-process so paraphrase
// passes don't pay twice for the same query.
//
// Empty cache + off-by-default LLM = the existing real_003b behavior
// is unchanged unless callers explicitly enable LLM mode.
type roleExtractor struct {
	hc        *http.Client
	ollamaURL string
	model     string // "" disables LLM fallback
	cache     map[string]string
}

// extract returns the role for a query. Tries regex first (fast,
// deterministic); if regex misses and LLM is configured, calls Ollama;
// caches the result either way. Returns "" when both miss — the
// caller's gate stays disabled, preserving current behavior on
// truly-unparseable shapes.
func (r *roleExtractor) extract(query string) string {
	if r == nil {
		return extractRoleFromNeed(query)
	}
	if cached, ok := r.cache[query]; ok {
		return cached
	}
	role := extractRoleFromNeed(query)
	if role == "" && r.model != "" {
		if v, err := extractRoleViaLLM(r.hc, r.ollamaURL, r.model, query); err == nil {
			role = v
		} else {
			log.Printf("[lift] llm-role-extract failed (%v) — falling back to empty for %q", err, abbrev(query, 60))
		}
	}
	if r.cache == nil {
		r.cache = make(map[string]string)
	}
	r.cache[query] = role
	return role
}

// extractRoleViaLLM asks the Ollama-shape /api/chat to identify the
// staffing role in a free-form query. Tight schema + format=json so
// parsing is deterministic. Empty role string is a valid response —
// the model may decline to extract when the query has no clean role
// (e.g. lift-suite multi-constraint queries).
func extractRoleViaLLM(hc *http.Client, ollamaURL, model, query string) (string, error) {
	system := `You are a staffing-domain role extractor.
Output JSON only: {"role": "<role string or empty>"}.
Rules:
- Identify the staffing role (job title) the query is asking for.
- Return only the role noun phrase — e.g. "Forklift Operator", "Pickers".
- Preserve plurality from the query (don't singularize).
- Strip qualifiers ("OSHA-30 certified") — we want the bare role.
- If the query has no clean staffing role, return "" (empty string).
- Do NOT explain. Just emit the JSON.`
	body, _ := json.Marshal(map[string]any{
		"model":  model,
		"stream": false,
		"format": "json",
		"messages": []map[string]string{
			{"role": "system", "content": system},
			{"role": "user", "content": query},
		},
		"options": map[string]any{"temperature": 0.0},
	})
	req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(body))
	req.Header.Set("Content-Type", "application/json")
	resp, err := hc.Do(req)
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()
	if resp.StatusCode/100 != 2 {
		return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode)
	}
	rb, _ := io.ReadAll(resp.Body)
	var ollamaResp struct {
		Message struct {
			Content string `json:"content"`
		} `json:"message"`
	}
	if err := json.Unmarshal(rb, &ollamaResp); err != nil {
		return "", err
	}
	var out struct {
		Role string `json:"role"`
	}
	if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil {
		return "", fmt.Errorf("decode role: %w (content=%q)", err, ollamaResp.Message.Content)
	}
	return strings.TrimSpace(out.Role), nil
}

// extractRoleFromNeed pulls the role out of staffing-shape queries.
// Returns "" for any query that doesn't match a known anchor pattern
// (free-form lift-suite queries + shorthand-style fall back to empty,
// leaving the cross-role gate disabled).
//
// Patterns covered (in priority order):
//   need:         "Need N {role}{s} in {city} ..."
//   client_first: "{client} needs N {role}{s} in {city} ..."
//   looking:      "Looking for N {role}{s} at {client} in {city} ..."
//
// Pattern explicitly NOT covered:
//   shorthand:    "N {role}{s} {city} {state} {at} {client}"
// Because there's no separator between role and city in shorthand
// ("Forklift Operator Detroit" is shape-indistinguishable from
// "Forklift" + "Operator Detroit"), a regex can't reliably extract
// role here. real_003 confirmed shorthand-vs-shorthand cross-role
// bleed: a CNC Operator shorthand recording leaked w-2404 onto a
// Forklift Operator shorthand query within the same Beacon Freight
// Detroit cluster. Closing that requires either an LLM extractor at
// record+query time or a known-cities lookup table.
//
// Lives here (not in internal/matrix) because role extraction from
// free-form text is a caller concern; matrix only consumes the
// already-resolved Role string. A future LLM-based extractor would
// replace this function without changing matrix's gate logic.
func extractRoleFromNeed(query string) string {
	for _, re := range roleExtractRegexes {
		if m := re.FindStringSubmatch(query); len(m) >= 2 {
			return strings.TrimSpace(m[1])
		}
	}
	return ""
}

// roleExtractRegexes is ordered: more-specific anchors first so a
// "Looking for ..." query doesn't accidentally land in the "Need"
// pattern (impossible given the prefix, but guards against future
// pattern additions). Compiled once at package init via MustCompile.
var roleExtractRegexes = []*regexp.Regexp{
	// "Need N {role} in ..." — the original real_001 form.
	regexp.MustCompile(`(?i)^Need\s+\d+\s+(.+?)\s+in\s+`),
	// "Looking for N {role} at ..." — the looking style. Anchor on
	// "at" because the role is followed by client (preceded by "at"),
	// not by city directly.
	regexp.MustCompile(`(?i)^Looking\s+for\s+\d+\s+(.+?)\s+at\s+`),
	// "{client} needs N {role} in ..." — the client_first style.
	// Greedy on the client side via .+?, then "needs", then count,
	// then role, then "in".
	regexp.MustCompile(`(?i)^.+?\s+needs\s+\d+\s+(.+?)\s+in\s+`),
}