// Playbook-lift reality test driver. Two-pass design: // // Pass 1 (cold): for each query → matrix.search use_playbook=false → // LLM judge rates top-K → record playbook entry pointing // at the highest-rated result (which may NOT be top-1 // by distance — that's the discovery worth boosting). // // Pass 2 (warm): same queries → use_playbook=true → measure how the // ranking shifted. // // Lift = real if pass-2 brings the LLM-judged-best result into top-1 // more often than pass-1. If lift ≈ 0, the playbook is just confirming // what cosine already said and the 5-loop thesis is unproven. // // Honest about what this measures: with no human-labeled ground truth, // the LLM judge IS the ground truth proxy. That's the small-model // pipeline thesis itself — the same model class that runs the inner // loop is also what we trust to evaluate it. If you don't trust the // judge, the lift number is meaningless; that's a separate problem // for ground-truth labeling. // // Usage (driven by scripts/playbook_lift.sh): // playbook_lift -gateway http://127.0.0.1:3110 \ // -queries tests/reality/playbook_lift_queries.txt \ // -judge qwen3.5:latest \ // -corpora workers,candidates \ // -k 10 \ // -out reports/reality-tests/playbook_lift_001.json package main import ( "bytes" "encoding/json" "flag" "fmt" "io" "log" "net/http" "os" "sort" "strings" "time" "git.agentview.dev/profit/golangLAKEHOUSE/internal/shared" ) type matrixResult struct { ID string `json:"id"` Distance float32 `json:"distance"` Corpus string `json:"corpus"` Metadata json.RawMessage `json:"metadata,omitempty"` } type matrixResp struct { Results []matrixResult `json:"results"` PerCorpusCounts map[string]int `json:"per_corpus_counts"` PlaybookBoosted int `json:"playbook_boosted,omitempty"` } type judgeVerdict struct { Rating int `json:"rating"` Reason string `json:"reason"` } type queryRun struct { Query string `json:"query"` ColdTop1ID string `json:"cold_top1_id"` ColdTop1Distance float32 `json:"cold_top1_distance"` ColdJudgeBestID string `json:"cold_judge_best_id"` ColdJudgeBestRank int `json:"cold_judge_best_rank"` ColdJudgeBestRating int `json:"cold_judge_best_rating"` ColdRatings []int `json:"cold_ratings"` PlaybookRecorded bool `json:"playbook_recorded"` PlaybookID string `json:"playbook_target_id,omitempty"` WarmTop1ID string `json:"warm_top1_id"` WarmTop1Distance float32 `json:"warm_top1_distance"` WarmBoostedCount int `json:"warm_boosted_count"` WarmJudgeBestRank int `json:"warm_judge_best_rank"` Lift bool `json:"lift"` // judge-best was below top-1 cold, but top-1 warm Note string `json:"note,omitempty"` } type summary struct { Total int `json:"total"` WithDiscovery int `json:"with_discovery"` // judge-best != cold top-1 LiftCount int `json:"lift_count"` // top-1 changed warm→ judge-best NoChange int `json:"no_change"` MeanTop1DeltaDistance float32 `json:"mean_top1_delta_distance"` PlaybookBoostedTotal int `json:"playbook_boosted_total"` GeneratedAt time.Time `json:"generated_at"` } func main() { configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)") gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL") ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge") queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path") corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora") // Empty default — resolved below from (priority): flag > env > config > hardcoded. judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)") k := flag.Int("k", 10, "top-k from matrix.search per pass") out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path") flag.Parse() // Judge resolution priority: explicit flag > $JUDGE_MODEL env > // cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this // up so model bumps land in lakehouse.toml, not in this driver. if *judge == "" { if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" { *judge = env } else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" { *judge = cfg.Models.LocalJudge } else { *judge = "qwen3.5:latest" log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge) } } corpora := strings.Split(*corporaCSV, ",") qs, err := loadQueries(*queries) if err != nil { log.Fatalf("load queries: %v", err) } if len(qs) == 0 { log.Fatalf("no queries in %s", *queries) } log.Printf("[lift] %d queries · corpora=%v · k=%d · judge=%s", len(qs), corpora, *k, *judge) hc := &http.Client{Timeout: 60 * time.Second} runs := make([]queryRun, 0, len(qs)) totalDelta := float32(0) playbookBoostedTotal := 0 withDiscovery := 0 liftCount := 0 noChange := 0 // Pass 1 (cold) + record playbooks based on judge verdicts. for i, q := range qs { log.Printf("[lift] (%d/%d cold) %s", i+1, len(qs), abbrev(q, 60)) resp, err := matrixSearch(hc, *gw, q, corpora, *k, false) if err != nil { log.Printf(" cold search failed: %v — skipping", err) continue } if len(resp.Results) == 0 { log.Printf(" cold returned 0 results — skipping") continue } ratings := make([]int, len(resp.Results)) bestRank := 0 bestRating := -1 for j, r := range resp.Results { rating := judgeRate(hc, *ollama, *judge, q, r) ratings[j] = rating if rating > bestRating { bestRating = rating bestRank = j } } run := queryRun{ Query: q, ColdTop1ID: resp.Results[0].ID, ColdTop1Distance: resp.Results[0].Distance, ColdJudgeBestID: resp.Results[bestRank].ID, ColdJudgeBestRank: bestRank, ColdJudgeBestRating: bestRating, ColdRatings: ratings, } // Record a playbook only if the judge best is not already top-1 // (otherwise we're boosting something cosine already crowned). if bestRank > 0 && bestRating >= 4 { withDiscovery++ if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil { log.Printf(" playbook record failed: %v", err) run.Note = "playbook record failed: " + err.Error() } else { run.PlaybookRecorded = true run.PlaybookID = resp.Results[bestRank].ID } } else if bestRank == 0 { run.Note = "judge-best already top-1 cold — no playbook needed" } else { run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating) } runs = append(runs, run) } // Pass 2 (warm) on the same queries. for i := range runs { q := runs[i].Query log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60)) resp, err := matrixSearch(hc, *gw, q, corpora, *k, true) if err != nil || len(resp.Results) == 0 { runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err)) continue } runs[i].WarmTop1ID = resp.Results[0].ID runs[i].WarmTop1Distance = resp.Results[0].Distance runs[i].WarmBoostedCount = resp.PlaybookBoosted playbookBoostedTotal += resp.PlaybookBoosted // Find where the cold judge-best ID landed in the warm ranking. warmRank := -1 for j, r := range resp.Results { if r.ID == runs[i].ColdJudgeBestID { warmRank = j break } } runs[i].WarmJudgeBestRank = warmRank switch { case runs[i].PlaybookRecorded && warmRank == 0: runs[i].Lift = true liftCount++ case !runs[i].PlaybookRecorded: noChange++ default: noChange++ } totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance } sum := summary{ Total: len(runs), WithDiscovery: withDiscovery, LiftCount: liftCount, NoChange: noChange, MeanTop1DeltaDistance: 0, PlaybookBoostedTotal: playbookBoostedTotal, GeneratedAt: time.Now().UTC(), } if len(runs) > 0 { sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs)) } if err := writeJSON(*out, runs, sum); err != nil { log.Fatalf("write %s: %v", *out, err) } log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance) log.Printf("[lift] results → %s", *out) } func loadQueries(path string) ([]string, error) { bs, err := os.ReadFile(path) if err != nil { return nil, err } var out []string for _, line := range strings.Split(string(bs), "\n") { s := strings.TrimSpace(line) if s == "" || strings.HasPrefix(s, "#") { continue } out = append(out, s) } return out, nil } func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) { body := map[string]any{ "query_text": query, "corpora": corpora, "k": k, "per_corpus_k": k, "use_playbook": usePlaybook, } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return nil, err } defer resp.Body.Close() rb, _ := io.ReadAll(resp.Body) if resp.StatusCode/100 != 2 { return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) } var out matrixResp if err := json.Unmarshal(rb, &out); err != nil { return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200)) } return &out, nil } func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error { body := map[string]any{ "query": query, "answer_id": answerID, "answer_corpus": answerCorpus, "score": score, "tags": []string{"reality-test", "playbook-lift-001"}, } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { rb, _ := io.ReadAll(resp.Body) return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) } return nil } // judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating // of the result against the query. Returns 0 on any failure (treated as // "couldn't judge, exclude from best-of consideration"). func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int { system := `You rate retrieval results for a staffing co-pilot. Rate the result 1-5 against the query: 5 = perfect match (this person/job IS what was asked for) 4 = strong match (right field, right level, minor mismatches) 3 = adjacent match (related field or partial overlap) 2 = weak/tangential match 1 = irrelevant Output JSON only: {"rating": N, "reason": ""}.` user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s", query, r.Corpus, r.ID, string(r.Metadata)) body := map[string]any{ "model": model, "stream": false, "format": "json", "messages": []map[string]string{ {"role": "system", "content": system}, {"role": "user", "content": user}, }, "options": map[string]any{"temperature": 0}, } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return 0 } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return 0 } rb, _ := io.ReadAll(resp.Body) var ollamaResp struct { Message struct { Content string `json:"content"` } `json:"message"` } if err := json.Unmarshal(rb, &ollamaResp); err != nil { return 0 } var v judgeVerdict if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil { return 0 } if v.Rating < 1 || v.Rating > 5 { return 0 } return v.Rating } func writeJSON(path string, runs []queryRun, sum summary) error { if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil { return err } out := struct { Summary summary `json:"summary"` Runs []queryRun `json:"runs"` }{Summary: sum, Runs: runs} bs, err := json.MarshalIndent(out, "", " ") if err != nil { return err } return os.WriteFile(path, bs, 0o644) } func filepath_dir(p string) string { if i := strings.LastIndex(p, "/"); i >= 0 { return p[:i] } return "." } func abbrev(s string, n int) string { if len(s) <= n { return s } return s[:n] + "…" } func appendNote(existing, add string) string { if existing == "" { return add } return existing + "; " + add } // Suppress unused-import warning when sort isn't used in a future // refactor; harmless for now. var _ = sort.Slice