// Playbook-lift reality test driver. Two-pass design: // // Pass 1 (cold): for each query → matrix.search use_playbook=false → // LLM judge rates top-K → record playbook entry pointing // at the highest-rated result (which may NOT be top-1 // by distance — that's the discovery worth boosting). // // Pass 2 (warm): same queries → use_playbook=true → measure how the // ranking shifted. // // Lift = real if pass-2 brings the LLM-judged-best result into top-1 // more often than pass-1. If lift ≈ 0, the playbook is just confirming // what cosine already said and the 5-loop thesis is unproven. // // Honest about what this measures: with no human-labeled ground truth, // the LLM judge IS the ground truth proxy. That's the small-model // pipeline thesis itself — the same model class that runs the inner // loop is also what we trust to evaluate it. If you don't trust the // judge, the lift number is meaningless; that's a separate problem // for ground-truth labeling. // // Usage (driven by scripts/playbook_lift.sh): // playbook_lift -gateway http://127.0.0.1:3110 \ // -queries tests/reality/playbook_lift_queries.txt \ // -judge qwen3.5:latest \ // -corpora workers,candidates \ // -k 10 \ // -out reports/reality-tests/playbook_lift_001.json package main import ( "bytes" "encoding/json" "flag" "fmt" "io" "log" "net/http" "os" "regexp" "sort" "strings" "time" "git.agentview.dev/profit/golangLAKEHOUSE/internal/shared" ) type matrixResult struct { ID string `json:"id"` Distance float32 `json:"distance"` Corpus string `json:"corpus"` Metadata json.RawMessage `json:"metadata,omitempty"` } type matrixResp struct { Results []matrixResult `json:"results"` PerCorpusCounts map[string]int `json:"per_corpus_counts"` PlaybookBoosted int `json:"playbook_boosted,omitempty"` } type judgeVerdict struct { Rating int `json:"rating"` Reason string `json:"reason"` } type queryRun struct { Query string `json:"query"` ColdTop1ID string `json:"cold_top1_id"` ColdTop1Distance float32 `json:"cold_top1_distance"` ColdJudgeBestID string `json:"cold_judge_best_id"` ColdJudgeBestRank int `json:"cold_judge_best_rank"` ColdJudgeBestRating int `json:"cold_judge_best_rating"` ColdRatings []int `json:"cold_ratings"` PlaybookRecorded bool `json:"playbook_recorded"` PlaybookID string `json:"playbook_target_id,omitempty"` WarmTop1ID string `json:"warm_top1_id"` WarmTop1Distance float32 `json:"warm_top1_distance"` WarmBoostedCount int `json:"warm_boosted_count"` WarmJudgeBestRank int `json:"warm_judge_best_rank"` // rank of cold judge-best in warm — NOT the warm pass's own judge-best WarmTop1Metadata json.RawMessage `json:"-"` // cached for Pass 4 rejudge; not emitted // WarmTop1Rating: only populated when --with-rejudge. Compare to // ColdRatings[0] (== cold top-1 rating) to measure quality lift. // *int so absence (no rejudge pass) and a 0-rating verdict are // distinguishable. WarmTop1Rating *int `json:"warm_top1_rating,omitempty"` Lift bool `json:"lift"` // judge-best was below top-1 cold, but top-1 warm // Paraphrase pass — only populated when --with-paraphrase. Tests // the playbook's actual learning property: does a recorded entry // for query Q help a similar-but-different query Q'? // // ParaphraseRecordedRank semantics: // nil = paraphrase pass didn't run for this query (no playbook // was recorded in cold pass, so nothing to test) // 0 = recorded answer landed at top-1 // 1..K-1 = recorded answer present in top-K at that rank // -1 = recorded answer absent from top-K // Pointer (not int) so nil and rank-0 are distinguishable in JSON. ParaphraseQuery string `json:"paraphrase_query,omitempty"` ParaphraseTop1ID string `json:"paraphrase_top1_id,omitempty"` ParaphraseRecordedRank *int `json:"paraphrase_recorded_rank,omitempty"` ParaphraseLift bool `json:"paraphrase_lift,omitempty"` // recorded answer at rank 0 for paraphrase Note string `json:"note,omitempty"` } type summary struct { Total int `json:"total"` WithDiscovery int `json:"with_discovery"` // judge-best != cold top-1 LiftCount int `json:"lift_count"` // top-1 changed warm→ judge-best NoChange int `json:"no_change"` MeanTop1DeltaDistance float32 `json:"mean_top1_delta_distance"` PlaybookBoostedTotal int `json:"playbook_boosted_total"` // Paraphrase pass aggregates — only populated when --with-paraphrase. ParaphraseAttempted int `json:"paraphrase_attempted,omitempty"` // queries with playbook recorded that ran a paraphrase ParaphraseTop1Lifts int `json:"paraphrase_top1_lifts,omitempty"` // recorded answer surfaced at rank 0 ParaphraseAnyRankHits int `json:"paraphrase_any_rank_hits,omitempty"` // recorded answer surfaced at any rank in top-K // Re-judge pass aggregates — only populated when --with-rejudge. // Measures QUALITY lift (warm top-1 rating vs cold top-1 rating) // rather than rank-of-cold-judge-best lift. The latter conflates // "warm surfaced a different but equally-good result" with "warm // shuffled ranks but the answer was the same"; quality lift // disambiguates them. RejudgeAttempted int `json:"rejudge_attempted,omitempty"` // queries that ran the rejudge pass QualityLifted int `json:"quality_lifted,omitempty"` // warm-top-1 rating > cold-top-1 rating QualityNeutral int `json:"quality_neutral,omitempty"` // ratings equal (could be same or different item) QualityRegressed int `json:"quality_regressed,omitempty"` // warm-top-1 rating < cold-top-1 rating GeneratedAt time.Time `json:"generated_at"` } func main() { configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)") gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL") ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge") queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path") corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora") // Empty default — resolved below from (priority): flag > env > config > hardcoded. judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)") k := flag.Int("k", 10, "top-k from matrix.search per pass") out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path") withParaphrase := flag.Bool("with-paraphrase", false, "after warm pass, generate a paraphrase via the judge model and re-query with playbook=true to test the learning property") withRejudge := flag.Bool("with-rejudge", false, "after warm pass, judge warm top-1 to measure QUALITY lift (vs cold top-1 rating), not just rank-of-cold-judge-best") llmRoleExtract := flag.Bool("llm-role-extract", false, "fall back to LLM (qwen2.5 format=json) when the regex extractor returns empty — closes the shorthand-style cross-role bleed surfaced in real_003 at the cost of ~1-3s/query") llmRoleModel := flag.String("llm-role-model", "qwen2.5:latest", "Ollama model used for LLM role extraction; ignored when -llm-role-extract is off") flag.Parse() // Judge resolution priority: explicit flag > $JUDGE_MODEL env > // cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this // up so model bumps land in lakehouse.toml, not in this driver. if *judge == "" { if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" { *judge = env } else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" { *judge = cfg.Models.LocalJudge } else { *judge = "qwen3.5:latest" log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge) } } corpora := strings.Split(*corporaCSV, ",") qs, err := loadQueries(*queries) if err != nil { log.Fatalf("load queries: %v", err) } if len(qs) == 0 { log.Fatalf("no queries in %s", *queries) } log.Printf("[lift] %d queries · corpora=%v · k=%d · judge=%s", len(qs), corpora, *k, *judge) hc := &http.Client{Timeout: 60 * time.Second} // Package-global role extractor — used by matrixSearch + // playbookRecord. Off-by-default so the existing harness behavior // (regex-only extraction) is unchanged unless -llm-role-extract. { mdl := "" if *llmRoleExtract { mdl = *llmRoleModel log.Printf("[lift] llm role extraction ON (model=%s) — shorthand queries get LLM fallback", mdl) } globalRoleExtractor = &roleExtractor{ hc: hc, ollamaURL: *ollama, model: mdl, } } runs := make([]queryRun, 0, len(qs)) totalDelta := float32(0) playbookBoostedTotal := 0 withDiscovery := 0 liftCount := 0 noChange := 0 // Pass 1 (cold) + record playbooks based on judge verdicts. for i, q := range qs { log.Printf("[lift] (%d/%d cold) %s", i+1, len(qs), abbrev(q, 60)) resp, err := matrixSearch(hc, *gw, q, corpora, *k, false) if err != nil { log.Printf(" cold search failed: %v — skipping", err) continue } if len(resp.Results) == 0 { log.Printf(" cold returned 0 results — skipping") continue } ratings := make([]int, len(resp.Results)) bestRank := 0 bestRating := -1 for j, r := range resp.Results { rating := judgeRate(hc, *ollama, *judge, q, r) ratings[j] = rating if rating > bestRating { bestRating = rating bestRank = j } } run := queryRun{ Query: q, ColdTop1ID: resp.Results[0].ID, ColdTop1Distance: resp.Results[0].Distance, ColdJudgeBestID: resp.Results[bestRank].ID, ColdJudgeBestRank: bestRank, ColdJudgeBestRating: bestRating, ColdRatings: ratings, } // Record a playbook only if the judge best is not already top-1 // (otherwise we're boosting something cosine already crowned). if bestRank > 0 && bestRating >= 4 { withDiscovery++ if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil { log.Printf(" playbook record failed: %v", err) run.Note = "playbook record failed: " + err.Error() } else { run.PlaybookRecorded = true run.PlaybookID = resp.Results[bestRank].ID } } else if bestRank == 0 { run.Note = "judge-best already top-1 cold — no playbook needed" } else { run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating) } runs = append(runs, run) } // Pass 2 (warm) on the same queries. for i := range runs { q := runs[i].Query log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60)) resp, err := matrixSearch(hc, *gw, q, corpora, *k, true) if err != nil || len(resp.Results) == 0 { runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err)) continue } runs[i].WarmTop1ID = resp.Results[0].ID runs[i].WarmTop1Distance = resp.Results[0].Distance runs[i].WarmTop1Metadata = resp.Results[0].Metadata // cache for Pass 4 rejudge runs[i].WarmBoostedCount = resp.PlaybookBoosted playbookBoostedTotal += resp.PlaybookBoosted // Find where the cold judge-best ID landed in the warm ranking. warmRank := -1 for j, r := range resp.Results { if r.ID == runs[i].ColdJudgeBestID { warmRank = j break } } runs[i].WarmJudgeBestRank = warmRank switch { case runs[i].PlaybookRecorded && warmRank == 0: runs[i].Lift = true liftCount++ case !runs[i].PlaybookRecorded: noChange++ default: noChange++ } totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance } // Pass 3 (paraphrase) — opt-in via --with-paraphrase. For each // query where a playbook was recorded in Pass 1, generate a // paraphrase via the judge model and run it through warm // matrix.search. The expectation: if the playbook's learning // property holds (cosine on embed(paraphrase) finds the recorded // embed(query) within DefaultPlaybookMaxDistance), the recorded // answer should appear at top-1 for the paraphrase too. This is // the claim from the report's caveat #3 that v1 didn't test. paraphraseAttempted := 0 paraphraseTop1Lifts := 0 paraphraseAnyRankHits := 0 if *withParaphrase { log.Printf("[lift] paraphrase pass: testing playbook learning property") for i := range runs { if !runs[i].PlaybookRecorded { continue } paraphraseAttempted++ paraphrase, err := generateParaphrase(hc, *ollama, *judge, runs[i].Query) if err != nil { log.Printf(" (%d) paraphrase generation failed: %v", i+1, err) runs[i].Note = appendNote(runs[i].Note, "paraphrase gen failed: "+err.Error()) continue } runs[i].ParaphraseQuery = paraphrase log.Printf("[lift] (%d/%d paraphrase) %s → %s", i+1, len(runs), abbrev(runs[i].Query, 40), abbrev(paraphrase, 40)) resp, err := matrixSearch(hc, *gw, paraphrase, corpora, *k, true) if err != nil || len(resp.Results) == 0 { runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("paraphrase search failed: %v", err)) missed := -1 runs[i].ParaphraseRecordedRank = &missed continue } runs[i].ParaphraseTop1ID = resp.Results[0].ID recordedRank := -1 for j, r := range resp.Results { if r.ID == runs[i].PlaybookID { recordedRank = j break } } runs[i].ParaphraseRecordedRank = &recordedRank if recordedRank == 0 { runs[i].ParaphraseLift = true paraphraseTop1Lifts++ paraphraseAnyRankHits++ } else if recordedRank > 0 { paraphraseAnyRankHits++ } } } // Pass 4 (warm-rejudge) — opt-in via --with-rejudge. Judge warm // top-1 against the same prompt as cold ratings, then compare to // cold top-1 rating. This measures QUALITY lift (did the playbook // produce a better candidate?) rather than just rank-of-cold-judge- // best lift (did the recorded answer move to top-1, even if cold's // top-1 was already good?). See STATE_OF_PLAY OPEN — added because // run #003's verbatim 2/6 didn't tell us whether Shape B was // surfacing better OR same-quality alternatives. rejudgeAttempted := 0 qualityLifted := 0 qualityNeutral := 0 qualityRegressed := 0 if *withRejudge { log.Printf("[lift] warm-rejudge pass: measuring quality lift (warm top-1 rating vs cold top-1 rating)") for i := range runs { if runs[i].WarmTop1ID == "" || len(runs[i].WarmTop1Metadata) == 0 { continue // warm pass didn't complete for this query } rejudgeAttempted++ result := matrixResult{ ID: runs[i].WarmTop1ID, Distance: runs[i].WarmTop1Distance, Metadata: runs[i].WarmTop1Metadata, } warmRating := judgeRate(hc, *ollama, *judge, runs[i].Query, result) runs[i].WarmTop1Rating = &warmRating coldRating := 0 if len(runs[i].ColdRatings) > 0 { coldRating = runs[i].ColdRatings[0] } switch { case warmRating > coldRating: qualityLifted++ case warmRating < coldRating: qualityRegressed++ default: qualityNeutral++ } } } sum := summary{ Total: len(runs), WithDiscovery: withDiscovery, LiftCount: liftCount, NoChange: noChange, MeanTop1DeltaDistance: 0, PlaybookBoostedTotal: playbookBoostedTotal, ParaphraseAttempted: paraphraseAttempted, ParaphraseTop1Lifts: paraphraseTop1Lifts, ParaphraseAnyRankHits: paraphraseAnyRankHits, RejudgeAttempted: rejudgeAttempted, QualityLifted: qualityLifted, QualityNeutral: qualityNeutral, QualityRegressed: qualityRegressed, GeneratedAt: time.Now().UTC(), } if len(runs) > 0 { sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs)) } if err := writeJSON(*out, runs, sum); err != nil { log.Fatalf("write %s: %v", *out, err) } if *withParaphrase || *withRejudge { log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1 · quality=lifted%d/neutral%d/regressed%d", sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance, sum.ParaphraseTop1Lifts, sum.ParaphraseAttempted, sum.QualityLifted, sum.QualityNeutral, sum.QualityRegressed) } else { log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance) } log.Printf("[lift] results → %s", *out) } // generateParaphrase asks the judge model to rephrase a staffing query // while preserving intent. Used in the paraphrase pass to test whether // the playbook's recorded embedding survives wording variation. // // temperature=0.5 — enough variance to make the paraphrase actually // different, but not so high that it drifts off the staffing domain. // format=json + a tight schema makes parsing deterministic. func generateParaphrase(hc *http.Client, ollamaURL, model, query string) (string, error) { system := `You rephrase staffing queries while preserving intent. Output JSON only: {"paraphrase": ""}. Rules: - Keep the same role, certifications, geography, and constraints. - Vary the wording (synonyms, reordered clauses, different sentence shape). - Do NOT add or remove requirements. - Do NOT explain — just emit the JSON.` body := map[string]any{ "model": model, "stream": false, "format": "json", "messages": []map[string]string{ {"role": "system", "content": system}, {"role": "user", "content": query}, }, "options": map[string]any{"temperature": 0.5}, } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode) } rb, _ := io.ReadAll(resp.Body) var ollamaResp struct { Message struct { Content string `json:"content"` } `json:"message"` } if err := json.Unmarshal(rb, &ollamaResp); err != nil { return "", fmt.Errorf("decode ollama envelope: %w", err) } var out struct { Paraphrase string `json:"paraphrase"` } if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil { return "", fmt.Errorf("decode paraphrase JSON: %w (content=%q)", err, ollamaResp.Message.Content) } if strings.TrimSpace(out.Paraphrase) == "" { return "", fmt.Errorf("empty paraphrase (content=%q)", ollamaResp.Message.Content) } return out.Paraphrase, nil } func loadQueries(path string) ([]string, error) { bs, err := os.ReadFile(path) if err != nil { return nil, err } var out []string for _, line := range strings.Split(string(bs), "\n") { s := strings.TrimSpace(line) if s == "" || strings.HasPrefix(s, "#") { continue } out = append(out, s) } return out, nil } func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) { body := map[string]any{ "query_text": query, "corpora": corpora, "k": k, "per_corpus_k": k, "use_playbook": usePlaybook, } // Role extraction (real_001 + real_003 cross-role bleed fixes). // Goes through globalRoleExtractor so shorthand-style queries get // LLM fallback when -llm-role-extract is on. Empty result leaves // the gate disabled — harness preserves current behavior on // truly-unparseable shapes. if role := globalRoleExtractor.extract(query); role != "" { body["query_role"] = role } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return nil, err } defer resp.Body.Close() rb, _ := io.ReadAll(resp.Body) if resp.StatusCode/100 != 2 { return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) } var out matrixResp if err := json.Unmarshal(rb, &out); err != nil { return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200)) } return &out, nil } func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error { body := map[string]any{ "query_text": query, "answer_id": answerID, "answer_corpus": answerCorpus, "score": score, "tags": []string{"reality-test", "playbook-lift-001"}, } // Same extractor as matrixSearch — shared cache, same LLM fallback // rules. Recorded role lets retrieve-time gate fire on cross-role // queries (real_001 + real_003 fixes). if role := globalRoleExtractor.extract(query); role != "" { body["role"] = role } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { rb, _ := io.ReadAll(resp.Body) return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) } return nil } // judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating // of the result against the query. Returns 0 on any failure (treated as // "couldn't judge, exclude from best-of consideration"). func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int { system := `You rate retrieval results for a staffing co-pilot. Rate the result 1-5 against the query: 5 = perfect match (this person/job IS what was asked for) 4 = strong match (right field, right level, minor mismatches) 3 = adjacent match (related field or partial overlap) 2 = weak/tangential match 1 = irrelevant Output JSON only: {"rating": N, "reason": ""}.` user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s", query, r.Corpus, r.ID, string(r.Metadata)) body := map[string]any{ "model": model, "stream": false, "format": "json", "messages": []map[string]string{ {"role": "system", "content": system}, {"role": "user", "content": user}, }, "options": map[string]any{"temperature": 0}, } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return 0 } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return 0 } rb, _ := io.ReadAll(resp.Body) var ollamaResp struct { Message struct { Content string `json:"content"` } `json:"message"` } if err := json.Unmarshal(rb, &ollamaResp); err != nil { return 0 } var v judgeVerdict if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil { return 0 } if v.Rating < 1 || v.Rating > 5 { return 0 } return v.Rating } func writeJSON(path string, runs []queryRun, sum summary) error { if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil { return err } out := struct { Summary summary `json:"summary"` Runs []queryRun `json:"runs"` }{Summary: sum, Runs: runs} bs, err := json.MarshalIndent(out, "", " ") if err != nil { return err } return os.WriteFile(path, bs, 0o644) } func filepath_dir(p string) string { if i := strings.LastIndex(p, "/"); i >= 0 { return p[:i] } return "." } func abbrev(s string, n int) string { if len(s) <= n { return s } return s[:n] + "…" } func appendNote(existing, add string) string { if existing == "" { return add } return existing + "; " + add } // Suppress unused-import warning when sort isn't used in a future // refactor; harmless for now. var _ = sort.Slice // globalRoleExtractor is set in main() and read by matrixSearch + // playbookRecord. nil-safe: a nil receiver on roleExtractor.extract // degrades to regex-only behavior, so the field is checked once at // startup and never re-validated per call. var globalRoleExtractor *roleExtractor // roleExtractor combines the fast-path regex with an optional LLM // fallback so callers can pay for shorthand coverage only when they // need it. Per real_003_findings.md: shorthand-style queries // ("N {role} {city} {state} {at} {client}") have no separator // between role and city, so a regex can't reliably extract — but a // small LLM with a tight format=json prompt can. Cost is ~1-3s per // extraction on local qwen2.5; cached per-process so paraphrase // passes don't pay twice for the same query. // // Empty cache + off-by-default LLM = the existing real_003b behavior // is unchanged unless callers explicitly enable LLM mode. type roleExtractor struct { hc *http.Client ollamaURL string model string // "" disables LLM fallback cache map[string]string } // extract returns the role for a query. Tries regex first (fast, // deterministic); if regex misses and LLM is configured, calls Ollama; // caches the result either way. Returns "" when both miss — the // caller's gate stays disabled, preserving current behavior on // truly-unparseable shapes. func (r *roleExtractor) extract(query string) string { if r == nil { return extractRoleFromNeed(query) } if cached, ok := r.cache[query]; ok { return cached } role := extractRoleFromNeed(query) if role == "" && r.model != "" { if v, err := extractRoleViaLLM(r.hc, r.ollamaURL, r.model, query); err == nil { role = v } else { log.Printf("[lift] llm-role-extract failed (%v) — falling back to empty for %q", err, abbrev(query, 60)) } } if r.cache == nil { r.cache = make(map[string]string) } r.cache[query] = role return role } // extractRoleViaLLM asks the Ollama-shape /api/chat to identify the // staffing role in a free-form query. Tight schema + format=json so // parsing is deterministic. Empty role string is a valid response — // the model may decline to extract when the query has no clean role // (e.g. lift-suite multi-constraint queries). func extractRoleViaLLM(hc *http.Client, ollamaURL, model, query string) (string, error) { system := `You are a staffing-domain role extractor. Output JSON only: {"role": ""}. Rules: - Identify the staffing role (job title) the query is asking for. - Return only the role noun phrase — e.g. "Forklift Operator", "Pickers". - Preserve plurality from the query (don't singularize). - Strip qualifiers ("OSHA-30 certified") — we want the bare role. - If the query has no clean staffing role, return "" (empty string). - Do NOT explain. Just emit the JSON.` body, _ := json.Marshal(map[string]any{ "model": model, "stream": false, "format": "json", "messages": []map[string]string{ {"role": "system", "content": system}, {"role": "user", "content": query}, }, "options": map[string]any{"temperature": 0.0}, }) req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") resp, err := hc.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode) } rb, _ := io.ReadAll(resp.Body) var ollamaResp struct { Message struct { Content string `json:"content"` } `json:"message"` } if err := json.Unmarshal(rb, &ollamaResp); err != nil { return "", err } var out struct { Role string `json:"role"` } if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil { return "", fmt.Errorf("decode role: %w (content=%q)", err, ollamaResp.Message.Content) } return strings.TrimSpace(out.Role), nil } // extractRoleFromNeed pulls the role out of staffing-shape queries. // Returns "" for any query that doesn't match a known anchor pattern // (free-form lift-suite queries + shorthand-style fall back to empty, // leaving the cross-role gate disabled). // // Patterns covered (in priority order): // need: "Need N {role}{s} in {city} ..." // client_first: "{client} needs N {role}{s} in {city} ..." // looking: "Looking for N {role}{s} at {client} in {city} ..." // // Pattern explicitly NOT covered: // shorthand: "N {role}{s} {city} {state} {at} {client}" // Because there's no separator between role and city in shorthand // ("Forklift Operator Detroit" is shape-indistinguishable from // "Forklift" + "Operator Detroit"), a regex can't reliably extract // role here. real_003 confirmed shorthand-vs-shorthand cross-role // bleed: a CNC Operator shorthand recording leaked w-2404 onto a // Forklift Operator shorthand query within the same Beacon Freight // Detroit cluster. Closing that requires either an LLM extractor at // record+query time or a known-cities lookup table. // // Lives here (not in internal/matrix) because role extraction from // free-form text is a caller concern; matrix only consumes the // already-resolved Role string. A future LLM-based extractor would // replace this function without changing matrix's gate logic. func extractRoleFromNeed(query string) string { for _, re := range roleExtractRegexes { if m := re.FindStringSubmatch(query); len(m) >= 2 { return strings.TrimSpace(m[1]) } } return "" } // roleExtractRegexes is ordered: more-specific anchors first so a // "Looking for ..." query doesn't accidentally land in the "Need" // pattern (impossible given the prefix, but guards against future // pattern additions). Compiled once at package init via MustCompile. var roleExtractRegexes = []*regexp.Regexp{ // "Need N {role} in ..." — the original real_001 form. regexp.MustCompile(`(?i)^Need\s+\d+\s+(.+?)\s+in\s+`), // "Looking for N {role} at ..." — the looking style. Anchor on // "at" because the role is followed by client (preceded by "at"), // not by city directly. regexp.MustCompile(`(?i)^Looking\s+for\s+\d+\s+(.+?)\s+at\s+`), // "{client} needs N {role} in ..." — the client_first style. // Greedy on the client side via .+?, then "needs", then count, // then role, then "in". regexp.MustCompile(`(?i)^.+?\s+needs\s+\d+\s+(.+?)\s+in\s+`), }