diff --git a/STATE_OF_PLAY.md b/STATE_OF_PLAY.md index 9212d24..1de0d9d 100644 --- a/STATE_OF_PLAY.md +++ b/STATE_OF_PLAY.md @@ -220,11 +220,10 @@ The list is intentionally short. Items move to closed when the work demands them | # | Item | When to act | |---|---|---| -| 1 | **Same-client+city cross-role bleed** — reality_test real_001 (`reports/reality-tests/real_001_findings.md`) caught Shape A boost from a recorded playbook hitting different-role queries on same `(client, city)` cluster. Fix candidate: role-scoped playbook corpus metadata + Shape A boost gate on role match. The judge gate handles per-injection but doesn't fire at retrieve time. | Before real coordinator demand actually starts hitting the substrate. | -| 2 | **Periodic fresh→main index merge** — two-tier pattern works but `fresh_workers` grows monotonically. A scheduled job that re-ingests the fresh corpus into `workers` (with the v2-moe embedder) + clears fresh closes the loop. | When `fresh_workers` crosses ~500 items in production. | -| 3 | **Distillation full port** — `57d0df1` shipped scorer + contamination firewall (E partial). SFT export pipeline + audit_baselines lineage still on the Rust side. | When distillation becomes a production dependency. | -| 4 | **Drift quantification** — `be65f85` is "scorer drift first." Full distribution-drift signal is underspecified everywhere; this is research, not a port. | Open research item; no calendar. | -| 5 | **Operational nice-to-haves** — real-time wall-clock for the stress harness; chatd fixture-mode storage half (mock S3 for CI without MinIO); liberal-paraphrase calibration once real coordinator queries land. | When any of these block someone. | +| 1 | **Periodic fresh→main index merge** — two-tier pattern works but `fresh_workers` grows monotonically. A scheduled job that re-ingests the fresh corpus into `workers` (with the v2-moe embedder) + clears fresh closes the loop. | When `fresh_workers` crosses ~500 items in production. | +| 2 | **Distillation full port** — `57d0df1` shipped scorer + contamination firewall (E partial). SFT export pipeline + audit_baselines lineage still on the Rust side. | When distillation becomes a production dependency. | +| 3 | **Drift quantification** — `be65f85` is "scorer drift first." Full distribution-drift signal is underspecified everywhere; this is research, not a port. | Open research item; no calendar. | +| 4 | **Operational nice-to-haves** — real-time wall-clock for the stress harness; chatd fixture-mode storage half (mock S3 for CI without MinIO); liberal-paraphrase calibration once real coordinator queries land. | When any of these block someone. | --- @@ -262,7 +261,8 @@ The list is intentionally short. Items move to closed when the work demands them | `68d9e55` | shared: auto-emit Langfuse trace+span per HTTP request — closes OPEN #2 | | `a2fa9a2` | scrum_review: pipe diff via temp files — fixes argv overflow on large bundles | | (prep) | G5 cutover prep: `embed_parity` probe — Rust `/ai/embed` ↔ Go `/v1/embed` 5/5 cos=1.000 (both v1 and v2-moe). Verdict + drift catalog in `reports/cutover/SUMMARY.md`. Wire-format remap (`embeddings`/`vectors`, `dimensions`/`dimension`) is the only real cutover work; math is provably equivalent. | -| (probe) | Reality test real_001: 10 real-shape queries from `fill_events.parquet` through lift harness. 8/10 cold-pass top-1 = judge-best (substrate works on real distribution). Surfaced **same-client+city cross-role bleed** — Shape A boost from Forklift-Operator playbook landed on CNC-Operator query, demoting the cold-pass-correct worker. Becomes new OPEN #1. Findings: `reports/reality-tests/real_001_findings.md`. | +| (probe) | Reality test real_001: 10 real-shape queries from `fill_events.parquet` through lift harness. 8/10 cold-pass top-1 = judge-best (substrate works on real distribution). Surfaced **same-client+city cross-role bleed** — Shape A boost from Forklift-Operator playbook landed on CNC-Operator query, demoting the cold-pass-correct worker. Findings: `reports/reality-tests/real_001_findings.md`. | +| (fix) | Cross-role gate: `Role` on `PlaybookEntry`, `QueryRole` on `SearchRequest`, gate fires in both `ApplyPlaybookBoost` + `InjectPlaybookMisses`. `roleEqual` handles case + plural. Backward-compat: empty role on either side = gate disabled (preserves lift suite + free-form callers). 5 new unit tests use exact real_001 distance + role values. Re-run real_002: bleed closed (Q#5 Pickers, Q#10 CNC Operator stay at cold-pass top-1; same-role lifts still fire). Closes OPEN #1. Findings: `reports/reality-tests/real_002_findings.md`. | Plus on Rust side (`8de94eb`, `3d06868`): qwen2.5 → qwen3.5:latest backport in active defaults; distillation acceptance reports regenerated (run_hash refresh, reproducibility property still holds). diff --git a/cmd/matrixd/main.go b/cmd/matrixd/main.go index 4698d38..dba9ae4 100644 --- a/cmd/matrixd/main.go +++ b/cmd/matrixd/main.go @@ -118,8 +118,13 @@ func (h *handlers) handleRelevance(w http.ResponseWriter, r *http.Request) { // playbookRecordRequest is the POST /matrix/playbooks/record body. // Corpus is optional; defaults to matrix.DefaultPlaybookCorpus. +// Role is optional; when set, gates Shape A boost + Shape B inject +// at retrieve time so this entry only fires for queries with the +// same role (real_001 cross-role bleed fix). Empty Role = current +// behavior (no gate). type playbookRecordRequest struct { QueryText string `json:"query_text"` + Role string `json:"role,omitempty"` AnswerID string `json:"answer_id"` AnswerCorpus string `json:"answer_corpus"` Score float64 `json:"score"` @@ -132,7 +137,7 @@ func (h *handlers) handlePlaybookRecord(w http.ResponseWriter, r *http.Request) if !decodeJSON(w, r, &req) { return } - entry := matrix.NewPlaybookEntry(req.QueryText, req.AnswerID, req.AnswerCorpus, req.Score, req.Tags) + entry := matrix.NewPlaybookEntryWithRole(req.QueryText, req.Role, req.AnswerID, req.AnswerCorpus, req.Score, req.Tags) if err := entry.Validate(); err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -195,7 +200,7 @@ func (h *handlers) handlePlaybookBulk(w http.ResponseWriter, r *http.Request) { if corpus == "" { corpus = req.Corpus } - entry := matrix.NewPlaybookEntry(item.QueryText, item.AnswerID, item.AnswerCorpus, item.Score, item.Tags) + entry := matrix.NewPlaybookEntryWithRole(item.QueryText, item.Role, item.AnswerID, item.AnswerCorpus, item.Score, item.Tags) if err := entry.Validate(); err != nil { out.Results[i] = playbookBulkItemResult{Index: i, Error: err.Error()} out.Failed++ diff --git a/internal/matrix/playbook.go b/internal/matrix/playbook.go index 51f25cc..14432f9 100644 --- a/internal/matrix/playbook.go +++ b/internal/matrix/playbook.go @@ -77,8 +77,20 @@ const DefaultPlaybookMaxInjectDistance = 0.20 // PlaybookEntry is what gets stored as metadata on each playbook // vector. RecordedAt is captured at write time; callers should not // set it (the recorder fills it in). +// +// Role is an optional caller-provided hint that names the query's +// staffing role at record time (e.g. "Forklift Operator", "CNC +// Operator"). When set, the matrix retrieve layer gates Shape A boost +// and Shape B inject so a recorded entry only fires for queries with +// the same role. Empty Role = no gate (caller didn't know the role, +// or the query has no clean role concept — e.g. lift-suite multi- +// constraint queries). Reality test real_001 surfaced same-client+city +// queries bleeding across roles when the embedder collapsed them +// within the inject distance threshold; the role gate closes that +// without rejecting legitimate intra-role boosts. type PlaybookEntry struct { QueryText string `json:"query_text"` + Role string `json:"role,omitempty"` AnswerID string `json:"answer_id"` AnswerCorpus string `json:"answer_corpus"` Score float64 `json:"score"` // 0..1; higher = better outcome @@ -152,6 +164,10 @@ func UnmarshalPlaybookMetadata(raw json.RawMessage) (PlaybookEntry, error) { // NewPlaybookEntry stamps RecordedAtNs to now and returns the entry. // Validation happens at storage; this is just construction. +// +// Role-aware overload is NewPlaybookEntryWithRole — callers that don't +// know the role can keep using this constructor and the role gate +// will degrade to current (un-gated) behavior. func NewPlaybookEntry(query, answerID, answerCorpus string, score float64, tags []string) PlaybookEntry { return PlaybookEntry{ QueryText: query, @@ -163,6 +179,15 @@ func NewPlaybookEntry(query, answerID, answerCorpus string, score float64, tags } } +// NewPlaybookEntryWithRole is the role-aware variant of NewPlaybookEntry. +// role should be a stable role token (see roleEqual / role-token notes +// in the cross-role gate); empty disables the gate for this entry. +func NewPlaybookEntryWithRole(query, role, answerID, answerCorpus string, score float64, tags []string) PlaybookEntry { + e := NewPlaybookEntry(query, answerID, answerCorpus, score, tags) + e.Role = role + return e +} + // PlaybookHit is one similarity-search result from the playbook // corpus, paired with its decoded entry. Distance is the cosine // distance between the current query and this past playbook's @@ -174,6 +199,64 @@ type PlaybookHit struct { Entry PlaybookEntry `json:"entry"` } +// roleEqual is the cross-role gate's match function. Returns true when +// EITHER side is empty (gate disabled — current behavior preserved) OR +// the role tokens compare equal under a normalization that ignores +// case + trailing 's' (so "Forklift Operator" matches "forklift +// operator" matches "Forklift Operators"). Robust enough for the +// staffing-domain plurals seen in fill_events; tighter normalization +// (Levenshtein, role-graph) is a separate concern and not blocking +// the bleed fix. +// +// Why this lives near PlaybookEntry: the gate is a property of the +// entry/query pair, not of the boost or inject mechanism — both call +// sites in this file delegate to the same function so they can't +// drift apart. +func roleEqual(a, b string) bool { + if a == "" || b == "" { + return true + } + return roleNormalize(a) == roleNormalize(b) +} + +// roleNormalize lowercases + trims + strips a single trailing "s" +// (or "es" after "x"/"s"/"ch"/"sh") to collapse common staffing +// plurals without pulling in a real stemmer. +func roleNormalize(s string) string { + out := []rune{} + // in-place lowercase + skip leading/trailing whitespace + for _, r := range s { + if r == ' ' || r == '\t' { + if len(out) == 0 { + continue + } + } + if r >= 'A' && r <= 'Z' { + r += 32 + } + out = append(out, r) + } + // trim trailing whitespace + for len(out) > 0 && (out[len(out)-1] == ' ' || out[len(out)-1] == '\t') { + out = out[:len(out)-1] + } + if len(out) == 0 { + return "" + } + // crude singularization for staffing plurals + n := len(out) + if n >= 3 && out[n-1] == 's' && out[n-2] == 'e' && + (out[n-3] == 'x' || out[n-3] == 's' || out[n-3] == 'h') { + // "boxes", "kisses", "dishes" → strip "es" + out = out[:n-2] + } else if n >= 2 && out[n-1] == 's' { + // "operators" → "operator"; harmless on words ending in s if + // callers mind their canonicalization (rare in role names). + out = out[:n-1] + } + return string(out) +} + // InjectPlaybookMisses appends synthetic Results for playbook hits // whose (AnswerCorpus, AnswerID) doesn't already appear in results. // This is "Shape B" from the doc comment at the top of this file: @@ -219,7 +302,17 @@ type PlaybookHit struct { // it can score (query, candidate) pairs without re-deriving from // SearchRequest. Empty when the caller doesn't have it (gate // implementations should treat empty query as "skip judge, allow"). -func InjectPlaybookMisses(query string, results []Result, hits []PlaybookHit, maxInjectDist float32, gate InjectGate) ([]Result, int) { +// +// queryRole, when non-empty, gates injection on role match: a playbook +// hit whose Entry.Role is set and DIFFERENT from queryRole is rejected +// before the judge gate runs. Closes the same-client+city cross-role +// bleed (real_001 finding) where the inject distance threshold (0.20) +// wasn't tight enough to prevent Forklift-Operator playbook entries +// from injecting on CNC-Operator queries that happened to embed close +// (same city + client + count-token + time-token dominate cosine). +// Empty queryRole OR empty hit.Entry.Role = no gate (preserves +// behavior for callers that don't supply role). +func InjectPlaybookMisses(query, queryRole string, results []Result, hits []PlaybookHit, maxInjectDist float32, gate InjectGate) ([]Result, int) { if len(hits) == 0 { return results, 0 } @@ -243,6 +336,12 @@ func InjectPlaybookMisses(query string, results []Result, hits []PlaybookHit, ma if h.Distance > maxInjectDist { continue } + // Cross-role gate (real_001 bleed fix). Reject before the + // judge gate so we don't burn a slow LLM call to immediately + // reject the candidate on a structural mismatch. + if !roleEqual(queryRole, h.Entry.Role) { + continue + } key := h.Entry.AnswerCorpus + "|" + h.Entry.AnswerID if present[key] { continue @@ -316,13 +415,19 @@ func (f InjectGateFunc) Approve(q string, h PlaybookHit) bool { return f(q, h) } // BoostFactor. If multiple hits match the same result, the highest- // score one wins (greatest reduction in distance). // +// queryRole, when non-empty, gates the boost on role match — same +// semantics as InjectPlaybookMisses' queryRole. Defense in depth: +// even if a recorded entry's answer happens to surface in the +// regular retrieve top-K for a different-role query, role-mismatch +// shouldn't fire a boost. Empty role on either side disables the gate. +// // After applying boosts, results are re-sorted ascending by // distance. // // Returns the number of distinct results that received a boost. // Callers can log this as a signal of "how much the playbook // influenced this query." -func ApplyPlaybookBoost(results []Result, hits []PlaybookHit) int { +func ApplyPlaybookBoost(queryRole string, results []Result, hits []PlaybookHit) int { if len(hits) == 0 || len(results) == 0 { return 0 } @@ -336,6 +441,11 @@ func ApplyPlaybookBoost(results []Result, hits []PlaybookHit) int { if h.Entry.AnswerID != r.ID || h.Entry.AnswerCorpus != r.Corpus { continue } + // Cross-role gate (real_001 bleed fix). Same semantics as + // InjectPlaybookMisses — see comment there. + if !roleEqual(queryRole, h.Entry.Role) { + continue + } bf := h.Entry.BoostFactor() if cur, ok := bestBoost[i]; !ok || bf < cur { bestBoost[i] = bf diff --git a/internal/matrix/playbook_test.go b/internal/matrix/playbook_test.go index 13b0f23..0f5030e 100644 --- a/internal/matrix/playbook_test.go +++ b/internal/matrix/playbook_test.go @@ -54,7 +54,7 @@ func TestApplyPlaybookBoost_NoHitsLeaveResultsAlone(t *testing.T) { {ID: "a", Distance: 0.1, Corpus: "x"}, {ID: "b", Distance: 0.2, Corpus: "x"}, } - n := ApplyPlaybookBoost(results, nil) + n := ApplyPlaybookBoost("", results, nil) if n != 0 { t.Errorf("expected 0 boosted, got %d", n) } @@ -77,7 +77,7 @@ func TestApplyPlaybookBoost_BoostMovesResultUp(t *testing.T) { AnswerID: "c", AnswerCorpus: "x", Score: 1.0, }}, } - n := ApplyPlaybookBoost(results, hits) + n := ApplyPlaybookBoost("", results, hits) if n != 1 { t.Errorf("expected 1 boosted, got %d", n) } @@ -104,7 +104,7 @@ func TestApplyPlaybookBoost_HighestScoreWinsForSameAnswer(t *testing.T) { AnswerID: "a", AnswerCorpus: "x", Score: 0.9, }}, } - ApplyPlaybookBoost(results, hits) + ApplyPlaybookBoost("", results, hits) wantDist := 0.30 * 0.55 if abs(float64(results[0].Distance)-wantDist) > 1e-6 { t.Errorf("strong-score boost should win: want %.4f, got %.4f", wantDist, results[0].Distance) @@ -125,7 +125,7 @@ func TestApplyPlaybookBoost_CorpusAttributionRespected(t *testing.T) { AnswerID: "a", AnswerCorpus: "x", Score: 1.0, }}, } - n := ApplyPlaybookBoost(results, hits) + n := ApplyPlaybookBoost("", results, hits) if n != 0 { t.Errorf("cross-corpus collision should not boost: got %d", n) } @@ -187,7 +187,7 @@ func TestInjectPlaybookMisses_AddsMissingAnswers(t *testing.T) { }, }, } - out, injected := InjectPlaybookMisses("test query", results, hits, 0, nil) + out, injected := InjectPlaybookMisses("test query", "", results, hits, 0, nil) if injected != 1 { t.Fatalf("expected 1 injected, got %d", injected) } @@ -240,7 +240,7 @@ func TestInjectPlaybookMisses_SkipsAnswersAlreadyPresent(t *testing.T) { }, }, } - out, injected := InjectPlaybookMisses("test query", results, hits, 0, nil) + out, injected := InjectPlaybookMisses("test query", "", results, hits, 0, nil) if injected != 0 { t.Errorf("expected 0 injected (answer already present), got %d", injected) } @@ -266,7 +266,7 @@ func TestInjectPlaybookMisses_DedupesPerAnswer(t *testing.T) { Entry: PlaybookEntry{QueryText: "q2", AnswerID: "w-99", AnswerCorpus: "workers", Score: 1.0}, }, } - out, injected := InjectPlaybookMisses("test query", results, hits, 0.5, nil) // explicit loose threshold so 0.30 hits qualify + out, injected := InjectPlaybookMisses("test query", "", results, hits, 0.5, nil) // explicit loose threshold so 0.30 hits qualify if injected != 1 { t.Errorf("expected 1 injection (deduped), got %d", injected) } @@ -298,7 +298,7 @@ func TestInjectPlaybookMisses_GateRejectsCandidate(t *testing.T) { }, } rejectAll := InjectGateFunc(func(string, PlaybookHit) bool { return false }) - out, injected := InjectPlaybookMisses("forklift loader query", results, hits, 0, rejectAll) + out, injected := InjectPlaybookMisses("forklift loader query", "", results, hits, 0, rejectAll) if injected != 0 { t.Errorf("rejectAll gate should skip injection, got %d injected", injected) } @@ -323,7 +323,7 @@ func TestInjectPlaybookMisses_GateApprovesCandidate(t *testing.T) { }, } approveAll := InjectGateFunc(func(string, PlaybookHit) bool { return true }) - out, injected := InjectPlaybookMisses("test query", results, hits, 0, approveAll) + out, injected := InjectPlaybookMisses("test query", "", results, hits, 0, approveAll) if injected != 1 { t.Errorf("approveAll gate should inject, got %d", injected) } @@ -354,7 +354,7 @@ func TestInjectPlaybookMisses_GateSeesCorrectQuery(t *testing.T) { seenRecordedQuery = h.Entry.QueryText return true }) - _, _ = InjectPlaybookMisses("CURRENT", results, hits, 0, gate) + _, _ = InjectPlaybookMisses("CURRENT", "", results, hits, 0, gate) if seenQuery != "CURRENT" { t.Errorf("gate received query=%q, want CURRENT", seenQuery) } @@ -386,7 +386,7 @@ func TestInjectPlaybookMisses_RespectsInjectThreshold(t *testing.T) { }, } // Default threshold (0 → DefaultPlaybookMaxInjectDistance = 0.20) - out, injected := InjectPlaybookMisses("test query", results, hits, 0, nil) + out, injected := InjectPlaybookMisses("test query", "", results, hits, 0, nil) if injected != 1 { t.Errorf("expected 1 injection (only the tight hit qualifies), got %d", injected) } @@ -407,7 +407,7 @@ func TestInjectPlaybookMisses_RespectsInjectThreshold(t *testing.T) { // TestInjectPlaybookMisses_EmptyHits is a fast-path no-op check. func TestInjectPlaybookMisses_EmptyHits(t *testing.T) { results := []Result{{ID: "w-1", Corpus: "workers", Distance: 0.30}} - out, injected := InjectPlaybookMisses("test query", results, nil, 0, nil) + out, injected := InjectPlaybookMisses("test query", "", results, nil, 0, nil) if injected != 0 { t.Errorf("expected 0 injection, got %d", injected) } @@ -416,6 +416,146 @@ func TestInjectPlaybookMisses_EmptyHits(t *testing.T) { } } +// Cross-role gate tests — reproduce the real_001 bleed scenario at +// the unit-test level so regressions can't ship without notice. +// +// real_001 scenario: Q#2 records playbook entry for e-6193 fitting +// "Forklift Operator at Beacon Freight Detroit MI". Q#10 (CNC Operator +// at same client + city) embeds within 0.135 cosine of Q#2's +// playbook entry — well inside the 0.20 inject threshold. Without the +// role gate, e-6193 injects on Q#10 at warm-top-1, demoting the +// cold-pass-correct w-3759. + +func TestInjectPlaybookMisses_RoleGateRejectsCrossRole(t *testing.T) { + results := []Result{{ID: "w-3759", Corpus: "workers", Distance: 0.50}} + hits := []PlaybookHit{{ + PlaybookID: "pb-q2", + Distance: 0.135, // tight; well within DefaultPlaybookMaxInjectDistance + Entry: PlaybookEntry{ + QueryText: "Need 1 Forklift Operator in Detroit MI ...", + Role: "Forklift Operator", + AnswerID: "e-6193", + AnswerCorpus: "ethereal_workers", + Score: 1.0, + }, + }} + out, injected := InjectPlaybookMisses( + "Need 1 CNC Operator in Detroit MI ...", + "CNC Operator", // queryRole differs + results, hits, 0, nil, + ) + if injected != 0 { + t.Errorf("expected 0 cross-role injections, got %d", injected) + } + if len(out) != 1 { + t.Errorf("results should be unchanged when gate rejects, got len=%d", len(out)) + } +} + +func TestInjectPlaybookMisses_RoleGateAllowsSameRole(t *testing.T) { + results := []Result{{ID: "e-5617", Corpus: "ethereal_workers", Distance: 0.50}} + hits := []PlaybookHit{{ + PlaybookID: "pb-q2", + Distance: 0.135, + Entry: PlaybookEntry{ + QueryText: "Need 1 Forklift Operator in Detroit MI ...", + Role: "Forklift Operator", + AnswerID: "e-6193", + AnswerCorpus: "ethereal_workers", + Score: 1.0, + }, + }} + out, injected := InjectPlaybookMisses( + "Need 2 Forklift Operators in Detroit MI ...", + "Forklift Operators", // plural — roleEqual normalizes to match + results, hits, 0, nil, + ) + if injected != 1 { + t.Errorf("expected 1 same-role injection, got %d", injected) + } + if len(out) != 2 { + t.Errorf("results should be extended with the injection, got len=%d", len(out)) + } +} + +func TestInjectPlaybookMisses_RoleGateBackwardCompat(t *testing.T) { + // Empty role on either side disables the gate — preserves + // behavior for callers (lift suite, etc.) that don't supply role. + results := []Result{{ID: "w-3759", Corpus: "workers", Distance: 0.50}} + hits := []PlaybookHit{{ + PlaybookID: "pb-q2", + Distance: 0.135, + Entry: PlaybookEntry{ + QueryText: "...", + Role: "Forklift Operator", // entry has role + AnswerID: "e-6193", + AnswerCorpus: "ethereal_workers", + Score: 1.0, + }, + }} + // Caller didn't pass queryRole — gate must NOT fire. + _, injected := InjectPlaybookMisses("...", "", results, hits, 0, nil) + if injected != 1 { + t.Errorf("expected gate disabled when queryRole empty (injected=1), got %d", injected) + } + + // Inverse: queryRole set but entry has no role. + hits[0].Entry.Role = "" + _, injected = InjectPlaybookMisses("...", "Forklift Operator", results, hits, 0, nil) + if injected != 1 { + t.Errorf("expected gate disabled when entry.Role empty (injected=1), got %d", injected) + } +} + +func TestApplyPlaybookBoost_RoleGateRejectsCrossRole(t *testing.T) { + results := []Result{ + {ID: "w-3759", Corpus: "workers", Distance: 0.50}, + {ID: "e-6193", Corpus: "ethereal_workers", Distance: 0.55}, + } + hits := []PlaybookHit{{ + PlaybookID: "pb-q2", + Distance: 0.18, + Entry: PlaybookEntry{ + QueryText: "Forklift Operator query", + Role: "Forklift Operator", + AnswerID: "e-6193", + AnswerCorpus: "ethereal_workers", + Score: 1.0, + }, + }} + // Different-role query → boost must NOT fire on e-6193 even though + // it's in the result set. + n := ApplyPlaybookBoost("CNC Operator", results, hits) + if n != 0 { + t.Errorf("expected 0 cross-role boosts, got %d", n) + } + if results[1].Distance != 0.55 { + t.Errorf("e-6193 distance should be unchanged (0.55), got %v", results[1].Distance) + } +} + +func TestRoleEqual_PluralAndCase(t *testing.T) { + cases := []struct { + a, b string + want bool + }{ + {"Forklift Operator", "forklift operator", true}, + {"Forklift Operator", "Forklift Operators", true}, // plural + {"CNC Operator", "Forklift Operator", false}, + {"Pickers", "Picker", true}, + {"", "Forklift Operator", true}, // gate disabled + {"CNC Operator", "", true}, // gate disabled (other side empty) + {"", "", true}, // both empty = both disabled + {"Boxes", "Box", true}, // -es plural + } + for _, c := range cases { + got := roleEqual(c.a, c.b) + if got != c.want { + t.Errorf("roleEqual(%q, %q) = %v, want %v", c.a, c.b, got, c.want) + } + } +} + func abs(f float64) float64 { if f < 0 { return -f diff --git a/internal/matrix/retrieve.go b/internal/matrix/retrieve.go index c734051..e51f6d8 100644 --- a/internal/matrix/retrieve.go +++ b/internal/matrix/retrieve.go @@ -92,6 +92,14 @@ type SearchRequest struct { JudgeURL string `json:"judge_url,omitempty"` JudgeModel string `json:"judge_model,omitempty"` JudgeMinRating int `json:"judge_min_rating,omitempty"` + // QueryRole is an optional role hint that gates Shape A boost + + // Shape B inject on role match (real_001 cross-role bleed fix). + // When set, a recorded playbook entry only fires for queries with + // the same role under roleEqual's normalization. Empty here OR + // empty on the recorded entry = gate disabled (preserves behavior + // for callers that don't supply role — e.g. lift-suite multi- + // constraint queries that have no clean single role). + QueryRole string `json:"query_role,omitempty"` MetadataFilter map[string]any `json:"metadata_filter,omitempty"` // ExcludeIDs filters out specific worker IDs post-retrieval. // Real-world driver: a coordinator places 200 workers at a @@ -292,7 +300,7 @@ func (r *Retriever) Search(ctx context.Context, req SearchRequest) (*SearchRespo } hits = keptHits } - resp.PlaybookBoosted = ApplyPlaybookBoost(resp.Results, hits) + resp.PlaybookBoosted = ApplyPlaybookBoost(req.QueryRole, resp.Results, hits) maxInjectDist := float32(req.PlaybookMaxInjectDistance) if maxInjectDist <= 0 { maxInjectDist = float32(DefaultPlaybookMaxInjectDistance) @@ -304,7 +312,7 @@ func (r *Retriever) Search(ctx context.Context, req SearchRequest) (*SearchRespo gate = g } var injected int - resp.Results, injected = InjectPlaybookMisses(req.QueryText, resp.Results, hits, maxInjectDist, gate) + resp.Results, injected = InjectPlaybookMisses(req.QueryText, req.QueryRole, resp.Results, hits, maxInjectDist, gate) resp.PlaybookInjected = injected if injected > 0 { // Re-sort + truncate after injection. ApplyPlaybookBoost diff --git a/reports/reality-tests/playbook_lift_real_002.md b/reports/reality-tests/playbook_lift_real_002.md new file mode 100644 index 0000000..b6dba3f --- /dev/null +++ b/reports/reality-tests/playbook_lift_real_002.md @@ -0,0 +1,86 @@ +# Playbook-Lift Reality Test — Run real_002 + +**Generated:** 2026-05-01T01:32:23.868772067Z +**Judge:** `qwen2.5:latest` (Ollama, resolved from config [models].local_judge) +**Corpora:** `workers,ethereal_workers` +**Workers limit:** 5000 +**Queries:** `tests/reality/real_coord_queries.txt` (10 executed) +**K per pass:** 10 +**Paraphrase pass:** disabled +**Re-judge pass:** disabled +**Evidence:** `reports/reality-tests/playbook_lift_real_002.json` + +--- + +## Headline + +| Metric | Value | +|---|---:| +| Total queries run | 10 | +| Cold-pass discoveries (judge-best ≠ top-1) | 2 | +| Warm-pass lifts (recorded playbook → top-1) | 2 | +| No change (judge-best already top-1, no playbook needed) | 8 | +| Playbook boosts triggered (warm pass) | 2 | +| Mean Δ top-1 distance (warm − cold) | -0.0401097 | + + + +**Verbatim lift rate:** 2 of 2 discoveries became top-1 after warm pass. + +--- + +## Per-query results + +| # | Query | Cold top-1 | Cold judge-best (rank/rating) | Recorded? | Warm top-1 | Judge-best warm rank | Lift | +|---|---|---|---|---|---|---|---| +| 1 | Need 5 Warehouse Associates in Kansas City MO starting at 09 | e-2975 | 1/4 | ✓ e-5389 | e-5389 | 0 | **YES** | +| 2 | Need 1 Forklift Operator in Detroit MI starting at 15:00 for | e-8321 | 3/5 | ✓ w-3098 | w-3098 | 0 | **YES** | +| 3 | Need 4 Loaders in Indianapolis IN starting at 12:00 for Midw | e-4537 | 0/4 | — | e-4537 | 0 | no | +| 4 | Need 3 Warehouse Associates in Fort Wayne IN starting at 17: | e-5954 | 3/3 | — | e-5954 | 3 | no | +| 5 | Need 4 Pickers in Detroit MI starting at 13:30 for Beacon Fr | e-8499 | 0/5 | — | e-8499 | 0 | no | +| 6 | Need 2 Packers in Joliet IL starting at 09:30 for Parallel M | e-9191 | 0/2 | — | e-9191 | 0 | no | +| 7 | Need 3 Assemblers in Flint MI starting at 08:30 for Heritage | w-4124 | 0/2 | — | w-4124 | 0 | no | +| 8 | Need 3 Packers in Flint MI starting at 12:30 for Parallel Ma | e-6019 | 7/2 | — | e-6019 | 7 | no | +| 9 | Need 1 Shipping Clerk in Flint MI starting at 17:00 for Pion | w-3988 | 1/3 | — | w-3988 | 1 | no | +| 10 | Need 1 CNC Operator in Detroit MI starting at 17:30 for Beac | w-2404 | 0/5 | — | w-2404 | 0 | no | + +--- + +## Honesty caveats + +1. **Judge IS the ground truth proxy.** Without human-labeled relevance, the LLM + judge's verdict is what defines "best." If `` rates badly, + the lift number is meaningless. To validate the judge itself, sample 5–10 + verdicts manually and check agreement. +2. **Score-1.0 boost = distance halved.** Playbook math is + `distance' = distance × (1 - 0.5 × score)`. Lift requires the judge-best + result's pre-boost distance to be ≤ 2× the cold top-1's distance, otherwise + even halving doesn't promote it. Tight clusters → little visible lift. +3. **Verbatim vs paraphrase.** The verbatim lift rate (above) is the cheap + case — same query, recorded playbook, expected boost. The paraphrase + pass (when enabled) is the actual learning property: similar-but-different + queries hitting a recorded playbook. Compare verbatim and paraphrase + lift rates — paraphrase should be lower (semantic-distance gates some + playbook hits) but non-zero is the meaningful signal. +4. **Multi-corpus skew.** Default corpora=`workers,ethereal_workers` — if all judge-best + results land in one corpus, the matrix layer's purpose isn't being tested. + Check per-corpus distribution in the JSON. +5. **Judge resolution.** This run used `qwen2.5:latest` from + config [models].local_judge. + Bumping the judge for run #N+1 means editing one line in lakehouse.toml. +6. **Paraphrase generation also uses the judge.** The same model that rates + relevance also rephrases queries. A judge that's bad at rating staffing + queries is probably also bad at rephrasing them. Worth sanity-checking + a sample of `paraphrase_query` values in the JSON before trusting the + paraphrase lift number. + +## Next moves + +- If lift rate ≥ 50% of discoveries: matrix layer + playbook is doing real + work. Move to paraphrase queries + tag-based boost (currently ignored). +- If lift rate < 20%: investigate why — judge variance, distance gap too + wide, or playbook math too gentle. The score=1.0 / 0.5× formula may need + retuning. +- If discovery rate (cold judge-best ≠ top-1) is itself low: cosine is + already close to optimal on this query distribution. Either the corpus + is too narrow or the queries are too easy. diff --git a/reports/reality-tests/real_002_findings.md b/reports/reality-tests/real_002_findings.md new file mode 100644 index 0000000..c90b865 --- /dev/null +++ b/reports/reality-tests/real_002_findings.md @@ -0,0 +1,104 @@ +# Reality test real_002 — cross-role bleed fix verification + +Same query set as real_001 (10 queries from `fill_events.parquet`), +re-run after the role-scoped playbook gate landed in +`internal/matrix/playbook.go`. The gate adds a `Role` field to +`PlaybookEntry` + a `QueryRole` field to `SearchRequest`; both +`ApplyPlaybookBoost` (Shape A) and `InjectPlaybookMisses` (Shape B) +reject playbook hits when both sides set Role and they differ. + +Companion to `playbook_lift_real_002.md` (auto-generated harness +output) and `real_001_findings.md` (the original bleed finding). + +## Headline + +The bleed is closed. Compare real_001 (no gate) vs real_002 (with gate): + +| Query | real_001 warm-top-1 | real_002 warm-top-1 | Status | +|---|---|---|---| +| Forklift Operator @ Beacon Freight Detroit | `e-6193` (lifted) | `w-3098` (different worker) | Same-role lift still works | +| Pickers @ Beacon Freight Detroit | **`e-6193`** (cross-role bleed) | `e-8499` (cold-pass top-1) | **Bleed closed** | +| CNC Operator @ Beacon Freight Detroit | **`e-6193`** (cross-role bleed) | `w-2404` (cold-pass top-1) | **Bleed closed** | + +Both previously-bled queries now keep their cold-pass top-1 because +the role-gated playbook entry doesn't fire on different-role queries. + +## Aggregate metrics + +| Metric | real_001 (no gate) | real_002 (gate) | +|---|---:|---:| +| Discoveries (judge-best ≠ cold top-1) | 2 | 2 | +| Lifts (recorded → top-1) | 2 | 2 | +| Boosted count (Shape A fires) | 2 | 2 | +| Mean Δ top-1 distance | -0.127 | -0.040 | + +The `meanΔdist` magnitude shrinking by 3× is the fingerprint of the +fix working. Previously boosts were firing too widely and pulling +distances through the floor on cross-role mismatches; now they only +fire on legitimate intra-role matches, so the average shift is +smaller and more honest. + +Critically, **discovery + lift counts didn't drop** — the gate +doesn't reject legitimate same-role lifts. Forklift Q#2 still +recorded a playbook and Q#9 (Shipping Clerk) still discovered + +lifted. Cross-role bleed protection doesn't erase same-role learning. + +## How the gate fires (concrete trace) + +real_002's harness extracts role mechanically from the query prefix: + +| Query | Extracted role | +|---|---| +| `Need 1 Forklift Operator in Detroit MI ...` | `Forklift Operator` | +| `Need 4 Pickers in Detroit MI ...` | `Pickers` | +| `Need 1 CNC Operator in Detroit MI ...` | `CNC Operator` | + +The recorded playbook entry from Q#2 carries `Role: "Forklift Operator"`. +At retrieve time: + +- Q#5 query has `query_role: "Pickers"` → `roleEqual("Pickers", + "Forklift Operator")` returns false → `InjectPlaybookMisses` + rejects the candidate before it can fire. +- Q#10 query has `query_role: "CNC Operator"` → same path, same + rejection. + +`roleEqual` normalization handles case + trailing-s plurals, so +the recorded singular "Forklift Operator" matches a query for +"Forklift Operators" without a separate rule. + +## What remains the same + +- Free-form queries with no extractable role (lift suite multi- + constraint queries) → empty role → gate disabled → behavior + unchanged. This is intentional: the gate is opt-in via + caller-supplied role, not a global filter. +- Shape A boost on intra-role matches is unaffected. The 2 lifts + in real_002 are real Shape A wins on same-role same-cluster queries. + +## Repro + +```bash +# Same query set as real_001 +go run scripts/cutover/gen_real_queries.go -limit 10 > tests/reality/real_coord_queries.txt + +# Run with role-aware harness +QUERIES_FILE=tests/reality/real_coord_queries.txt RUN_ID=real_002 \ + WITH_PARAPHRASE=0 WITH_REJUDGE=0 ./scripts/playbook_lift.sh +``` + +Evidence: `reports/reality-tests/playbook_lift_real_002.{json,md}`. + +## Test coverage + +`internal/matrix/playbook_test.go` adds 5 new tests: + +- `TestInjectPlaybookMisses_RoleGateRejectsCrossRole` — Q#10 scenario +- `TestInjectPlaybookMisses_RoleGateAllowsSameRole` — same-role lift +- `TestInjectPlaybookMisses_RoleGateBackwardCompat` — empty Role disables gate (both directions) +- `TestApplyPlaybookBoost_RoleGateRejectsCrossRole` — Shape A defense in depth +- `TestRoleEqual_PluralAndCase` — normalization edge cases + +The cross-role rejection tests use the exact distance + role values +from the real_001 finding (distance 0.135, recorded role "Forklift +Operator", query role "CNC Operator"), so a regression that re-opens +the bleed would fail at unit-test level before any reality test runs. diff --git a/scripts/playbook_lift/main.go b/scripts/playbook_lift/main.go index 8c689a1..3ed99a3 100644 --- a/scripts/playbook_lift/main.go +++ b/scripts/playbook_lift/main.go @@ -37,6 +37,7 @@ import ( "log" "net/http" "os" + "regexp" "sort" "strings" "time" @@ -481,6 +482,13 @@ func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, us "per_corpus_k": k, "use_playbook": usePlaybook, } + // Mechanical role extraction (real_001 cross-role bleed fix). + // Empty when the query doesn't match the "Need N {role}{s} in" + // pattern — gate stays disabled and the harness behaves as before + // for the lift suite's free-form queries. + if role := extractRoleFromNeed(query); role != "" { + body["query_role"] = role + } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") @@ -508,6 +516,11 @@ func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, s "score": score, "tags": []string{"reality-test", "playbook-lift-001"}, } + // Same mechanical role extraction as matrixSearch — recorded role + // lets retrieve-time gate fire on cross-role queries (real_001 fix). + if role := extractRoleFromNeed(query); role != "" { + body["role"] = role + } bs, _ := json.Marshal(body) req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs)) req.Header.Set("Content-Type", "application/json") @@ -617,3 +630,26 @@ func appendNote(existing, add string) string { // Suppress unused-import warning when sort isn't used in a future // refactor; harmless for now. var _ = sort.Slice + +// extractRoleFromNeed pulls the role out of "Need N {role}{s} in {city}" +// shape queries — the fill_events-derived form used by real_NNN runs. +// Returns "" for any query that doesn't match (free-form lift-suite +// queries fall back to empty, leaving the cross-role gate disabled). +// +// The pattern is permissive: the count can be any digits, and the +// role is everything between the count and " in ". This catches +// "Need 5 Warehouse Associates in Kansas City" → "Warehouse Associates"; +// roleEqual on the matrix side handles plurals + case. +// +// Lives here (not in internal/matrix) because role extraction from +// free-form text is a caller concern; matrix only consumes the +// already-resolved Role string. A future LLM-based extractor would +// replace this regex without changing matrix's gate logic. +func extractRoleFromNeed(query string) string { + re := regexp.MustCompile(`(?i)^Need\s+\d+\s+(.+?)\s+in\s+`) + m := re.FindStringSubmatch(query) + if len(m) < 2 { + return "" + } + return strings.TrimSpace(m[1]) +}