root 0331288641 playbook_lift: LLM-based role extractor closes shorthand bleed (real_004)
real_003 left a known-weak hole: shorthand-style queries
("{count} {role} {city} {state} ...") have no separator between
role and city, so a regex can't reliably extract — leaving the
cross-role gate disabled when both record AND query are shorthand.

This commit adds a roleExtractor with regex-first + LLM fallback:

- Regex first (fast, deterministic) — handles need + client_first +
  looking from real_003b. ~75% of styles, no LLM cost paid.
- LLM fallback when regex returns empty AND model is configured —
  Ollama-shape /api/chat with format=json, schema-tight prompt,
  temperature 0. ~1-3s on local qwen2.5.
- Per-process cache — paraphrase + rejudge passes reuse the same
  query 4× per run; cache prevents 4× LLM cost.
- Off-by-default — opt-in via -llm-role-extract flag (CLI) and
  LLM_ROLE_EXTRACT=1 env var (harness wrapper). real_003b shipping
  config unchanged unless explicitly enabled.

8 new tests in scripts/playbook_lift/main_test.go:
- TestRoleExtractor_RegexFirst: LLM not called when regex matches
- TestRoleExtractor_LLMFallback: shorthand goes to LLM
- TestRoleExtractor_LLMOffLeavesEmpty: opt-in default preserved
- TestRoleExtractor_Cache: 3 calls = 1 LLM hit
- TestRoleExtractor_NilSafe: nil receiver runs regex only
- TestExtractRoleViaLLM_HTTPError + _BadJSON: failure paths
- TestRoleExtractor_ClosesCrossRoleShorthandBleed: synthetic
  witness for the real_003 scenario — both record + query are
  shorthand, regex returns "" for both, LLM produces DIFFERENT
  role tokens for CNC vs Forklift, so matrix gate's cross-role
  rejection (locked separately in
  TestInjectPlaybookMisses_RoleGateRejectsCrossRole) fires
  correctly. This is the load-bearing verification.

Reality test real_004 ran the same 40-query stress as real_003 with
LLM extraction on. Cross-style same-role boosts fired correctly
across all 4 styles for Loaders + Packers + Shipping Clerk clusters
(including shorthand → other-style transfer). No cross-role bleed
observed. The reality test alone can't be a clean "with vs without"
comparison (HNSW build is non-deterministic across runs, and
real_004 stochastics didn't trigger a shorthand recording at all),
which is why the unit-test witness exists.

Production note (in real_004_findings.md): LLM extraction is for
reality-test coverage of arbitrary query shapes. Production should
extract role at INGEST time (when the inbox parser already runs an
LLM) and pass already-resolved role through requests — same shape
as multi_coord_stress's existing Demand{Role: ...} model. The hot
path should never need the harness extractor's per-query LLM cost.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 22:51:27 -05:00

284 lines
9.6 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
)
// TestExtractRoleFromNeed locks the four query-shape patterns documented
// in real_003_findings.md so a future change to the regex can't silently
// drop coverage of any production-shape style. Real_001 used `need`-only;
// real_003 confirmed `shorthand` cross-role bleed; the extended
// extractor in real_003b covers `client_first` + `looking` and leaves
// `shorthand` as a known limitation (no separator between role and city).
func TestExtractRoleFromNeed(t *testing.T) {
cases := []struct {
name string
query string
want string
}{
{
"need style — original real_001 form",
"Need 1 Forklift Operator in Detroit MI starting at 15:00 for Beacon Freight",
"Forklift Operator",
},
{
"need with deadline trailer",
"Need 4 Pickers in Detroit MI starting at 13:30 for Beacon Freight, deadline 2026-05-28",
"Pickers",
},
{
"client_first style — added in real_003b",
"Beacon Freight needs 1 Forklift Operator in Detroit MI at 15:00",
"Forklift Operator",
},
{
"client_first with multi-word client",
"Parallel Machining needs 5 Warehouse Associates in Kansas City MO at 09:00",
"Warehouse Associates",
},
{
"looking style — added in real_003b",
"Looking for 1 Forklift Operator at Beacon Freight in Detroit MI for 15:00 shift",
"Forklift Operator",
},
{
"looking with multi-word role + 4-digit count",
"Looking for 1234 Senior Production Supervisors at Heritage Foods in Flint MI for 08:30 shift",
"Senior Production Supervisors",
},
{
"shorthand — known limitation, returns empty",
"1 Forklift Operator Detroit MI 15:00 Beacon Freight",
"",
},
{
"shorthand multi-word city — also empty",
"5 Warehouse Associates Kansas City MO 09:00 Parallel Machining",
"",
},
{
"lift-suite multi-constraint — no clean role, returns empty",
"Forklift operator with OSHA-30, warehouse experience, day shift availability",
"",
},
{
"OOD honesty signal — lift-suite, returns empty",
"Dental hygienist with three years experience, Indianapolis area",
"",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
got := extractRoleFromNeed(c.query)
if got != c.want {
t.Errorf("extractRoleFromNeed(%q) = %q, want %q", c.query, got, c.want)
}
})
}
}
// fakeOllama returns a minimal Ollama-shape /api/chat handler that
// echoes a fixed role string. Counts hits so caching tests can assert
// on call count.
func fakeOllama(role string) (*httptest.Server, *int64) {
var hits int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/chat" {
http.NotFound(w, r)
return
}
atomic.AddInt64(&hits, 1)
_, _ = io.ReadAll(r.Body)
// Ollama wraps the model's response under message.content as
// a JSON-string. The model's output is itself the
// {"role":"..."} JSON our extractor parses.
inner, _ := json.Marshal(map[string]string{"role": role})
out, _ := json.Marshal(map[string]any{
"message": map[string]string{"content": string(inner)},
})
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(out)
}))
return srv, &hits
}
// TestRoleExtractor_RegexFirst locks the priority order: when the
// regex matches, the LLM must NOT be called even if configured. Same
// behavior the real_003b shipping config relies on (don't pay LLM
// cost on need/client_first/looking queries).
func TestRoleExtractor_RegexFirst(t *testing.T) {
srv, hits := fakeOllama("LLM-WONT-BE-CALLED")
defer srv.Close()
rx := &roleExtractor{
hc: srv.Client(),
ollamaURL: srv.URL,
model: "test-model", // LLM is configured...
}
got := rx.extract("Need 5 Forklift Operators in Detroit MI starting at 09:00 for ACME")
if got != "Forklift Operators" {
t.Errorf("regex should win on Need-form query, got %q", got)
}
if *hits != 0 {
t.Errorf("LLM should not be called when regex matched, got %d hits", *hits)
}
}
// TestRoleExtractor_LLMFallback locks the shorthand-coverage path:
// when regex returns empty AND LLM is configured, the LLM is called
// and its result is used. Closes the real_003 shorthand bleed at the
// extraction layer.
func TestRoleExtractor_LLMFallback(t *testing.T) {
srv, hits := fakeOllama("CNC Operator")
defer srv.Close()
rx := &roleExtractor{
hc: srv.Client(),
ollamaURL: srv.URL,
model: "test-model",
}
got := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
if got != "CNC Operator" {
t.Errorf("LLM fallback should fire on shorthand, got %q", got)
}
if *hits != 1 {
t.Errorf("expected exactly 1 LLM hit, got %d", *hits)
}
}
// TestRoleExtractor_LLMOffLeavesEmpty locks the opt-in default: with
// LLM model unset, shorthand queries return empty (preserves
// real_003b shipping config — no LLM cost paid by default).
func TestRoleExtractor_LLMOffLeavesEmpty(t *testing.T) {
rx := &roleExtractor{} // model = "" disables LLM
got := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
if got != "" {
t.Errorf("with LLM off, shorthand should return empty, got %q", got)
}
}
// TestRoleExtractor_Cache locks the per-process cache: re-extracting
// the same query must NOT call the LLM twice. Paraphrase passes hit
// the same query under cold + warm + paraphrase + rejudge — without
// the cache that's 4× LLM cost per query.
func TestRoleExtractor_Cache(t *testing.T) {
srv, hits := fakeOllama("Pickers")
defer srv.Close()
rx := &roleExtractor{
hc: srv.Client(),
ollamaURL: srv.URL,
model: "test-model",
}
q := "4 Pickers Detroit MI 13:30 Beacon Freight"
for i := 0; i < 3; i++ {
got := rx.extract(q)
if got != "Pickers" {
t.Errorf("call %d: got %q, want Pickers", i, got)
}
}
if *hits != 1 {
t.Errorf("expected 1 LLM hit (rest cached), got %d", *hits)
}
}
// TestRoleExtractor_NilSafe locks the nil-receiver behavior so call
// sites in matrixSearch + playbookRecord don't need a guard. nil
// extractor degrades to regex-only.
func TestRoleExtractor_NilSafe(t *testing.T) {
var rx *roleExtractor
got := rx.extract("Need 1 Loader in Indianapolis IN starting at 12:00 for Midway")
if got != "Loader" {
t.Errorf("nil receiver should still run regex, got %q", got)
}
got = rx.extract("1 CNC Operator Detroit MI 17:30 Beacon")
if got != "" {
t.Errorf("nil receiver should return empty when regex misses (no LLM), got %q", got)
}
}
// TestExtractRoleViaLLM_HTTPError locks the failure path: HTTP non-2xx
// surfaces as an error so the caller can fall back to empty cleanly.
func TestExtractRoleViaLLM_HTTPError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "internal", http.StatusInternalServerError)
}))
defer srv.Close()
_, err := extractRoleViaLLM(srv.Client(), srv.URL, "test-model", "anything")
if err == nil || !strings.Contains(err.Error(), "HTTP 500") {
t.Errorf("expected HTTP 500 error, got %v", err)
}
}
// TestRoleExtractor_ClosesCrossRoleShorthandBleed is the synthetic
// witness for the real_003 finding: when both record AND query are
// shorthand-style (no anchor between role and city), the regex
// returns "" for both. With LLM off, both sides go empty → gate
// disabled → cross-role bleed possible. With LLM on, both sides
// extract a non-empty role → role-mismatch is caught at the matrix
// gate.
//
// This test isolates the EXTRACTION layer (does the harness produce
// the right role token?). The matrix gate's behavior on those tokens
// is locked separately in internal/matrix/playbook_test.go's
// TestInjectPlaybookMisses_RoleGateRejectsCrossRole.
func TestRoleExtractor_ClosesCrossRoleShorthandBleed(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
// Determine which role to return based on the user message.
role := ""
if strings.Contains(string(body), "CNC") {
role = "CNC Operator"
} else if strings.Contains(string(body), "Forklift") {
role = "Forklift Operator"
}
inner, _ := json.Marshal(map[string]string{"role": role})
out, _ := json.Marshal(map[string]any{
"message": map[string]string{"content": string(inner)},
})
_, _ = w.Write(out)
}))
defer srv.Close()
rx := &roleExtractor{
hc: srv.Client(),
ollamaURL: srv.URL,
model: "test-model",
}
// Both queries are shorthand — regex would return "" for both.
// LLM extracts a real role for each, and they differ.
cnc := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
fork := rx.extract("1 Forklift Operator Detroit MI 15:00 Beacon Freight")
if cnc != "CNC Operator" {
t.Errorf("CNC shorthand should extract via LLM, got %q", cnc)
}
if fork != "Forklift Operator" {
t.Errorf("Forklift shorthand should extract via LLM, got %q", fork)
}
if cnc == fork {
t.Errorf("cross-role shorthand should produce DIFFERENT role tokens (cnc=%q fork=%q)", cnc, fork)
}
}
// TestExtractRoleViaLLM_BadJSON locks the model-output validation: if
// the LLM returns non-JSON content, the error is surfaced (don't
// silently treat it as empty role).
func TestExtractRoleViaLLM_BadJSON(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
out, _ := json.Marshal(map[string]any{
"message": map[string]string{"content": "this is not json"},
})
_, _ = w.Write(out)
}))
defer srv.Close()
_, err := extractRoleViaLLM(srv.Client(), srv.URL, "test-model", "anything")
if err == nil || !strings.Contains(err.Error(), "decode role") {
t.Errorf("expected decode error, got %v", err)
}
}