real_003 left a known-weak hole: shorthand-style queries
("{count} {role} {city} {state} ...") have no separator between
role and city, so a regex can't reliably extract — leaving the
cross-role gate disabled when both record AND query are shorthand.
This commit adds a roleExtractor with regex-first + LLM fallback:
- Regex first (fast, deterministic) — handles need + client_first +
looking from real_003b. ~75% of styles, no LLM cost paid.
- LLM fallback when regex returns empty AND model is configured —
Ollama-shape /api/chat with format=json, schema-tight prompt,
temperature 0. ~1-3s on local qwen2.5.
- Per-process cache — paraphrase + rejudge passes reuse the same
query 4× per run; cache prevents 4× LLM cost.
- Off-by-default — opt-in via -llm-role-extract flag (CLI) and
LLM_ROLE_EXTRACT=1 env var (harness wrapper). real_003b shipping
config unchanged unless explicitly enabled.
8 new tests in scripts/playbook_lift/main_test.go:
- TestRoleExtractor_RegexFirst: LLM not called when regex matches
- TestRoleExtractor_LLMFallback: shorthand goes to LLM
- TestRoleExtractor_LLMOffLeavesEmpty: opt-in default preserved
- TestRoleExtractor_Cache: 3 calls = 1 LLM hit
- TestRoleExtractor_NilSafe: nil receiver runs regex only
- TestExtractRoleViaLLM_HTTPError + _BadJSON: failure paths
- TestRoleExtractor_ClosesCrossRoleShorthandBleed: synthetic
witness for the real_003 scenario — both record + query are
shorthand, regex returns "" for both, LLM produces DIFFERENT
role tokens for CNC vs Forklift, so matrix gate's cross-role
rejection (locked separately in
TestInjectPlaybookMisses_RoleGateRejectsCrossRole) fires
correctly. This is the load-bearing verification.
Reality test real_004 ran the same 40-query stress as real_003 with
LLM extraction on. Cross-style same-role boosts fired correctly
across all 4 styles for Loaders + Packers + Shipping Clerk clusters
(including shorthand → other-style transfer). No cross-role bleed
observed. The reality test alone can't be a clean "with vs without"
comparison (HNSW build is non-deterministic across runs, and
real_004 stochastics didn't trigger a shorthand recording at all),
which is why the unit-test witness exists.
Production note (in real_004_findings.md): LLM extraction is for
reality-test coverage of arbitrary query shapes. Production should
extract role at INGEST time (when the inbox parser already runs an
LLM) and pass already-resolved role through requests — same shape
as multi_coord_stress's existing Demand{Role: ...} model. The hot
path should never need the harness extractor's per-query LLM cost.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
284 lines
9.6 KiB
Go
284 lines
9.6 KiB
Go
package main
|
||
|
||
import (
|
||
"encoding/json"
|
||
"io"
|
||
"net/http"
|
||
"net/http/httptest"
|
||
"strings"
|
||
"sync/atomic"
|
||
"testing"
|
||
)
|
||
|
||
// TestExtractRoleFromNeed locks the four query-shape patterns documented
|
||
// in real_003_findings.md so a future change to the regex can't silently
|
||
// drop coverage of any production-shape style. Real_001 used `need`-only;
|
||
// real_003 confirmed `shorthand` cross-role bleed; the extended
|
||
// extractor in real_003b covers `client_first` + `looking` and leaves
|
||
// `shorthand` as a known limitation (no separator between role and city).
|
||
func TestExtractRoleFromNeed(t *testing.T) {
|
||
cases := []struct {
|
||
name string
|
||
query string
|
||
want string
|
||
}{
|
||
{
|
||
"need style — original real_001 form",
|
||
"Need 1 Forklift Operator in Detroit MI starting at 15:00 for Beacon Freight",
|
||
"Forklift Operator",
|
||
},
|
||
{
|
||
"need with deadline trailer",
|
||
"Need 4 Pickers in Detroit MI starting at 13:30 for Beacon Freight, deadline 2026-05-28",
|
||
"Pickers",
|
||
},
|
||
{
|
||
"client_first style — added in real_003b",
|
||
"Beacon Freight needs 1 Forklift Operator in Detroit MI at 15:00",
|
||
"Forklift Operator",
|
||
},
|
||
{
|
||
"client_first with multi-word client",
|
||
"Parallel Machining needs 5 Warehouse Associates in Kansas City MO at 09:00",
|
||
"Warehouse Associates",
|
||
},
|
||
{
|
||
"looking style — added in real_003b",
|
||
"Looking for 1 Forklift Operator at Beacon Freight in Detroit MI for 15:00 shift",
|
||
"Forklift Operator",
|
||
},
|
||
{
|
||
"looking with multi-word role + 4-digit count",
|
||
"Looking for 1234 Senior Production Supervisors at Heritage Foods in Flint MI for 08:30 shift",
|
||
"Senior Production Supervisors",
|
||
},
|
||
{
|
||
"shorthand — known limitation, returns empty",
|
||
"1 Forklift Operator Detroit MI 15:00 Beacon Freight",
|
||
"",
|
||
},
|
||
{
|
||
"shorthand multi-word city — also empty",
|
||
"5 Warehouse Associates Kansas City MO 09:00 Parallel Machining",
|
||
"",
|
||
},
|
||
{
|
||
"lift-suite multi-constraint — no clean role, returns empty",
|
||
"Forklift operator with OSHA-30, warehouse experience, day shift availability",
|
||
"",
|
||
},
|
||
{
|
||
"OOD honesty signal — lift-suite, returns empty",
|
||
"Dental hygienist with three years experience, Indianapolis area",
|
||
"",
|
||
},
|
||
}
|
||
for _, c := range cases {
|
||
t.Run(c.name, func(t *testing.T) {
|
||
got := extractRoleFromNeed(c.query)
|
||
if got != c.want {
|
||
t.Errorf("extractRoleFromNeed(%q) = %q, want %q", c.query, got, c.want)
|
||
}
|
||
})
|
||
}
|
||
}
|
||
|
||
// fakeOllama returns a minimal Ollama-shape /api/chat handler that
|
||
// echoes a fixed role string. Counts hits so caching tests can assert
|
||
// on call count.
|
||
func fakeOllama(role string) (*httptest.Server, *int64) {
|
||
var hits int64
|
||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
if r.URL.Path != "/api/chat" {
|
||
http.NotFound(w, r)
|
||
return
|
||
}
|
||
atomic.AddInt64(&hits, 1)
|
||
_, _ = io.ReadAll(r.Body)
|
||
// Ollama wraps the model's response under message.content as
|
||
// a JSON-string. The model's output is itself the
|
||
// {"role":"..."} JSON our extractor parses.
|
||
inner, _ := json.Marshal(map[string]string{"role": role})
|
||
out, _ := json.Marshal(map[string]any{
|
||
"message": map[string]string{"content": string(inner)},
|
||
})
|
||
w.Header().Set("Content-Type", "application/json")
|
||
_, _ = w.Write(out)
|
||
}))
|
||
return srv, &hits
|
||
}
|
||
|
||
// TestRoleExtractor_RegexFirst locks the priority order: when the
|
||
// regex matches, the LLM must NOT be called even if configured. Same
|
||
// behavior the real_003b shipping config relies on (don't pay LLM
|
||
// cost on need/client_first/looking queries).
|
||
func TestRoleExtractor_RegexFirst(t *testing.T) {
|
||
srv, hits := fakeOllama("LLM-WONT-BE-CALLED")
|
||
defer srv.Close()
|
||
rx := &roleExtractor{
|
||
hc: srv.Client(),
|
||
ollamaURL: srv.URL,
|
||
model: "test-model", // LLM is configured...
|
||
}
|
||
got := rx.extract("Need 5 Forklift Operators in Detroit MI starting at 09:00 for ACME")
|
||
if got != "Forklift Operators" {
|
||
t.Errorf("regex should win on Need-form query, got %q", got)
|
||
}
|
||
if *hits != 0 {
|
||
t.Errorf("LLM should not be called when regex matched, got %d hits", *hits)
|
||
}
|
||
}
|
||
|
||
// TestRoleExtractor_LLMFallback locks the shorthand-coverage path:
|
||
// when regex returns empty AND LLM is configured, the LLM is called
|
||
// and its result is used. Closes the real_003 shorthand bleed at the
|
||
// extraction layer.
|
||
func TestRoleExtractor_LLMFallback(t *testing.T) {
|
||
srv, hits := fakeOllama("CNC Operator")
|
||
defer srv.Close()
|
||
rx := &roleExtractor{
|
||
hc: srv.Client(),
|
||
ollamaURL: srv.URL,
|
||
model: "test-model",
|
||
}
|
||
got := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
|
||
if got != "CNC Operator" {
|
||
t.Errorf("LLM fallback should fire on shorthand, got %q", got)
|
||
}
|
||
if *hits != 1 {
|
||
t.Errorf("expected exactly 1 LLM hit, got %d", *hits)
|
||
}
|
||
}
|
||
|
||
// TestRoleExtractor_LLMOffLeavesEmpty locks the opt-in default: with
|
||
// LLM model unset, shorthand queries return empty (preserves
|
||
// real_003b shipping config — no LLM cost paid by default).
|
||
func TestRoleExtractor_LLMOffLeavesEmpty(t *testing.T) {
|
||
rx := &roleExtractor{} // model = "" disables LLM
|
||
got := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
|
||
if got != "" {
|
||
t.Errorf("with LLM off, shorthand should return empty, got %q", got)
|
||
}
|
||
}
|
||
|
||
// TestRoleExtractor_Cache locks the per-process cache: re-extracting
|
||
// the same query must NOT call the LLM twice. Paraphrase passes hit
|
||
// the same query under cold + warm + paraphrase + rejudge — without
|
||
// the cache that's 4× LLM cost per query.
|
||
func TestRoleExtractor_Cache(t *testing.T) {
|
||
srv, hits := fakeOllama("Pickers")
|
||
defer srv.Close()
|
||
rx := &roleExtractor{
|
||
hc: srv.Client(),
|
||
ollamaURL: srv.URL,
|
||
model: "test-model",
|
||
}
|
||
q := "4 Pickers Detroit MI 13:30 Beacon Freight"
|
||
for i := 0; i < 3; i++ {
|
||
got := rx.extract(q)
|
||
if got != "Pickers" {
|
||
t.Errorf("call %d: got %q, want Pickers", i, got)
|
||
}
|
||
}
|
||
if *hits != 1 {
|
||
t.Errorf("expected 1 LLM hit (rest cached), got %d", *hits)
|
||
}
|
||
}
|
||
|
||
// TestRoleExtractor_NilSafe locks the nil-receiver behavior so call
|
||
// sites in matrixSearch + playbookRecord don't need a guard. nil
|
||
// extractor degrades to regex-only.
|
||
func TestRoleExtractor_NilSafe(t *testing.T) {
|
||
var rx *roleExtractor
|
||
got := rx.extract("Need 1 Loader in Indianapolis IN starting at 12:00 for Midway")
|
||
if got != "Loader" {
|
||
t.Errorf("nil receiver should still run regex, got %q", got)
|
||
}
|
||
got = rx.extract("1 CNC Operator Detroit MI 17:30 Beacon")
|
||
if got != "" {
|
||
t.Errorf("nil receiver should return empty when regex misses (no LLM), got %q", got)
|
||
}
|
||
}
|
||
|
||
// TestExtractRoleViaLLM_HTTPError locks the failure path: HTTP non-2xx
|
||
// surfaces as an error so the caller can fall back to empty cleanly.
|
||
func TestExtractRoleViaLLM_HTTPError(t *testing.T) {
|
||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
http.Error(w, "internal", http.StatusInternalServerError)
|
||
}))
|
||
defer srv.Close()
|
||
_, err := extractRoleViaLLM(srv.Client(), srv.URL, "test-model", "anything")
|
||
if err == nil || !strings.Contains(err.Error(), "HTTP 500") {
|
||
t.Errorf("expected HTTP 500 error, got %v", err)
|
||
}
|
||
}
|
||
|
||
// TestRoleExtractor_ClosesCrossRoleShorthandBleed is the synthetic
|
||
// witness for the real_003 finding: when both record AND query are
|
||
// shorthand-style (no anchor between role and city), the regex
|
||
// returns "" for both. With LLM off, both sides go empty → gate
|
||
// disabled → cross-role bleed possible. With LLM on, both sides
|
||
// extract a non-empty role → role-mismatch is caught at the matrix
|
||
// gate.
|
||
//
|
||
// This test isolates the EXTRACTION layer (does the harness produce
|
||
// the right role token?). The matrix gate's behavior on those tokens
|
||
// is locked separately in internal/matrix/playbook_test.go's
|
||
// TestInjectPlaybookMisses_RoleGateRejectsCrossRole.
|
||
func TestRoleExtractor_ClosesCrossRoleShorthandBleed(t *testing.T) {
|
||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
body, _ := io.ReadAll(r.Body)
|
||
// Determine which role to return based on the user message.
|
||
role := ""
|
||
if strings.Contains(string(body), "CNC") {
|
||
role = "CNC Operator"
|
||
} else if strings.Contains(string(body), "Forklift") {
|
||
role = "Forklift Operator"
|
||
}
|
||
inner, _ := json.Marshal(map[string]string{"role": role})
|
||
out, _ := json.Marshal(map[string]any{
|
||
"message": map[string]string{"content": string(inner)},
|
||
})
|
||
_, _ = w.Write(out)
|
||
}))
|
||
defer srv.Close()
|
||
|
||
rx := &roleExtractor{
|
||
hc: srv.Client(),
|
||
ollamaURL: srv.URL,
|
||
model: "test-model",
|
||
}
|
||
|
||
// Both queries are shorthand — regex would return "" for both.
|
||
// LLM extracts a real role for each, and they differ.
|
||
cnc := rx.extract("1 CNC Operator Detroit MI 17:30 Beacon Freight")
|
||
fork := rx.extract("1 Forklift Operator Detroit MI 15:00 Beacon Freight")
|
||
|
||
if cnc != "CNC Operator" {
|
||
t.Errorf("CNC shorthand should extract via LLM, got %q", cnc)
|
||
}
|
||
if fork != "Forklift Operator" {
|
||
t.Errorf("Forklift shorthand should extract via LLM, got %q", fork)
|
||
}
|
||
if cnc == fork {
|
||
t.Errorf("cross-role shorthand should produce DIFFERENT role tokens (cnc=%q fork=%q)", cnc, fork)
|
||
}
|
||
}
|
||
|
||
// TestExtractRoleViaLLM_BadJSON locks the model-output validation: if
|
||
// the LLM returns non-JSON content, the error is surfaced (don't
|
||
// silently treat it as empty role).
|
||
func TestExtractRoleViaLLM_BadJSON(t *testing.T) {
|
||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
out, _ := json.Marshal(map[string]any{
|
||
"message": map[string]string{"content": "this is not json"},
|
||
})
|
||
_, _ = w.Write(out)
|
||
}))
|
||
defer srv.Close()
|
||
_, err := extractRoleViaLLM(srv.Client(), srv.URL, "test-model", "anything")
|
||
if err == nil || !strings.Contains(err.Error(), "decode role") {
|
||
t.Errorf("expected decode error, got %v", err)
|
||
}
|
||
}
|