golangLAKEHOUSE/internal/matrix/playbook_test.go
root 154a72ea5e matrix: Shape B — inject playbook misses + 6/6 paraphrase recovery
The v0 boost-only stance documented in internal/matrix/playbook.go:22-27
("the boost only re-ranks results that ALREADY surfaced from the regular
retrieval") couldn't promote recorded answers that dropped out of a
paraphrase's top-K. playbook_lift_002 surfaced exactly that gap: 0/2
paraphrase recoveries because the recorded answers weren't in regular
retrieval at all (rank=-1).

Shape B: when warm-pass retrieval doesn't surface a playbook hit's
answer, inject a synthetic Result for it directly. Distance =
playbook_hit_distance × BoostFactor — same formula as the boost path so
injections land in comparable distance space. Caller re-sorts +
truncates after both boost and inject have run.

Result on playbook_lift_003 (Shape B + paraphrase pass):

  Verbatim discovery        6
  Verbatim lift             2 / 6
  **Paraphrase top-1**      **6 / 6**
  Paraphrase any-rank in K  6 / 6
  Mean Δ top-1 distance     -0.1637 (warm closer than cold)

Every paraphrase the judge generated landed the v1-recorded answer at
top-1 of the new query's results. The learning property holds — cosine
on embed(paraphrase) finds the recorded query's vector within
DefaultPlaybookMaxDistance (0.5), and Shape B injects the answer.

Verbatim lift dropped from v1's 7/8 because Shape B cross-pollinates
recorded answers across queries. w-4435 (Q2's recording) appears as
warm top-1 for several other queries because their embeddings are
within the playbook hit threshold of "OSHA-30 forklift Wisconsin." This
is a feature, not a bug — the matrix layer's purpose is to share
knowledge across queries — but the lift metric only counts "warm top-1
== cold judge best," so cross-pollinated lifts don't register. A v3
metric would re-judge warm pass to measure true judge improvement.

Tests:
- TestInjectPlaybookMisses_AddsMissingAnswers — primary claim
- TestInjectPlaybookMisses_SkipsAnswersAlreadyPresent — no double-inject
- TestInjectPlaybookMisses_DedupesPerAnswer — multi-hit same answer
- TestInjectPlaybookMisses_EmptyHits — fast-path no-op

Driver fix: ParaphraseRecordedRank int → *int. The `omitempty` int
silently dropped rank=0 (top-1, the WANTED value) from JSON, making the
v003 report show "null" instead of "0" for every successful recovery.
Pointer keeps nil/rank-0 distinguishable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 07:06:13 -05:00

309 lines
9.7 KiB
Go

package matrix
import (
"encoding/json"
"testing"
)
func TestPlaybookEntry_Validate(t *testing.T) {
good := PlaybookEntry{
QueryText: "x", AnswerID: "y", AnswerCorpus: "z", Score: 0.5,
}
if err := good.Validate(); err != nil {
t.Errorf("good entry should validate: %v", err)
}
cases := []struct {
name string
entry PlaybookEntry
}{
{"empty query", PlaybookEntry{AnswerID: "y", AnswerCorpus: "z", Score: 0.5}},
{"empty answer id", PlaybookEntry{QueryText: "x", AnswerCorpus: "z", Score: 0.5}},
{"empty corpus", PlaybookEntry{QueryText: "x", AnswerID: "y", Score: 0.5}},
{"score too high", PlaybookEntry{QueryText: "x", AnswerID: "y", AnswerCorpus: "z", Score: 1.5}},
{"score negative", PlaybookEntry{QueryText: "x", AnswerID: "y", AnswerCorpus: "z", Score: -0.1}},
}
for _, c := range cases {
if err := c.entry.Validate(); err == nil {
t.Errorf("%s: expected validation error, got nil", c.name)
}
}
}
func TestPlaybookEntry_BoostFactor(t *testing.T) {
cases := []struct {
score float64
want float64
}{
{0.0, 1.0},
{0.5, 0.75},
{1.0, 0.5},
{-0.1, 1.0}, // clamped
{1.5, 0.5}, // clamped
}
for _, c := range cases {
got := PlaybookEntry{Score: c.score}.BoostFactor()
if abs(got-c.want) > 1e-9 {
t.Errorf("BoostFactor(score=%.2f): want %.4f, got %.4f", c.score, c.want, got)
}
}
}
func TestApplyPlaybookBoost_NoHitsLeaveResultsAlone(t *testing.T) {
results := []Result{
{ID: "a", Distance: 0.1, Corpus: "x"},
{ID: "b", Distance: 0.2, Corpus: "x"},
}
n := ApplyPlaybookBoost(results, nil)
if n != 0 {
t.Errorf("expected 0 boosted, got %d", n)
}
if results[0].ID != "a" || results[1].ID != "b" {
t.Errorf("results reordered without hits: %v", results)
}
}
func TestApplyPlaybookBoost_BoostMovesResultUp(t *testing.T) {
// Initial: a (0.10) beats b (0.20) beats c (0.30).
// Playbook says (answer=c, score=1.0) should be boosted → c's
// distance becomes 0.30 * 0.5 = 0.15. New ordering: a, c, b.
results := []Result{
{ID: "a", Distance: 0.10, Corpus: "x"},
{ID: "b", Distance: 0.20, Corpus: "x"},
{ID: "c", Distance: 0.30, Corpus: "x"},
}
hits := []PlaybookHit{
{PlaybookID: "p1", Distance: 0.05, Entry: PlaybookEntry{
AnswerID: "c", AnswerCorpus: "x", Score: 1.0,
}},
}
n := ApplyPlaybookBoost(results, hits)
if n != 1 {
t.Errorf("expected 1 boosted, got %d", n)
}
if results[0].ID != "a" || results[1].ID != "c" || results[2].ID != "b" {
t.Errorf("expected order a,c,b after boost; got %v", idsOf(results))
}
if abs(float64(results[1].Distance)-0.15) > 1e-6 {
t.Errorf("expected c distance 0.15 after boost; got %.4f", results[1].Distance)
}
}
func TestApplyPlaybookBoost_HighestScoreWinsForSameAnswer(t *testing.T) {
results := []Result{
{ID: "a", Distance: 0.30, Corpus: "x"},
}
// Two playbook hits both pointing at "a". Score=0.4 (weak boost)
// + Score=0.9 (strong boost). Strong should win — distance gets
// multiplied by 1-0.5*0.9 = 0.55, not by 1-0.5*0.4 = 0.80.
hits := []PlaybookHit{
{PlaybookID: "p_weak", Distance: 0.05, Entry: PlaybookEntry{
AnswerID: "a", AnswerCorpus: "x", Score: 0.4,
}},
{PlaybookID: "p_strong", Distance: 0.05, Entry: PlaybookEntry{
AnswerID: "a", AnswerCorpus: "x", Score: 0.9,
}},
}
ApplyPlaybookBoost(results, hits)
wantDist := 0.30 * 0.55
if abs(float64(results[0].Distance)-wantDist) > 1e-6 {
t.Errorf("strong-score boost should win: want %.4f, got %.4f", wantDist, results[0].Distance)
}
}
func TestApplyPlaybookBoost_CorpusAttributionRespected(t *testing.T) {
// Playbook references answer_id="a" in corpus="x".
// Results have answer_id="a" in corpus="y" — DIFFERENT corpus.
// Boost should NOT apply; the (id, corpus) tuple is the join key,
// not just id (otherwise different-corpus collisions would create
// false positives).
results := []Result{
{ID: "a", Distance: 0.30, Corpus: "y"},
}
hits := []PlaybookHit{
{PlaybookID: "p1", Distance: 0.05, Entry: PlaybookEntry{
AnswerID: "a", AnswerCorpus: "x", Score: 1.0,
}},
}
n := ApplyPlaybookBoost(results, hits)
if n != 0 {
t.Errorf("cross-corpus collision should not boost: got %d", n)
}
if abs(float64(results[0].Distance)-0.30) > 1e-6 {
// 1e-6 tolerance accounts for float32→float64 conversion;
// the assertion that matters is "unchanged from input."
t.Errorf("distance should be unchanged: got %.6f", results[0].Distance)
}
}
func TestPlaybookEntry_RoundTripJSON(t *testing.T) {
e := NewPlaybookEntry("forklift query", "w-12345", "workers", 0.85, []string{"chicago", "verified"})
raw, err := e.MarshalMetadata()
if err != nil {
t.Fatalf("marshal: %v", err)
}
got, err := UnmarshalPlaybookMetadata(raw)
if err != nil {
t.Fatalf("unmarshal: %v", err)
}
if got.QueryText != e.QueryText || got.AnswerID != e.AnswerID ||
got.AnswerCorpus != e.AnswerCorpus || got.Score != e.Score {
t.Errorf("round-trip mismatch: want %+v, got %+v", e, got)
}
if len(got.Tags) != 2 || got.Tags[0] != "chicago" {
t.Errorf("tags lost in round-trip: %v", got.Tags)
}
if got.RecordedAtNs == 0 {
t.Error("RecordedAtNs not set by NewPlaybookEntry")
}
}
func TestUnmarshalPlaybookMetadata_RejectsEmpty(t *testing.T) {
if _, err := UnmarshalPlaybookMetadata(json.RawMessage{}); err == nil {
t.Error("empty metadata should error")
}
}
// TestInjectPlaybookMisses_AddsMissingAnswers locks Shape B's primary
// claim: when a playbook hit's answer isn't already in regular
// retrieval results, InjectPlaybookMisses appends a synthetic Result
// for it. Reality test playbook_lift_002 surfaced 0/2 paraphrase
// recoveries because the v0 boost-only stance couldn't promote
// answers that dropped out of the paraphrase's top-K.
func TestInjectPlaybookMisses_AddsMissingAnswers(t *testing.T) {
results := []Result{
{ID: "w-1", Corpus: "workers", Distance: 0.30},
{ID: "w-2", Corpus: "workers", Distance: 0.35},
}
hits := []PlaybookHit{
{
PlaybookID: "pb-x",
Distance: 0.20, // current query is close to recorded query
Entry: PlaybookEntry{
QueryText: "recorded query",
AnswerID: "w-99", // NOT in results
AnswerCorpus: "workers",
Score: 1.0, // strong outcome → boost factor 0.5
},
},
}
out, injected := InjectPlaybookMisses(results, hits)
if injected != 1 {
t.Fatalf("expected 1 injected, got %d", injected)
}
if len(out) != 3 {
t.Fatalf("expected len=3, got %d (%v)", len(out), idsOf(out))
}
// The injected result should be findable + carry the playbook
// provenance metadata flag.
var injectedResult *Result
for i := range out {
if out[i].ID == "w-99" {
injectedResult = &out[i]
break
}
}
if injectedResult == nil {
t.Fatal("w-99 not present in output")
}
// distance = 0.20 * 0.5 = 0.10 → near-top after caller re-sorts
if injectedResult.Distance < 0.099 || injectedResult.Distance > 0.101 {
t.Errorf("expected injected distance ~0.10, got %f", injectedResult.Distance)
}
var meta map[string]any
if err := json.Unmarshal(injectedResult.Metadata, &meta); err != nil {
t.Fatalf("decode meta: %v", err)
}
if v, _ := meta["playbook_injected"].(bool); !v {
t.Errorf("expected playbook_injected=true marker, got %v", meta)
}
if v, _ := meta["playbook_query_text"].(string); v != "recorded query" {
t.Errorf("expected recorded query in meta, got %v", v)
}
}
// TestInjectPlaybookMisses_SkipsAnswersAlreadyPresent locks the
// boost-only-when-present property. If a playbook hit's answer is
// ALREADY in results, we don't duplicate-inject — ApplyPlaybookBoost
// has handled that case via in-place re-rank.
func TestInjectPlaybookMisses_SkipsAnswersAlreadyPresent(t *testing.T) {
results := []Result{
{ID: "w-1", Corpus: "workers", Distance: 0.30},
{ID: "w-99", Corpus: "workers", Distance: 0.40}, // ALREADY HERE
}
hits := []PlaybookHit{
{
PlaybookID: "pb-x",
Distance: 0.20,
Entry: PlaybookEntry{
QueryText: "x", AnswerID: "w-99", AnswerCorpus: "workers", Score: 1.0,
},
},
}
out, injected := InjectPlaybookMisses(results, hits)
if injected != 0 {
t.Errorf("expected 0 injected (answer already present), got %d", injected)
}
if len(out) != 2 {
t.Errorf("expected results unchanged at len=2, got %d", len(out))
}
}
// TestInjectPlaybookMisses_DedupesPerAnswer locks: multiple playbook
// hits all pointing to the same missing answer collapse to ONE
// injection (the highest-scoring hit wins).
func TestInjectPlaybookMisses_DedupesPerAnswer(t *testing.T) {
results := []Result{{ID: "w-1", Corpus: "workers", Distance: 0.30}}
hits := []PlaybookHit{
{
PlaybookID: "pb-low",
Distance: 0.30,
Entry: PlaybookEntry{QueryText: "q1", AnswerID: "w-99", AnswerCorpus: "workers", Score: 0.4},
},
{
PlaybookID: "pb-high",
Distance: 0.30,
Entry: PlaybookEntry{QueryText: "q2", AnswerID: "w-99", AnswerCorpus: "workers", Score: 1.0},
},
}
out, injected := InjectPlaybookMisses(results, hits)
if injected != 1 {
t.Errorf("expected 1 injection (deduped), got %d", injected)
}
// Score=1.0 (the high one) wins → boost factor 0.5 → distance 0.15
for _, r := range out {
if r.ID == "w-99" {
if r.Distance < 0.149 || r.Distance > 0.151 {
t.Errorf("expected distance from highest-score hit (~0.15), got %f", r.Distance)
}
}
}
}
// TestInjectPlaybookMisses_EmptyHits is a fast-path no-op check.
func TestInjectPlaybookMisses_EmptyHits(t *testing.T) {
results := []Result{{ID: "w-1", Corpus: "workers", Distance: 0.30}}
out, injected := InjectPlaybookMisses(results, nil)
if injected != 0 {
t.Errorf("expected 0 injection, got %d", injected)
}
if len(out) != 1 {
t.Errorf("results should be unchanged, got len=%d", len(out))
}
}
func abs(f float64) float64 {
if f < 0 {
return -f
}
return f
}
func idsOf(rs []Result) []string {
out := make([]string, len(rs))
for i, r := range rs {
out[i] = r.ID
}
return out
}