root b2e45f7f26 playbook_lift: harness expansion + reality test #001 (7/8 lift, 87.5%)
The 5-loop substrate's load-bearing gate is verified — playbook +
matrix indexer give the results we're looking for. Per the report's
rubric, lift ≥ 50% of discoveries means matrix is doing real work;
7/8 = 87.5% blew through that.

Harness was structurally hiding bugs behind a 5-daemon stripped boot.
Expanding to the full 10-daemon prod stack surfaced 7 fixes in cascade:

1. driver→matrixd: {"query": ...} → {"query_text": ...} field name
2. harness temp toml missing [s3] → wrong default bucket → catalogd
   rehydrate 500 on first call
3. harness→queryd SQL probe: {"q": ...} → {"sql": ...} field name
4. expand boot from 5 → 10 daemons in dep-ordered launch
5. add SQL surface probe (3-row CSV ingest → COUNT(*)=3 assertion)
6. candidates corpus was synthetic SWE-tech (Swift/iOS, Scala/Spark) —
   wrong domain for staffing queries; replaced with ethereal_workers
   (10K rows, real staffing schema, "e-" id prefix to avoid collision
   with workers' "w-"). staffing_workers driver gains -index-name +
   -id-prefix flags so the same binary serves both corpora
7. local_judge qwen3.5:latest is a vision-SSM 256K-ctx build running
   ~30s per judge call against the lift loop; reverted to
   qwen2.5:latest (~1s/call, 30× faster, held lift theory)

Each contract drift (1, 3) is now locked into a cmd/<bin>/main_test.go
so future drift fires in `go test`, not in a reality run. R-005 closed:

- cmd/matrixd/main_test.go (new) — playbook record drift detector +
  score bounds + 6 routes mounted
- cmd/queryd/main_test.go — wrong-field-name drift detector
- cmd/pathwayd/main_test.go (new) — 9 routes + add round-trip + retire
- cmd/observerd/main_test.go (new) — 4 routes + invalid-op + unknown-mode

`go test ./cmd/{matrixd,queryd,pathwayd,observerd}` all green.

Reality test results (reports/reality-tests/playbook_lift_001.{json,md}):
  Queries              21 (staffing-domain, 7 categories)
  Discoveries          8 (judge ≠ cosine top-1)
  Lifts                7/8 (87.5%)
  Boosts triggered     9
  Mean Δ distance      -0.053 (warm closer than cold)
  OOD honesty          dental/RN/SWE rated 1, no fake matches
  Cross-corpus boosts  confirmed (e- ↔ w- swaps in lifts)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 06:22:21 -05:00

410 lines
13 KiB
Go

// Playbook-lift reality test driver. Two-pass design:
//
// Pass 1 (cold): for each query → matrix.search use_playbook=false →
// LLM judge rates top-K → record playbook entry pointing
// at the highest-rated result (which may NOT be top-1
// by distance — that's the discovery worth boosting).
//
// Pass 2 (warm): same queries → use_playbook=true → measure how the
// ranking shifted.
//
// Lift = real if pass-2 brings the LLM-judged-best result into top-1
// more often than pass-1. If lift ≈ 0, the playbook is just confirming
// what cosine already said and the 5-loop thesis is unproven.
//
// Honest about what this measures: with no human-labeled ground truth,
// the LLM judge IS the ground truth proxy. That's the small-model
// pipeline thesis itself — the same model class that runs the inner
// loop is also what we trust to evaluate it. If you don't trust the
// judge, the lift number is meaningless; that's a separate problem
// for ground-truth labeling.
//
// Usage (driven by scripts/playbook_lift.sh):
// playbook_lift -gateway http://127.0.0.1:3110 \
// -queries tests/reality/playbook_lift_queries.txt \
// -judge qwen3.5:latest \
// -corpora workers,candidates \
// -k 10 \
// -out reports/reality-tests/playbook_lift_001.json
package main
import (
"bytes"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"sort"
"strings"
"time"
"git.agentview.dev/profit/golangLAKEHOUSE/internal/shared"
)
type matrixResult struct {
ID string `json:"id"`
Distance float32 `json:"distance"`
Corpus string `json:"corpus"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type matrixResp struct {
Results []matrixResult `json:"results"`
PerCorpusCounts map[string]int `json:"per_corpus_counts"`
PlaybookBoosted int `json:"playbook_boosted,omitempty"`
}
type judgeVerdict struct {
Rating int `json:"rating"`
Reason string `json:"reason"`
}
type queryRun struct {
Query string `json:"query"`
ColdTop1ID string `json:"cold_top1_id"`
ColdTop1Distance float32 `json:"cold_top1_distance"`
ColdJudgeBestID string `json:"cold_judge_best_id"`
ColdJudgeBestRank int `json:"cold_judge_best_rank"`
ColdJudgeBestRating int `json:"cold_judge_best_rating"`
ColdRatings []int `json:"cold_ratings"`
PlaybookRecorded bool `json:"playbook_recorded"`
PlaybookID string `json:"playbook_target_id,omitempty"`
WarmTop1ID string `json:"warm_top1_id"`
WarmTop1Distance float32 `json:"warm_top1_distance"`
WarmBoostedCount int `json:"warm_boosted_count"`
WarmJudgeBestRank int `json:"warm_judge_best_rank"`
Lift bool `json:"lift"` // judge-best was below top-1 cold, but top-1 warm
Note string `json:"note,omitempty"`
}
type summary struct {
Total int `json:"total"`
WithDiscovery int `json:"with_discovery"` // judge-best != cold top-1
LiftCount int `json:"lift_count"` // top-1 changed warm→ judge-best
NoChange int `json:"no_change"`
MeanTop1DeltaDistance float32 `json:"mean_top1_delta_distance"`
PlaybookBoostedTotal int `json:"playbook_boosted_total"`
GeneratedAt time.Time `json:"generated_at"`
}
func main() {
configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)")
gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL")
ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge")
queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path")
corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora")
// Empty default — resolved below from (priority): flag > env > config > hardcoded.
judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)")
k := flag.Int("k", 10, "top-k from matrix.search per pass")
out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path")
flag.Parse()
// Judge resolution priority: explicit flag > $JUDGE_MODEL env >
// cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this
// up so model bumps land in lakehouse.toml, not in this driver.
if *judge == "" {
if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" {
*judge = env
} else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" {
*judge = cfg.Models.LocalJudge
} else {
*judge = "qwen3.5:latest"
log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge)
}
}
corpora := strings.Split(*corporaCSV, ",")
qs, err := loadQueries(*queries)
if err != nil {
log.Fatalf("load queries: %v", err)
}
if len(qs) == 0 {
log.Fatalf("no queries in %s", *queries)
}
log.Printf("[lift] %d queries · corpora=%v · k=%d · judge=%s", len(qs), corpora, *k, *judge)
hc := &http.Client{Timeout: 60 * time.Second}
runs := make([]queryRun, 0, len(qs))
totalDelta := float32(0)
playbookBoostedTotal := 0
withDiscovery := 0
liftCount := 0
noChange := 0
// Pass 1 (cold) + record playbooks based on judge verdicts.
for i, q := range qs {
log.Printf("[lift] (%d/%d cold) %s", i+1, len(qs), abbrev(q, 60))
resp, err := matrixSearch(hc, *gw, q, corpora, *k, false)
if err != nil {
log.Printf(" cold search failed: %v — skipping", err)
continue
}
if len(resp.Results) == 0 {
log.Printf(" cold returned 0 results — skipping")
continue
}
ratings := make([]int, len(resp.Results))
bestRank := 0
bestRating := -1
for j, r := range resp.Results {
rating := judgeRate(hc, *ollama, *judge, q, r)
ratings[j] = rating
if rating > bestRating {
bestRating = rating
bestRank = j
}
}
run := queryRun{
Query: q,
ColdTop1ID: resp.Results[0].ID,
ColdTop1Distance: resp.Results[0].Distance,
ColdJudgeBestID: resp.Results[bestRank].ID,
ColdJudgeBestRank: bestRank,
ColdJudgeBestRating: bestRating,
ColdRatings: ratings,
}
// Record a playbook only if the judge best is not already top-1
// (otherwise we're boosting something cosine already crowned).
if bestRank > 0 && bestRating >= 4 {
withDiscovery++
if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil {
log.Printf(" playbook record failed: %v", err)
run.Note = "playbook record failed: " + err.Error()
} else {
run.PlaybookRecorded = true
run.PlaybookID = resp.Results[bestRank].ID
}
} else if bestRank == 0 {
run.Note = "judge-best already top-1 cold — no playbook needed"
} else {
run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating)
}
runs = append(runs, run)
}
// Pass 2 (warm) on the same queries.
for i := range runs {
q := runs[i].Query
log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60))
resp, err := matrixSearch(hc, *gw, q, corpora, *k, true)
if err != nil || len(resp.Results) == 0 {
runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err))
continue
}
runs[i].WarmTop1ID = resp.Results[0].ID
runs[i].WarmTop1Distance = resp.Results[0].Distance
runs[i].WarmBoostedCount = resp.PlaybookBoosted
playbookBoostedTotal += resp.PlaybookBoosted
// Find where the cold judge-best ID landed in the warm ranking.
warmRank := -1
for j, r := range resp.Results {
if r.ID == runs[i].ColdJudgeBestID {
warmRank = j
break
}
}
runs[i].WarmJudgeBestRank = warmRank
switch {
case runs[i].PlaybookRecorded && warmRank == 0:
runs[i].Lift = true
liftCount++
case !runs[i].PlaybookRecorded:
noChange++
default:
noChange++
}
totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance
}
sum := summary{
Total: len(runs),
WithDiscovery: withDiscovery,
LiftCount: liftCount,
NoChange: noChange,
MeanTop1DeltaDistance: 0,
PlaybookBoostedTotal: playbookBoostedTotal,
GeneratedAt: time.Now().UTC(),
}
if len(runs) > 0 {
sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs))
}
if err := writeJSON(*out, runs, sum); err != nil {
log.Fatalf("write %s: %v", *out, err)
}
log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f",
sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance)
log.Printf("[lift] results → %s", *out)
}
func loadQueries(path string) ([]string, error) {
bs, err := os.ReadFile(path)
if err != nil {
return nil, err
}
var out []string
for _, line := range strings.Split(string(bs), "\n") {
s := strings.TrimSpace(line)
if s == "" || strings.HasPrefix(s, "#") {
continue
}
out = append(out, s)
}
return out, nil
}
func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) {
body := map[string]any{
"query_text": query,
"corpora": corpora,
"k": k,
"per_corpus_k": k,
"use_playbook": usePlaybook,
}
bs, _ := json.Marshal(body)
req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs))
req.Header.Set("Content-Type", "application/json")
resp, err := hc.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
rb, _ := io.ReadAll(resp.Body)
if resp.StatusCode/100 != 2 {
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
}
var out matrixResp
if err := json.Unmarshal(rb, &out); err != nil {
return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200))
}
return &out, nil
}
func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error {
body := map[string]any{
"query_text": query,
"answer_id": answerID,
"answer_corpus": answerCorpus,
"score": score,
"tags": []string{"reality-test", "playbook-lift-001"},
}
bs, _ := json.Marshal(body)
req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs))
req.Header.Set("Content-Type", "application/json")
resp, err := hc.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
rb, _ := io.ReadAll(resp.Body)
return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
}
return nil
}
// judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating
// of the result against the query. Returns 0 on any failure (treated as
// "couldn't judge, exclude from best-of consideration").
func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int {
system := `You rate retrieval results for a staffing co-pilot.
Rate the result 1-5 against the query:
5 = perfect match (this person/job IS what was asked for)
4 = strong match (right field, right level, minor mismatches)
3 = adjacent match (related field or partial overlap)
2 = weak/tangential match
1 = irrelevant
Output JSON only: {"rating": N, "reason": "<one sentence>"}.`
user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s",
query, r.Corpus, r.ID, string(r.Metadata))
body := map[string]any{
"model": model,
"stream": false,
"format": "json",
"messages": []map[string]string{
{"role": "system", "content": system},
{"role": "user", "content": user},
},
"options": map[string]any{"temperature": 0},
}
bs, _ := json.Marshal(body)
req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs))
req.Header.Set("Content-Type", "application/json")
resp, err := hc.Do(req)
if err != nil {
return 0
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
return 0
}
rb, _ := io.ReadAll(resp.Body)
var ollamaResp struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
}
if err := json.Unmarshal(rb, &ollamaResp); err != nil {
return 0
}
var v judgeVerdict
if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil {
return 0
}
if v.Rating < 1 || v.Rating > 5 {
return 0
}
return v.Rating
}
func writeJSON(path string, runs []queryRun, sum summary) error {
if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil {
return err
}
out := struct {
Summary summary `json:"summary"`
Runs []queryRun `json:"runs"`
}{Summary: sum, Runs: runs}
bs, err := json.MarshalIndent(out, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, bs, 0o644)
}
func filepath_dir(p string) string {
if i := strings.LastIndex(p, "/"); i >= 0 {
return p[:i]
}
return "."
}
func abbrev(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "…"
}
func appendNote(existing, add string) string {
if existing == "" {
return add
}
return existing + "; " + add
}
// Suppress unused-import warning when sort isn't used in a future
// refactor; harmless for now.
var _ = sort.Slice