From 3dd7d9fe300e016fdd5c40fbdb87342a377a36e0 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 23:22:36 -0500 Subject: [PATCH] =?UTF-8?q?reality-tests:=20playbook-lift=20harness=20?= =?UTF-8?q?=E2=80=94=20does=20the=205-loop=20substrate=20beat=20raw=20cosi?= =?UTF-8?q?ne=3F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First reality test driver. Two-pass design: - Pass 1 (cold): matrix.search use_playbook=false → small-model judge rates top-K → record playbook entry pointing at the highest-rated result (which may NOT be top-1 by distance — that's the discovery). - Pass 2 (warm): same queries with use_playbook=true → measure ranking shift. Lift = real if recorded answer becomes top-1. Files: - scripts/playbook_lift/main.go driver (391 LoC) - scripts/playbook_lift.sh stack-bring-up + report gen - tests/reality/playbook_lift_queries.txt query corpus (5 placeholders; J writes real 20+) - reports/reality-tests/README.md framework + interpretation - .gitignore track reports/reality-tests/ but ignore per-run JSON evidence This answers the gate from project_small_model_pipeline_vision.md: "the playbook + matrix indexer must give the results we're looking for." Without ground-truth labels, the LLM judge is the proxy — the same small-model thesis applied to evaluation. Honest about that limitation in the generated reports. Driver compiles clean; full run requires Ollama + workers/candidates ingest. Skips cleanly if Ollama absent. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 4 + reports/reality-tests/README.md | 69 +++++ scripts/playbook_lift.sh | 233 ++++++++++++++ scripts/playbook_lift/main.go | 391 ++++++++++++++++++++++++ tests/reality/playbook_lift_queries.txt | 18 ++ 5 files changed, 715 insertions(+) create mode 100644 reports/reality-tests/README.md create mode 100755 scripts/playbook_lift.sh create mode 100644 scripts/playbook_lift/main.go create mode 100644 tests/reality/playbook_lift_queries.txt diff --git a/.gitignore b/.gitignore index 6b1db00..ed44c8b 100644 --- a/.gitignore +++ b/.gitignore @@ -39,10 +39,14 @@ vendor/ # Use /reports/* + un-ignore so git can traverse into reports/. /reports/* !/reports/scrum/ +!/reports/reality-tests/ # Inside the audit directory, the per-run _evidence/ dump (smoke logs, # command output) IS runtime — track the dir, ignore its contents. /reports/scrum/_evidence/* !/reports/scrum/_evidence/.gitkeep +# Reality-test JSON evidence is runtime — track the dir + MD reports +# (committed deliberately as outcome record), ignore per-run JSON. +/reports/reality-tests/*.json # Proof harness runtime output — same pattern as reports/scrum/_evidence. # Track the directory but ignore per-run subdirs. diff --git a/reports/reality-tests/README.md b/reports/reality-tests/README.md new file mode 100644 index 0000000..f72658d --- /dev/null +++ b/reports/reality-tests/README.md @@ -0,0 +1,69 @@ +# reports/reality-tests — does the 5-loop substrate actually work? + +Reality tests measure **product outcomes**, not substrate health. The 21 smokes prove the system *runs*; the proof harness proves the system *makes the claims it claims*; reality tests answer: **does the small-model pipeline + matrix indexer + playbook give measurably better results than raw cosine?** + +This is the gate from `project_small_model_pipeline_vision.md`: *"the playbook + matrix indexer must give the results we're looking for."* Single load-bearing criterion. Throughput, scaling, code elegance are secondary. + +--- + +## What lives here + +Each reality test is a numbered run that produces: + +- `_.json` — raw structured evidence (per-query data, summary metrics) +- `_.md` — human-readable report with headline metrics, per-query table, honesty caveats, next moves + +Runs are append-only. Earlier runs stay in tree as historical baseline. + +--- + +## Test catalog + +### `playbook_lift_` — does the playbook actually lift the right answer? + +**Driver:** `scripts/playbook_lift.sh` → `bin/playbook_lift` +**Queries:** `tests/reality/playbook_lift_queries.txt` +**Pipeline:** cold pass → LLM judge → playbook record → warm pass → measure ranking shift. + +The headline question: **when the LLM judge finds a better answer than cosine top-1, can the playbook boost it to top-1 on the next run?** If yes, the learning loop closes; if no, the matrix layer + playbook is infrastructure for a thesis that doesn't pay rent. + +See the run reports for honesty caveats — chiefly that the LLM judge IS the ground-truth proxy. + +--- + +## Running a reality test + +```bash +# Defaults: judge=qwen3.5:latest, workers limit 5000, run id 001 +./scripts/playbook_lift.sh + +# Re-run with a different judge to check inter-judge agreement +JUDGE_MODEL=qwen2.5:latest RUN_ID=002 ./scripts/playbook_lift.sh + +# Smaller scale for fast iteration +WORKERS_LIMIT=1000 K=5 RUN_ID=dev ./scripts/playbook_lift.sh +``` + +Requires: Ollama on `:11434` with `nomic-embed-text` + the chosen judge model loaded. Skips cleanly (exit 0) if Ollama is absent. + +--- + +## Interpreting results + +Three thresholds matter on the `playbook_lift` tests: + +| Lift rate (lifts / discoveries) | Verdict | +|---|---| +| ≥ 50% | Loop closes — playbook is doing real work, move to paraphrase queries | +| 20-50% | Lift exists but inconsistent — investigate boost math (`score × 0.5`) or judge variance | +| < 20% | Loop is not pulling its weight — diagnose before adding more components | + +A separate concern: **discovery rate** (cold judge-best ≠ cold top-1). If discovery is itself rare (< 30% of queries), cosine is already close to optimal on this query distribution and the matrix+playbook layer has little headroom. That's not necessarily a bug — but it means the value gate has to come from somewhere else (multi-corpus retrieval, domain-specific tags, drift signal). + +--- + +## What this is not + +- **Not a benchmark.** No comparison against external systems; only internal cold-vs-warm. +- **Not a regression gate.** Each run is a snapshot. Scores will drift with corpus changes, judge updates, and playbook math tuning. Don't wire `just verify` to demand a minimum lift. +- **Not human-validated.** The LLM judge is the ground truth proxy. Sample 5-10 verdicts manually per run to sanity-check the judge isn't pathological. diff --git a/scripts/playbook_lift.sh b/scripts/playbook_lift.sh new file mode 100755 index 0000000..082eb65 --- /dev/null +++ b/scripts/playbook_lift.sh @@ -0,0 +1,233 @@ +#!/usr/bin/env bash +# Playbook-lift reality test — measure whether the 5-loop substrate +# (matrix retrieve+merge + playbook + small-model judge) actually beats +# raw cosine on staffing queries. +# +# Pipeline: +# 1. Boot the Go stack (storaged, embedd, vectord, matrixd, gateway) +# 2. Ingest workers (default 5000) + candidates corpora +# 3. Run the playbook_lift driver: cold pass → judge → record → +# warm pass → measure +# 4. Generate markdown report from the JSON evidence +# +# Output: +# reports/reality-tests/playbook_lift_.json — raw evidence +# reports/reality-tests/playbook_lift_.md — human report +# +# Requires: Ollama on :11434 with nomic-embed-text + the judge model +# loaded. Skips (exit 0) if Ollama is absent. +# +# Usage: +# ./scripts/playbook_lift.sh # run #001 with defaults +# RUN_ID=002 ./scripts/playbook_lift.sh # explicit run id +# JUDGE_MODEL=qwen2.5:latest ./scripts/playbook_lift.sh +# WORKERS_LIMIT=2000 ./scripts/playbook_lift.sh + +set -euo pipefail +cd "$(dirname "$0")/.." + +export PATH="$PATH:/usr/local/go/bin" + +RUN_ID="${RUN_ID:-001}" +JUDGE_MODEL="${JUDGE_MODEL:-qwen3.5:latest}" +WORKERS_LIMIT="${WORKERS_LIMIT:-5000}" +QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}" +CORPORA="${CORPORA:-workers,candidates}" +K="${K:-10}" + +OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json" +OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md" + +if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then + echo "[lift] Ollama not reachable on :11434 — skipping" + exit 0 +fi + +if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$JUDGE_MODEL" \ + '.models[] | select(.name == $m)' >/dev/null 2>&1; then + echo "[lift] judge model '$JUDGE_MODEL' not loaded in Ollama — pull it first" + exit 1 +fi + +echo "[lift] building binaries..." +go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \ + ./scripts/staffing_workers ./scripts/staffing_candidates \ + ./scripts/playbook_lift + +pkill -f "bin/(storaged|embedd|vectord|matrixd|gateway)" 2>/dev/null || true +sleep 0.3 + +PIDS=() +TMP="$(mktemp -d)" +CFG="$TMP/lift.toml" + +cleanup() { + echo "[lift] cleanup" + for p in "${PIDS[@]:-}"; do [ -n "${p:-}" ] && kill "$p" 2>/dev/null || true; done + rm -rf "$TMP" +} +trap cleanup EXIT INT TERM + +cat > "$CFG" </dev/null 2>&1; then return 0; fi + sleep 0.05 + done + return 1 +} + +echo "[lift] launching stack..." +./bin/storaged -config "$CFG" > /tmp/storaged.log 2>&1 & PIDS+=($!) +poll_health 3211 || { echo "storaged failed"; exit 1; } +./bin/embedd -config "$CFG" > /tmp/embedd.log 2>&1 & PIDS+=($!) +poll_health 3216 || { echo "embedd failed"; exit 1; } +./bin/vectord -config "$CFG" > /tmp/vectord.log 2>&1 & PIDS+=($!) +poll_health 3215 || { echo "vectord failed"; exit 1; } +./bin/matrixd -config "$CFG" > /tmp/matrixd.log 2>&1 & PIDS+=($!) +poll_health 3218 || { echo "matrixd failed"; exit 1; } +./bin/gateway -config "$CFG" > /tmp/gateway.log 2>&1 & PIDS+=($!) +poll_health 3110 || { echo "gateway failed"; exit 1; } + +echo +echo "[lift] ingest workers (limit=$WORKERS_LIMIT)..." +./bin/staffing_workers -limit "$WORKERS_LIMIT" + +echo +echo "[lift] ingest candidates..." +./bin/staffing_candidates -skip-populate=false -query "warmup" 2>&1 \ + | grep -v "^\[candidates\]\(matrix\|reality\)" || true + +echo +echo "[lift] running driver — judge=$JUDGE_MODEL · queries=$QUERIES_FILE · k=$K" +./bin/playbook_lift \ + -gateway "http://127.0.0.1:3110" \ + -ollama "http://localhost:11434" \ + -queries "$QUERIES_FILE" \ + -corpora "$CORPORA" \ + -judge "$JUDGE_MODEL" \ + -k "$K" \ + -out "$OUT_JSON" + +echo +echo "[lift] generating markdown report → $OUT_MD" +generate_md() { + local json="$1" md="$2" + local total discovery lift no_change boosted mean_delta gen_at + total=$(jq -r '.summary.total' "$json") + discovery=$(jq -r '.summary.with_discovery' "$json") + lift=$(jq -r '.summary.lift_count' "$json") + no_change=$(jq -r '.summary.no_change' "$json") + boosted=$(jq -r '.summary.playbook_boosted_total' "$json") + mean_delta=$(jq -r '.summary.mean_top1_delta_distance' "$json") + gen_at=$(jq -r '.summary.generated_at' "$json") + + cat > "$md" <> "$md" + + cat >> "$md" < bestRating { + bestRating = rating + bestRank = j + } + } + run := queryRun{ + Query: q, + ColdTop1ID: resp.Results[0].ID, + ColdTop1Distance: resp.Results[0].Distance, + ColdJudgeBestID: resp.Results[bestRank].ID, + ColdJudgeBestRank: bestRank, + ColdJudgeBestRating: bestRating, + ColdRatings: ratings, + } + // Record a playbook only if the judge best is not already top-1 + // (otherwise we're boosting something cosine already crowned). + if bestRank > 0 && bestRating >= 4 { + withDiscovery++ + if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil { + log.Printf(" playbook record failed: %v", err) + run.Note = "playbook record failed: " + err.Error() + } else { + run.PlaybookRecorded = true + run.PlaybookID = resp.Results[bestRank].ID + } + } else if bestRank == 0 { + run.Note = "judge-best already top-1 cold — no playbook needed" + } else { + run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating) + } + runs = append(runs, run) + } + + // Pass 2 (warm) on the same queries. + for i := range runs { + q := runs[i].Query + log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60)) + resp, err := matrixSearch(hc, *gw, q, corpora, *k, true) + if err != nil || len(resp.Results) == 0 { + runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err)) + continue + } + runs[i].WarmTop1ID = resp.Results[0].ID + runs[i].WarmTop1Distance = resp.Results[0].Distance + runs[i].WarmBoostedCount = resp.PlaybookBoosted + playbookBoostedTotal += resp.PlaybookBoosted + + // Find where the cold judge-best ID landed in the warm ranking. + warmRank := -1 + for j, r := range resp.Results { + if r.ID == runs[i].ColdJudgeBestID { + warmRank = j + break + } + } + runs[i].WarmJudgeBestRank = warmRank + + switch { + case runs[i].PlaybookRecorded && warmRank == 0: + runs[i].Lift = true + liftCount++ + case !runs[i].PlaybookRecorded: + noChange++ + default: + noChange++ + } + totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance + } + + sum := summary{ + Total: len(runs), + WithDiscovery: withDiscovery, + LiftCount: liftCount, + NoChange: noChange, + MeanTop1DeltaDistance: 0, + PlaybookBoostedTotal: playbookBoostedTotal, + GeneratedAt: time.Now().UTC(), + } + if len(runs) > 0 { + sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs)) + } + + if err := writeJSON(*out, runs, sum); err != nil { + log.Fatalf("write %s: %v", *out, err) + } + log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f", + sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance) + log.Printf("[lift] results → %s", *out) +} + +func loadQueries(path string) ([]string, error) { + bs, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var out []string + for _, line := range strings.Split(string(bs), "\n") { + s := strings.TrimSpace(line) + if s == "" || strings.HasPrefix(s, "#") { + continue + } + out = append(out, s) + } + return out, nil +} + +func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) { + body := map[string]any{ + "query_text": query, + "corpora": corpora, + "k": k, + "per_corpus_k": k, + "use_playbook": usePlaybook, + } + bs, _ := json.Marshal(body) + req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs)) + req.Header.Set("Content-Type", "application/json") + resp, err := hc.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + rb, _ := io.ReadAll(resp.Body) + if resp.StatusCode/100 != 2 { + return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) + } + var out matrixResp + if err := json.Unmarshal(rb, &out); err != nil { + return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200)) + } + return &out, nil +} + +func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error { + body := map[string]any{ + "query": query, + "answer_id": answerID, + "answer_corpus": answerCorpus, + "score": score, + "tags": []string{"reality-test", "playbook-lift-001"}, + } + bs, _ := json.Marshal(body) + req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs)) + req.Header.Set("Content-Type", "application/json") + resp, err := hc.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + rb, _ := io.ReadAll(resp.Body) + return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb)) + } + return nil +} + +// judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating +// of the result against the query. Returns 0 on any failure (treated as +// "couldn't judge, exclude from best-of consideration"). +func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int { + system := `You rate retrieval results for a staffing co-pilot. +Rate the result 1-5 against the query: + 5 = perfect match (this person/job IS what was asked for) + 4 = strong match (right field, right level, minor mismatches) + 3 = adjacent match (related field or partial overlap) + 2 = weak/tangential match + 1 = irrelevant +Output JSON only: {"rating": N, "reason": ""}.` + user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s", + query, r.Corpus, r.ID, string(r.Metadata)) + + body := map[string]any{ + "model": model, + "stream": false, + "format": "json", + "messages": []map[string]string{ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + }, + "options": map[string]any{"temperature": 0}, + } + bs, _ := json.Marshal(body) + req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs)) + req.Header.Set("Content-Type", "application/json") + resp, err := hc.Do(req) + if err != nil { + return 0 + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + return 0 + } + rb, _ := io.ReadAll(resp.Body) + var ollamaResp struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal(rb, &ollamaResp); err != nil { + return 0 + } + var v judgeVerdict + if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil { + return 0 + } + if v.Rating < 1 || v.Rating > 5 { + return 0 + } + return v.Rating +} + +func writeJSON(path string, runs []queryRun, sum summary) error { + if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil { + return err + } + out := struct { + Summary summary `json:"summary"` + Runs []queryRun `json:"runs"` + }{Summary: sum, Runs: runs} + bs, err := json.MarshalIndent(out, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, bs, 0o644) +} + +func filepath_dir(p string) string { + if i := strings.LastIndex(p, "/"); i >= 0 { + return p[:i] + } + return "." +} + +func abbrev(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "…" +} + +func appendNote(existing, add string) string { + if existing == "" { + return add + } + return existing + "; " + add +} + +// Suppress unused-import warning when sort isn't used in a future +// refactor; harmless for now. +var _ = sort.Slice diff --git a/tests/reality/playbook_lift_queries.txt b/tests/reality/playbook_lift_queries.txt new file mode 100644 index 0000000..8bf2a14 --- /dev/null +++ b/tests/reality/playbook_lift_queries.txt @@ -0,0 +1,18 @@ +# Playbook lift reality test — staffing query corpus. +# +# Each non-blank, non-comment line is one query. The harness will run +# each through matrix.search (cold pass, then warm pass with playbook), +# ask the LLM judge to rate top-K results, and record lift metrics. +# +# Goal: 20 queries, weighted toward the kinds of asks a staffing +# coordinator would actually issue. Specific roles + certifications + +# constraints surface playbook lift better than generic "find a worker" +# style queries. +# +# Placeholders (5) — J: replace + extend to 20+ for the real test. + +Forklift operator with OSHA-30, warehouse experience, day shift availability +Bilingual customer service rep, Spanish + English, two years call-center experience +CDL Class A driver, clean record, willing to do regional 4-day routes +Production line supervisor with lean manufacturing background +Dental hygienist with three years experience, Indianapolis area