From 848cbf5fefc25acd8b9a0056674e533e4476a3d6 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Wed, 29 Apr 2026 23:57:28 -0500
Subject: [PATCH] phase 3: playbook_lift harness reads judge from config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

migrate the reality-test harness's judge-model default from a
hardcoded "qwen3.5:latest" string to cfg.Models.LocalJudge.

resolution priority: explicit -judge flag > $JUDGE_MODEL env >
cfg.Models.LocalJudge from lakehouse.toml > hardcoded fallback.

bumping the judge for run #N+1 now means editing one line in
lakehouse.toml [models].local_judge — no Go file or shell script
edits required.

changes:
- scripts/playbook_lift/main.go: -config flag added, judge default
  flips to "" so resolution chain runs. Imports internal/shared for
  config loader.
- scripts/playbook_lift.sh: JUDGE_MODEL no longer defaulted in bash;
  EFFECTIVE_JUDGE resolved by mirror-of-the-Go-chain (env > config
  grep > qwen3.5:latest fallback). Used for the Ollama presence
  check + report header. Pre-flight grep avoids requiring jq just
  to read the toml.
- reports/reality-tests/README.md: documents the 4-step priority
  chain.

verified all 4 paths produce the expected judge:
- config (no env): qwen3.5:latest (from lakehouse.toml)
- env override:    env wins
- flag override:   flag wins over env
- missing config:  DefaultConfig fallback still gives qwen3.5:latest

just verify PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 reports/reality-tests/README.md | 19 ++++++++++++++++---
 scripts/playbook_lift.sh        | 29 ++++++++++++++++++++++++-----
 scripts/playbook_lift/main.go   | 20 +++++++++++++++++++-
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/reports/reality-tests/README.md b/reports/reality-tests/README.md
index f72658d..76f0ef5 100644
--- a/reports/reality-tests/README.md
+++ b/reports/reality-tests/README.md
@@ -34,17 +34,30 @@ See the run reports for honesty caveats — chiefly that the LLM judge IS the gr
 ## Running a reality test
 
 ```bash
-# Defaults: judge=qwen3.5:latest, workers limit 5000, run id 001
+# Defaults: judge resolved from lakehouse.toml [models].local_judge,
+# workers limit 5000, run id 001
 ./scripts/playbook_lift.sh
 
 # Re-run with a different judge to check inter-judge agreement
-JUDGE_MODEL=qwen2.5:latest RUN_ID=002 ./scripts/playbook_lift.sh
+# (env JUDGE_MODEL overrides the config tier)
+JUDGE_MODEL=qwen3:latest RUN_ID=002 ./scripts/playbook_lift.sh
 
 # Smaller scale for fast iteration
 WORKERS_LIMIT=1000 K=5 RUN_ID=dev ./scripts/playbook_lift.sh
 ```
 
-Requires: Ollama on `:11434` with `nomic-embed-text` + the chosen judge model loaded. Skips cleanly (exit 0) if Ollama is absent.
+**Judge resolution priority** (Phase 3, 2026-04-29):
+1. `-judge` flag on the Go driver (explicit override)
+2. `JUDGE_MODEL` env var (operator override)
+3. `lakehouse.toml [models].local_judge` (default)
+4. Hardcoded `qwen3.5:latest` (last-resort fallback if config missing)
+
+This means model bumps land in `lakehouse.toml`, not in this script or
+the Go driver. Bumping `local_judge` to a stronger local model (e.g.
+when qwen4 ships) takes one line.
+
+Requires: Ollama on `:11434` with `nomic-embed-text` + the resolved judge
+model loaded. Skips cleanly (exit 0) if Ollama is absent.
 
 ---
 
diff --git a/scripts/playbook_lift.sh b/scripts/playbook_lift.sh
index 082eb65..170a73a 100755
--- a/scripts/playbook_lift.sh
+++ b/scripts/playbook_lift.sh
@@ -29,11 +29,14 @@ cd "$(dirname "$0")/.."
 export PATH="$PATH:/usr/local/go/bin"
 
 RUN_ID="${RUN_ID:-001}"
-JUDGE_MODEL="${JUDGE_MODEL:-qwen3.5:latest}"
+# JUDGE_MODEL: empty means "let the Go driver resolve from
+# lakehouse.toml [models].local_judge". Set explicitly to override.
+JUDGE_MODEL="${JUDGE_MODEL:-}"
 WORKERS_LIMIT="${WORKERS_LIMIT:-5000}"
 QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}"
 CORPORA="${CORPORA:-workers,candidates}"
 K="${K:-10}"
+CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}"
 
 OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json"
 OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md"
@@ -43,11 +46,20 @@ if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then
   exit 0
 fi
 
-if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$JUDGE_MODEL" \
+# Resolve judge from config when not set explicitly — needed for the
+# Ollama model-presence check below. Mirrors the Go driver's priority.
+EFFECTIVE_JUDGE="$JUDGE_MODEL"
+if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then
+  EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')"
+fi
+EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}"
+
+if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \
     '.models[] | select(.name == $m)' >/dev/null 2>&1; then
-  echo "[lift] judge model '$JUDGE_MODEL' not loaded in Ollama — pull it first"
+  echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first"
   exit 1
 fi
+echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from ${JUDGE_MODEL:+env}${JUDGE_MODEL:-config})"
 
 echo "[lift] building binaries..."
 go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \
@@ -121,8 +133,12 @@ echo "[lift] ingest candidates..."
   | grep -v "^\[candidates\]\(matrix\|reality\)" || true
 
 echo
-echo "[lift] running driver — judge=$JUDGE_MODEL · queries=$QUERIES_FILE · k=$K"
+echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K"
+# Pass empty -judge so the Go driver runs its config resolution chain
+# unless the operator explicitly set JUDGE_MODEL (in which case we
+# pass it through, taking priority over config).
 ./bin/playbook_lift \
+  -config  "$CONFIG_PATH" \
   -gateway "http://127.0.0.1:3110" \
   -ollama  "http://localhost:11434" \
   -queries "$QUERIES_FILE" \
@@ -148,7 +164,7 @@ generate_md() {
 # Playbook-Lift Reality Test — Run ${RUN_ID}
 
 **Generated:** ${gen_at}
-**Judge:** \`${JUDGE_MODEL}\` (Ollama)
+**Judge:** \`${EFFECTIVE_JUDGE}\` (Ollama, resolved from ${JUDGE_MODEL:+env JUDGE_MODEL}${JUDGE_MODEL:-config [models].local_judge})
 **Corpora:** \`${CORPORA}\`
 **Workers limit:** ${WORKERS_LIMIT}
 **Queries:** \`${QUERIES_FILE}\` (${total} executed)
@@ -211,6 +227,9 @@ MDEOF
 4. **Multi-corpus skew.** Default corpora=\`${CORPORA}\` — if all judge-best
    results land in one corpus, the matrix layer's purpose isn't being tested.
    Check per-corpus distribution in the JSON.
+5. **Judge resolution.** This run used \`${EFFECTIVE_JUDGE}\` from
+   ${JUDGE_MODEL:+env JUDGE_MODEL override}${JUDGE_MODEL:-the lakehouse.toml [models].local_judge tier}.
+   Bumping the judge for run #N+1 means editing one line in lakehouse.toml.
 
 ## Next moves
 
diff --git a/scripts/playbook_lift/main.go b/scripts/playbook_lift/main.go
index 383cb20..7eab84f 100644
--- a/scripts/playbook_lift/main.go
+++ b/scripts/playbook_lift/main.go
@@ -40,6 +40,8 @@ import (
 	"sort"
 	"strings"
 	"time"
+
+	"git.agentview.dev/profit/golangLAKEHOUSE/internal/shared"
 )
 
 type matrixResult struct {
@@ -93,15 +95,31 @@ type summary struct {
 }
 
 func main() {
+	configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)")
 	gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL")
 	ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge")
 	queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path")
 	corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora")
-	judge := flag.String("judge", "qwen3.5:latest", "Ollama model for relevance judging")
+	// Empty default — resolved below from (priority): flag > env > config > hardcoded.
+	judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)")
 	k := flag.Int("k", 10, "top-k from matrix.search per pass")
 	out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path")
 	flag.Parse()
 
+	// Judge resolution priority: explicit flag > $JUDGE_MODEL env >
+	// cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this
+	// up so model bumps land in lakehouse.toml, not in this driver.
+	if *judge == "" {
+		if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" {
+			*judge = env
+		} else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" {
+			*judge = cfg.Models.LocalJudge
+		} else {
+			*judge = "qwen3.5:latest"
+			log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge)
+		}
+	}
+
 	corpora := strings.Split(*corporaCSV, ",")
 
 	qs, err := loadQueries(*queries)