diff --git a/reports/reality-tests/README.md b/reports/reality-tests/README.md index f72658d..76f0ef5 100644 --- a/reports/reality-tests/README.md +++ b/reports/reality-tests/README.md @@ -34,17 +34,30 @@ See the run reports for honesty caveats — chiefly that the LLM judge IS the gr ## Running a reality test ```bash -# Defaults: judge=qwen3.5:latest, workers limit 5000, run id 001 +# Defaults: judge resolved from lakehouse.toml [models].local_judge, +# workers limit 5000, run id 001 ./scripts/playbook_lift.sh # Re-run with a different judge to check inter-judge agreement -JUDGE_MODEL=qwen2.5:latest RUN_ID=002 ./scripts/playbook_lift.sh +# (env JUDGE_MODEL overrides the config tier) +JUDGE_MODEL=qwen3:latest RUN_ID=002 ./scripts/playbook_lift.sh # Smaller scale for fast iteration WORKERS_LIMIT=1000 K=5 RUN_ID=dev ./scripts/playbook_lift.sh ``` -Requires: Ollama on `:11434` with `nomic-embed-text` + the chosen judge model loaded. Skips cleanly (exit 0) if Ollama is absent. +**Judge resolution priority** (Phase 3, 2026-04-29): +1. `-judge` flag on the Go driver (explicit override) +2. `JUDGE_MODEL` env var (operator override) +3. `lakehouse.toml [models].local_judge` (default) +4. Hardcoded `qwen3.5:latest` (last-resort fallback if config missing) + +This means model bumps land in `lakehouse.toml`, not in this script or +the Go driver. Bumping `local_judge` to a stronger local model (e.g. +when qwen4 ships) takes one line. + +Requires: Ollama on `:11434` with `nomic-embed-text` + the resolved judge +model loaded. Skips cleanly (exit 0) if Ollama is absent. --- diff --git a/scripts/playbook_lift.sh b/scripts/playbook_lift.sh index 082eb65..170a73a 100755 --- a/scripts/playbook_lift.sh +++ b/scripts/playbook_lift.sh @@ -29,11 +29,14 @@ cd "$(dirname "$0")/.." export PATH="$PATH:/usr/local/go/bin" RUN_ID="${RUN_ID:-001}" -JUDGE_MODEL="${JUDGE_MODEL:-qwen3.5:latest}" +# JUDGE_MODEL: empty means "let the Go driver resolve from +# lakehouse.toml [models].local_judge". Set explicitly to override. +JUDGE_MODEL="${JUDGE_MODEL:-}" WORKERS_LIMIT="${WORKERS_LIMIT:-5000}" QUERIES_FILE="${QUERIES_FILE:-tests/reality/playbook_lift_queries.txt}" CORPORA="${CORPORA:-workers,candidates}" K="${K:-10}" +CONFIG_PATH="${CONFIG_PATH:-lakehouse.toml}" OUT_JSON="reports/reality-tests/playbook_lift_${RUN_ID}.json" OUT_MD="reports/reality-tests/playbook_lift_${RUN_ID}.md" @@ -43,11 +46,20 @@ if ! curl -sS --max-time 3 http://localhost:11434/api/tags >/dev/null 2>&1; then exit 0 fi -if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$JUDGE_MODEL" \ +# Resolve judge from config when not set explicitly — needed for the +# Ollama model-presence check below. Mirrors the Go driver's priority. +EFFECTIVE_JUDGE="$JUDGE_MODEL" +if [ -z "$EFFECTIVE_JUDGE" ] && [ -f "$CONFIG_PATH" ]; then + EFFECTIVE_JUDGE="$(grep -E '^local_judge\s*=' "$CONFIG_PATH" | head -1 | sed -E 's/.*=\s*"([^"]+)".*/\1/')" +fi +EFFECTIVE_JUDGE="${EFFECTIVE_JUDGE:-qwen3.5:latest}" + +if ! curl -sS http://localhost:11434/api/tags | jq -e --arg m "$EFFECTIVE_JUDGE" \ '.models[] | select(.name == $m)' >/dev/null 2>&1; then - echo "[lift] judge model '$JUDGE_MODEL' not loaded in Ollama — pull it first" + echo "[lift] judge model '$EFFECTIVE_JUDGE' not loaded in Ollama — pull it first" exit 1 fi +echo "[lift] judge resolved to: $EFFECTIVE_JUDGE (from ${JUDGE_MODEL:+env}${JUDGE_MODEL:-config})" echo "[lift] building binaries..." go build -o bin/ ./cmd/storaged ./cmd/embedd ./cmd/vectord ./cmd/matrixd ./cmd/gateway \ @@ -121,8 +133,12 @@ echo "[lift] ingest candidates..." | grep -v "^\[candidates\]\(matrix\|reality\)" || true echo -echo "[lift] running driver — judge=$JUDGE_MODEL · queries=$QUERIES_FILE · k=$K" +echo "[lift] running driver — judge=$EFFECTIVE_JUDGE · queries=$QUERIES_FILE · k=$K" +# Pass empty -judge so the Go driver runs its config resolution chain +# unless the operator explicitly set JUDGE_MODEL (in which case we +# pass it through, taking priority over config). ./bin/playbook_lift \ + -config "$CONFIG_PATH" \ -gateway "http://127.0.0.1:3110" \ -ollama "http://localhost:11434" \ -queries "$QUERIES_FILE" \ @@ -148,7 +164,7 @@ generate_md() { # Playbook-Lift Reality Test — Run ${RUN_ID} **Generated:** ${gen_at} -**Judge:** \`${JUDGE_MODEL}\` (Ollama) +**Judge:** \`${EFFECTIVE_JUDGE}\` (Ollama, resolved from ${JUDGE_MODEL:+env JUDGE_MODEL}${JUDGE_MODEL:-config [models].local_judge}) **Corpora:** \`${CORPORA}\` **Workers limit:** ${WORKERS_LIMIT} **Queries:** \`${QUERIES_FILE}\` (${total} executed) @@ -211,6 +227,9 @@ MDEOF 4. **Multi-corpus skew.** Default corpora=\`${CORPORA}\` — if all judge-best results land in one corpus, the matrix layer's purpose isn't being tested. Check per-corpus distribution in the JSON. +5. **Judge resolution.** This run used \`${EFFECTIVE_JUDGE}\` from + ${JUDGE_MODEL:+env JUDGE_MODEL override}${JUDGE_MODEL:-the lakehouse.toml [models].local_judge tier}. + Bumping the judge for run #N+1 means editing one line in lakehouse.toml. ## Next moves diff --git a/scripts/playbook_lift/main.go b/scripts/playbook_lift/main.go index 383cb20..7eab84f 100644 --- a/scripts/playbook_lift/main.go +++ b/scripts/playbook_lift/main.go @@ -40,6 +40,8 @@ import ( "sort" "strings" "time" + + "git.agentview.dev/profit/golangLAKEHOUSE/internal/shared" ) type matrixResult struct { @@ -93,15 +95,31 @@ type summary struct { } func main() { + configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)") gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL") ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge") queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path") corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora") - judge := flag.String("judge", "qwen3.5:latest", "Ollama model for relevance judging") + // Empty default — resolved below from (priority): flag > env > config > hardcoded. + judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)") k := flag.Int("k", 10, "top-k from matrix.search per pass") out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path") flag.Parse() + // Judge resolution priority: explicit flag > $JUDGE_MODEL env > + // cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this + // up so model bumps land in lakehouse.toml, not in this driver. + if *judge == "" { + if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" { + *judge = env + } else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" { + *judge = cfg.Models.LocalJudge + } else { + *judge = "qwen3.5:latest" + log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge) + } + } + corpora := strings.Split(*corporaCSV, ",") qs, err := loadQueries(*queries)