The rank-based "lift" metric (warm-top-1 == cold-judge-best) doesn't distinguish "Shape B surfaced a strictly-better answer" from "Shape B shuffled ranks but quality is unchanged" from "Shape B replaced a good answer with a wrong one." This commit adds Pass 4: judge warm top-1 with the same prompt as cold ratings, then bucket the comparison. Implementation: - New --with-rejudge driver flag (default off). - New WITH_REJUDGE harness env (default 1, on for prod runs). - queryRun gains WarmTop1Metadata (cached during Pass 2 for the rejudge call) + WarmTop1Rating *int (nil-distinguishable; nil = no rejudge, 0..5 = rating). - summary gains RejudgeAttempted, QualityLifted, QualityNeutral, QualityRegressed (counts of warm-rating > / == / < cold-rating). - Markdown headline gains a Quality block when rejudge ran. - ~21 extra judge calls (~30s on qwen2.5). Run #005 result (split inject threshold 0.20 + paraphrase + rejudge): Quality lifted 5 / 21 (24%) — 3× +2 rating, 2× +1 rating Quality neutral 13 / 21 (62%) — includes OOD queries holding 1 Quality regressed 3 / 21 (14%) Net rating delta +3 across 21 queries (+0.14 average) The 5 lifts were all rating-2 cold replaced with rating-3 or rating-4 warm — Shape B took mediocre matches and substituted substantively better ones. The 3 regressions were small (-1, -1, -3). Q11 is the cautionary tale: cold top-1 "production line worker" (rating 4) got replaced by Q1's recorded "forklift OSHA-30 operator" e-5729 (rating 1). Adjacent-domain cross-pollination — production worker and forklift operator embed within 0.20 cosine because both are warehouse-adjacent staffing queries, even though the judge correctly distinguishes them. The split-threshold defense (0.5 boost / 0.20 inject) catches OOD cross-pollination (Q19/Q20/Q21 all stayed neutral at rating 1) but not adjacent-domain cross-pollination. Net product verdict: working, net-positive on quality, but the worst case (Q11 4→1) is customer-visible and warrants a tighter inject threshold OR an additional gate beyond cosine distance. Filed in STATE_OF_PLAY OPEN as a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
620 lines
22 KiB
Go
620 lines
22 KiB
Go
// Playbook-lift reality test driver. Two-pass design:
|
|
//
|
|
// Pass 1 (cold): for each query → matrix.search use_playbook=false →
|
|
// LLM judge rates top-K → record playbook entry pointing
|
|
// at the highest-rated result (which may NOT be top-1
|
|
// by distance — that's the discovery worth boosting).
|
|
//
|
|
// Pass 2 (warm): same queries → use_playbook=true → measure how the
|
|
// ranking shifted.
|
|
//
|
|
// Lift = real if pass-2 brings the LLM-judged-best result into top-1
|
|
// more often than pass-1. If lift ≈ 0, the playbook is just confirming
|
|
// what cosine already said and the 5-loop thesis is unproven.
|
|
//
|
|
// Honest about what this measures: with no human-labeled ground truth,
|
|
// the LLM judge IS the ground truth proxy. That's the small-model
|
|
// pipeline thesis itself — the same model class that runs the inner
|
|
// loop is also what we trust to evaluate it. If you don't trust the
|
|
// judge, the lift number is meaningless; that's a separate problem
|
|
// for ground-truth labeling.
|
|
//
|
|
// Usage (driven by scripts/playbook_lift.sh):
|
|
// playbook_lift -gateway http://127.0.0.1:3110 \
|
|
// -queries tests/reality/playbook_lift_queries.txt \
|
|
// -judge qwen3.5:latest \
|
|
// -corpora workers,candidates \
|
|
// -k 10 \
|
|
// -out reports/reality-tests/playbook_lift_001.json
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.agentview.dev/profit/golangLAKEHOUSE/internal/shared"
|
|
)
|
|
|
|
type matrixResult struct {
|
|
ID string `json:"id"`
|
|
Distance float32 `json:"distance"`
|
|
Corpus string `json:"corpus"`
|
|
Metadata json.RawMessage `json:"metadata,omitempty"`
|
|
}
|
|
|
|
type matrixResp struct {
|
|
Results []matrixResult `json:"results"`
|
|
PerCorpusCounts map[string]int `json:"per_corpus_counts"`
|
|
PlaybookBoosted int `json:"playbook_boosted,omitempty"`
|
|
}
|
|
|
|
type judgeVerdict struct {
|
|
Rating int `json:"rating"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type queryRun struct {
|
|
Query string `json:"query"`
|
|
|
|
ColdTop1ID string `json:"cold_top1_id"`
|
|
ColdTop1Distance float32 `json:"cold_top1_distance"`
|
|
ColdJudgeBestID string `json:"cold_judge_best_id"`
|
|
ColdJudgeBestRank int `json:"cold_judge_best_rank"`
|
|
ColdJudgeBestRating int `json:"cold_judge_best_rating"`
|
|
ColdRatings []int `json:"cold_ratings"`
|
|
|
|
PlaybookRecorded bool `json:"playbook_recorded"`
|
|
PlaybookID string `json:"playbook_target_id,omitempty"`
|
|
|
|
WarmTop1ID string `json:"warm_top1_id"`
|
|
WarmTop1Distance float32 `json:"warm_top1_distance"`
|
|
WarmBoostedCount int `json:"warm_boosted_count"`
|
|
WarmJudgeBestRank int `json:"warm_judge_best_rank"` // rank of cold judge-best in warm — NOT the warm pass's own judge-best
|
|
WarmTop1Metadata json.RawMessage `json:"-"` // cached for Pass 4 rejudge; not emitted
|
|
|
|
// WarmTop1Rating: only populated when --with-rejudge. Compare to
|
|
// ColdRatings[0] (== cold top-1 rating) to measure quality lift.
|
|
// *int so absence (no rejudge pass) and a 0-rating verdict are
|
|
// distinguishable.
|
|
WarmTop1Rating *int `json:"warm_top1_rating,omitempty"`
|
|
|
|
Lift bool `json:"lift"` // judge-best was below top-1 cold, but top-1 warm
|
|
|
|
// Paraphrase pass — only populated when --with-paraphrase. Tests
|
|
// the playbook's actual learning property: does a recorded entry
|
|
// for query Q help a similar-but-different query Q'?
|
|
//
|
|
// ParaphraseRecordedRank semantics:
|
|
// nil = paraphrase pass didn't run for this query (no playbook
|
|
// was recorded in cold pass, so nothing to test)
|
|
// 0 = recorded answer landed at top-1
|
|
// 1..K-1 = recorded answer present in top-K at that rank
|
|
// -1 = recorded answer absent from top-K
|
|
// Pointer (not int) so nil and rank-0 are distinguishable in JSON.
|
|
ParaphraseQuery string `json:"paraphrase_query,omitempty"`
|
|
ParaphraseTop1ID string `json:"paraphrase_top1_id,omitempty"`
|
|
ParaphraseRecordedRank *int `json:"paraphrase_recorded_rank,omitempty"`
|
|
ParaphraseLift bool `json:"paraphrase_lift,omitempty"` // recorded answer at rank 0 for paraphrase
|
|
|
|
Note string `json:"note,omitempty"`
|
|
}
|
|
|
|
type summary struct {
|
|
Total int `json:"total"`
|
|
WithDiscovery int `json:"with_discovery"` // judge-best != cold top-1
|
|
LiftCount int `json:"lift_count"` // top-1 changed warm→ judge-best
|
|
NoChange int `json:"no_change"`
|
|
MeanTop1DeltaDistance float32 `json:"mean_top1_delta_distance"`
|
|
PlaybookBoostedTotal int `json:"playbook_boosted_total"`
|
|
|
|
// Paraphrase pass aggregates — only populated when --with-paraphrase.
|
|
ParaphraseAttempted int `json:"paraphrase_attempted,omitempty"` // queries with playbook recorded that ran a paraphrase
|
|
ParaphraseTop1Lifts int `json:"paraphrase_top1_lifts,omitempty"` // recorded answer surfaced at rank 0
|
|
ParaphraseAnyRankHits int `json:"paraphrase_any_rank_hits,omitempty"` // recorded answer surfaced at any rank in top-K
|
|
|
|
// Re-judge pass aggregates — only populated when --with-rejudge.
|
|
// Measures QUALITY lift (warm top-1 rating vs cold top-1 rating)
|
|
// rather than rank-of-cold-judge-best lift. The latter conflates
|
|
// "warm surfaced a different but equally-good result" with "warm
|
|
// shuffled ranks but the answer was the same"; quality lift
|
|
// disambiguates them.
|
|
RejudgeAttempted int `json:"rejudge_attempted,omitempty"` // queries that ran the rejudge pass
|
|
QualityLifted int `json:"quality_lifted,omitempty"` // warm-top-1 rating > cold-top-1 rating
|
|
QualityNeutral int `json:"quality_neutral,omitempty"` // ratings equal (could be same or different item)
|
|
QualityRegressed int `json:"quality_regressed,omitempty"` // warm-top-1 rating < cold-top-1 rating
|
|
|
|
GeneratedAt time.Time `json:"generated_at"`
|
|
}
|
|
|
|
func main() {
|
|
configPath := flag.String("config", "lakehouse.toml", "path to TOML config (provides judge default from [models].local_judge)")
|
|
gw := flag.String("gateway", "http://127.0.0.1:3110", "Go gateway base URL")
|
|
ollama := flag.String("ollama", "http://127.0.0.1:11434", "Ollama base URL for LLM judge")
|
|
queries := flag.String("queries", "tests/reality/playbook_lift_queries.txt", "query corpus path")
|
|
corporaCSV := flag.String("corpora", "workers,candidates", "comma-separated matrix corpora")
|
|
// Empty default — resolved below from (priority): flag > env > config > hardcoded.
|
|
judge := flag.String("judge", "", "Ollama model for relevance judging (empty = read from config [models].local_judge)")
|
|
k := flag.Int("k", 10, "top-k from matrix.search per pass")
|
|
out := flag.String("out", "reports/reality-tests/playbook_lift_001.json", "output JSONL path")
|
|
withParaphrase := flag.Bool("with-paraphrase", false, "after warm pass, generate a paraphrase via the judge model and re-query with playbook=true to test the learning property")
|
|
withRejudge := flag.Bool("with-rejudge", false, "after warm pass, judge warm top-1 to measure QUALITY lift (vs cold top-1 rating), not just rank-of-cold-judge-best")
|
|
flag.Parse()
|
|
|
|
// Judge resolution priority: explicit flag > $JUDGE_MODEL env >
|
|
// cfg.Models.LocalJudge > hardcoded fallback. Phase 3 wired this
|
|
// up so model bumps land in lakehouse.toml, not in this driver.
|
|
if *judge == "" {
|
|
if env := strings.TrimSpace(os.Getenv("JUDGE_MODEL")); env != "" {
|
|
*judge = env
|
|
} else if cfg, err := shared.LoadConfig(*configPath); err == nil && cfg.Models.LocalJudge != "" {
|
|
*judge = cfg.Models.LocalJudge
|
|
} else {
|
|
*judge = "qwen3.5:latest"
|
|
log.Printf("[lift] warn: no judge model from flag/env/config; falling back to %q", *judge)
|
|
}
|
|
}
|
|
|
|
corpora := strings.Split(*corporaCSV, ",")
|
|
|
|
qs, err := loadQueries(*queries)
|
|
if err != nil {
|
|
log.Fatalf("load queries: %v", err)
|
|
}
|
|
if len(qs) == 0 {
|
|
log.Fatalf("no queries in %s", *queries)
|
|
}
|
|
log.Printf("[lift] %d queries · corpora=%v · k=%d · judge=%s", len(qs), corpora, *k, *judge)
|
|
|
|
hc := &http.Client{Timeout: 60 * time.Second}
|
|
runs := make([]queryRun, 0, len(qs))
|
|
totalDelta := float32(0)
|
|
playbookBoostedTotal := 0
|
|
withDiscovery := 0
|
|
liftCount := 0
|
|
noChange := 0
|
|
|
|
// Pass 1 (cold) + record playbooks based on judge verdicts.
|
|
for i, q := range qs {
|
|
log.Printf("[lift] (%d/%d cold) %s", i+1, len(qs), abbrev(q, 60))
|
|
resp, err := matrixSearch(hc, *gw, q, corpora, *k, false)
|
|
if err != nil {
|
|
log.Printf(" cold search failed: %v — skipping", err)
|
|
continue
|
|
}
|
|
if len(resp.Results) == 0 {
|
|
log.Printf(" cold returned 0 results — skipping")
|
|
continue
|
|
}
|
|
ratings := make([]int, len(resp.Results))
|
|
bestRank := 0
|
|
bestRating := -1
|
|
for j, r := range resp.Results {
|
|
rating := judgeRate(hc, *ollama, *judge, q, r)
|
|
ratings[j] = rating
|
|
if rating > bestRating {
|
|
bestRating = rating
|
|
bestRank = j
|
|
}
|
|
}
|
|
run := queryRun{
|
|
Query: q,
|
|
ColdTop1ID: resp.Results[0].ID,
|
|
ColdTop1Distance: resp.Results[0].Distance,
|
|
ColdJudgeBestID: resp.Results[bestRank].ID,
|
|
ColdJudgeBestRank: bestRank,
|
|
ColdJudgeBestRating: bestRating,
|
|
ColdRatings: ratings,
|
|
}
|
|
// Record a playbook only if the judge best is not already top-1
|
|
// (otherwise we're boosting something cosine already crowned).
|
|
if bestRank > 0 && bestRating >= 4 {
|
|
withDiscovery++
|
|
if err := playbookRecord(hc, *gw, q, resp.Results[bestRank].ID, resp.Results[bestRank].Corpus, 1.0); err != nil {
|
|
log.Printf(" playbook record failed: %v", err)
|
|
run.Note = "playbook record failed: " + err.Error()
|
|
} else {
|
|
run.PlaybookRecorded = true
|
|
run.PlaybookID = resp.Results[bestRank].ID
|
|
}
|
|
} else if bestRank == 0 {
|
|
run.Note = "judge-best already top-1 cold — no playbook needed"
|
|
} else {
|
|
run.Note = fmt.Sprintf("judge-best rating %d below threshold (4) — no playbook", bestRating)
|
|
}
|
|
runs = append(runs, run)
|
|
}
|
|
|
|
// Pass 2 (warm) on the same queries.
|
|
for i := range runs {
|
|
q := runs[i].Query
|
|
log.Printf("[lift] (%d/%d warm) %s", i+1, len(runs), abbrev(q, 60))
|
|
resp, err := matrixSearch(hc, *gw, q, corpora, *k, true)
|
|
if err != nil || len(resp.Results) == 0 {
|
|
runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("warm search failed: %v", err))
|
|
continue
|
|
}
|
|
runs[i].WarmTop1ID = resp.Results[0].ID
|
|
runs[i].WarmTop1Distance = resp.Results[0].Distance
|
|
runs[i].WarmTop1Metadata = resp.Results[0].Metadata // cache for Pass 4 rejudge
|
|
runs[i].WarmBoostedCount = resp.PlaybookBoosted
|
|
playbookBoostedTotal += resp.PlaybookBoosted
|
|
|
|
// Find where the cold judge-best ID landed in the warm ranking.
|
|
warmRank := -1
|
|
for j, r := range resp.Results {
|
|
if r.ID == runs[i].ColdJudgeBestID {
|
|
warmRank = j
|
|
break
|
|
}
|
|
}
|
|
runs[i].WarmJudgeBestRank = warmRank
|
|
|
|
switch {
|
|
case runs[i].PlaybookRecorded && warmRank == 0:
|
|
runs[i].Lift = true
|
|
liftCount++
|
|
case !runs[i].PlaybookRecorded:
|
|
noChange++
|
|
default:
|
|
noChange++
|
|
}
|
|
totalDelta += runs[i].WarmTop1Distance - runs[i].ColdTop1Distance
|
|
}
|
|
|
|
// Pass 3 (paraphrase) — opt-in via --with-paraphrase. For each
|
|
// query where a playbook was recorded in Pass 1, generate a
|
|
// paraphrase via the judge model and run it through warm
|
|
// matrix.search. The expectation: if the playbook's learning
|
|
// property holds (cosine on embed(paraphrase) finds the recorded
|
|
// embed(query) within DefaultPlaybookMaxDistance), the recorded
|
|
// answer should appear at top-1 for the paraphrase too. This is
|
|
// the claim from the report's caveat #3 that v1 didn't test.
|
|
paraphraseAttempted := 0
|
|
paraphraseTop1Lifts := 0
|
|
paraphraseAnyRankHits := 0
|
|
if *withParaphrase {
|
|
log.Printf("[lift] paraphrase pass: testing playbook learning property")
|
|
for i := range runs {
|
|
if !runs[i].PlaybookRecorded {
|
|
continue
|
|
}
|
|
paraphraseAttempted++
|
|
paraphrase, err := generateParaphrase(hc, *ollama, *judge, runs[i].Query)
|
|
if err != nil {
|
|
log.Printf(" (%d) paraphrase generation failed: %v", i+1, err)
|
|
runs[i].Note = appendNote(runs[i].Note, "paraphrase gen failed: "+err.Error())
|
|
continue
|
|
}
|
|
runs[i].ParaphraseQuery = paraphrase
|
|
log.Printf("[lift] (%d/%d paraphrase) %s → %s", i+1, len(runs),
|
|
abbrev(runs[i].Query, 40), abbrev(paraphrase, 40))
|
|
|
|
resp, err := matrixSearch(hc, *gw, paraphrase, corpora, *k, true)
|
|
if err != nil || len(resp.Results) == 0 {
|
|
runs[i].Note = appendNote(runs[i].Note, fmt.Sprintf("paraphrase search failed: %v", err))
|
|
missed := -1
|
|
runs[i].ParaphraseRecordedRank = &missed
|
|
continue
|
|
}
|
|
runs[i].ParaphraseTop1ID = resp.Results[0].ID
|
|
recordedRank := -1
|
|
for j, r := range resp.Results {
|
|
if r.ID == runs[i].PlaybookID {
|
|
recordedRank = j
|
|
break
|
|
}
|
|
}
|
|
runs[i].ParaphraseRecordedRank = &recordedRank
|
|
if recordedRank == 0 {
|
|
runs[i].ParaphraseLift = true
|
|
paraphraseTop1Lifts++
|
|
paraphraseAnyRankHits++
|
|
} else if recordedRank > 0 {
|
|
paraphraseAnyRankHits++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pass 4 (warm-rejudge) — opt-in via --with-rejudge. Judge warm
|
|
// top-1 against the same prompt as cold ratings, then compare to
|
|
// cold top-1 rating. This measures QUALITY lift (did the playbook
|
|
// produce a better candidate?) rather than just rank-of-cold-judge-
|
|
// best lift (did the recorded answer move to top-1, even if cold's
|
|
// top-1 was already good?). See STATE_OF_PLAY OPEN — added because
|
|
// run #003's verbatim 2/6 didn't tell us whether Shape B was
|
|
// surfacing better OR same-quality alternatives.
|
|
rejudgeAttempted := 0
|
|
qualityLifted := 0
|
|
qualityNeutral := 0
|
|
qualityRegressed := 0
|
|
if *withRejudge {
|
|
log.Printf("[lift] warm-rejudge pass: measuring quality lift (warm top-1 rating vs cold top-1 rating)")
|
|
for i := range runs {
|
|
if runs[i].WarmTop1ID == "" || len(runs[i].WarmTop1Metadata) == 0 {
|
|
continue // warm pass didn't complete for this query
|
|
}
|
|
rejudgeAttempted++
|
|
result := matrixResult{
|
|
ID: runs[i].WarmTop1ID,
|
|
Distance: runs[i].WarmTop1Distance,
|
|
Metadata: runs[i].WarmTop1Metadata,
|
|
}
|
|
warmRating := judgeRate(hc, *ollama, *judge, runs[i].Query, result)
|
|
runs[i].WarmTop1Rating = &warmRating
|
|
coldRating := 0
|
|
if len(runs[i].ColdRatings) > 0 {
|
|
coldRating = runs[i].ColdRatings[0]
|
|
}
|
|
switch {
|
|
case warmRating > coldRating:
|
|
qualityLifted++
|
|
case warmRating < coldRating:
|
|
qualityRegressed++
|
|
default:
|
|
qualityNeutral++
|
|
}
|
|
}
|
|
}
|
|
|
|
sum := summary{
|
|
Total: len(runs),
|
|
WithDiscovery: withDiscovery,
|
|
LiftCount: liftCount,
|
|
NoChange: noChange,
|
|
MeanTop1DeltaDistance: 0,
|
|
PlaybookBoostedTotal: playbookBoostedTotal,
|
|
ParaphraseAttempted: paraphraseAttempted,
|
|
ParaphraseTop1Lifts: paraphraseTop1Lifts,
|
|
ParaphraseAnyRankHits: paraphraseAnyRankHits,
|
|
RejudgeAttempted: rejudgeAttempted,
|
|
QualityLifted: qualityLifted,
|
|
QualityNeutral: qualityNeutral,
|
|
QualityRegressed: qualityRegressed,
|
|
GeneratedAt: time.Now().UTC(),
|
|
}
|
|
if len(runs) > 0 {
|
|
sum.MeanTop1DeltaDistance = totalDelta / float32(len(runs))
|
|
}
|
|
|
|
if err := writeJSON(*out, runs, sum); err != nil {
|
|
log.Fatalf("write %s: %v", *out, err)
|
|
}
|
|
if *withParaphrase || *withRejudge {
|
|
log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f · paraphrase=%d/%d→top1 · quality=lifted%d/neutral%d/regressed%d",
|
|
sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance,
|
|
sum.ParaphraseTop1Lifts, sum.ParaphraseAttempted,
|
|
sum.QualityLifted, sum.QualityNeutral, sum.QualityRegressed)
|
|
} else {
|
|
log.Printf("[lift] DONE — %d queries · discovery=%d · lift=%d · boosted=%d · meanΔdist=%.4f",
|
|
sum.Total, sum.WithDiscovery, sum.LiftCount, sum.PlaybookBoostedTotal, sum.MeanTop1DeltaDistance)
|
|
}
|
|
log.Printf("[lift] results → %s", *out)
|
|
}
|
|
|
|
// generateParaphrase asks the judge model to rephrase a staffing query
|
|
// while preserving intent. Used in the paraphrase pass to test whether
|
|
// the playbook's recorded embedding survives wording variation.
|
|
//
|
|
// temperature=0.5 — enough variance to make the paraphrase actually
|
|
// different, but not so high that it drifts off the staffing domain.
|
|
// format=json + a tight schema makes parsing deterministic.
|
|
func generateParaphrase(hc *http.Client, ollamaURL, model, query string) (string, error) {
|
|
system := `You rephrase staffing queries while preserving intent.
|
|
Output JSON only: {"paraphrase": "<rephrased query>"}.
|
|
Rules:
|
|
- Keep the same role, certifications, geography, and constraints.
|
|
- Vary the wording (synonyms, reordered clauses, different sentence shape).
|
|
- Do NOT add or remove requirements.
|
|
- Do NOT explain — just emit the JSON.`
|
|
body := map[string]any{
|
|
"model": model,
|
|
"stream": false,
|
|
"format": "json",
|
|
"messages": []map[string]string{
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": query},
|
|
},
|
|
"options": map[string]any{"temperature": 0.5},
|
|
}
|
|
bs, _ := json.Marshal(body)
|
|
req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
return "", fmt.Errorf("ollama chat: HTTP %d", resp.StatusCode)
|
|
}
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
var ollamaResp struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
}
|
|
if err := json.Unmarshal(rb, &ollamaResp); err != nil {
|
|
return "", fmt.Errorf("decode ollama envelope: %w", err)
|
|
}
|
|
var out struct {
|
|
Paraphrase string `json:"paraphrase"`
|
|
}
|
|
if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &out); err != nil {
|
|
return "", fmt.Errorf("decode paraphrase JSON: %w (content=%q)", err, ollamaResp.Message.Content)
|
|
}
|
|
if strings.TrimSpace(out.Paraphrase) == "" {
|
|
return "", fmt.Errorf("empty paraphrase (content=%q)", ollamaResp.Message.Content)
|
|
}
|
|
return out.Paraphrase, nil
|
|
}
|
|
|
|
func loadQueries(path string) ([]string, error) {
|
|
bs, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var out []string
|
|
for _, line := range strings.Split(string(bs), "\n") {
|
|
s := strings.TrimSpace(line)
|
|
if s == "" || strings.HasPrefix(s, "#") {
|
|
continue
|
|
}
|
|
out = append(out, s)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, usePlaybook bool) (*matrixResp, error) {
|
|
body := map[string]any{
|
|
"query_text": query,
|
|
"corpora": corpora,
|
|
"k": k,
|
|
"per_corpus_k": k,
|
|
"use_playbook": usePlaybook,
|
|
}
|
|
bs, _ := json.Marshal(body)
|
|
req, _ := http.NewRequest("POST", gw+"/v1/matrix/search", bytes.NewReader(bs))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode/100 != 2 {
|
|
return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
|
|
}
|
|
var out matrixResp
|
|
if err := json.Unmarshal(rb, &out); err != nil {
|
|
return nil, fmt.Errorf("unmarshal: %w (body=%s)", err, abbrev(string(rb), 200))
|
|
}
|
|
return &out, nil
|
|
}
|
|
|
|
func playbookRecord(hc *http.Client, gw, query, answerID, answerCorpus string, score float64) error {
|
|
body := map[string]any{
|
|
"query_text": query,
|
|
"answer_id": answerID,
|
|
"answer_corpus": answerCorpus,
|
|
"score": score,
|
|
"tags": []string{"reality-test", "playbook-lift-001"},
|
|
}
|
|
bs, _ := json.Marshal(body)
|
|
req, _ := http.NewRequest("POST", gw+"/v1/matrix/playbooks/record", bytes.NewReader(bs))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
return fmt.Errorf("status %d: %s", resp.StatusCode, string(rb))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// judgeRate calls Ollama's /api/chat directly and asks for a 1-5 rating
|
|
// of the result against the query. Returns 0 on any failure (treated as
|
|
// "couldn't judge, exclude from best-of consideration").
|
|
func judgeRate(hc *http.Client, ollamaURL, model, query string, r matrixResult) int {
|
|
system := `You rate retrieval results for a staffing co-pilot.
|
|
Rate the result 1-5 against the query:
|
|
5 = perfect match (this person/job IS what was asked for)
|
|
4 = strong match (right field, right level, minor mismatches)
|
|
3 = adjacent match (related field or partial overlap)
|
|
2 = weak/tangential match
|
|
1 = irrelevant
|
|
Output JSON only: {"rating": N, "reason": "<one sentence>"}.`
|
|
user := fmt.Sprintf("Query: %q\n\nResult corpus: %s\nResult ID: %s\nResult metadata:\n%s",
|
|
query, r.Corpus, r.ID, string(r.Metadata))
|
|
|
|
body := map[string]any{
|
|
"model": model,
|
|
"stream": false,
|
|
"format": "json",
|
|
"messages": []map[string]string{
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
},
|
|
"options": map[string]any{"temperature": 0},
|
|
}
|
|
bs, _ := json.Marshal(body)
|
|
req, _ := http.NewRequest("POST", ollamaURL+"/api/chat", bytes.NewReader(bs))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := hc.Do(req)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
return 0
|
|
}
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
var ollamaResp struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
}
|
|
if err := json.Unmarshal(rb, &ollamaResp); err != nil {
|
|
return 0
|
|
}
|
|
var v judgeVerdict
|
|
if err := json.Unmarshal([]byte(ollamaResp.Message.Content), &v); err != nil {
|
|
return 0
|
|
}
|
|
if v.Rating < 1 || v.Rating > 5 {
|
|
return 0
|
|
}
|
|
return v.Rating
|
|
}
|
|
|
|
func writeJSON(path string, runs []queryRun, sum summary) error {
|
|
if err := os.MkdirAll(filepath_dir(path), 0o755); err != nil {
|
|
return err
|
|
}
|
|
out := struct {
|
|
Summary summary `json:"summary"`
|
|
Runs []queryRun `json:"runs"`
|
|
}{Summary: sum, Runs: runs}
|
|
bs, err := json.MarshalIndent(out, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(path, bs, 0o644)
|
|
}
|
|
|
|
func filepath_dir(p string) string {
|
|
if i := strings.LastIndex(p, "/"); i >= 0 {
|
|
return p[:i]
|
|
}
|
|
return "."
|
|
}
|
|
|
|
func abbrev(s string, n int) string {
|
|
if len(s) <= n {
|
|
return s
|
|
}
|
|
return s[:n] + "…"
|
|
}
|
|
|
|
func appendNote(existing, add string) string {
|
|
if existing == "" {
|
|
return add
|
|
}
|
|
return existing + "; " + add
|
|
}
|
|
|
|
// Suppress unused-import warning when sort isn't used in a future
|
|
// refactor; harmless for now.
|
|
var _ = sort.Slice
|