Lift suite run #004 left two unresolved tail issues: - Q6 ("Forklift loader") ↔ Q7 ("Hazmat warehouse, cold storage") swap recordings as warm top-1 because their embeddings are within 0.20 cosine of each other. Distance gate can't tell them apart. - Q9 + Q15 lose paraphrase recovery when qwen2.5 rephrases past the 0.20 threshold. Distance says "drift too far"; sometimes the drift is real (skip), sometimes the paraphrase is still on-domain (don't want to skip). Multi-coord run #008's judge re-rating proved the LLM can distinguish: Q3 crane case landed at distance 0.23 (looks tight) but rating 1 (irrelevant). The judge sees domain mismatch the embedder doesn't. This commit lifts that pattern into the matrix substrate. Shape B inject now optionally routes every candidate through a judge gate before the rank insert lands. Distance + judge BOTH have to approve. internal/matrix/playbook.go: - InjectPlaybookMisses signature gains a query string + an optional InjectGate. nil gate preserves pre-judge-gating behavior (current tests already pass with nil). - New InjectGate interface + InjectGateFunc adapter for tests and non-LLM callers. - Per-candidate gate.Approve(query, hit) call inserted between the dedup and the inject. Rejected candidates skip silently; injected count reflects post-gate decision. internal/matrix/judge.go (new, ~140 lines): - LLMJudgeGate calls an Ollama-shape /api/chat endpoint with the same 1-5 staffing-rubric prompt that worked in multi_coord run #008. fail-closed on HTTP/JSON errors (don't inject if judge can't speak — better miss than wrong-domain). - NewLLMJudgeGate returns nil when URL or Model is empty, matching InjectGate's nil-means-no-judge semantics. internal/matrix/retrieve.go: - SearchRequest gains JudgeURL, JudgeModel, JudgeMinRating fields. Run() builds an LLMJudgeGate when set; passes nil otherwise. Backward compatible — existing callers see no behavior change. Tests: - TestInjectPlaybookMisses_GateRejectsCandidate (rejectAll → 0 injected, even with tight distance) - TestInjectPlaybookMisses_GateApprovesCandidate (approveAll → same as nil-gate behavior) - TestInjectPlaybookMisses_GateSeesCorrectQuery (gate receives CURRENT query + RECORDED query separately so it can score the (current, candidate) pair) - All 5 existing inject tests updated to new signature go test ./internal/matrix → all 8 inject tests pass. go test ./internal/matrix ./internal/shared ./cmd/{matrixd, queryd,pathwayd,observerd} → all green. STATE_OF_PLAY: - OPEN item #1 (judge-gated injection) closed. - DO NOT RELITIGATE adds the substrate-level judge-gate lock. - OPEN list now 5 rows (was 6). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
153 lines
5.0 KiB
Go
153 lines
5.0 KiB
Go
package matrix
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// LLMJudgeGate is an InjectGate implementation that uses an Ollama-
|
|
// compatible chat endpoint (or chatd's /v1/chat) to rate the
|
|
// (query, candidate) pair on a 1-5 rubric, then approves the
|
|
// injection iff rating >= MinRating.
|
|
//
|
|
// The HTTP path is intentionally generic — works against any
|
|
// endpoint that speaks Ollama's /api/chat shape: bare Ollama,
|
|
// chatd's /v1/chat, or anything else honoring the same JSON.
|
|
// Per-call timeout is bounded by the parent ctx + the http.Client.
|
|
//
|
|
// Best-effort posture: a judge call that fails (network, JSON
|
|
// decode, anything) returns Approve=false. Same fail-closed default
|
|
// as the inject path's distance gate — when the judge can't speak,
|
|
// don't inject (better silent miss than confident wrong-domain).
|
|
//
|
|
// Usage from retrieve.go:
|
|
// gate := matrix.NewLLMJudgeGate(req.JudgeURL, req.JudgeModel,
|
|
// req.JudgeMinRating, hc)
|
|
// results, injected = matrix.InjectPlaybookMisses(req.QueryText,
|
|
// results, hits, maxInjectDist, gate)
|
|
type LLMJudgeGate struct {
|
|
URL string
|
|
Model string
|
|
MinRating int
|
|
HTTPClient *http.Client
|
|
}
|
|
|
|
// NewLLMJudgeGate is the constructor. Defaults: minRating 3, 10s
|
|
// HTTP timeout. URL must include the path (e.g.
|
|
// "http://localhost:11434/api/chat" for bare Ollama). Returns nil
|
|
// when URL or Model is empty — caller treats nil InjectGate as
|
|
// "no judge configured, default-approve" per InjectPlaybookMisses
|
|
// contract.
|
|
func NewLLMJudgeGate(url, model string, minRating int, hc *http.Client) *LLMJudgeGate {
|
|
if url == "" || model == "" {
|
|
return nil
|
|
}
|
|
if minRating <= 0 {
|
|
minRating = 3
|
|
}
|
|
if hc == nil {
|
|
hc = &http.Client{Timeout: 10 * time.Second}
|
|
}
|
|
return &LLMJudgeGate{
|
|
URL: url,
|
|
Model: model,
|
|
MinRating: minRating,
|
|
HTTPClient: hc,
|
|
}
|
|
}
|
|
|
|
// Approve calls the LLM judge with a query+candidate prompt; returns
|
|
// true iff the judge's rating meets MinRating. Errors return false
|
|
// (fail-closed — see type doc).
|
|
func (g *LLMJudgeGate) Approve(query string, hit PlaybookHit) bool {
|
|
if g == nil || query == "" {
|
|
// No judge or no query to judge against — treat as approve.
|
|
// Empty-query case mirrors InjectPlaybookMisses' contract:
|
|
// callers without a query string can't usefully judge.
|
|
return true
|
|
}
|
|
rating := g.rate(query, hit)
|
|
return rating >= g.MinRating
|
|
}
|
|
|
|
func (g *LLMJudgeGate) rate(query string, hit PlaybookHit) int {
|
|
system := `You rate retrieval results for a staffing co-pilot.
|
|
Rate the result 1-5 against the query:
|
|
5 = perfect match (this person/role IS what was asked for)
|
|
4 = strong match (right field, right level, minor mismatches)
|
|
3 = adjacent match (related field or partial overlap)
|
|
2 = weak/tangential match
|
|
1 = irrelevant
|
|
Output JSON only: {"rating": N, "reason": "<one sentence>"}.`
|
|
// We pass the recorded query text + answer ID to give the judge
|
|
// minimal context. Production might also fetch the answer's
|
|
// metadata, but that requires a second HTTP hop; the recorded
|
|
// query is usually enough to sniff wrong-domain matches.
|
|
user := fmt.Sprintf("Query: %q\n\nCandidate playbook entry:\n recorded_query: %q\n answer_id: %s\n answer_corpus: %s\n recorded_score: %.2f",
|
|
query, hit.Entry.QueryText, hit.Entry.AnswerID, hit.Entry.AnswerCorpus, hit.Entry.Score)
|
|
|
|
body, _ := json.Marshal(map[string]any{
|
|
"model": g.Model,
|
|
"stream": false,
|
|
"format": "json",
|
|
"messages": []map[string]string{
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user},
|
|
},
|
|
"options": map[string]any{"temperature": 0},
|
|
})
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
req, err := http.NewRequestWithContext(ctx, "POST", g.URL, bytes.NewReader(body))
|
|
if err != nil {
|
|
slog.Warn("matrix.judge: build request", "err", err)
|
|
return 0
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
resp, err := g.HTTPClient.Do(req)
|
|
if err != nil {
|
|
slog.Warn("matrix.judge: HTTP", "err", err, "url", g.URL)
|
|
return 0
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
slog.Warn("matrix.judge: non-2xx", "status", resp.StatusCode, "url", g.URL)
|
|
return 0
|
|
}
|
|
rb, _ := io.ReadAll(resp.Body)
|
|
var ollamaResp struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
} `json:"message"`
|
|
}
|
|
if err := json.Unmarshal(rb, &ollamaResp); err != nil {
|
|
slog.Warn("matrix.judge: decode envelope", "err", err)
|
|
return 0
|
|
}
|
|
var v struct {
|
|
Rating int `json:"rating"`
|
|
}
|
|
// Some chat endpoints wrap content in markdown code fences even
|
|
// with format=json. Strip leading/trailing whitespace + fences.
|
|
content := strings.TrimSpace(ollamaResp.Message.Content)
|
|
content = strings.TrimPrefix(content, "```json")
|
|
content = strings.TrimPrefix(content, "```")
|
|
content = strings.TrimSuffix(content, "```")
|
|
content = strings.TrimSpace(content)
|
|
if err := json.Unmarshal([]byte(content), &v); err != nil {
|
|
slog.Warn("matrix.judge: decode rating", "err", err, "content", content)
|
|
return 0
|
|
}
|
|
if v.Rating < 1 || v.Rating > 5 {
|
|
return 0
|
|
}
|
|
return v.Rating
|
|
}
|