root be65f85f17 F: drift quantification — scorer drift first
PRD's 5-loop substrate names "drift" as loop 5: quantify when
historical decisions stop matching current reality. Distinct from
the rating+distillation loop because drift is MEASUREMENT, not
LEARNING. The learning loop says "this match worked, remember it";
the drift loop says "this 4-month-old playbook entry — does it
still match what the substrate would surface today?"

First-shipped drift shape: SCORER drift. When the deterministic
scorer's ScorerVersion bumps, historical ScoredRuns may no longer
match what the current scorer produces on the same EvidenceRecord.

internal/drift/drift.go:
  - ScorerDriftInput  — (EvidenceRecord, persisted_category) pair
  - ScorerDriftEntry  — one mismatch with current reasons attached
  - CategoryShift     — (from, to, count) cell in the shift matrix
  - ScorerDriftReport — summary + sorted shift matrix + optional entries
  - ComputeScorerDrift(inputs, includeEntries) — pure function;
    re-runs ScoreRecord over each input and reports mismatches

Why this matters: without a drift quantifier, a scorer-rule change
silently invalidates the historical training data feeding the
learning loop. With drift quantification, a rule change surfaces
a concrete number ("847 of 4701 historical ScoredRuns now
disagree") that triggers a re-score-and-retrain cycle rather than
letting the substrate quietly rot.

Tests (6/6 PASS):
  - No-drift: all 3 inputs match → 100% matched
  - Shift detected: 5 inputs, 3 drift cases, drift_rate=0.6,
    shift matrix shows accepted→partially_accepted x3
  - Multiple shifts sorted by count desc
  - includeEntries=false skips the per-mismatch list
  - Empty input → all-zero report (no division-by-zero)
  - ScorerVersion stamped on every report

Future drift shapes (deferred to follow-ups, named in package doc):
  - PLAYBOOK drift: re-run playbook queries through current
    matrix-search; recorded answer not in top-K = drift
  - EMBEDDING drift: KS-test on vector distribution at T1 vs T2
  - AUDIT BASELINE drift: matches Rust audit_baselines.jsonl
    longitudinal signal

Pure compute. Materialization layer (read scored-runs jsonl + their
matching evidence jsonl + feed into ComputeScorerDrift) lands with
the distillation materialization commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:06:17 -05:00

156 lines
5.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package drift
import (
"testing"
"git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation"
)
func mkInput(sourceFile string, persisted distillation.ScoreCategory, succ []string) ScorerDriftInput {
return ScorerDriftInput{
Record: distillation.EvidenceRecord{
RunID: "run-x",
TaskID: "task-x",
Timestamp: "2026-01-01T00:00:00Z",
SchemaVersion: distillation.EvidenceSchemaVersion,
Provenance: distillation.Provenance{
SourceFile: sourceFile,
SigHash: "abc",
RecordedAt: "2026-01-01T00:00:01Z",
},
SuccessMarkers: succ,
},
PersistedCategory: persisted,
}
}
func TestComputeScorerDrift_NoDrift(t *testing.T) {
// All inputs have persisted=accepted matching what the current
// scrum_review scorer produces on accepted_on_attempt_1.
inputs := []ScorerDriftInput{
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
}
r := ComputeScorerDrift(inputs, true)
if r.TotalChecked != 3 || r.Matched != 3 || r.Drifted != 0 {
t.Errorf("no-drift case: total=%d matched=%d drifted=%d",
r.TotalChecked, r.Matched, r.Drifted)
}
if r.DriftRate != 0 {
t.Errorf("drift_rate: want 0, got %v", r.DriftRate)
}
if len(r.Entries) != 0 {
t.Errorf("entries: want 0, got %d", len(r.Entries))
}
}
func TestComputeScorerDrift_ShiftDetected(t *testing.T) {
// Simulate a historical labeling where the persisted scorer
// thought attempt-2 acceptances were "accepted" but the current
// scorer (this code) categorizes them as "partially_accepted".
// Drift should fire on those.
inputs := []ScorerDriftInput{
// Match: attempt 1 → accepted (still)
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
// Drift: persisted thought attempt-2 was accepted, today's scorer says partial
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_3"}),
// Drift: persisted thought attempt-5 was accepted, today's scorer says partial (high-cost)
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_5"}),
}
r := ComputeScorerDrift(inputs, true)
if r.TotalChecked != 5 {
t.Errorf("total: want 5, got %d", r.TotalChecked)
}
if r.Matched != 2 {
t.Errorf("matched: want 2, got %d", r.Matched)
}
if r.Drifted != 3 {
t.Errorf("drifted: want 3, got %d", r.Drifted)
}
wantRate := 3.0 / 5.0
if r.DriftRate < wantRate-1e-9 || r.DriftRate > wantRate+1e-9 {
t.Errorf("drift_rate: want %v, got %v", wantRate, r.DriftRate)
}
if len(r.Entries) != 3 {
t.Errorf("entries: want 3 mismatches, got %d", len(r.Entries))
}
// Shift matrix should show one shift: accepted → partially_accepted, count=3
if len(r.ShiftMatrix) != 1 {
t.Errorf("shift matrix: want 1 shift, got %d (%+v)", len(r.ShiftMatrix), r.ShiftMatrix)
} else {
s := r.ShiftMatrix[0]
if s.From != distillation.CategoryAccepted ||
s.To != distillation.CategoryPartiallyAccepted ||
s.Count != 3 {
t.Errorf("shift: got %+v", s)
}
}
}
func TestComputeScorerDrift_MultipleShiftsSortedByCount(t *testing.T) {
inputs := []ScorerDriftInput{
// 3× accepted→partial
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
// 1× rejected→needs_human (no marker)
{
Record: distillation.EvidenceRecord{
RunID: "r1", TaskID: "t1",
Timestamp: "2026-01-01T00:00:00Z",
SchemaVersion: distillation.EvidenceSchemaVersion,
Provenance: distillation.Provenance{
SourceFile: "data/_kb/scrum_reviews.jsonl",
SigHash: "x", RecordedAt: "2026-01-01T00:00:01Z",
},
// no markers → needs_human_review
},
PersistedCategory: distillation.CategoryRejected,
},
}
r := ComputeScorerDrift(inputs, false)
if r.Drifted != 4 {
t.Errorf("drifted: want 4, got %d", r.Drifted)
}
if len(r.ShiftMatrix) != 2 {
t.Errorf("shift matrix: want 2 distinct shifts, got %d", len(r.ShiftMatrix))
}
// Sorted by count desc, so accepted→partial (3) before rejected→needs_human (1)
if r.ShiftMatrix[0].Count != 3 || r.ShiftMatrix[1].Count != 1 {
t.Errorf("shift order wrong: got %+v", r.ShiftMatrix)
}
}
func TestComputeScorerDrift_IncludeEntriesFalse(t *testing.T) {
inputs := []ScorerDriftInput{
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
}
r := ComputeScorerDrift(inputs, false)
if r.Drifted != 1 {
t.Errorf("drifted: want 1, got %d", r.Drifted)
}
if len(r.Entries) != 0 {
t.Errorf("entries: want 0 when includeEntries=false, got %d", len(r.Entries))
}
}
func TestComputeScorerDrift_EmptyInput(t *testing.T) {
r := ComputeScorerDrift(nil, true)
if r.TotalChecked != 0 || r.Drifted != 0 || r.Matched != 0 {
t.Errorf("empty: want all-zero, got %+v", r)
}
if r.DriftRate != 0 {
t.Errorf("drift_rate on empty: want 0, got %v", r.DriftRate)
}
}
func TestComputeScorerDrift_ScorerVersionStamped(t *testing.T) {
r := ComputeScorerDrift(nil, false)
if r.ScorerVersion != distillation.ScorerVersion {
t.Errorf("scorer_version: want %q, got %q", distillation.ScorerVersion, r.ScorerVersion)
}
}