F: drift quantification — scorer drift first

PRD's 5-loop substrate names "drift" as loop 5: quantify when historical decisions stop matching current reality. Distinct from the rating+distillation loop because drift is MEASUREMENT, not LEARNING. The learning loop says "this match worked, remember it"; the drift loop says "this 4-month-old playbook entry — does it still match what the substrate would surface today?" First-shipped drift shape: SCORER drift. When the deterministic scorer's ScorerVersion bumps, historical ScoredRuns may no longer match what the current scorer produces on the same EvidenceRecord. internal/drift/drift.go: - ScorerDriftInput — (EvidenceRecord, persisted_category) pair - ScorerDriftEntry — one mismatch with current reasons attached - CategoryShift — (from, to, count) cell in the shift matrix - ScorerDriftReport — summary + sorted shift matrix + optional entries - ComputeScorerDrift(inputs, includeEntries) — pure function; re-runs ScoreRecord over each input and reports mismatches Why this matters: without a drift quantifier, a scorer-rule change silently invalidates the historical training data feeding the learning loop. With drift quantification, a rule change surfaces a concrete number ("847 of 4701 historical ScoredRuns now disagree") that triggers a re-score-and-retrain cycle rather than letting the substrate quietly rot. Tests (6/6 PASS): - No-drift: all 3 inputs match → 100% matched - Shift detected: 5 inputs, 3 drift cases, drift_rate=0.6, shift matrix shows accepted→partially_accepted x3 - Multiple shifts sorted by count desc - includeEntries=false skips the per-mismatch list - Empty input → all-zero report (no division-by-zero) - ScorerVersion stamped on every report Future drift shapes (deferred to follow-ups, named in package doc): - PLAYBOOK drift: re-run playbook queries through current matrix-search; recorded answer not in top-K = drift - EMBEDDING drift: KS-test on vector distribution at T1 vs T2 - AUDIT BASELINE drift: matches Rust audit_baselines.jsonl longitudinal signal Pure compute. Materialization layer (read scored-runs jsonl + their matching evidence jsonl + feed into ComputeScorerDrift) lands with the distillation materialization commit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:06:17 -05:00 · 2026-04-29 20:06:17 -05:00 · be65f85f17
commit be65f85f17
parent 57d0df125d
2 changed files with 306 additions and 0 deletions
--- a/internal/drift/drift.go
+++ b/internal/drift/drift.go
@ -0,0 +1,151 @@
+// Package drift quantifies when historical decisions stop matching
+// current reality. Per the PRD's 5-loop substrate, this is loop 5
+// (drift) — distinct from the rating+distillation loop because
+// drift is about MEASUREMENT, not learning. The learning loop says
+// "this match worked, remember it"; the drift loop says "the
+// playbook entry from 4 months ago — does it still match what the
+// substrate would surface today?"
+//
+// First-shipped drift shape: SCORER drift. When the deterministic
+// scorer's logic changes (ScorerVersion bumped), historical
+// ScoredRuns may no longer match what the current scorer would
+// produce on the same EvidenceRecord. ComputeScorerDrift re-runs
+// the current scorer over a slice of (EvidenceRecord, persisted
+// category) pairs and reports mismatches.
+//
+// Why this matters: the rating+distillation loop only learns
+// forward. Without a drift quantifier, a scorer-rule change
+// silently invalidates the historical training data feeding the
+// loop. With drift quantification, a rule change surfaces a
+// concrete number ("847 of 4701 historical scoredruns now
+// disagree") that triggers a re-score-and-retrain cycle rather
+// than letting the substrate quietly rot.
+//
+// Future drift shapes (not in this commit):
+//   - PLAYBOOK drift: for each playbook entry, re-run its query
+//     through current matrix-search; if the recorded answer is no
+//     longer in top-K, the world has moved.
+//   - EMBEDDING drift: KS-test on the distribution of embedding
+//     vectors at T1 vs T2; large shifts = the corpus has changed
+//     materially.
+//   - AUDIT BASELINE drift: track how PR audit verdicts shift over
+//     scorer/auditor versions; matches the Rust audit_baselines.jsonl
+//     longitudinal signal.
+
+package drift
+
+import (
+	"sort"
+
+	"git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation"
+)
+
+// ScorerDriftEntry is one mismatch — a historical (record, category)
+// pair where the current scorer disagrees with the persisted
+// verdict. Reasons captures the current scorer's explanation so
+// operators can see WHY the verdict changed.
+type ScorerDriftEntry struct {
+	EvidenceRunID    string                       `json:"evidence_run_id"`
+	EvidenceTaskID   string                       `json:"evidence_task_id"`
+	PersistedCategory distillation.ScoreCategory  `json:"persisted_category"`
+	CurrentCategory   distillation.ScoreCategory  `json:"current_category"`
+	CurrentReasons    []string                    `json:"current_reasons"`
+	SourceFile        string                      `json:"source_file"`
+}
+
+// CategoryShift is one cell in the drift matrix — "X persisted
+// records that NOW classify as Y." e.g. "12 records that were
+// 'rejected' yesterday are 'partially_accepted' today."
+type CategoryShift struct {
+	From  distillation.ScoreCategory `json:"from"`
+	To    distillation.ScoreCategory `json:"to"`
+	Count int                        `json:"count"`
+}
+
+// ScorerDriftReport is the summary returned by ComputeScorerDrift.
+// The shape is intentionally machine-readable so a downstream
+// dashboard / alerting layer can threshold on Drifted / TotalChecked
+// without parsing the entries list.
+type ScorerDriftReport struct {
+	ScorerVersion string             `json:"scorer_version"` // current scorer's version
+	TotalChecked  int                `json:"total_checked"`
+	Matched       int                `json:"matched"`        // current == persisted
+	Drifted       int                `json:"drifted"`        // current != persisted
+	DriftRate     float64            `json:"drift_rate"`     // Drifted / TotalChecked
+	ShiftMatrix   []CategoryShift    `json:"shift_matrix,omitempty"`
+	Entries       []ScorerDriftEntry `json:"entries,omitempty"` // mismatches only
+}
+
+// ScorerDriftInput is one (record, persisted_category) pair to check.
+// Caller is responsible for materializing these from disk; this
+// package is pure compute.
+type ScorerDriftInput struct {
+	Record            distillation.EvidenceRecord
+	PersistedCategory distillation.ScoreCategory
+}
+
+// ComputeScorerDrift re-runs distillation.ScoreRecord over each
+// input and reports mismatches. Pure function — no I/O. The caller
+// supplies the inputs (typically by reading a directory of
+// scored-runs JSONL alongside the corresponding evidence JSONL).
+//
+// IncludeEntries controls whether the per-mismatch detail list is
+// populated. For large corpora (e.g. 4,701 fill events) the
+// summary numbers may be all the caller needs; setting this to
+// false avoids allocating the entries slice.
+func ComputeScorerDrift(inputs []ScorerDriftInput, includeEntries bool) ScorerDriftReport {
+	report := ScorerDriftReport{
+		ScorerVersion: distillation.ScorerVersion,
+		TotalChecked:  len(inputs),
+	}
+
+	shiftCounts := make(map[[2]distillation.ScoreCategory]int)
+
+	for _, in := range inputs {
+		out := distillation.ScoreRecord(in.Record)
+		if out.Category == in.PersistedCategory {
+			report.Matched++
+			continue
+		}
+		report.Drifted++
+		shiftCounts[[2]distillation.ScoreCategory{in.PersistedCategory, out.Category}]++
+		if includeEntries {
+			report.Entries = append(report.Entries, ScorerDriftEntry{
+				EvidenceRunID:     in.Record.RunID,
+				EvidenceTaskID:    in.Record.TaskID,
+				PersistedCategory: in.PersistedCategory,
+				CurrentCategory:   out.Category,
+				CurrentReasons:    out.Reasons,
+				SourceFile:        in.Record.Provenance.SourceFile,
+			})
+		}
+	}
+
+	if report.TotalChecked > 0 {
+		report.DriftRate = float64(report.Drifted) / float64(report.TotalChecked)
+	}
+
+	if len(shiftCounts) > 0 {
+		report.ShiftMatrix = make([]CategoryShift, 0, len(shiftCounts))
+		for k, v := range shiftCounts {
+			report.ShiftMatrix = append(report.ShiftMatrix, CategoryShift{
+				From: k[0], To: k[1], Count: v,
+			})
+		}
+		// Sort: largest shifts first, then alphabetical-ish for ties.
+		// Stable ordering matters for downstream display and JSON
+		// determinism in tests.
+		sort.Slice(report.ShiftMatrix, func(i, j int) bool {
+			a, b := report.ShiftMatrix[i], report.ShiftMatrix[j]
+			if a.Count != b.Count {
+				return a.Count > b.Count
+			}
+			if a.From != b.From {
+				return string(a.From) < string(b.From)
+			}
+			return string(a.To) < string(b.To)
+		})
+	}
+
+	return report
+}
--- a/internal/drift/drift_test.go
+++ b/internal/drift/drift_test.go
@ -0,0 +1,155 @@
+package drift
+
+import (
+	"testing"
+
+	"git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation"
+)
+
+func mkInput(sourceFile string, persisted distillation.ScoreCategory, succ []string) ScorerDriftInput {
+	return ScorerDriftInput{
+		Record: distillation.EvidenceRecord{
+			RunID:         "run-x",
+			TaskID:        "task-x",
+			Timestamp:     "2026-01-01T00:00:00Z",
+			SchemaVersion: distillation.EvidenceSchemaVersion,
+			Provenance: distillation.Provenance{
+				SourceFile: sourceFile,
+				SigHash:    "abc",
+				RecordedAt: "2026-01-01T00:00:01Z",
+			},
+			SuccessMarkers: succ,
+		},
+		PersistedCategory: persisted,
+	}
+}
+
+func TestComputeScorerDrift_NoDrift(t *testing.T) {
+	// All inputs have persisted=accepted matching what the current
+	// scrum_review scorer produces on accepted_on_attempt_1.
+	inputs := []ScorerDriftInput{
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
+	}
+	r := ComputeScorerDrift(inputs, true)
+	if r.TotalChecked != 3 || r.Matched != 3 || r.Drifted != 0 {
+		t.Errorf("no-drift case: total=%d matched=%d drifted=%d",
+			r.TotalChecked, r.Matched, r.Drifted)
+	}
+	if r.DriftRate != 0 {
+		t.Errorf("drift_rate: want 0, got %v", r.DriftRate)
+	}
+	if len(r.Entries) != 0 {
+		t.Errorf("entries: want 0, got %d", len(r.Entries))
+	}
+}
+
+func TestComputeScorerDrift_ShiftDetected(t *testing.T) {
+	// Simulate a historical labeling where the persisted scorer
+	// thought attempt-2 acceptances were "accepted" but the current
+	// scorer (this code) categorizes them as "partially_accepted".
+	// Drift should fire on those.
+	inputs := []ScorerDriftInput{
+		// Match: attempt 1 → accepted (still)
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
+		// Drift: persisted thought attempt-2 was accepted, today's scorer says partial
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_3"}),
+		// Drift: persisted thought attempt-5 was accepted, today's scorer says partial (high-cost)
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_5"}),
+	}
+	r := ComputeScorerDrift(inputs, true)
+	if r.TotalChecked != 5 {
+		t.Errorf("total: want 5, got %d", r.TotalChecked)
+	}
+	if r.Matched != 2 {
+		t.Errorf("matched: want 2, got %d", r.Matched)
+	}
+	if r.Drifted != 3 {
+		t.Errorf("drifted: want 3, got %d", r.Drifted)
+	}
+	wantRate := 3.0 / 5.0
+	if r.DriftRate < wantRate-1e-9 || r.DriftRate > wantRate+1e-9 {
+		t.Errorf("drift_rate: want %v, got %v", wantRate, r.DriftRate)
+	}
+	if len(r.Entries) != 3 {
+		t.Errorf("entries: want 3 mismatches, got %d", len(r.Entries))
+	}
+	// Shift matrix should show one shift: accepted → partially_accepted, count=3
+	if len(r.ShiftMatrix) != 1 {
+		t.Errorf("shift matrix: want 1 shift, got %d (%+v)", len(r.ShiftMatrix), r.ShiftMatrix)
+	} else {
+		s := r.ShiftMatrix[0]
+		if s.From != distillation.CategoryAccepted ||
+			s.To != distillation.CategoryPartiallyAccepted ||
+			s.Count != 3 {
+			t.Errorf("shift: got %+v", s)
+		}
+	}
+}
+
+func TestComputeScorerDrift_MultipleShiftsSortedByCount(t *testing.T) {
+	inputs := []ScorerDriftInput{
+		// 3× accepted→partial
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
+		// 1× rejected→needs_human (no marker)
+		{
+			Record: distillation.EvidenceRecord{
+				RunID: "r1", TaskID: "t1",
+				Timestamp:     "2026-01-01T00:00:00Z",
+				SchemaVersion: distillation.EvidenceSchemaVersion,
+				Provenance: distillation.Provenance{
+					SourceFile: "data/_kb/scrum_reviews.jsonl",
+					SigHash:    "x", RecordedAt: "2026-01-01T00:00:01Z",
+				},
+				// no markers → needs_human_review
+			},
+			PersistedCategory: distillation.CategoryRejected,
+		},
+	}
+	r := ComputeScorerDrift(inputs, false)
+	if r.Drifted != 4 {
+		t.Errorf("drifted: want 4, got %d", r.Drifted)
+	}
+	if len(r.ShiftMatrix) != 2 {
+		t.Errorf("shift matrix: want 2 distinct shifts, got %d", len(r.ShiftMatrix))
+	}
+	// Sorted by count desc, so accepted→partial (3) before rejected→needs_human (1)
+	if r.ShiftMatrix[0].Count != 3 || r.ShiftMatrix[1].Count != 1 {
+		t.Errorf("shift order wrong: got %+v", r.ShiftMatrix)
+	}
+}
+
+func TestComputeScorerDrift_IncludeEntriesFalse(t *testing.T) {
+	inputs := []ScorerDriftInput{
+		mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
+	}
+	r := ComputeScorerDrift(inputs, false)
+	if r.Drifted != 1 {
+		t.Errorf("drifted: want 1, got %d", r.Drifted)
+	}
+	if len(r.Entries) != 0 {
+		t.Errorf("entries: want 0 when includeEntries=false, got %d", len(r.Entries))
+	}
+}
+
+func TestComputeScorerDrift_EmptyInput(t *testing.T) {
+	r := ComputeScorerDrift(nil, true)
+	if r.TotalChecked != 0 || r.Drifted != 0 || r.Matched != 0 {
+		t.Errorf("empty: want all-zero, got %+v", r)
+	}
+	if r.DriftRate != 0 {
+		t.Errorf("drift_rate on empty: want 0, got %v", r.DriftRate)
+	}
+}
+
+func TestComputeScorerDrift_ScorerVersionStamped(t *testing.T) {
+	r := ComputeScorerDrift(nil, false)
+	if r.ScorerVersion != distillation.ScorerVersion {
+		t.Errorf("scorer_version: want %q, got %q", distillation.ScorerVersion, r.ScorerVersion)
+	}
+}