diff --git a/internal/drift/drift.go b/internal/drift/drift.go new file mode 100644 index 0000000..b902e3e --- /dev/null +++ b/internal/drift/drift.go @@ -0,0 +1,151 @@ +// Package drift quantifies when historical decisions stop matching +// current reality. Per the PRD's 5-loop substrate, this is loop 5 +// (drift) — distinct from the rating+distillation loop because +// drift is about MEASUREMENT, not learning. The learning loop says +// "this match worked, remember it"; the drift loop says "the +// playbook entry from 4 months ago — does it still match what the +// substrate would surface today?" +// +// First-shipped drift shape: SCORER drift. When the deterministic +// scorer's logic changes (ScorerVersion bumped), historical +// ScoredRuns may no longer match what the current scorer would +// produce on the same EvidenceRecord. ComputeScorerDrift re-runs +// the current scorer over a slice of (EvidenceRecord, persisted +// category) pairs and reports mismatches. +// +// Why this matters: the rating+distillation loop only learns +// forward. Without a drift quantifier, a scorer-rule change +// silently invalidates the historical training data feeding the +// loop. With drift quantification, a rule change surfaces a +// concrete number ("847 of 4701 historical scoredruns now +// disagree") that triggers a re-score-and-retrain cycle rather +// than letting the substrate quietly rot. +// +// Future drift shapes (not in this commit): +// - PLAYBOOK drift: for each playbook entry, re-run its query +// through current matrix-search; if the recorded answer is no +// longer in top-K, the world has moved. +// - EMBEDDING drift: KS-test on the distribution of embedding +// vectors at T1 vs T2; large shifts = the corpus has changed +// materially. +// - AUDIT BASELINE drift: track how PR audit verdicts shift over +// scorer/auditor versions; matches the Rust audit_baselines.jsonl +// longitudinal signal. + +package drift + +import ( + "sort" + + "git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation" +) + +// ScorerDriftEntry is one mismatch — a historical (record, category) +// pair where the current scorer disagrees with the persisted +// verdict. Reasons captures the current scorer's explanation so +// operators can see WHY the verdict changed. +type ScorerDriftEntry struct { + EvidenceRunID string `json:"evidence_run_id"` + EvidenceTaskID string `json:"evidence_task_id"` + PersistedCategory distillation.ScoreCategory `json:"persisted_category"` + CurrentCategory distillation.ScoreCategory `json:"current_category"` + CurrentReasons []string `json:"current_reasons"` + SourceFile string `json:"source_file"` +} + +// CategoryShift is one cell in the drift matrix — "X persisted +// records that NOW classify as Y." e.g. "12 records that were +// 'rejected' yesterday are 'partially_accepted' today." +type CategoryShift struct { + From distillation.ScoreCategory `json:"from"` + To distillation.ScoreCategory `json:"to"` + Count int `json:"count"` +} + +// ScorerDriftReport is the summary returned by ComputeScorerDrift. +// The shape is intentionally machine-readable so a downstream +// dashboard / alerting layer can threshold on Drifted / TotalChecked +// without parsing the entries list. +type ScorerDriftReport struct { + ScorerVersion string `json:"scorer_version"` // current scorer's version + TotalChecked int `json:"total_checked"` + Matched int `json:"matched"` // current == persisted + Drifted int `json:"drifted"` // current != persisted + DriftRate float64 `json:"drift_rate"` // Drifted / TotalChecked + ShiftMatrix []CategoryShift `json:"shift_matrix,omitempty"` + Entries []ScorerDriftEntry `json:"entries,omitempty"` // mismatches only +} + +// ScorerDriftInput is one (record, persisted_category) pair to check. +// Caller is responsible for materializing these from disk; this +// package is pure compute. +type ScorerDriftInput struct { + Record distillation.EvidenceRecord + PersistedCategory distillation.ScoreCategory +} + +// ComputeScorerDrift re-runs distillation.ScoreRecord over each +// input and reports mismatches. Pure function — no I/O. The caller +// supplies the inputs (typically by reading a directory of +// scored-runs JSONL alongside the corresponding evidence JSONL). +// +// IncludeEntries controls whether the per-mismatch detail list is +// populated. For large corpora (e.g. 4,701 fill events) the +// summary numbers may be all the caller needs; setting this to +// false avoids allocating the entries slice. +func ComputeScorerDrift(inputs []ScorerDriftInput, includeEntries bool) ScorerDriftReport { + report := ScorerDriftReport{ + ScorerVersion: distillation.ScorerVersion, + TotalChecked: len(inputs), + } + + shiftCounts := make(map[[2]distillation.ScoreCategory]int) + + for _, in := range inputs { + out := distillation.ScoreRecord(in.Record) + if out.Category == in.PersistedCategory { + report.Matched++ + continue + } + report.Drifted++ + shiftCounts[[2]distillation.ScoreCategory{in.PersistedCategory, out.Category}]++ + if includeEntries { + report.Entries = append(report.Entries, ScorerDriftEntry{ + EvidenceRunID: in.Record.RunID, + EvidenceTaskID: in.Record.TaskID, + PersistedCategory: in.PersistedCategory, + CurrentCategory: out.Category, + CurrentReasons: out.Reasons, + SourceFile: in.Record.Provenance.SourceFile, + }) + } + } + + if report.TotalChecked > 0 { + report.DriftRate = float64(report.Drifted) / float64(report.TotalChecked) + } + + if len(shiftCounts) > 0 { + report.ShiftMatrix = make([]CategoryShift, 0, len(shiftCounts)) + for k, v := range shiftCounts { + report.ShiftMatrix = append(report.ShiftMatrix, CategoryShift{ + From: k[0], To: k[1], Count: v, + }) + } + // Sort: largest shifts first, then alphabetical-ish for ties. + // Stable ordering matters for downstream display and JSON + // determinism in tests. + sort.Slice(report.ShiftMatrix, func(i, j int) bool { + a, b := report.ShiftMatrix[i], report.ShiftMatrix[j] + if a.Count != b.Count { + return a.Count > b.Count + } + if a.From != b.From { + return string(a.From) < string(b.From) + } + return string(a.To) < string(b.To) + }) + } + + return report +} diff --git a/internal/drift/drift_test.go b/internal/drift/drift_test.go new file mode 100644 index 0000000..349ce22 --- /dev/null +++ b/internal/drift/drift_test.go @@ -0,0 +1,155 @@ +package drift + +import ( + "testing" + + "git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation" +) + +func mkInput(sourceFile string, persisted distillation.ScoreCategory, succ []string) ScorerDriftInput { + return ScorerDriftInput{ + Record: distillation.EvidenceRecord{ + RunID: "run-x", + TaskID: "task-x", + Timestamp: "2026-01-01T00:00:00Z", + SchemaVersion: distillation.EvidenceSchemaVersion, + Provenance: distillation.Provenance{ + SourceFile: sourceFile, + SigHash: "abc", + RecordedAt: "2026-01-01T00:00:01Z", + }, + SuccessMarkers: succ, + }, + PersistedCategory: persisted, + } +} + +func TestComputeScorerDrift_NoDrift(t *testing.T) { + // All inputs have persisted=accepted matching what the current + // scrum_review scorer produces on accepted_on_attempt_1. + inputs := []ScorerDriftInput{ + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}), + } + r := ComputeScorerDrift(inputs, true) + if r.TotalChecked != 3 || r.Matched != 3 || r.Drifted != 0 { + t.Errorf("no-drift case: total=%d matched=%d drifted=%d", + r.TotalChecked, r.Matched, r.Drifted) + } + if r.DriftRate != 0 { + t.Errorf("drift_rate: want 0, got %v", r.DriftRate) + } + if len(r.Entries) != 0 { + t.Errorf("entries: want 0, got %d", len(r.Entries)) + } +} + +func TestComputeScorerDrift_ShiftDetected(t *testing.T) { + // Simulate a historical labeling where the persisted scorer + // thought attempt-2 acceptances were "accepted" but the current + // scorer (this code) categorizes them as "partially_accepted". + // Drift should fire on those. + inputs := []ScorerDriftInput{ + // Match: attempt 1 → accepted (still) + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}), + // Drift: persisted thought attempt-2 was accepted, today's scorer says partial + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_3"}), + // Drift: persisted thought attempt-5 was accepted, today's scorer says partial (high-cost) + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_5"}), + } + r := ComputeScorerDrift(inputs, true) + if r.TotalChecked != 5 { + t.Errorf("total: want 5, got %d", r.TotalChecked) + } + if r.Matched != 2 { + t.Errorf("matched: want 2, got %d", r.Matched) + } + if r.Drifted != 3 { + t.Errorf("drifted: want 3, got %d", r.Drifted) + } + wantRate := 3.0 / 5.0 + if r.DriftRate < wantRate-1e-9 || r.DriftRate > wantRate+1e-9 { + t.Errorf("drift_rate: want %v, got %v", wantRate, r.DriftRate) + } + if len(r.Entries) != 3 { + t.Errorf("entries: want 3 mismatches, got %d", len(r.Entries)) + } + // Shift matrix should show one shift: accepted → partially_accepted, count=3 + if len(r.ShiftMatrix) != 1 { + t.Errorf("shift matrix: want 1 shift, got %d (%+v)", len(r.ShiftMatrix), r.ShiftMatrix) + } else { + s := r.ShiftMatrix[0] + if s.From != distillation.CategoryAccepted || + s.To != distillation.CategoryPartiallyAccepted || + s.Count != 3 { + t.Errorf("shift: got %+v", s) + } + } +} + +func TestComputeScorerDrift_MultipleShiftsSortedByCount(t *testing.T) { + inputs := []ScorerDriftInput{ + // 3× accepted→partial + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}), + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}), + // 1× rejected→needs_human (no marker) + { + Record: distillation.EvidenceRecord{ + RunID: "r1", TaskID: "t1", + Timestamp: "2026-01-01T00:00:00Z", + SchemaVersion: distillation.EvidenceSchemaVersion, + Provenance: distillation.Provenance{ + SourceFile: "data/_kb/scrum_reviews.jsonl", + SigHash: "x", RecordedAt: "2026-01-01T00:00:01Z", + }, + // no markers → needs_human_review + }, + PersistedCategory: distillation.CategoryRejected, + }, + } + r := ComputeScorerDrift(inputs, false) + if r.Drifted != 4 { + t.Errorf("drifted: want 4, got %d", r.Drifted) + } + if len(r.ShiftMatrix) != 2 { + t.Errorf("shift matrix: want 2 distinct shifts, got %d", len(r.ShiftMatrix)) + } + // Sorted by count desc, so accepted→partial (3) before rejected→needs_human (1) + if r.ShiftMatrix[0].Count != 3 || r.ShiftMatrix[1].Count != 1 { + t.Errorf("shift order wrong: got %+v", r.ShiftMatrix) + } +} + +func TestComputeScorerDrift_IncludeEntriesFalse(t *testing.T) { + inputs := []ScorerDriftInput{ + mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}), + } + r := ComputeScorerDrift(inputs, false) + if r.Drifted != 1 { + t.Errorf("drifted: want 1, got %d", r.Drifted) + } + if len(r.Entries) != 0 { + t.Errorf("entries: want 0 when includeEntries=false, got %d", len(r.Entries)) + } +} + +func TestComputeScorerDrift_EmptyInput(t *testing.T) { + r := ComputeScorerDrift(nil, true) + if r.TotalChecked != 0 || r.Drifted != 0 || r.Matched != 0 { + t.Errorf("empty: want all-zero, got %+v", r) + } + if r.DriftRate != 0 { + t.Errorf("drift_rate on empty: want 0, got %v", r.DriftRate) + } +} + +func TestComputeScorerDrift_ScorerVersionStamped(t *testing.T) { + r := ComputeScorerDrift(nil, false) + if r.ScorerVersion != distillation.ScorerVersion { + t.Errorf("scorer_version: want %q, got %q", distillation.ScorerVersion, r.ScorerVersion) + } +}