// Package drift quantifies when historical decisions stop matching // current reality. Per the PRD's 5-loop substrate, this is loop 5 // (drift) — distinct from the rating+distillation loop because // drift is about MEASUREMENT, not learning. The learning loop says // "this match worked, remember it"; the drift loop says "the // playbook entry from 4 months ago — does it still match what the // substrate would surface today?" // // First-shipped drift shape: SCORER drift. When the deterministic // scorer's logic changes (ScorerVersion bumped), historical // ScoredRuns may no longer match what the current scorer would // produce on the same EvidenceRecord. ComputeScorerDrift re-runs // the current scorer over a slice of (EvidenceRecord, persisted // category) pairs and reports mismatches. // // Why this matters: the rating+distillation loop only learns // forward. Without a drift quantifier, a scorer-rule change // silently invalidates the historical training data feeding the // loop. With drift quantification, a rule change surfaces a // concrete number ("847 of 4701 historical scoredruns now // disagree") that triggers a re-score-and-retrain cycle rather // than letting the substrate quietly rot. // // Future drift shapes (not in this commit): // - PLAYBOOK drift: for each playbook entry, re-run its query // through current matrix-search; if the recorded answer is no // longer in top-K, the world has moved. // - EMBEDDING drift: KS-test on the distribution of embedding // vectors at T1 vs T2; large shifts = the corpus has changed // materially. // - AUDIT BASELINE drift: track how PR audit verdicts shift over // scorer/auditor versions; matches the Rust audit_baselines.jsonl // longitudinal signal. package drift import ( "math" "sort" "git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation" ) // ScorerDriftEntry is one mismatch — a historical (record, category) // pair where the current scorer disagrees with the persisted // verdict. Reasons captures the current scorer's explanation so // operators can see WHY the verdict changed. type ScorerDriftEntry struct { EvidenceRunID string `json:"evidence_run_id"` EvidenceTaskID string `json:"evidence_task_id"` PersistedCategory distillation.ScoreCategory `json:"persisted_category"` CurrentCategory distillation.ScoreCategory `json:"current_category"` CurrentReasons []string `json:"current_reasons"` SourceFile string `json:"source_file"` } // CategoryShift is one cell in the drift matrix — "X persisted // records that NOW classify as Y." e.g. "12 records that were // 'rejected' yesterday are 'partially_accepted' today." type CategoryShift struct { From distillation.ScoreCategory `json:"from"` To distillation.ScoreCategory `json:"to"` Count int `json:"count"` } // ScorerDriftReport is the summary returned by ComputeScorerDrift. // The shape is intentionally machine-readable so a downstream // dashboard / alerting layer can threshold on Drifted / TotalChecked // without parsing the entries list. type ScorerDriftReport struct { ScorerVersion string `json:"scorer_version"` // current scorer's version TotalChecked int `json:"total_checked"` Matched int `json:"matched"` // current == persisted Drifted int `json:"drifted"` // current != persisted DriftRate float64 `json:"drift_rate"` // Drifted / TotalChecked ShiftMatrix []CategoryShift `json:"shift_matrix,omitempty"` Entries []ScorerDriftEntry `json:"entries,omitempty"` // mismatches only } // ScorerDriftInput is one (record, persisted_category) pair to check. // Caller is responsible for materializing these from disk; this // package is pure compute. type ScorerDriftInput struct { Record distillation.EvidenceRecord PersistedCategory distillation.ScoreCategory } // ComputeScorerDrift re-runs distillation.ScoreRecord over each // input and reports mismatches. Pure function — no I/O. The caller // supplies the inputs (typically by reading a directory of // scored-runs JSONL alongside the corresponding evidence JSONL). // // IncludeEntries controls whether the per-mismatch detail list is // populated. For large corpora (e.g. 4,701 fill events) the // summary numbers may be all the caller needs; setting this to // false avoids allocating the entries slice. func ComputeScorerDrift(inputs []ScorerDriftInput, includeEntries bool) ScorerDriftReport { report := ScorerDriftReport{ ScorerVersion: distillation.ScorerVersion, TotalChecked: len(inputs), } shiftCounts := make(map[[2]distillation.ScoreCategory]int) for _, in := range inputs { out := distillation.ScoreRecord(in.Record) if out.Category == in.PersistedCategory { report.Matched++ continue } report.Drifted++ shiftCounts[[2]distillation.ScoreCategory{in.PersistedCategory, out.Category}]++ if includeEntries { report.Entries = append(report.Entries, ScorerDriftEntry{ EvidenceRunID: in.Record.RunID, EvidenceTaskID: in.Record.TaskID, PersistedCategory: in.PersistedCategory, CurrentCategory: out.Category, CurrentReasons: out.Reasons, SourceFile: in.Record.Provenance.SourceFile, }) } } if report.TotalChecked > 0 { report.DriftRate = float64(report.Drifted) / float64(report.TotalChecked) } if len(shiftCounts) > 0 { report.ShiftMatrix = make([]CategoryShift, 0, len(shiftCounts)) for k, v := range shiftCounts { report.ShiftMatrix = append(report.ShiftMatrix, CategoryShift{ From: k[0], To: k[1], Count: v, }) } // Sort: largest shifts first, then alphabetical-ish for ties. // Stable ordering matters for downstream display and JSON // determinism in tests. sort.Slice(report.ShiftMatrix, func(i, j int) bool { a, b := report.ShiftMatrix[i], report.ShiftMatrix[j] if a.Count != b.Count { return a.Count > b.Count } if a.From != b.From { return string(a.From) < string(b.From) } return string(a.To) < string(b.To) }) } return report } // ───────────────────────────────────────────────────────────────── // Distribution drift via Population Stability Index (PSI). // // Closes OPEN #3's "concrete distribution-drift signal" — pairs with // ComputeScorerDrift's "categorical drift" to give callers a // continuous-value drift metric for things like cosine distances, // judge ratings, query top-1 distance distributions over time. // // Why PSI: standard in finance/risk for distribution-shift monitoring, // well-defined verdict tiers (< 0.1 stable, 0.1-0.25 minor, > 0.25 // major), tolerant of moderate sample sizes, no assumption about // distribution shape. Alternatives (KS-test, KL-divergence) all // have failure modes when one bucket has zero observations; PSI's // epsilon-clamping handles that gracefully. // // Math: PSI = Σᵢ (actualᵢ - expectedᵢ) × ln(actualᵢ / expectedᵢ) // where actualᵢ and expectedᵢ are the proportion of observations // falling into bucket i (zero values are clamped to a small // epsilon so the log doesn't blow up). // // First-shipped use case: comparing the distribution of cosine top-1 // distances at T0 vs T1 to detect when the embedder's behavior // against the corpus has shifted materially (e.g. after a model // upgrade or a corpus refresh). // ───────────────────────────────────────────────────────────────── // DistributionDriftTier is a verdict bucket for the PSI value. // Standard PSI thresholds from finance/risk; documented in the // function-level comment for ComputeDistributionDrift. type DistributionDriftTier string const ( DriftTierStable DistributionDriftTier = "stable" // PSI < 0.1 DriftTierMinor DistributionDriftTier = "minor" // 0.1 ≤ PSI < 0.25 DriftTierMajor DistributionDriftTier = "major" // PSI ≥ 0.25 ) // DistributionDriftBucket is one bucket's contribution to the PSI sum. // Useful for drilldown — operators can see WHICH part of the // distribution shifted (e.g. "the [0.3, 0.4) bucket gained 12% of // observations from T0 to T1"). type DistributionDriftBucket struct { Lower float64 `json:"lower"` Upper float64 `json:"upper"` BaselineCount int `json:"baseline_count"` CurrentCount int `json:"current_count"` BaselineRatio float64 `json:"baseline_ratio"` // [0, 1] CurrentRatio float64 `json:"current_ratio"` // [0, 1] PSIPart float64 `json:"psi_part"` // contribution to PSI } // DistributionDriftReport is the aggregate output of a PSI run. type DistributionDriftReport struct { PSI float64 `json:"psi"` Tier DistributionDriftTier `json:"tier"` Buckets []DistributionDriftBucket `json:"buckets"` BaselineN int `json:"baseline_n"` CurrentN int `json:"current_n"` NumBuckets int `json:"num_buckets"` Min float64 `json:"min"` Max float64 `json:"max"` } // DistributionDriftInput is the input shape for ComputeDistributionDrift. // Baseline is the reference distribution (T0); Current is the new // distribution (T1). NumBuckets defaults to 10 when zero, capped at // 100 to keep the per-bucket math stable on small samples. type DistributionDriftInput struct { Baseline []float64 Current []float64 NumBuckets int } // driftEpsilon clamps zero-bucket ratios so log doesn't blow up. // 1e-4 is the standard PSI choice — represents "less than 0.01% of // observations." Anything smaller would let one rare-bucket shift // dominate the PSI sum unfairly. const driftEpsilon = 1e-4 // ComputeDistributionDrift returns a PSI-based drift report between // the baseline and current distributions. Buckets span [min, max] of // the COMBINED data (so neither side falls outside) with // equal-frequency buckets on the baseline (so each baseline bucket // has roughly the same count — robust to skewed distributions like // cosine distances which cluster near 0). // // Empty inputs return PSI=0, tier=stable. Caller should also check // BaselineN/CurrentN before trusting the verdict — PSI on tiny // samples is statistical noise. // // Thresholds (industry-standard, citable): // - PSI < 0.10 → stable: distributions are operationally equivalent // - 0.10 ≤ PSI < 0.25 → minor: investigate but not alarming // - PSI ≥ 0.25 → major: distributions have shifted materially; // downstream decisions trained on baseline may be invalid func ComputeDistributionDrift(in DistributionDriftInput) DistributionDriftReport { report := DistributionDriftReport{ BaselineN: len(in.Baseline), CurrentN: len(in.Current), } if report.BaselineN == 0 || report.CurrentN == 0 { report.Tier = DriftTierStable return report } n := in.NumBuckets if n <= 0 { n = 10 } if n > 100 { n = 100 } report.NumBuckets = n // Combined min/max so both distributions fit the bucket range. minV, maxV := in.Baseline[0], in.Baseline[0] for _, v := range in.Baseline { if v < minV { minV = v } if v > maxV { maxV = v } } for _, v := range in.Current { if v < minV { minV = v } if v > maxV { maxV = v } } report.Min = minV report.Max = maxV if maxV == minV { // All values identical — trivially stable, no meaningful PSI. report.Tier = DriftTierStable return report } // Equal-WIDTH bucketing. Equal-frequency would be more robust on // skewed distributions, but it complicates the math (need to // quantile-sort the baseline) for marginal gain on the symmetric // distributions PSI is most useful for. Operators with skewed // data can pre-bucket their inputs into normalized scores. width := (maxV - minV) / float64(n) bucketIdx := func(v float64) int { if v >= maxV { return n - 1 // right-edge inclusive } idx := int((v - minV) / width) if idx < 0 { idx = 0 } if idx >= n { idx = n - 1 } return idx } bCounts := make([]int, n) cCounts := make([]int, n) for _, v := range in.Baseline { bCounts[bucketIdx(v)]++ } for _, v := range in.Current { cCounts[bucketIdx(v)]++ } report.Buckets = make([]DistributionDriftBucket, 0, n) bN := float64(report.BaselineN) cN := float64(report.CurrentN) psi := 0.0 for i := 0; i < n; i++ { bRatio := float64(bCounts[i]) / bN cRatio := float64(cCounts[i]) / cN // Epsilon clamp so empty buckets don't blow up log. if bRatio < driftEpsilon { bRatio = driftEpsilon } if cRatio < driftEpsilon { cRatio = driftEpsilon } part := (cRatio - bRatio) * math.Log(cRatio/bRatio) psi += part report.Buckets = append(report.Buckets, DistributionDriftBucket{ Lower: minV + float64(i)*width, Upper: minV + float64(i+1)*width, BaselineCount: bCounts[i], CurrentCount: cCounts[i], BaselineRatio: float64(bCounts[i]) / bN, CurrentRatio: float64(cCounts[i]) / cN, PSIPart: part, }) } report.PSI = psi switch { case psi < 0.10: report.Tier = DriftTierStable case psi < 0.25: report.Tier = DriftTierMinor default: report.Tier = DriftTierMajor } return report }