Substantial wave addressing all 4 prior OPEN items. Three closed in full, one partially (the speculative half deliberately deferred). OPEN #1 — Periodic fresh→main index merge (FULL): - POST /v1/vectors/index/{src}/merge with {dest, clear_source} - Idempotent on re-runs (existing-in-dest items skipped) - internal/vectord/index.go: new Index.IDs() snapshot method + i.ids tracker field as canonical ID set, independent of meta map's nil-vs-{} sparseness (was a real bug — IDs() backed by meta alone missed items added with nil metadata) - 4 cmd-level integration tests (happy path drain+clear, dim mismatch, dest not found, self-merge rejection) + 1 unit test - DecodeIndex backward-compat: old envelopes restore i.ids from meta keys (best effort; new items going forward use the tracker) OPEN #2 — Distillation SFT export (SUBSTRATE): - internal/distillation/sft_export.go ports the load-bearing half: IsSftNever predicate + ListScoredRunFiles (data/scored-runs/YYYY/ MM/DD walk) + LoadScoredRunsFromFile + partial ExportSft. - Synthesis (instruction/input/response generation) deferred to a separate wave — too big for this session, but the substrate makes the next wave a port-not-design exercise. - TestSftNever_PinsExpectedSet locks the contamination firewall set: if a future commit adds/removes from SftNever, this test fails — forcing the change through review. - 5 new tests; firewall fires end-to-end through the partial port. OPEN #3 — Distribution drift via PSI (FULL): - internal/drift/drift.go: ComputeDistributionDrift via Population Stability Index. Standard finance/risk metric, well-defined verdict tiers (stable < 0.10, minor 0.10–0.25, major ≥ 0.25). - Equal-width bucketing over combined min/max so neither dist falls outside; epsilon-clamping for empty buckets so log doesn't blow up. Per-bucket breakdown for drilldown. - Pairs with the existing ComputeScorerDrift: scorer drift is categorical, distribution drift is continuous. Different shapes, same package. - 7 new tests covering identical-is-stable, hard-shift-is-major, moderate-detected-not-stable, empty-inputs-safe, all-identical- safe, bucket-counts-conserved, num-buckets-clamping. OPEN #4 — Ops nice-to-haves (PARTIAL — wall-clock done, others deferred): - (a) Real-time wall-clock for stress harness: per-phase elapsed time logged to stdout as it runs (`[stress] phase NAME starting (T+12.3s)` + `[stress] phase NAME done — 8.5s (T+20.8s)`). Output.PhaseTimings + Output.TotalElapsedMs in JSON. - (b) chatd fixture-mode S3 mock + (c) liberal-paraphrase calibration: not actioned — no fired trigger, would be speculative. Documented as deferred-until-need rather than ignored. Per the project's discipline ("don't add features beyond what the task requires"). OPEN list now empty / steady-state. Future items will land as production triggers fire. Build + vet + tests green; 18 new tests across the 4 closures. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
290 lines
11 KiB
Go
290 lines
11 KiB
Go
package drift
|
||
|
||
import (
|
||
"testing"
|
||
|
||
"git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation"
|
||
)
|
||
|
||
func mkInput(sourceFile string, persisted distillation.ScoreCategory, succ []string) ScorerDriftInput {
|
||
return ScorerDriftInput{
|
||
Record: distillation.EvidenceRecord{
|
||
RunID: "run-x",
|
||
TaskID: "task-x",
|
||
Timestamp: "2026-01-01T00:00:00Z",
|
||
SchemaVersion: distillation.EvidenceSchemaVersion,
|
||
Provenance: distillation.Provenance{
|
||
SourceFile: sourceFile,
|
||
SigHash: "abc",
|
||
RecordedAt: "2026-01-01T00:00:01Z",
|
||
},
|
||
SuccessMarkers: succ,
|
||
},
|
||
PersistedCategory: persisted,
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_NoDrift(t *testing.T) {
|
||
// All inputs have persisted=accepted matching what the current
|
||
// scrum_review scorer produces on accepted_on_attempt_1.
|
||
inputs := []ScorerDriftInput{
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
|
||
}
|
||
r := ComputeScorerDrift(inputs, true)
|
||
if r.TotalChecked != 3 || r.Matched != 3 || r.Drifted != 0 {
|
||
t.Errorf("no-drift case: total=%d matched=%d drifted=%d",
|
||
r.TotalChecked, r.Matched, r.Drifted)
|
||
}
|
||
if r.DriftRate != 0 {
|
||
t.Errorf("drift_rate: want 0, got %v", r.DriftRate)
|
||
}
|
||
if len(r.Entries) != 0 {
|
||
t.Errorf("entries: want 0, got %d", len(r.Entries))
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_ShiftDetected(t *testing.T) {
|
||
// Simulate a historical labeling where the persisted scorer
|
||
// thought attempt-2 acceptances were "accepted" but the current
|
||
// scorer (this code) categorizes them as "partially_accepted".
|
||
// Drift should fire on those.
|
||
inputs := []ScorerDriftInput{
|
||
// Match: attempt 1 → accepted (still)
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
|
||
// Drift: persisted thought attempt-2 was accepted, today's scorer says partial
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_3"}),
|
||
// Drift: persisted thought attempt-5 was accepted, today's scorer says partial (high-cost)
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_5"}),
|
||
}
|
||
r := ComputeScorerDrift(inputs, true)
|
||
if r.TotalChecked != 5 {
|
||
t.Errorf("total: want 5, got %d", r.TotalChecked)
|
||
}
|
||
if r.Matched != 2 {
|
||
t.Errorf("matched: want 2, got %d", r.Matched)
|
||
}
|
||
if r.Drifted != 3 {
|
||
t.Errorf("drifted: want 3, got %d", r.Drifted)
|
||
}
|
||
wantRate := 3.0 / 5.0
|
||
if r.DriftRate < wantRate-1e-9 || r.DriftRate > wantRate+1e-9 {
|
||
t.Errorf("drift_rate: want %v, got %v", wantRate, r.DriftRate)
|
||
}
|
||
if len(r.Entries) != 3 {
|
||
t.Errorf("entries: want 3 mismatches, got %d", len(r.Entries))
|
||
}
|
||
// Shift matrix should show one shift: accepted → partially_accepted, count=3
|
||
if len(r.ShiftMatrix) != 1 {
|
||
t.Errorf("shift matrix: want 1 shift, got %d (%+v)", len(r.ShiftMatrix), r.ShiftMatrix)
|
||
} else {
|
||
s := r.ShiftMatrix[0]
|
||
if s.From != distillation.CategoryAccepted ||
|
||
s.To != distillation.CategoryPartiallyAccepted ||
|
||
s.Count != 3 {
|
||
t.Errorf("shift: got %+v", s)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_MultipleShiftsSortedByCount(t *testing.T) {
|
||
inputs := []ScorerDriftInput{
|
||
// 3× accepted→partial
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
|
||
// 1× rejected→needs_human (no marker)
|
||
{
|
||
Record: distillation.EvidenceRecord{
|
||
RunID: "r1", TaskID: "t1",
|
||
Timestamp: "2026-01-01T00:00:00Z",
|
||
SchemaVersion: distillation.EvidenceSchemaVersion,
|
||
Provenance: distillation.Provenance{
|
||
SourceFile: "data/_kb/scrum_reviews.jsonl",
|
||
SigHash: "x", RecordedAt: "2026-01-01T00:00:01Z",
|
||
},
|
||
// no markers → needs_human_review
|
||
},
|
||
PersistedCategory: distillation.CategoryRejected,
|
||
},
|
||
}
|
||
r := ComputeScorerDrift(inputs, false)
|
||
if r.Drifted != 4 {
|
||
t.Errorf("drifted: want 4, got %d", r.Drifted)
|
||
}
|
||
if len(r.ShiftMatrix) != 2 {
|
||
t.Errorf("shift matrix: want 2 distinct shifts, got %d", len(r.ShiftMatrix))
|
||
}
|
||
// Sorted by count desc, so accepted→partial (3) before rejected→needs_human (1)
|
||
if r.ShiftMatrix[0].Count != 3 || r.ShiftMatrix[1].Count != 1 {
|
||
t.Errorf("shift order wrong: got %+v", r.ShiftMatrix)
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_IncludeEntriesFalse(t *testing.T) {
|
||
inputs := []ScorerDriftInput{
|
||
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
|
||
}
|
||
r := ComputeScorerDrift(inputs, false)
|
||
if r.Drifted != 1 {
|
||
t.Errorf("drifted: want 1, got %d", r.Drifted)
|
||
}
|
||
if len(r.Entries) != 0 {
|
||
t.Errorf("entries: want 0 when includeEntries=false, got %d", len(r.Entries))
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_EmptyInput(t *testing.T) {
|
||
r := ComputeScorerDrift(nil, true)
|
||
if r.TotalChecked != 0 || r.Drifted != 0 || r.Matched != 0 {
|
||
t.Errorf("empty: want all-zero, got %+v", r)
|
||
}
|
||
if r.DriftRate != 0 {
|
||
t.Errorf("drift_rate on empty: want 0, got %v", r.DriftRate)
|
||
}
|
||
}
|
||
|
||
func TestComputeScorerDrift_ScorerVersionStamped(t *testing.T) {
|
||
r := ComputeScorerDrift(nil, false)
|
||
if r.ScorerVersion != distillation.ScorerVersion {
|
||
t.Errorf("scorer_version: want %q, got %q", distillation.ScorerVersion, r.ScorerVersion)
|
||
}
|
||
}
|
||
|
||
// ── Distribution drift (PSI) tests ────────────────────────────────
|
||
|
||
// TestDistributionDrift_IdenticalIsStable: same data on both sides
|
||
// should yield PSI ≈ 0 and tier=stable. Anchors the lower bound.
|
||
func TestDistributionDrift_IdenticalIsStable(t *testing.T) {
|
||
data := []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: data,
|
||
Current: data,
|
||
NumBuckets: 5,
|
||
})
|
||
if r.PSI > 0.001 {
|
||
t.Errorf("identical distributions: expected PSI ≈ 0, got %f", r.PSI)
|
||
}
|
||
if r.Tier != DriftTierStable {
|
||
t.Errorf("expected stable tier, got %q", r.Tier)
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_HardShiftIsMajor: distribution moved
|
||
// completely to a different range — should yield major tier.
|
||
func TestDistributionDrift_HardShiftIsMajor(t *testing.T) {
|
||
baseline := []float64{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4}
|
||
current := []float64{0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0}
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: baseline,
|
||
Current: current,
|
||
NumBuckets: 10,
|
||
})
|
||
if r.PSI < 0.25 {
|
||
t.Errorf("hard distribution shift: expected PSI ≥ 0.25, got %f", r.PSI)
|
||
}
|
||
if r.Tier != DriftTierMajor {
|
||
t.Errorf("expected major tier, got %q", r.Tier)
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_DetectsModerateShift: distribution shifted
|
||
// noticeably but not catastrophically — PSI must be > 0 (some drift
|
||
// detected) and tier must NOT be stable. Whether the tier is minor
|
||
// vs major depends on bucketing granularity; we don't pin that here
|
||
// because PSI thresholds are sensitive to bucket count.
|
||
func TestDistributionDrift_DetectsModerateShift(t *testing.T) {
|
||
// Baseline: many around 0.5, some spread.
|
||
baseline := []float64{0.4, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.6, 0.6,
|
||
0.45, 0.5, 0.5, 0.55, 0.5, 0.5, 0.5, 0.55, 0.5, 0.6}
|
||
// Current: same range, slight rightward shift (still overlapping).
|
||
current := []float64{0.45, 0.5, 0.5, 0.55, 0.55, 0.55, 0.6, 0.6, 0.65, 0.7,
|
||
0.5, 0.55, 0.55, 0.55, 0.6, 0.6, 0.55, 0.6, 0.65, 0.65}
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: baseline,
|
||
Current: current,
|
||
NumBuckets: 10,
|
||
})
|
||
if r.PSI < 0.01 {
|
||
t.Errorf("moderate shift should produce PSI > 0.01, got %f", r.PSI)
|
||
}
|
||
if r.Tier == DriftTierStable {
|
||
t.Errorf("moderate shift should NOT be stable tier, got PSI=%f tier=%q", r.PSI, r.Tier)
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_EmptyInputs: empty baseline OR current
|
||
// returns PSI=0, stable tier — caller must check N before trusting.
|
||
func TestDistributionDrift_EmptyInputs(t *testing.T) {
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: []float64{},
|
||
Current: []float64{1, 2, 3},
|
||
})
|
||
if r.PSI != 0 || r.Tier != DriftTierStable {
|
||
t.Errorf("empty baseline: expected PSI=0 stable, got psi=%f tier=%q", r.PSI, r.Tier)
|
||
}
|
||
r = ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: []float64{1, 2, 3},
|
||
Current: []float64{},
|
||
})
|
||
if r.PSI != 0 || r.Tier != DriftTierStable {
|
||
t.Errorf("empty current: expected PSI=0 stable, got psi=%f tier=%q", r.PSI, r.Tier)
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_AllIdenticalValues: degenerate case where
|
||
// everything's the same value (e.g., all zeros). Should not panic;
|
||
// returns stable.
|
||
func TestDistributionDrift_AllIdenticalValues(t *testing.T) {
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: []float64{0.5, 0.5, 0.5},
|
||
Current: []float64{0.5, 0.5, 0.5},
|
||
})
|
||
if r.Tier != DriftTierStable {
|
||
t.Errorf("expected stable on identical-singleton, got %q", r.Tier)
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_BucketCounts: per-bucket counts must sum to
|
||
// the input N. If they don't, we're losing observations to bucket
|
||
// boundary issues.
|
||
func TestDistributionDrift_BucketCounts(t *testing.T) {
|
||
baseline := []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}
|
||
current := []float64{0.5, 0.5, 0.5, 0.5}
|
||
r := ComputeDistributionDrift(DistributionDriftInput{
|
||
Baseline: baseline,
|
||
Current: current,
|
||
NumBuckets: 5,
|
||
})
|
||
totalB := 0
|
||
totalC := 0
|
||
for _, b := range r.Buckets {
|
||
totalB += b.BaselineCount
|
||
totalC += b.CurrentCount
|
||
}
|
||
if totalB != len(baseline) {
|
||
t.Errorf("baseline bucket counts sum to %d, expected %d", totalB, len(baseline))
|
||
}
|
||
if totalC != len(current) {
|
||
t.Errorf("current bucket counts sum to %d, expected %d", totalC, len(current))
|
||
}
|
||
}
|
||
|
||
// TestDistributionDrift_NumBucketsClamping: 0 → default 10; > 100 → 100.
|
||
func TestDistributionDrift_NumBucketsClamping(t *testing.T) {
|
||
in := DistributionDriftInput{Baseline: []float64{1, 2}, Current: []float64{1, 2}}
|
||
in.NumBuckets = 0
|
||
r := ComputeDistributionDrift(in)
|
||
if r.NumBuckets != 10 {
|
||
t.Errorf("0 should default to 10 buckets, got %d", r.NumBuckets)
|
||
}
|
||
in.NumBuckets = 500
|
||
r = ComputeDistributionDrift(in)
|
||
if r.NumBuckets != 100 {
|
||
t.Errorf("500 should clamp to 100 buckets, got %d", r.NumBuckets)
|
||
}
|
||
}
|