root b216b7e5b6 fix the other 4: close all OPEN-list items in one wave
Substantial wave addressing all 4 prior OPEN items. Three closed in
full, one partially (the speculative half deliberately deferred).

OPEN #1 — Periodic fresh→main index merge (FULL):
- POST /v1/vectors/index/{src}/merge with {dest, clear_source}
- Idempotent on re-runs (existing-in-dest items skipped)
- internal/vectord/index.go: new Index.IDs() snapshot method +
  i.ids tracker field as canonical ID set, independent of meta
  map's nil-vs-{} sparseness (was a real bug — IDs() backed by meta
  alone missed items added with nil metadata)
- 4 cmd-level integration tests (happy path drain+clear, dim
  mismatch, dest not found, self-merge rejection) + 1 unit test
- DecodeIndex backward-compat: old envelopes restore i.ids from
  meta keys (best effort; new items going forward use the tracker)

OPEN #2 — Distillation SFT export (SUBSTRATE):
- internal/distillation/sft_export.go ports the load-bearing half:
  IsSftNever predicate + ListScoredRunFiles (data/scored-runs/YYYY/
  MM/DD walk) + LoadScoredRunsFromFile + partial ExportSft.
- Synthesis (instruction/input/response generation) deferred to a
  separate wave — too big for this session, but the substrate
  makes the next wave a port-not-design exercise.
- TestSftNever_PinsExpectedSet locks the contamination firewall
  set: if a future commit adds/removes from SftNever, this test
  fails — forcing the change through review.
- 5 new tests; firewall fires end-to-end through the partial port.

OPEN #3 — Distribution drift via PSI (FULL):
- internal/drift/drift.go: ComputeDistributionDrift via Population
  Stability Index. Standard finance/risk metric, well-defined
  verdict tiers (stable < 0.10, minor 0.10–0.25, major ≥ 0.25).
- Equal-width bucketing over combined min/max so neither dist
  falls outside; epsilon-clamping for empty buckets so log doesn't
  blow up. Per-bucket breakdown for drilldown.
- Pairs with the existing ComputeScorerDrift: scorer drift is
  categorical, distribution drift is continuous. Different shapes,
  same package.
- 7 new tests covering identical-is-stable, hard-shift-is-major,
  moderate-detected-not-stable, empty-inputs-safe, all-identical-
  safe, bucket-counts-conserved, num-buckets-clamping.

OPEN #4 — Ops nice-to-haves (PARTIAL — wall-clock done, others
deferred):
- (a) Real-time wall-clock for stress harness: per-phase elapsed
  time logged to stdout as it runs (`[stress] phase NAME starting
  (T+12.3s)` + `[stress] phase NAME done — 8.5s (T+20.8s)`).
  Output.PhaseTimings + Output.TotalElapsedMs in JSON.
- (b) chatd fixture-mode S3 mock + (c) liberal-paraphrase
  calibration: not actioned — no fired trigger, would be
  speculative. Documented as deferred-until-need rather than
  ignored. Per the project's discipline ("don't add features
  beyond what the task requires").

OPEN list now empty / steady-state. Future items will land as
production triggers fire.

Build + vet + tests green; 18 new tests across the 4 closures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 23:42:11 -05:00

290 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package drift
import (
"testing"
"git.agentview.dev/profit/golangLAKEHOUSE/internal/distillation"
)
func mkInput(sourceFile string, persisted distillation.ScoreCategory, succ []string) ScorerDriftInput {
return ScorerDriftInput{
Record: distillation.EvidenceRecord{
RunID: "run-x",
TaskID: "task-x",
Timestamp: "2026-01-01T00:00:00Z",
SchemaVersion: distillation.EvidenceSchemaVersion,
Provenance: distillation.Provenance{
SourceFile: sourceFile,
SigHash: "abc",
RecordedAt: "2026-01-01T00:00:01Z",
},
SuccessMarkers: succ,
},
PersistedCategory: persisted,
}
}
func TestComputeScorerDrift_NoDrift(t *testing.T) {
// All inputs have persisted=accepted matching what the current
// scrum_review scorer produces on accepted_on_attempt_1.
inputs := []ScorerDriftInput{
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
}
r := ComputeScorerDrift(inputs, true)
if r.TotalChecked != 3 || r.Matched != 3 || r.Drifted != 0 {
t.Errorf("no-drift case: total=%d matched=%d drifted=%d",
r.TotalChecked, r.Matched, r.Drifted)
}
if r.DriftRate != 0 {
t.Errorf("drift_rate: want 0, got %v", r.DriftRate)
}
if len(r.Entries) != 0 {
t.Errorf("entries: want 0, got %d", len(r.Entries))
}
}
func TestComputeScorerDrift_ShiftDetected(t *testing.T) {
// Simulate a historical labeling where the persisted scorer
// thought attempt-2 acceptances were "accepted" but the current
// scorer (this code) categorizes them as "partially_accepted".
// Drift should fire on those.
inputs := []ScorerDriftInput{
// Match: attempt 1 → accepted (still)
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_1"}),
// Drift: persisted thought attempt-2 was accepted, today's scorer says partial
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_3"}),
// Drift: persisted thought attempt-5 was accepted, today's scorer says partial (high-cost)
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_5"}),
}
r := ComputeScorerDrift(inputs, true)
if r.TotalChecked != 5 {
t.Errorf("total: want 5, got %d", r.TotalChecked)
}
if r.Matched != 2 {
t.Errorf("matched: want 2, got %d", r.Matched)
}
if r.Drifted != 3 {
t.Errorf("drifted: want 3, got %d", r.Drifted)
}
wantRate := 3.0 / 5.0
if r.DriftRate < wantRate-1e-9 || r.DriftRate > wantRate+1e-9 {
t.Errorf("drift_rate: want %v, got %v", wantRate, r.DriftRate)
}
if len(r.Entries) != 3 {
t.Errorf("entries: want 3 mismatches, got %d", len(r.Entries))
}
// Shift matrix should show one shift: accepted → partially_accepted, count=3
if len(r.ShiftMatrix) != 1 {
t.Errorf("shift matrix: want 1 shift, got %d (%+v)", len(r.ShiftMatrix), r.ShiftMatrix)
} else {
s := r.ShiftMatrix[0]
if s.From != distillation.CategoryAccepted ||
s.To != distillation.CategoryPartiallyAccepted ||
s.Count != 3 {
t.Errorf("shift: got %+v", s)
}
}
}
func TestComputeScorerDrift_MultipleShiftsSortedByCount(t *testing.T) {
inputs := []ScorerDriftInput{
// 3× accepted→partial
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
// 1× rejected→needs_human (no marker)
{
Record: distillation.EvidenceRecord{
RunID: "r1", TaskID: "t1",
Timestamp: "2026-01-01T00:00:00Z",
SchemaVersion: distillation.EvidenceSchemaVersion,
Provenance: distillation.Provenance{
SourceFile: "data/_kb/scrum_reviews.jsonl",
SigHash: "x", RecordedAt: "2026-01-01T00:00:01Z",
},
// no markers → needs_human_review
},
PersistedCategory: distillation.CategoryRejected,
},
}
r := ComputeScorerDrift(inputs, false)
if r.Drifted != 4 {
t.Errorf("drifted: want 4, got %d", r.Drifted)
}
if len(r.ShiftMatrix) != 2 {
t.Errorf("shift matrix: want 2 distinct shifts, got %d", len(r.ShiftMatrix))
}
// Sorted by count desc, so accepted→partial (3) before rejected→needs_human (1)
if r.ShiftMatrix[0].Count != 3 || r.ShiftMatrix[1].Count != 1 {
t.Errorf("shift order wrong: got %+v", r.ShiftMatrix)
}
}
func TestComputeScorerDrift_IncludeEntriesFalse(t *testing.T) {
inputs := []ScorerDriftInput{
mkInput("data/_kb/scrum_reviews.jsonl", distillation.CategoryAccepted, []string{"accepted_on_attempt_2"}),
}
r := ComputeScorerDrift(inputs, false)
if r.Drifted != 1 {
t.Errorf("drifted: want 1, got %d", r.Drifted)
}
if len(r.Entries) != 0 {
t.Errorf("entries: want 0 when includeEntries=false, got %d", len(r.Entries))
}
}
func TestComputeScorerDrift_EmptyInput(t *testing.T) {
r := ComputeScorerDrift(nil, true)
if r.TotalChecked != 0 || r.Drifted != 0 || r.Matched != 0 {
t.Errorf("empty: want all-zero, got %+v", r)
}
if r.DriftRate != 0 {
t.Errorf("drift_rate on empty: want 0, got %v", r.DriftRate)
}
}
func TestComputeScorerDrift_ScorerVersionStamped(t *testing.T) {
r := ComputeScorerDrift(nil, false)
if r.ScorerVersion != distillation.ScorerVersion {
t.Errorf("scorer_version: want %q, got %q", distillation.ScorerVersion, r.ScorerVersion)
}
}
// ── Distribution drift (PSI) tests ────────────────────────────────
// TestDistributionDrift_IdenticalIsStable: same data on both sides
// should yield PSI ≈ 0 and tier=stable. Anchors the lower bound.
func TestDistributionDrift_IdenticalIsStable(t *testing.T) {
data := []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: data,
Current: data,
NumBuckets: 5,
})
if r.PSI > 0.001 {
t.Errorf("identical distributions: expected PSI ≈ 0, got %f", r.PSI)
}
if r.Tier != DriftTierStable {
t.Errorf("expected stable tier, got %q", r.Tier)
}
}
// TestDistributionDrift_HardShiftIsMajor: distribution moved
// completely to a different range — should yield major tier.
func TestDistributionDrift_HardShiftIsMajor(t *testing.T) {
baseline := []float64{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4}
current := []float64{0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0}
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: baseline,
Current: current,
NumBuckets: 10,
})
if r.PSI < 0.25 {
t.Errorf("hard distribution shift: expected PSI ≥ 0.25, got %f", r.PSI)
}
if r.Tier != DriftTierMajor {
t.Errorf("expected major tier, got %q", r.Tier)
}
}
// TestDistributionDrift_DetectsModerateShift: distribution shifted
// noticeably but not catastrophically — PSI must be > 0 (some drift
// detected) and tier must NOT be stable. Whether the tier is minor
// vs major depends on bucketing granularity; we don't pin that here
// because PSI thresholds are sensitive to bucket count.
func TestDistributionDrift_DetectsModerateShift(t *testing.T) {
// Baseline: many around 0.5, some spread.
baseline := []float64{0.4, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.6, 0.6,
0.45, 0.5, 0.5, 0.55, 0.5, 0.5, 0.5, 0.55, 0.5, 0.6}
// Current: same range, slight rightward shift (still overlapping).
current := []float64{0.45, 0.5, 0.5, 0.55, 0.55, 0.55, 0.6, 0.6, 0.65, 0.7,
0.5, 0.55, 0.55, 0.55, 0.6, 0.6, 0.55, 0.6, 0.65, 0.65}
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: baseline,
Current: current,
NumBuckets: 10,
})
if r.PSI < 0.01 {
t.Errorf("moderate shift should produce PSI > 0.01, got %f", r.PSI)
}
if r.Tier == DriftTierStable {
t.Errorf("moderate shift should NOT be stable tier, got PSI=%f tier=%q", r.PSI, r.Tier)
}
}
// TestDistributionDrift_EmptyInputs: empty baseline OR current
// returns PSI=0, stable tier — caller must check N before trusting.
func TestDistributionDrift_EmptyInputs(t *testing.T) {
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: []float64{},
Current: []float64{1, 2, 3},
})
if r.PSI != 0 || r.Tier != DriftTierStable {
t.Errorf("empty baseline: expected PSI=0 stable, got psi=%f tier=%q", r.PSI, r.Tier)
}
r = ComputeDistributionDrift(DistributionDriftInput{
Baseline: []float64{1, 2, 3},
Current: []float64{},
})
if r.PSI != 0 || r.Tier != DriftTierStable {
t.Errorf("empty current: expected PSI=0 stable, got psi=%f tier=%q", r.PSI, r.Tier)
}
}
// TestDistributionDrift_AllIdenticalValues: degenerate case where
// everything's the same value (e.g., all zeros). Should not panic;
// returns stable.
func TestDistributionDrift_AllIdenticalValues(t *testing.T) {
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: []float64{0.5, 0.5, 0.5},
Current: []float64{0.5, 0.5, 0.5},
})
if r.Tier != DriftTierStable {
t.Errorf("expected stable on identical-singleton, got %q", r.Tier)
}
}
// TestDistributionDrift_BucketCounts: per-bucket counts must sum to
// the input N. If they don't, we're losing observations to bucket
// boundary issues.
func TestDistributionDrift_BucketCounts(t *testing.T) {
baseline := []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}
current := []float64{0.5, 0.5, 0.5, 0.5}
r := ComputeDistributionDrift(DistributionDriftInput{
Baseline: baseline,
Current: current,
NumBuckets: 5,
})
totalB := 0
totalC := 0
for _, b := range r.Buckets {
totalB += b.BaselineCount
totalC += b.CurrentCount
}
if totalB != len(baseline) {
t.Errorf("baseline bucket counts sum to %d, expected %d", totalB, len(baseline))
}
if totalC != len(current) {
t.Errorf("current bucket counts sum to %d, expected %d", totalC, len(current))
}
}
// TestDistributionDrift_NumBucketsClamping: 0 → default 10; > 100 → 100.
func TestDistributionDrift_NumBucketsClamping(t *testing.T) {
in := DistributionDriftInput{Baseline: []float64{1, 2}, Current: []float64{1, 2}}
in.NumBuckets = 0
r := ComputeDistributionDrift(in)
if r.NumBuckets != 10 {
t.Errorf("0 should default to 10 buckets, got %d", r.NumBuckets)
}
in.NumBuckets = 500
r = ComputeDistributionDrift(in)
if r.NumBuckets != 100 {
t.Errorf("500 should clamp to 100 buckets, got %d", r.NumBuckets)
}
}