root b216b7e5b6 fix the other 4: close all OPEN-list items in one wave
Substantial wave addressing all 4 prior OPEN items. Three closed in
full, one partially (the speculative half deliberately deferred).

OPEN #1 — Periodic fresh→main index merge (FULL):
- POST /v1/vectors/index/{src}/merge with {dest, clear_source}
- Idempotent on re-runs (existing-in-dest items skipped)
- internal/vectord/index.go: new Index.IDs() snapshot method +
  i.ids tracker field as canonical ID set, independent of meta
  map's nil-vs-{} sparseness (was a real bug — IDs() backed by meta
  alone missed items added with nil metadata)
- 4 cmd-level integration tests (happy path drain+clear, dim
  mismatch, dest not found, self-merge rejection) + 1 unit test
- DecodeIndex backward-compat: old envelopes restore i.ids from
  meta keys (best effort; new items going forward use the tracker)

OPEN #2 — Distillation SFT export (SUBSTRATE):
- internal/distillation/sft_export.go ports the load-bearing half:
  IsSftNever predicate + ListScoredRunFiles (data/scored-runs/YYYY/
  MM/DD walk) + LoadScoredRunsFromFile + partial ExportSft.
- Synthesis (instruction/input/response generation) deferred to a
  separate wave — too big for this session, but the substrate
  makes the next wave a port-not-design exercise.
- TestSftNever_PinsExpectedSet locks the contamination firewall
  set: if a future commit adds/removes from SftNever, this test
  fails — forcing the change through review.
- 5 new tests; firewall fires end-to-end through the partial port.

OPEN #3 — Distribution drift via PSI (FULL):
- internal/drift/drift.go: ComputeDistributionDrift via Population
  Stability Index. Standard finance/risk metric, well-defined
  verdict tiers (stable < 0.10, minor 0.10–0.25, major ≥ 0.25).
- Equal-width bucketing over combined min/max so neither dist
  falls outside; epsilon-clamping for empty buckets so log doesn't
  blow up. Per-bucket breakdown for drilldown.
- Pairs with the existing ComputeScorerDrift: scorer drift is
  categorical, distribution drift is continuous. Different shapes,
  same package.
- 7 new tests covering identical-is-stable, hard-shift-is-major,
  moderate-detected-not-stable, empty-inputs-safe, all-identical-
  safe, bucket-counts-conserved, num-buckets-clamping.

OPEN #4 — Ops nice-to-haves (PARTIAL — wall-clock done, others
deferred):
- (a) Real-time wall-clock for stress harness: per-phase elapsed
  time logged to stdout as it runs (`[stress] phase NAME starting
  (T+12.3s)` + `[stress] phase NAME done — 8.5s (T+20.8s)`).
  Output.PhaseTimings + Output.TotalElapsedMs in JSON.
- (b) chatd fixture-mode S3 mock + (c) liberal-paraphrase
  calibration: not actioned — no fired trigger, would be
  speculative. Documented as deferred-until-need rather than
  ignored. Per the project's discipline ("don't add features
  beyond what the task requires").

OPEN list now empty / steady-state. Future items will land as
production triggers fire.

Build + vet + tests green; 18 new tests across the 4 closures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 23:42:11 -05:00

175 lines
5.6 KiB
Go

package distillation
import (
"os"
"path/filepath"
"testing"
)
// TestIsSftNever_Firewall locks the contamination firewall set:
// the predicate fires for "rejected" and "needs_human_review" and
// no others. Per project_distillation_substrate.md: this is one of
// the substrate's load-bearing knobs — touching the firewall set
// requires explicit sign-off.
func TestIsSftNever_Firewall(t *testing.T) {
mustBlock := []ScoreCategory{
CategoryRejected,
CategoryNeedsHumanReview,
}
for _, c := range mustBlock {
if !IsSftNever(c) {
t.Errorf("firewall must block %q", c)
}
}
// Anything else should NOT be blocked. Read every category
// constant in this package and assert non-blocked unless it's
// in mustBlock.
allKnown := []ScoreCategory{
CategoryAccepted,
CategoryPartiallyAccepted,
CategoryRejected,
CategoryNeedsHumanReview,
}
for _, c := range allKnown {
shouldBlock := false
for _, b := range mustBlock {
if c == b {
shouldBlock = true
break
}
}
if got := IsSftNever(c); got != shouldBlock {
t.Errorf("IsSftNever(%q) = %v, want %v", c, got, shouldBlock)
}
}
// Unknown category is NOT blocked — that's the safe default
// (operators bumping ScoreCategory enum should explicitly add
// to firewall if they want it gated).
if IsSftNever(ScoreCategory("custom_future_category")) {
t.Errorf("unknown category must not be blocked by firewall")
}
}
// TestSftNever_PinsExpectedSet locks the firewall slice contents.
// If a future commit adds or removes categories from SftNever, this
// test fails — forcing the change through review.
func TestSftNever_PinsExpectedSet(t *testing.T) {
want := map[ScoreCategory]bool{
CategoryRejected: true,
CategoryNeedsHumanReview: true,
}
if len(SftNever) != len(want) {
t.Fatalf("SftNever has %d entries, want %d (firewall set changed without review?)",
len(SftNever), len(want))
}
for _, c := range SftNever {
if !want[c] {
t.Errorf("SftNever contains %q, which is not in the expected firewall set", c)
}
}
}
// TestListScoredRunFiles_Empty: missing root → no files, no error.
// Matches Rust behavior; operators running ExportSft on a fresh box
// shouldn't see an error before any scored runs have landed.
func TestListScoredRunFiles_Empty(t *testing.T) {
tmp := t.TempDir()
files, err := ListScoredRunFiles(tmp)
if err != nil {
t.Fatalf("ListScoredRunFiles: %v", err)
}
if len(files) != 0 {
t.Errorf("empty root: expected 0 files, got %d", len(files))
}
}
// TestListScoredRunFiles_WalksYearMonthDay locks the directory walk
// pattern: data/scored-runs/YYYY/MM/DD/*.jsonl. Subset of full
// Rust-side test coverage but proves the walk visits the right
// nesting.
func TestListScoredRunFiles_WalksYearMonthDay(t *testing.T) {
tmp := t.TempDir()
// Create the expected nested structure.
dirs := []string{
filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30"),
filepath.Join(tmp, "data", "scored-runs", "2026", "05", "01"),
}
for _, d := range dirs {
if err := os.MkdirAll(d, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
}
// Drop a JSONL in each + a non-JSONL we should skip.
for i, d := range dirs {
jsonlPath := filepath.Join(d, "run.jsonl")
if err := os.WriteFile(jsonlPath, []byte("{}\n"), 0o644); err != nil {
t.Fatalf("write %s: %v", jsonlPath, err)
}
// Non-JSONL — must be skipped.
other := filepath.Join(d, "skip.txt")
if err := os.WriteFile(other, []byte("ignore me"), 0o644); err != nil {
t.Fatalf("write %s: %v", other, err)
}
_ = i
}
files, err := ListScoredRunFiles(tmp)
if err != nil {
t.Fatalf("ListScoredRunFiles: %v", err)
}
if len(files) != 2 {
t.Errorf("expected 2 .jsonl files, got %d (%v)", len(files), files)
}
// Sort order: 2026-04-30 before 2026-05-01. Critical for audit
// baselines — the longitudinal signal depends on stable order.
if len(files) >= 2 {
if files[0] >= files[1] {
t.Errorf("files not sorted ascending: %q vs %q", files[0], files[1])
}
}
// Non-JSONL must be skipped.
for _, f := range files {
if filepath.Ext(f) != ".jsonl" {
t.Errorf("listing returned non-.jsonl: %q", f)
}
}
}
// TestExportSft_PartialPort_FirewallFires runs the partial-port
// ExportSft on a fixture with one valid + one rejected ScoredRun
// and asserts the firewall counts correctly. Locks the contamination
// guarantee at the integration layer — even before the synthesis
// half ports, the firewall protection is end-to-end testable.
func TestExportSft_PartialPort_FirewallFires(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
// Two scored runs: one passes the firewall, one is blocked.
jsonl := `{"category":"accepted","run_id":"r1","task_id":"t1"}
{"category":"rejected","run_id":"r2","task_id":"t2"}
{"category":"partially_accepted","run_id":"r3","task_id":"t3"}
{"category":"needs_human_review","run_id":"r4","task_id":"t4"}
`
if err := os.WriteFile(filepath.Join(dir, "run.jsonl"), []byte(jsonl), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
res, err := ExportSft(ExportSftOptions{
Root: tmp,
RecordedAt: "2026-04-30T00:00:00Z",
DryRun: true,
})
if err != nil {
t.Fatalf("ExportSft: %v", err)
}
if res.RecordsRead != 4 {
t.Errorf("RecordsRead: got %d, want 4", res.RecordsRead)
}
if res.RecordsExported != 2 {
t.Errorf("RecordsExported (firewall-passing): got %d, want 2", res.RecordsExported)
}
if res.RecordsQuarantined != 2 {
t.Errorf("RecordsQuarantined (firewall-blocked): got %d, want 2", res.RecordsQuarantined)
}
}