Substantial wave addressing all 4 prior OPEN items. Three closed in full, one partially (the speculative half deliberately deferred). OPEN #1 — Periodic fresh→main index merge (FULL): - POST /v1/vectors/index/{src}/merge with {dest, clear_source} - Idempotent on re-runs (existing-in-dest items skipped) - internal/vectord/index.go: new Index.IDs() snapshot method + i.ids tracker field as canonical ID set, independent of meta map's nil-vs-{} sparseness (was a real bug — IDs() backed by meta alone missed items added with nil metadata) - 4 cmd-level integration tests (happy path drain+clear, dim mismatch, dest not found, self-merge rejection) + 1 unit test - DecodeIndex backward-compat: old envelopes restore i.ids from meta keys (best effort; new items going forward use the tracker) OPEN #2 — Distillation SFT export (SUBSTRATE): - internal/distillation/sft_export.go ports the load-bearing half: IsSftNever predicate + ListScoredRunFiles (data/scored-runs/YYYY/ MM/DD walk) + LoadScoredRunsFromFile + partial ExportSft. - Synthesis (instruction/input/response generation) deferred to a separate wave — too big for this session, but the substrate makes the next wave a port-not-design exercise. - TestSftNever_PinsExpectedSet locks the contamination firewall set: if a future commit adds/removes from SftNever, this test fails — forcing the change through review. - 5 new tests; firewall fires end-to-end through the partial port. OPEN #3 — Distribution drift via PSI (FULL): - internal/drift/drift.go: ComputeDistributionDrift via Population Stability Index. Standard finance/risk metric, well-defined verdict tiers (stable < 0.10, minor 0.10–0.25, major ≥ 0.25). - Equal-width bucketing over combined min/max so neither dist falls outside; epsilon-clamping for empty buckets so log doesn't blow up. Per-bucket breakdown for drilldown. - Pairs with the existing ComputeScorerDrift: scorer drift is categorical, distribution drift is continuous. Different shapes, same package. - 7 new tests covering identical-is-stable, hard-shift-is-major, moderate-detected-not-stable, empty-inputs-safe, all-identical- safe, bucket-counts-conserved, num-buckets-clamping. OPEN #4 — Ops nice-to-haves (PARTIAL — wall-clock done, others deferred): - (a) Real-time wall-clock for stress harness: per-phase elapsed time logged to stdout as it runs (`[stress] phase NAME starting (T+12.3s)` + `[stress] phase NAME done — 8.5s (T+20.8s)`). Output.PhaseTimings + Output.TotalElapsedMs in JSON. - (b) chatd fixture-mode S3 mock + (c) liberal-paraphrase calibration: not actioned — no fired trigger, would be speculative. Documented as deferred-until-need rather than ignored. Per the project's discipline ("don't add features beyond what the task requires"). OPEN list now empty / steady-state. Future items will land as production triggers fire. Build + vet + tests green; 18 new tests across the 4 closures. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
175 lines
5.6 KiB
Go
175 lines
5.6 KiB
Go
package distillation
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
// TestIsSftNever_Firewall locks the contamination firewall set:
|
|
// the predicate fires for "rejected" and "needs_human_review" and
|
|
// no others. Per project_distillation_substrate.md: this is one of
|
|
// the substrate's load-bearing knobs — touching the firewall set
|
|
// requires explicit sign-off.
|
|
func TestIsSftNever_Firewall(t *testing.T) {
|
|
mustBlock := []ScoreCategory{
|
|
CategoryRejected,
|
|
CategoryNeedsHumanReview,
|
|
}
|
|
for _, c := range mustBlock {
|
|
if !IsSftNever(c) {
|
|
t.Errorf("firewall must block %q", c)
|
|
}
|
|
}
|
|
// Anything else should NOT be blocked. Read every category
|
|
// constant in this package and assert non-blocked unless it's
|
|
// in mustBlock.
|
|
allKnown := []ScoreCategory{
|
|
CategoryAccepted,
|
|
CategoryPartiallyAccepted,
|
|
CategoryRejected,
|
|
CategoryNeedsHumanReview,
|
|
}
|
|
for _, c := range allKnown {
|
|
shouldBlock := false
|
|
for _, b := range mustBlock {
|
|
if c == b {
|
|
shouldBlock = true
|
|
break
|
|
}
|
|
}
|
|
if got := IsSftNever(c); got != shouldBlock {
|
|
t.Errorf("IsSftNever(%q) = %v, want %v", c, got, shouldBlock)
|
|
}
|
|
}
|
|
// Unknown category is NOT blocked — that's the safe default
|
|
// (operators bumping ScoreCategory enum should explicitly add
|
|
// to firewall if they want it gated).
|
|
if IsSftNever(ScoreCategory("custom_future_category")) {
|
|
t.Errorf("unknown category must not be blocked by firewall")
|
|
}
|
|
}
|
|
|
|
// TestSftNever_PinsExpectedSet locks the firewall slice contents.
|
|
// If a future commit adds or removes categories from SftNever, this
|
|
// test fails — forcing the change through review.
|
|
func TestSftNever_PinsExpectedSet(t *testing.T) {
|
|
want := map[ScoreCategory]bool{
|
|
CategoryRejected: true,
|
|
CategoryNeedsHumanReview: true,
|
|
}
|
|
if len(SftNever) != len(want) {
|
|
t.Fatalf("SftNever has %d entries, want %d (firewall set changed without review?)",
|
|
len(SftNever), len(want))
|
|
}
|
|
for _, c := range SftNever {
|
|
if !want[c] {
|
|
t.Errorf("SftNever contains %q, which is not in the expected firewall set", c)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestListScoredRunFiles_Empty: missing root → no files, no error.
|
|
// Matches Rust behavior; operators running ExportSft on a fresh box
|
|
// shouldn't see an error before any scored runs have landed.
|
|
func TestListScoredRunFiles_Empty(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
files, err := ListScoredRunFiles(tmp)
|
|
if err != nil {
|
|
t.Fatalf("ListScoredRunFiles: %v", err)
|
|
}
|
|
if len(files) != 0 {
|
|
t.Errorf("empty root: expected 0 files, got %d", len(files))
|
|
}
|
|
}
|
|
|
|
// TestListScoredRunFiles_WalksYearMonthDay locks the directory walk
|
|
// pattern: data/scored-runs/YYYY/MM/DD/*.jsonl. Subset of full
|
|
// Rust-side test coverage but proves the walk visits the right
|
|
// nesting.
|
|
func TestListScoredRunFiles_WalksYearMonthDay(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
// Create the expected nested structure.
|
|
dirs := []string{
|
|
filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30"),
|
|
filepath.Join(tmp, "data", "scored-runs", "2026", "05", "01"),
|
|
}
|
|
for _, d := range dirs {
|
|
if err := os.MkdirAll(d, 0o755); err != nil {
|
|
t.Fatalf("mkdir: %v", err)
|
|
}
|
|
}
|
|
// Drop a JSONL in each + a non-JSONL we should skip.
|
|
for i, d := range dirs {
|
|
jsonlPath := filepath.Join(d, "run.jsonl")
|
|
if err := os.WriteFile(jsonlPath, []byte("{}\n"), 0o644); err != nil {
|
|
t.Fatalf("write %s: %v", jsonlPath, err)
|
|
}
|
|
// Non-JSONL — must be skipped.
|
|
other := filepath.Join(d, "skip.txt")
|
|
if err := os.WriteFile(other, []byte("ignore me"), 0o644); err != nil {
|
|
t.Fatalf("write %s: %v", other, err)
|
|
}
|
|
_ = i
|
|
}
|
|
files, err := ListScoredRunFiles(tmp)
|
|
if err != nil {
|
|
t.Fatalf("ListScoredRunFiles: %v", err)
|
|
}
|
|
if len(files) != 2 {
|
|
t.Errorf("expected 2 .jsonl files, got %d (%v)", len(files), files)
|
|
}
|
|
// Sort order: 2026-04-30 before 2026-05-01. Critical for audit
|
|
// baselines — the longitudinal signal depends on stable order.
|
|
if len(files) >= 2 {
|
|
if files[0] >= files[1] {
|
|
t.Errorf("files not sorted ascending: %q vs %q", files[0], files[1])
|
|
}
|
|
}
|
|
// Non-JSONL must be skipped.
|
|
for _, f := range files {
|
|
if filepath.Ext(f) != ".jsonl" {
|
|
t.Errorf("listing returned non-.jsonl: %q", f)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestExportSft_PartialPort_FirewallFires runs the partial-port
|
|
// ExportSft on a fixture with one valid + one rejected ScoredRun
|
|
// and asserts the firewall counts correctly. Locks the contamination
|
|
// guarantee at the integration layer — even before the synthesis
|
|
// half ports, the firewall protection is end-to-end testable.
|
|
func TestExportSft_PartialPort_FirewallFires(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
dir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
t.Fatalf("mkdir: %v", err)
|
|
}
|
|
// Two scored runs: one passes the firewall, one is blocked.
|
|
jsonl := `{"category":"accepted","run_id":"r1","task_id":"t1"}
|
|
{"category":"rejected","run_id":"r2","task_id":"t2"}
|
|
{"category":"partially_accepted","run_id":"r3","task_id":"t3"}
|
|
{"category":"needs_human_review","run_id":"r4","task_id":"t4"}
|
|
`
|
|
if err := os.WriteFile(filepath.Join(dir, "run.jsonl"), []byte(jsonl), 0o644); err != nil {
|
|
t.Fatalf("write: %v", err)
|
|
}
|
|
res, err := ExportSft(ExportSftOptions{
|
|
Root: tmp,
|
|
RecordedAt: "2026-04-30T00:00:00Z",
|
|
DryRun: true,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("ExportSft: %v", err)
|
|
}
|
|
if res.RecordsRead != 4 {
|
|
t.Errorf("RecordsRead: got %d, want 4", res.RecordsRead)
|
|
}
|
|
if res.RecordsExported != 2 {
|
|
t.Errorf("RecordsExported (firewall-passing): got %d, want 2", res.RecordsExported)
|
|
}
|
|
if res.RecordsQuarantined != 2 {
|
|
t.Errorf("RecordsQuarantined (firewall-blocked): got %d, want 2", res.RecordsQuarantined)
|
|
}
|
|
}
|