golangLAKEHOUSE/internal/distillation/audit_baseline_test.go
root ca142b9271 distillation: audit-baselines lineage port — fully closes the OPEN #2 surface
The original OPEN #2 line called for "SFT export pipeline +
audit_baselines lineage." Commit 7bb432f shipped the SFT export.
This commit ports the audit_baselines half — the longitudinal
drift signal that distinguishes "metrics shifted because the world
changed" from "metrics shifted because we broke something."

Mirrors Rust scripts/distillation/audit_full.ts's substrate:

- LoadLastBaseline(path) reads the most recent entry from
  data/_kb/audit_baselines.jsonl. Returns (nil, nil) on missing
  file (first run), errors on truncated last line (partial-write
  detection — operators don't lose drift signal silently).
- AppendBaseline(path, baseline) appends one entry as a JSON line.
  Atomic at the line level via bufio + O_APPEND. Creates the
  parent directory if missing.
- BuildAuditDriftTable(prior, current, threshold) computes
  per-metric drift. flag values mirror Rust exactly: first_run,
  ok, warn. DefaultDriftWarnThreshold = 0.20 = Rust's 20%.
- FormatAuditDriftTable renders a fixed-width text grid for
  stdout dumps in audit-full runs.

Edge cases handled:
- Zero-baseline: prior=0 means no division — PctChange stays nil.
  current=0 → ok (no change). current>0 → warn (zero→nonzero is
  always notable, never silently fine).
- New metric in current: flagged first_run, not "0%-change".
  Operators see "this is a new signal we haven't tracked before."
- Sort: stable by metric name for deterministic JSON output and
  clean CI diffs.

Generic on metric name (vs Rust's pinned p2_evidence_rows etc.):
the Rust phase numbering doesn't translate to Go directly. The
AuditBaselineRustCompat constant pins the Rust names so operators
running both runtimes use the same labels, which makes drift
comparison meaningful across the two pipelines.

13 new tests covering: missing file, last-line-wins, blank-line
tolerance, malformed-line errors, append round-trip, append-to-
existing, schema validation, first-run, threshold boundary,
zero-baseline, new-metric-in-current, sort-by-metric stability,
formatter output rendering.

OPEN #2's "audit_baselines lineage" half now closed. The
distillation package surface is at parity with the Rust pipeline:
scorer, scored runs, SFT export, audit baselines all available
on the Go side.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 00:11:47 -05:00

279 lines
9.5 KiB
Go

package distillation
import (
"os"
"path/filepath"
"strings"
"testing"
)
// TestLoadLastBaseline_MissingFile: no baselines file yet → (nil, nil),
// not an error. First-run pipelines have no baseline.
func TestLoadLastBaseline_MissingFile(t *testing.T) {
tmp := t.TempDir()
got, err := LoadLastBaseline(filepath.Join(tmp, "nonexistent.jsonl"))
if err != nil {
t.Fatalf("expected nil error on missing file, got %v", err)
}
if got != nil {
t.Errorf("expected nil baseline on missing file, got %+v", got)
}
}
// TestLoadLastBaseline_LastLineWins locks the "last line is most
// recent" semantic — append-only log read backward.
func TestLoadLastBaseline_LastLineWins(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "baselines.jsonl")
jsonl := `{"recorded_at":"2026-04-01T00:00:00Z","metrics":{"a":1}}
{"recorded_at":"2026-04-15T00:00:00Z","metrics":{"a":2}}
{"recorded_at":"2026-04-30T00:00:00Z","metrics":{"a":3}}
`
if err := os.WriteFile(path, []byte(jsonl), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
got, err := LoadLastBaseline(path)
if err != nil {
t.Fatalf("LoadLastBaseline: %v", err)
}
if got == nil || got.RecordedAt != "2026-04-30T00:00:00Z" {
t.Errorf("expected last-line baseline (2026-04-30), got %+v", got)
}
if got.Metrics["a"] != 3 {
t.Errorf("expected metrics.a=3, got %d", got.Metrics["a"])
}
}
// TestLoadLastBaseline_TolerateTrailingBlankLines: writers append
// "\n" — a stray blank line at the end mustn't trigger malformed-
// JSON errors.
func TestLoadLastBaseline_TolerateTrailingBlankLines(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "baselines.jsonl")
jsonl := `{"recorded_at":"2026-04-30T00:00:00Z","metrics":{"a":3}}
`
if err := os.WriteFile(path, []byte(jsonl), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
got, err := LoadLastBaseline(path)
if err != nil {
t.Fatalf("LoadLastBaseline: %v", err)
}
if got == nil || got.Metrics["a"] != 3 {
t.Errorf("expected baseline with metrics.a=3, got %+v", got)
}
}
// TestLoadLastBaseline_MalformedLastLineErrors: partial-write on
// disk-full would leave a truncated last line. Rather than
// silently skip back to a stale baseline, surface the error so
// operators don't lose drift signal.
func TestLoadLastBaseline_MalformedLastLineErrors(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "baselines.jsonl")
jsonl := `{"recorded_at":"2026-04-15T00:00:00Z","metrics":{"a":2}}
{"recorded_at":"2026-04-30T0` // truncated
if err := os.WriteFile(path, []byte(jsonl), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
_, err := LoadLastBaseline(path)
if err == nil {
t.Errorf("expected decode error on truncated last line, got nil")
}
}
// TestAppendBaseline_RoundTrip: append one + read back via
// LoadLastBaseline. Critical for the "first audit-full run on a
// fresh box" path.
func TestAppendBaseline_RoundTrip(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "data", "_kb", "audit_baselines.jsonl")
b := AuditBaseline{
RecordedAt: "2026-05-01T12:00:00Z",
GitCommit: "deadbeef",
Metrics: map[string]int64{
"p3_accepted": 42,
"p4_sft_rows": 17,
},
}
if err := AppendBaseline(path, b); err != nil {
t.Fatalf("AppendBaseline: %v", err)
}
got, err := LoadLastBaseline(path)
if err != nil {
t.Fatalf("LoadLastBaseline: %v", err)
}
if got == nil {
t.Fatalf("expected non-nil baseline after append")
}
if got.RecordedAt != b.RecordedAt || got.GitCommit != b.GitCommit {
t.Errorf("baseline header mismatch:\n got: %+v\n want: %+v", got, b)
}
if got.Metrics["p3_accepted"] != 42 || got.Metrics["p4_sft_rows"] != 17 {
t.Errorf("metrics roundtrip: got %+v", got.Metrics)
}
}
// TestAppendBaseline_AppendsToExisting: existing file gets an
// extra line, prior contents preserved.
func TestAppendBaseline_AppendsToExisting(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "baselines.jsonl")
if err := os.WriteFile(path,
[]byte(`{"recorded_at":"2026-04-30T00:00:00Z","metrics":{"a":1}}`+"\n"),
0o644); err != nil {
t.Fatalf("seed: %v", err)
}
if err := AppendBaseline(path, AuditBaseline{
RecordedAt: "2026-05-01T00:00:00Z",
Metrics: map[string]int64{"a": 2},
}); err != nil {
t.Fatalf("AppendBaseline: %v", err)
}
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read: %v", err)
}
lines := strings.Split(strings.TrimRight(string(data), "\n"), "\n")
if len(lines) != 2 {
t.Errorf("expected 2 lines after append, got %d", len(lines))
}
}
// TestAppendBaseline_RejectsEmptyRecordedAt: schema invariant —
// every entry must carry a timestamp for ordering.
func TestAppendBaseline_RejectsEmptyRecordedAt(t *testing.T) {
tmp := t.TempDir()
path := filepath.Join(tmp, "baselines.jsonl")
err := AppendBaseline(path, AuditBaseline{Metrics: map[string]int64{"a": 1}})
if err == nil {
t.Errorf("expected error on empty RecordedAt")
}
}
// TestBuildAuditDriftTable_FirstRun: no prior → every metric flagged
// "first_run", no PctChange.
func TestBuildAuditDriftTable_FirstRun(t *testing.T) {
rows := BuildAuditDriftTable(nil, map[string]int64{"a": 10, "b": 20}, 0)
if len(rows) != 2 {
t.Fatalf("expected 2 rows, got %d", len(rows))
}
for _, r := range rows {
if r.Flag != AuditDriftFlagFirstRun {
t.Errorf("metric %s: expected first_run flag, got %s", r.Metric, r.Flag)
}
if r.PctChange != nil {
t.Errorf("metric %s: PctChange should be nil on first run", r.Metric)
}
}
}
// TestBuildAuditDriftTable_ThresholdBoundary: exactly threshold = OK,
// just over = WARN. Locks the >|threshold| (strict) semantic.
func TestBuildAuditDriftTable_ThresholdBoundary(t *testing.T) {
prior := &AuditBaseline{Metrics: map[string]int64{"a": 100, "b": 100}}
current := map[string]int64{
"a": 120, // +20% — exactly at threshold → OK
"b": 121, // +21% — over threshold → WARN
}
rows := BuildAuditDriftTable(prior, current, 0.20)
byMetric := map[string]AuditDriftRow{}
for _, r := range rows {
byMetric[r.Metric] = r
}
if byMetric["a"].Flag != AuditDriftFlagOK {
t.Errorf("metric a (+20%% exactly): expected OK, got %s", byMetric["a"].Flag)
}
if byMetric["b"].Flag != AuditDriftFlagWarn {
t.Errorf("metric b (+21%%): expected warn, got %s", byMetric["b"].Flag)
}
}
// TestBuildAuditDriftTable_ZeroBaseline: prior=0 means we can't
// compute pct (div-by-0). PctChange stays nil; current=0 stays
// OK; current>0 escalates to WARN (zero→nonzero is always
// notable).
func TestBuildAuditDriftTable_ZeroBaseline(t *testing.T) {
prior := &AuditBaseline{Metrics: map[string]int64{"stayed_zero": 0, "went_nonzero": 0}}
current := map[string]int64{
"stayed_zero": 0,
"went_nonzero": 5,
}
rows := BuildAuditDriftTable(prior, current, 0.20)
byMetric := map[string]AuditDriftRow{}
for _, r := range rows {
byMetric[r.Metric] = r
}
if byMetric["stayed_zero"].Flag != AuditDriftFlagOK {
t.Errorf("0→0 should be OK, got %s", byMetric["stayed_zero"].Flag)
}
if byMetric["went_nonzero"].Flag != AuditDriftFlagWarn {
t.Errorf("0→5 should be warn, got %s", byMetric["went_nonzero"].Flag)
}
if byMetric["stayed_zero"].PctChange != nil || byMetric["went_nonzero"].PctChange != nil {
t.Errorf("zero-baseline rows must have nil PctChange (no division by zero)")
}
}
// TestBuildAuditDriftTable_NewMetricInCurrent: a metric present in
// current but not in prior is flagged first_run, not "0%-change".
func TestBuildAuditDriftTable_NewMetricInCurrent(t *testing.T) {
prior := &AuditBaseline{Metrics: map[string]int64{"old_only": 5}}
current := map[string]int64{"old_only": 5, "brand_new": 10}
rows := BuildAuditDriftTable(prior, current, 0)
byMetric := map[string]AuditDriftRow{}
for _, r := range rows {
byMetric[r.Metric] = r
}
if byMetric["brand_new"].Flag != AuditDriftFlagFirstRun {
t.Errorf("new metric should be first_run, got %s", byMetric["brand_new"].Flag)
}
if byMetric["old_only"].Flag != AuditDriftFlagOK {
t.Errorf("unchanged metric should be OK, got %s", byMetric["old_only"].Flag)
}
}
// TestBuildAuditDriftTable_SortedByMetric: deterministic JSON
// output requires stable sort — drift tables in CI runs need to
// diff cleanly.
func TestBuildAuditDriftTable_SortedByMetric(t *testing.T) {
prior := &AuditBaseline{Metrics: map[string]int64{"zoo": 1, "alpha": 1, "midway": 1}}
current := map[string]int64{"zoo": 1, "alpha": 1, "midway": 1}
rows := BuildAuditDriftTable(prior, current, 0)
want := []string{"alpha", "midway", "zoo"}
for i, r := range rows {
if r.Metric != want[i] {
t.Errorf("rows[%d]: got %q, want %q", i, r.Metric, want[i])
}
}
}
// TestFormatAuditDriftTable_RendersFlags: stdout dump shape — we
// don't pin every byte but verify the metric names + flags appear.
func TestFormatAuditDriftTable_RendersFlags(t *testing.T) {
rows := []AuditDriftRow{
{Metric: "p3_accepted", Current: 50, Flag: AuditDriftFlagFirstRun},
{Metric: "p4_sft_rows", Baseline: int64Ptr(10), Current: 13, PctChange: float64Ptr(0.30), Flag: AuditDriftFlagWarn},
}
out := FormatAuditDriftTable(rows)
for _, want := range []string{"p3_accepted", "p4_sft_rows", "first_run", "warn", "+30.0%"} {
if !strings.Contains(out, want) {
t.Errorf("expected %q in output:\n%s", want, out)
}
}
}
// TestFormatAuditDriftTable_EmptyHeader: empty rows yields a
// single-line "(no metrics)" — operators see something instead of
// blank output.
func TestFormatAuditDriftTable_EmptyHeader(t *testing.T) {
out := FormatAuditDriftTable(nil)
if !strings.Contains(out, "no metrics") {
t.Errorf("expected 'no metrics' notice on empty input, got %q", out)
}
}
func int64Ptr(v int64) *int64 { return &v }
func float64Ptr(v float64) *float64 { return &v }