root 55b8c76a8c distillation: audit-FULL pipeline port (phases 0/3/4) — cross-runtime metric parity verified
Ports the metric-collection passes from scripts/distillation/audit_full.ts.
The substrate that PRODUCES audit_baselines.jsonl entries — the
half OPEN #2 left as "deferred to next wave" after the read/write
substrate landed in ca142b9.

Phase coverage:
  Phase 0 (file presence)             ported
  Phase 1 (schema validators)         skipped (Go's `go test` covers it)
  Phase 2 (materializer dry-run)      deferred (Go materializer not yet ported)
  Phase 3 (scored-runs distribution)  ported
  Phase 4 (contamination firewall)    ported
  Phase 5 (receipts validation)       deferred (Go run-summary JSON not yet emitted)
  Phase 6 (replay sanity)             deferred (Go replay tool not ported)
  Phase 7 (run summary lineage)       deferred (same)

Cross-runtime parity verified end-to-end:
  Go-side audit-full against /home/profit/lakehouse produced
  metrics IDENTICAL to the last Rust-emitted audit_baselines.jsonl
  entry. All 8 ported metrics match byte-for-byte:
    p3_accepted=386, p3_partial=132, p3_rejected=57, p3_human=480,
    p4_sft_rows=353, p4_rag_rows=448, p4_pref_pairs=83, p4_total_quarantined=1325
  6/6 required checks pass on live data.

Components:
- internal/distillation/audit_full.go: PhaseCheck struct (mirrors
  Rust shape), PhaseCheckReport aggregation, RunAuditFull
  orchestrator, auditPhase0/3/4 implementations, FormatAuditFullReport
  Markdown writer.
- cmd/audit_full/main.go: CLI binary with -root, -out, -json,
  -append-baseline flags. Operators run "./bin/audit_full
  -append-baseline" to grow the longitudinal log alongside the
  Rust pipeline (entries are interchangeable — same envelope shape).
- 6 new tests: empty-root failure handling, full-fixture clean PASS
  (locks all 8 metrics + all 6 required checks), SFT firewall
  contamination detection, preference self-pair detection, sig_hash
  regex correctness (rejects wrong-length + uppercase), Markdown
  formatter smoke.

Live-data probe captured at reports/cutover/audit_full_go_vs_rust.md
(linked from reports/cutover/SUMMARY.md). Same shape as the
audit_baselines round-trip evidence — both Go-side ports of the
distillation surface are now validated against real Rust data, not
just fixtures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 01:30:23 -05:00

219 lines
8.7 KiB
Go

package distillation
import (
"os"
"path/filepath"
"strings"
"testing"
)
// TestRunAuditFull_EmptyRoot: missing data directories yield
// failures on required checks but doesn't error out the run.
// Operator running on a fresh box sees the report with the
// expected "missing" actuals.
func TestRunAuditFull_EmptyRoot(t *testing.T) {
tmp := t.TempDir()
report := RunAuditFull(AuditFullOptions{Root: tmp})
if len(report.Checks) == 0 {
t.Fatalf("expected check rows even on empty root, got %d", len(report.Checks))
}
// Phase 3's "scored-runs on disk" must fail (required); the
// failure count rises by at least 1.
if report.Failed < 1 {
t.Errorf("expected ≥1 required failure on empty root, got %d", report.Failed)
}
}
// TestRunAuditFull_FullFixtureFlow seeds a complete data layout
// and verifies all phases produce the expected metrics + a clean
// PASS verdict. Locks the end-to-end orchestration.
func TestRunAuditFull_FullFixtureFlow(t *testing.T) {
tmp := t.TempDir()
// scored-runs: one accepted record (passes phase 3 required check)
scoredDir := filepath.Join(tmp, "data", "scored-runs", "2026", "05", "01")
if err := os.MkdirAll(scoredDir, 0o755); err != nil {
t.Fatalf("mkdir scored: %v", err)
}
scoredJSONL := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0","recorded_at":"2026-05-01T00:00:00Z"}}
{"category":"partially_accepted","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff1","recorded_at":"2026-05-01T00:00:00Z"}}
{"category":"rejected","evidence_run_id":"r3","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff2","recorded_at":"2026-05-01T00:00:00Z"}}
`
if err := os.WriteFile(filepath.Join(scoredDir, "run.jsonl"), []byte(scoredJSONL), 0o644); err != nil {
t.Fatalf("write scored: %v", err)
}
// SFT export: only legal quality scores, valid sig_hash on every row.
sftDir := filepath.Join(tmp, "exports", "sft")
if err := os.MkdirAll(sftDir, 0o755); err != nil {
t.Fatalf("mkdir sft: %v", err)
}
sftJSONL := `{"quality_score":"accepted","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
{"quality_score":"partially_accepted","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff1"}}
`
if err := os.WriteFile(filepath.Join(sftDir, "instruction_response.jsonl"), []byte(sftJSONL), 0o644); err != nil {
t.Fatalf("write sft: %v", err)
}
// RAG: no rejected leaks
ragDir := filepath.Join(tmp, "exports", "rag")
if err := os.MkdirAll(ragDir, 0o755); err != nil {
t.Fatalf("mkdir rag: %v", err)
}
ragJSONL := `{"success_score":"accepted","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
`
if err := os.WriteFile(filepath.Join(ragDir, "playbooks.jsonl"), []byte(ragJSONL), 0o644); err != nil {
t.Fatalf("write rag: %v", err)
}
// Preference: distinct chosen vs rejected, no self-pairs
prefDir := filepath.Join(tmp, "exports", "preference")
if err := os.MkdirAll(prefDir, 0o755); err != nil {
t.Fatalf("mkdir pref: %v", err)
}
prefJSONL := `{"chosen_run_id":"a","rejected_run_id":"b","chosen":"good","rejected":"bad","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
`
if err := os.WriteFile(filepath.Join(prefDir, "chosen_rejected.jsonl"), []byte(prefJSONL), 0o644); err != nil {
t.Fatalf("write pref: %v", err)
}
report := RunAuditFull(AuditFullOptions{Root: tmp})
if report.Failed != 0 {
t.Errorf("clean fixture should have 0 required failures, got %d", report.Failed)
for _, c := range report.Checks {
if c.Required && !c.Passed {
t.Logf(" failed: phase=%d name=%q actual=%q", c.Phase, c.Name, c.Actual)
}
}
}
// Metrics populated correctly
if report.Metrics["p3_accepted"] != 1 {
t.Errorf("p3_accepted: got %d, want 1", report.Metrics["p3_accepted"])
}
if report.Metrics["p3_partial"] != 1 {
t.Errorf("p3_partial: got %d, want 1", report.Metrics["p3_partial"])
}
if report.Metrics["p3_rejected"] != 1 {
t.Errorf("p3_rejected: got %d, want 1", report.Metrics["p3_rejected"])
}
if report.Metrics["p4_sft_rows"] != 2 {
t.Errorf("p4_sft_rows: got %d, want 2", report.Metrics["p4_sft_rows"])
}
if report.Metrics["p4_rag_rows"] != 1 {
t.Errorf("p4_rag_rows: got %d, want 1", report.Metrics["p4_rag_rows"])
}
if report.Metrics["p4_pref_pairs"] != 1 {
t.Errorf("p4_pref_pairs: got %d, want 1", report.Metrics["p4_pref_pairs"])
}
}
// TestPhase4_SftFirewallCatchesRejected: contamination must never
// leak into SFT export. Test seeds a row with a forbidden
// quality_score and asserts the firewall flags it.
func TestPhase4_SftFirewallCatchesRejected(t *testing.T) {
tmp := t.TempDir()
sftDir := filepath.Join(tmp, "exports", "sft")
if err := os.MkdirAll(sftDir, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
bad := `{"quality_score":"rejected","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
`
if err := os.WriteFile(filepath.Join(sftDir, "instruction_response.jsonl"), []byte(bad), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
report := RunAuditFull(AuditFullOptions{Root: tmp})
found := false
for _, c := range report.Checks {
if c.Phase == 4 && strings.Contains(c.Name, "SFT contamination firewall") {
if c.Passed {
t.Errorf("firewall should fail on rejected SFT row, but check passed")
}
if c.Actual != "1" {
t.Errorf("firewall actual: got %q, want '1'", c.Actual)
}
found = true
}
}
if !found {
t.Errorf("firewall check not present in report")
}
}
// TestPhase4_PreferenceSelfPairCaught: same chosen + rejected run_id
// is structural noise and must be flagged.
func TestPhase4_PreferenceSelfPairCaught(t *testing.T) {
tmp := t.TempDir()
prefDir := filepath.Join(tmp, "exports", "preference")
if err := os.MkdirAll(prefDir, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
bad := `{"chosen_run_id":"X","rejected_run_id":"X","chosen":"a","rejected":"b","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
`
if err := os.WriteFile(filepath.Join(prefDir, "chosen_rejected.jsonl"), []byte(bad), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
report := RunAuditFull(AuditFullOptions{Root: tmp})
found := false
for _, c := range report.Checks {
if c.Phase == 4 && strings.Contains(c.Name, "self-pairs") {
if c.Passed {
t.Errorf("self-pair check should fail, but passed")
}
found = true
}
}
if !found {
t.Errorf("self-pair check not present in report")
}
}
// TestPhase4_ProvenanceRequiresValidSha256: bad sig_hash must be
// flagged. Locks the regex shape — only 64-char lowercase hex.
func TestPhase4_ProvenanceRequiresValidSha256(t *testing.T) {
tmp := t.TempDir()
sftDir := filepath.Join(tmp, "exports", "sft")
if err := os.MkdirAll(sftDir, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
// Three rows: one valid, one wrong-length, one wrong-charset (uppercase).
bad := `{"quality_score":"accepted","provenance":{"sig_hash":"a1b2c3d4e5f60718293a4b5c6d7e8f900112233445566778899aabbccddeeff0"}}
{"quality_score":"accepted","provenance":{"sig_hash":"too_short"}}
{"quality_score":"accepted","provenance":{"sig_hash":"A1B2C3D4E5F60718293A4B5C6D7E8F900112233445566778899AABBCCDDEEFF0"}}
`
if err := os.WriteFile(filepath.Join(sftDir, "instruction_response.jsonl"), []byte(bad), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
report := RunAuditFull(AuditFullOptions{Root: tmp})
for _, c := range report.Checks {
if c.Phase == 4 && strings.Contains(c.Name, "sig_hash") {
if c.Actual != "2 missing" {
t.Errorf("provenance check: got actual=%q, want '2 missing'", c.Actual)
}
if c.Passed {
t.Errorf("provenance check should fail with 2 bad sig_hashes")
}
}
}
}
// TestFormatAuditFullReport_RendersCheckTable: smoke-test the
// Markdown formatter — operators should see the right verdict +
// per-phase rows.
func TestFormatAuditFullReport_RendersCheckTable(t *testing.T) {
report := PhaseCheckReport{
GitHEAD: "deadbeef",
Checks: []PhaseCheck{
{Phase: 0, Name: "test check", Expected: "x", Actual: "x", Passed: true, Required: true},
{Phase: 4, Name: "fail check", Expected: "0", Actual: "5", Passed: false, Required: true},
},
Metrics: map[string]int64{"p3_accepted": 42, "p4_sft_rows": 17},
Failed: 1,
Skipped: 4,
}
out := FormatAuditFullReport(report)
for _, want := range []string{"FAIL", "deadbeef", "test check", "fail check", "p3_accepted", "42", "deferred"} {
if !strings.Contains(out, want) {
t.Errorf("expected %q in formatted report:\n%s", want, out)
}
}
}