golangLAKEHOUSE/internal/catalogd/subject_test.go

package catalogd

import (
	"crypto/hmac"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"strings"
	"testing"
	"time"
)

// deterministicKey is the same fixture key the Rust tests use:
// (0u8..32).collect() — so a Rust-written chain verifies under Go.
func deterministicKey() []byte {
	k := make([]byte, 32)
	for i := range k {
		k[i] = byte(i)
	}
	return k
}

func mkRow(candidateID string, fields []string, prevHash, ts string) SubjectAuditRow {
	t, _ := time.Parse(time.RFC3339Nano, ts)
	return SubjectAuditRow{
		Schema:      "subject_audit.v1",
		Ts:          t,
		CandidateID: candidateID,
		Accessor: AuditAccessor{
			Kind:    "gateway_lookup",
			Daemon:  "gateway",
			Purpose: "fill_validation",
			TraceID: "",
		},
		FieldsAccessed: fields,
		Result:         "success",
		PrevChainHash:  prevHash,
		RowHmac:        "", // computed below
	}
}

// TestCanonicalJSON_KeysSortedAlphabetically asserts the same property
// the Rust unit test asserts (subject_audit::tests::canonical_json_sorts_keys_alphabetically).
func TestCanonicalJSON_KeysSortedAlphabetically(t *testing.T) {
	v := map[string]any{
		"z": 1,
		"a": 2,
		"m": map[string]any{"y": 1, "b": 2},
	}
	var buf strings.Builder
	if err := writeCanonical(&buf, v); err != nil {
		t.Fatalf("canonical: %v", err)
	}
	s := buf.String()
	a, m, z := strings.Index(s, "\"a\""), strings.Index(s, "\"m\""), strings.Index(s, "\"z\"")
	if !(a < m && m < z) {
		t.Fatalf("top-level keys out of order: %s", s)
	}
	b, y := strings.Index(s, "\"b\""), strings.Index(s, "\"y\"")
	if !(b < y) {
		t.Fatalf("nested keys out of order: %s", s)
	}
}

// TestCanonicalJSON_ArraysPreserveOrder asserts arrays are NOT sorted —
// matches Rust subject_audit::tests::canonical_json_arrays_preserve_order.
func TestCanonicalJSON_ArraysPreserveOrder(t *testing.T) {
	v := map[string]any{"k": []any{"c", "a", "b"}}
	var buf strings.Builder
	if err := writeCanonical(&buf, v); err != nil {
		t.Fatalf("canonical: %v", err)
	}
	if !strings.Contains(buf.String(), "\"c\",\"a\",\"b\"") {
		t.Fatalf("array order altered: %s", buf.String())
	}
}

// buildEntry produces an AuditLogEntry by computing the HMAC against
// the row's struct-derived canonical bytes, then storing the resulting
// row JSON as the raw bytes. Test-only — production reads raw bytes
// straight from disk so the time-precision drift doesn't apply.
func buildEntry(row SubjectAuditRow, key []byte, prev string) AuditLogEntry {
	canon, err := canonicalRowBytesFromStruct(&row)
	if err != nil {
		panic(err)
	}
	row.RowHmac = computeRowHMAC(key, prev, canon)
	raw, err := json.Marshal(row)
	if err != nil {
		panic(err)
	}
	return AuditLogEntry{Row: row, Raw: raw}
}

// TestVerifyChain_ReplaysAndReachesTip writes 3 rows with HMACs computed
// the same way Rust would, then verifies they chain. This is the local
// half of the parity contract — the cross-runtime half (Rust writes,
// Go verifies) is covered by scripts/cutover/parity/subject_audit_parity.sh.
func TestVerifyChain_ReplaysAndReachesTip(t *testing.T) {
	key := deterministicKey()
	r1 := mkRow("CAND-PARITY", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z")
	e1 := buildEntry(r1, key, GenesisHash)
	r2 := mkRow("CAND-PARITY", []string{"phone"}, e1.Row.RowHmac, "2026-05-03T12:00:01Z")
	e2 := buildEntry(r2, key, e1.Row.RowHmac)
	r3 := mkRow("CAND-PARITY", []string{"email"}, e2.Row.RowHmac, "2026-05-03T12:00:02Z")
	e3 := buildEntry(r3, key, e2.Row.RowHmac)

	count, tip, err := VerifyChain([]AuditLogEntry{e1, e2, e3}, key)
	if err != nil {
		t.Fatalf("verify failed: %v", err)
	}
	if count != 3 {
		t.Fatalf("expected 3 rows verified, got %d", count)
	}
	if tip != e3.Row.RowHmac {
		t.Fatalf("chain tip wrong: tip=%s expected=%s", tip, e3.Row.RowHmac)
	}
}

// TestVerifyChain_EmptyLogIsTriviallyValid mirrors Rust's empty-log
// special case: 0 rows, GENESIS tip, no error.
func TestVerifyChain_EmptyLogIsTriviallyValid(t *testing.T) {
	count, tip, err := VerifyChain(nil, deterministicKey())
	if err != nil {
		t.Fatalf("empty log returned error: %v", err)
	}
	if count != 0 {
		t.Fatalf("expected 0 rows on empty log, got %d", count)
	}
	if tip != GenesisHash {
		t.Fatalf("expected GENESIS tip on empty log, got %q", tip)
	}
}

// TestVerifyChain_TamperDetected: tamper the raw line's `result` field
// (the canonicalizer sees the new bytes; HMAC mismatches the stored hash).
func TestVerifyChain_TamperDetected(t *testing.T) {
	key := deterministicKey()
	r1 := mkRow("CAND-T", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z")
	e1 := buildEntry(r1, key, GenesisHash)
	// Tamper: replace "success" with "denied" in the raw bytes ONLY.
	// The struct's row_hmac (used as the "stored" comparator) stays put.
	e1.Raw = []byte(strings.Replace(string(e1.Raw), `"success"`, `"denied"`, 1))
	_, _, err := VerifyChain([]AuditLogEntry{e1}, key)
	if err == nil {
		t.Fatal("expected hmac mismatch after tamper, got nil")
	}
	if !strings.Contains(err.Error(), "hmac mismatch") {
		t.Fatalf("expected hmac mismatch, got: %v", err)
	}
}

// TestVerifyChain_BadKeyRejectsValidRows: same rows + wrong key = mismatch.
func TestVerifyChain_BadKeyRejectsValidRows(t *testing.T) {
	good := deterministicKey()
	r1 := mkRow("CAND-BK", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z")
	e1 := buildEntry(r1, good, GenesisHash)
	bad := make([]byte, 32)
	for i := range bad {
		bad[i] = 0xff
	}
	_, _, err := VerifyChain([]AuditLogEntry{e1}, bad)
	if err == nil {
		t.Fatal("expected hmac mismatch with wrong key")
	}
}

// TestComputeRowHMAC_StableAcrossRuns: same row + same key always = same hash.
func TestComputeRowHMAC_StableAcrossRuns(t *testing.T) {
	key := deterministicKey()
	r := mkRow("CAND-S", []string{"a", "b"}, GenesisHash, "2026-05-03T12:00:00Z")
	c1, _ := canonicalRowBytesFromStruct(&r)
	c2, _ := canonicalRowBytesFromStruct(&r)
	if string(c1) != string(c2) {
		t.Fatalf("canonical bytes unstable across runs:\n  c1=%s\n  c2=%s", c1, c2)
	}
	h1 := computeRowHMAC(key, GenesisHash, c1)
	h2 := computeRowHMAC(key, GenesisHash, c2)
	if h1 != h2 {
		t.Fatalf("hmac unstable across runs: %s vs %s", h1, h2)
	}
	if len(h1) != 64 {
		t.Fatalf("hmac wrong length %d", len(h1))
	}
	// Sanity: hex-decodable.
	if _, err := hex.DecodeString(h1); err != nil {
		t.Fatalf("hmac not hex: %v", err)
	}
}

// TestKnownAnswerVector matches a Go-computed reference. The same
// inputs must produce this exact byte string under Rust as well —
// scripts/cutover/parity/subject_audit_parity.sh runs the Rust helper
// against this exact fixture and asserts byte-identical output.
//
// If you change the fixture, rebuild Rust's parity_subject_audit + Go's
// helper and update both sides together.
func TestKnownAnswerVector(t *testing.T) {
	key := deterministicKey()
	r := SubjectAuditRow{
		Schema:      "subject_audit.v1",
		Ts:          time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC),
		CandidateID: "WORKER-FIXED",
		Accessor: AuditAccessor{
			Kind:    "gateway_lookup",
			Daemon:  "gateway",
			Purpose: "parity_test",
			TraceID: "trace-fixed",
		},
		FieldsAccessed: []string{"name"},
		Result:         "success",
		PrevChainHash:  GenesisHash,
		RowHmac:        "",
	}
	canon, err := canonicalRowBytesFromStruct(&r)
	if err != nil {
		t.Fatalf("canonical: %v", err)
	}
	t.Logf("canonical bytes: %s", canon)
	hmacHex := computeRowHMAC(key, GenesisHash, canon)
	t.Logf("hmac: %s", hmacHex)
	// Sanity: round-trip through encoding/json + canonicalization is stable.
	again, err := canonicalRowBytesFromStruct(&r)
	if err != nil {
		t.Fatalf("canonical 2: %v", err)
	}
	if string(canon) != string(again) {
		t.Fatalf("canonical drift: %s vs %s", canon, again)
	}
	// Sanity: real HMAC against the canonical bytes.
	mac := hmac.New(sha256.New, key)
	mac.Write([]byte(GenesisHash))
	mac.Write(canon)
	expected := hex.EncodeToString(mac.Sum(nil))
	if hmacHex != expected {
		t.Fatalf("computeRowHMAC drift: %s vs %s", hmacHex, expected)
	}
}

// TestVerifyChain_HtmlChars_NotEscaped is the regression test for the
// 2026-05-03 opus scrum WARN: Go's json.Marshal escapes `<`, `>`, `&`
// to `<`, `>`, `&` by default; Rust's serde_json keeps
// them literal. Audit rows with these chars in any string field would
// silently break the chain across runtimes. Fix is in writeCanonical's
// marshalNoEscapeHTML helper. This test asserts canonical bytes contain
// the literal `<`, `>`, `&` (proving the fix is in place).
func TestVerifyChain_HtmlChars_NotEscaped(t *testing.T) {
	r := mkRow("CAND-HTML", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z")
	r.Accessor.Purpose = "error & retry"   // & must NOT be &
	r.Accessor.TraceID = "<HTTP-Req-Id>"   // < and > must NOT be < / >
	canon, err := canonicalRowBytesFromStruct(&r)
	if err != nil {
		t.Fatalf("canonical: %v", err)
	}
	s := string(canon)
	// FAIL if the bytes contain Go's HTML-safe < / > / &
	// escape sequences (six raw chars each: backslash, u, 0, 0, hex, hex).
	// Those wouldn't match Rust's literal-char output and would silently
	// break the cross-runtime HMAC chain. Note: the strings below are
	// raw-string literals — the backslash + u006xx is six literal bytes,
	// NOT a Go-source unicode escape.
	if strings.Contains(s, "\\u003c") || strings.Contains(s, "\\u003e") || strings.Contains(s, "\\u0026") {
		t.Fatalf("canonical bytes contain Go HTML-escape sequences (would diverge from Rust):\n%s", s)
	}
	// PASS only if the literal chars survived round-trip.
	if !strings.Contains(s, "\"<HTTP-Req-Id>\"") || !strings.Contains(s, "\"error & retry\"") {
		t.Fatalf("canonical bytes missing literal <>&:\n%s", s)
	}
}

// TestVerifyChain_RawBytesPreserveTimePrecision is the regression test
// for the 2026-05-03 WORKER-5 finding: when a row's nanoseconds end in
// 0, time.RFC3339Nano strips the trailing zero on re-marshal, producing
// different canonical bytes than Rust's chrono AutoSi (which always
// emits 9 digits). VerifyChain MUST canonicalize from the raw line
// bytes to avoid this drift. Test feeds a hand-crafted raw line whose
// ts has a trailing-zero nano value and asserts verify succeeds when
// the chain hash was computed against THOSE EXACT bytes.
func TestVerifyChain_RawBytesPreserveTimePrecision(t *testing.T) {
	key := deterministicKey()
	// Hand-crafted raw line exactly as Rust would write it, with
	// nanoseconds=461439210 (trailing zero present).
	rawNoHmac := `{"schema":"subject_audit.v1","ts":"2026-05-03T09:12:47.461439210Z","candidate_id":"WORKER-5","accessor":{"kind":"validator_lookup","daemon":"gateway","purpose":"validator_worker_lookup","trace_id":""},"fields_accessed":["exists"],"result":"not_found","prev_chain_hash":"GENESIS"}`
	canonical, err := canonicalRowBytesFromRaw([]byte(rawNoHmac))
	if err != nil {
		t.Fatalf("canonicalize raw: %v", err)
	}
	hmacHex := computeRowHMAC(key, GenesisHash, canonical)
	// Compose the full row by injecting row_hmac at the end (matches
	// what the Rust writer produces — declaration order + appended hmac).
	rawFull := strings.TrimSuffix(rawNoHmac, "}") + `,"row_hmac":"` + hmacHex + `"}`

	var row SubjectAuditRow
	if err := json.Unmarshal([]byte(rawFull), &row); err != nil {
		t.Fatalf("unmarshal: %v", err)
	}
	entry := AuditLogEntry{Row: row, Raw: []byte(rawFull)}
	count, tip, err := VerifyChain([]AuditLogEntry{entry}, key)
	if err != nil {
		t.Fatalf("verify failed (regression: time-precision drift): %v", err)
	}
	if count != 1 {
		t.Fatalf("expected 1 row verified, got %d", count)
	}
	if tip != hmacHex {
		t.Fatalf("tip mismatch: %s vs %s", tip, hmacHex)
	}
}

// TestSubjectManifest_RoundTripJSON: parse a fixture JSON identical in
// shape to what crates/catalogd/src/registry.rs::put_subject writes to
// data/_catalog/subjects/<id>.json. If this fails, the Go reader is
// out of sync with the Rust writer (a Step 8 contract violation).
func TestSubjectManifest_RoundTripJSON(t *testing.T) {
	src := `{
  "schema": "subject_manifest.v1",
  "candidate_id": "WORKER-1",
  "created_at": "2026-05-03T08:22:24.571647177Z",
  "updated_at": "2026-05-03T08:22:24.571647177Z",
  "status": "active",
  "vertical": "unknown",
  "consent": {
    "general_pii": {
      "status": "pending_backfill_review",
      "version": ""
    },
    "biometric": {
      "status": "never_collected"
    }
  },
  "retention": {
    "general_pii_until": "2030-05-02T08:22:24.571647177Z",
    "policy": "4_year_default"
  },
  "datasets": [
    {"name": "workers_500k", "key_column": "worker_id", "key_value": "1"}
  ],
  "safe_views": ["workers_safe"],
  "audit_log_path": "_catalog/subjects/WORKER-1.audit.jsonl",
  "audit_log_chain_root": ""
}`
	var m SubjectManifest
	if err := json.Unmarshal([]byte(src), &m); err != nil {
		t.Fatalf("unmarshal: %v", err)
	}
	if m.CandidateID != "WORKER-1" {
		t.Fatalf("candidate_id wrong: %s", m.CandidateID)
	}
	if m.Status != "active" {
		t.Fatalf("status wrong: %s", m.Status)
	}
	if m.Consent.GeneralPii.Status != "pending_backfill_review" {
		t.Fatalf("general_pii.status wrong: %s", m.Consent.GeneralPii.Status)
	}
	if m.Consent.Biometric.Status != "never_collected" {
		t.Fatalf("biometric.status wrong: %s", m.Consent.Biometric.Status)
	}
	if m.Retention.Policy != "4_year_default" {
		t.Fatalf("retention.policy wrong: %s", m.Retention.Policy)
	}
	if len(m.Datasets) != 1 || m.Datasets[0].Name != "workers_500k" {
		t.Fatalf("datasets wrong: %+v", m.Datasets)
	}
}