From 262a77a52aa07493f861c11bc8f835c71132c9b5 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 3 May 2026 04:17:15 -0500 Subject: [PATCH] =?UTF-8?q?subject-audit=20parity=20(Step=208)=20=E2=80=94?= =?UTF-8?q?=20Go=20reader=20+=20cross-runtime=20probe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md §5 Step 8. Go side reads SubjectManifest + verifies HMAC chain on per-subject audit JSONL files using IDENTICAL canonical-JSON + HMAC-SHA256 algorithm to crates/catalogd/src/subject_audit.rs. A Rust-written chain now verifies under Go and vice versa. Files: - internal/catalogd/subject.go SubjectManifest, SubjectAuditRow, AuditAccessor, AuditLogEntry LoadSubjectManifest, LoadKeyFile (32-byte minimum, matches Rust) ReadAuditLog, VerifyChain canonicalRowBytesFromRaw (production), canonicalRowBytesFromStruct (tests) computeRowHMAC, CanonicalAndHmac (parity helper) - internal/catalogd/subject_test.go (10 unit tests) - scripts/cutover/parity/subject_audit_helper/main.go CLI helper mirroring crates/catalogd/src/bin/parity_subject_audit.rs - scripts/cutover/parity/subject_audit_parity.sh Two-phase probe: known-answer + every real audit log Two real bugs caught + fixed by the probe authoring loop: 1. omitempty on AuditAccessor.TraceID stripped the field when empty, producing different canonical bytes than Rust (which always writes the field). Removed omitempty. Rust + Go now produce identical bytes for rows with trace_id="" (the common production case). 2. time.RFC3339Nano strips trailing zeros from nanoseconds, producing "...46143921" where Rust's chrono AutoSi produces "...461439210". Hashing through the parsed-then-re-marshaled struct breaks the chain on any row whose nanos end in 0. Fixed by canonicalizing from the RAW LINE BYTES (preserves the original timestamp string byte-for-byte). Test TestVerifyChain_RawBytesPreserveTimePrecision regression-locks this with a hand-crafted nanos=461439210 row. Live verification (6 / 6 byte-identical assertions): - Phase 1 known-answer: canonical bytes (266) + HMAC match - Phase 2 real logs: WORKER-1..5 audit JSONL all verify under both runtimes with identical (count, tip, verified, error) output Report: reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/catalogd/subject.go | 329 +++++++++++++++++ internal/catalogd/subject_test.go | 333 ++++++++++++++++++ .../parity/subject_audit_parity.md | 35 ++ .../parity/subject_audit_helper/main.go | 168 +++++++++ .../cutover/parity/subject_audit_parity.sh | 203 +++++++++++ 5 files changed, 1068 insertions(+) create mode 100644 internal/catalogd/subject.go create mode 100644 internal/catalogd/subject_test.go create mode 100644 reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md create mode 100644 scripts/cutover/parity/subject_audit_helper/main.go create mode 100755 scripts/cutover/parity/subject_audit_parity.sh diff --git a/internal/catalogd/subject.go b/internal/catalogd/subject.go new file mode 100644 index 0000000..aceb9ca --- /dev/null +++ b/internal/catalogd/subject.go @@ -0,0 +1,329 @@ +// Subject manifests + per-subject audit-log chain verification. +// +// Specification: /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md. +// +// This file is the Go side of Step 8 (cross-runtime parity). The Rust +// side is the writer (crates/catalogd/src/subject_audit.rs); Go is a +// READER + VERIFIER only — both sides serialize / hash with identical +// algorithms so a chain written by Rust verifies under Go. +// +// Algorithm (must match Rust crates/catalogd/src/subject_audit.rs exactly): +// row_hmac = HMAC-SHA256(key, prev_chain_hash_bytes || canonical_row_bytes) +// +// where: +// - prev_chain_hash_bytes is the ASCII bytes of the previous row's +// row_hmac field, OR the literal ASCII string "GENESIS" for the +// first row (no hex decode!) +// - canonical_row_bytes is the row JSON with the row_hmac field +// dropped, with all object keys sorted alphabetically at every +// nesting depth, with no insignificant whitespace +// - the final hash is rendered as 64 lowercase hex characters +package catalogd + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "sort" + "strings" + "time" +) + +const GenesisHash = "GENESIS" + +// SubjectManifest mirrors crates/shared/src/types.rs::SubjectManifest. +// Keep field tags byte-identical so manifest JSON written by Rust round-trips. +type SubjectManifest struct { + Schema string `json:"schema"` + CandidateID string `json:"candidate_id"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` + Status string `json:"status"` + Vertical string `json:"vertical,omitempty"` + Consent SubjectConsent `json:"consent"` + Retention SubjectRetention `json:"retention"` + Datasets []SubjectDatasetRef `json:"datasets"` + SafeViews []string `json:"safe_views"` + AuditLogPath string `json:"audit_log_path"` + AuditLogChainRoot string `json:"audit_log_chain_root"` +} + +type SubjectConsent struct { + GeneralPii GeneralPiiConsent `json:"general_pii"` + Biometric BiometricConsent `json:"biometric"` +} + +type GeneralPiiConsent struct { + Status string `json:"status"` + Version string `json:"version,omitempty"` + GivenAt *time.Time `json:"given_at,omitempty"` + WithdrawnAt *time.Time `json:"withdrawn_at,omitempty"` +} + +type BiometricConsent struct { + Status string `json:"status"` + RetentionUntil *time.Time `json:"retention_until,omitempty"` +} + +type SubjectRetention struct { + GeneralPiiUntil time.Time `json:"general_pii_until"` + Policy string `json:"policy,omitempty"` +} + +type SubjectDatasetRef struct { + Name string `json:"name"` + KeyColumn string `json:"key_column"` + KeyValue string `json:"key_value"` +} + +// SubjectAuditRow mirrors crates/shared/src/types.rs::SubjectAuditRow. +// JSON field order does NOT matter (canonicalizer sorts at hash time) +// but field names + types MUST match for round-trip. +type SubjectAuditRow struct { + Schema string `json:"schema"` + Ts time.Time `json:"ts"` + CandidateID string `json:"candidate_id"` + Accessor AuditAccessor `json:"accessor"` + FieldsAccessed []string `json:"fields_accessed"` + Result string `json:"result"` + PrevChainHash string `json:"prev_chain_hash"` + RowHmac string `json:"row_hmac"` +} + +// AuditAccessor mirrors crates/shared/src/types.rs::AuditAccessor. +// +// IMPORTANT: trace_id has NO omitempty — Rust's serde always writes +// the field (even empty) because `#[serde(default)]` only affects +// READS. Stripping it here produces different canonical bytes than +// the writer used → HMAC mismatch on real production rows. (Caught +// 2026-05-03 by subject_audit_parity.sh on a live WORKER-1 row with +// trace_id="".) +type AuditAccessor struct { + Kind string `json:"kind"` + Daemon string `json:"daemon"` + Purpose string `json:"purpose"` + TraceID string `json:"trace_id"` +} + +// LoadSubjectManifest reads + parses a subject manifest JSON file. +func LoadSubjectManifest(path string) (*SubjectManifest, error) { + bytes, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read manifest %s: %w", path, err) + } + var m SubjectManifest + if err := json.Unmarshal(bytes, &m); err != nil { + return nil, fmt.Errorf("parse manifest %s: %w", path, err) + } + return &m, nil +} + +// AuditLogEntry is one parsed audit row PLUS the raw line bytes from +// the file. The raw bytes are load-bearing: we MUST canonicalize from +// them (not from the re-marshaled struct) to avoid time-precision drift. +// +// Why: Rust's chrono RFC3339-AutoSi serializes nanoseconds with 9 digits +// (e.g. "461439210"). Go's time.RFC3339Nano strips trailing zeros (e.g. +// "46143921"). Round-tripping through time.Time changes the byte sequence +// and breaks the HMAC. Read once, hash from the original bytes. +// (Caught 2026-05-03 by subject_audit_parity.sh on a real WORKER-5 row +// whose nanoseconds happened to end in 0.) +type AuditLogEntry struct { + Row SubjectAuditRow + Raw []byte +} + +// ReadAuditLog parses an audit JSONL file. Returns one AuditLogEntry +// per non-empty line. Defensive: blank lines are skipped, unparseable +// lines are skipped (matches Rust read_rows_in_range). +func ReadAuditLog(path string) ([]AuditLogEntry, error) { + bytes, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, fmt.Errorf("read audit log %s: %w", path, err) + } + var entries []AuditLogEntry + for _, line := range strings.Split(string(bytes), "\n") { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + var row SubjectAuditRow + if err := json.Unmarshal([]byte(trimmed), &row); err != nil { + continue + } + entries = append(entries, AuditLogEntry{Row: row, Raw: []byte(trimmed)}) + } + return entries, nil +} + +// canonicalRowBytesFromRaw is the production canonicalizer used by +// VerifyChain. It accepts the raw line bytes from the audit JSONL, +// drops the row_hmac key, sorts all keys alphabetically at every +// nesting depth, and re-emits compact JSON. Numbers + strings pass +// through verbatim (json.Number preserves number text, strings are +// re-JSON-escaped identically to Rust's serde_json). +// +// CRITICAL: hashing must always go through this function. NEVER hash +// the bytes produced by canonicalRowBytesFromStruct against a real +// production chain — the time-precision drift will give wrong HMACs +// for any row whose nanoseconds end in 0. +func canonicalRowBytesFromRaw(rawLine []byte) ([]byte, error) { + dec := json.NewDecoder(strings.NewReader(string(rawLine))) + dec.UseNumber() + var v any + if err := dec.Decode(&v); err != nil { + return nil, fmt.Errorf("decode raw line: %w", err) + } + if obj, ok := v.(map[string]any); ok { + delete(obj, "row_hmac") + } + var buf strings.Builder + if err := writeCanonical(&buf, v); err != nil { + return nil, err + } + return []byte(buf.String()), nil +} + +// canonicalRowBytesFromStruct is for the parity probe's known-answer +// vector + tests where we WANT to canonicalize a Go-built row from +// scratch (no original bytes exist yet). Production verification uses +// canonicalRowBytesFromRaw against the file bytes; this function is +// only safe for synthetic inputs whose ts has no trailing-zero nanos. +func canonicalRowBytesFromStruct(row *SubjectAuditRow) ([]byte, error) { + raw, err := json.Marshal(row) + if err != nil { + return nil, fmt.Errorf("marshal row: %w", err) + } + return canonicalRowBytesFromRaw(raw) +} + +// writeCanonical recursively writes v as canonical JSON: object keys +// sorted alphabetically, no insignificant whitespace. Arrays preserve +// element order (semantically significant per spec §3). +func writeCanonical(buf *strings.Builder, v any) error { + switch t := v.(type) { + case map[string]any: + keys := make([]string, 0, len(t)) + for k := range t { + keys = append(keys, k) + } + sort.Strings(keys) + buf.WriteByte('{') + for i, k := range keys { + if i > 0 { + buf.WriteByte(',') + } + ks, err := json.Marshal(k) + if err != nil { + return fmt.Errorf("marshal key: %w", err) + } + buf.Write(ks) + buf.WriteByte(':') + if err := writeCanonical(buf, t[k]); err != nil { + return err + } + } + buf.WriteByte('}') + case []any: + buf.WriteByte('[') + for i, elem := range t { + if i > 0 { + buf.WriteByte(',') + } + if err := writeCanonical(buf, elem); err != nil { + return err + } + } + buf.WriteByte(']') + default: + // json.Number, string, bool, nil — encoding/json renders these + // the same way Rust's serde_json does (compact, RFC-8259-conformant). + bs, err := json.Marshal(v) + if err != nil { + return fmt.Errorf("marshal scalar: %w", err) + } + buf.Write(bs) + } + return nil +} + +// CanonicalAndHmac is a helper for the parity probe: canonicalizes the +// row from scratch (struct → JSON → canonical), computes the HMAC +// against the given prev_hash, returns both. Used only for the +// known-answer fixture; production verification uses VerifyChain +// (which canonicalizes from the original file bytes to dodge time +// precision drift). +func CanonicalAndHmac(row *SubjectAuditRow, key []byte, prevHash string) (canonical []byte, hmacHex string, err error) { + canonical, err = canonicalRowBytesFromStruct(row) + if err != nil { + return nil, "", err + } + hmacHex = computeRowHMAC(key, prevHash, canonical) + return canonical, hmacHex, nil +} + +// computeRowHMAC = HMAC-SHA256(key, prev_hash_ascii_bytes || canonical_row_bytes) +// rendered as 64-char lowercase hex. +func computeRowHMAC(key []byte, prevHash string, canonical []byte) string { + mac := hmac.New(sha256.New, key) + mac.Write([]byte(prevHash)) + mac.Write(canonical) + return hex.EncodeToString(mac.Sum(nil)) +} + +// VerifyChain replays the chain for a subject's audit log. Returns the +// number of rows verified, the chain tip (last row's row_hmac, or +// GENESIS when log is empty), and an error describing the first chain +// break found. +// +// CRITICAL: canonicalizes from each entry's RAW LINE BYTES, not from +// the parsed struct. Round-tripping through encoding/json + time.Time +// strips trailing zeros from RFC3339 nanos and produces different +// canonical bytes than the Rust writer used (which used chrono's +// AutoSi 9-digit format). Catches: 2026-05-03 WORKER-5 nanos +// 461439210 → Rust emits 9 digits, Go's time.RFC3339Nano emits 8. +// +// Special case: empty/missing log returns (0, GENESIS, nil). +func VerifyChain(entries []AuditLogEntry, key []byte) (count int, chainTip string, err error) { + prev := GenesisHash + for i, entry := range entries { + if entry.Row.PrevChainHash != prev { + return i, prev, fmt.Errorf("chain break at row %d: prev_chain_hash=%q expected=%q", + i+1, entry.Row.PrevChainHash, prev) + } + claimed := entry.Row.RowHmac + canonical, cerr := canonicalRowBytesFromRaw(entry.Raw) + if cerr != nil { + return i, prev, fmt.Errorf("canonicalize row %d: %w", i+1, cerr) + } + recomputed := computeRowHMAC(key, prev, canonical) + if recomputed != claimed { + return i, prev, fmt.Errorf("hmac mismatch at row %d: stored=%s recomputed=%s", + i+1, claimed, recomputed) + } + prev = claimed + count = i + 1 + } + chainTip = prev + return count, chainTip, nil +} + +// LoadKeyFile reads a signing key file. Refuses keys shorter than 32 +// bytes (matches Rust: 2026-05-03 kimi BLOCK fix lifted the minimum +// from 16 to 32 to align with HMAC-SHA256 best practice). +func LoadKeyFile(path string) ([]byte, error) { + key, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read key file %s: %w", path, err) + } + if len(key) < 32 { + return nil, fmt.Errorf("signing key is %d bytes; recommend ≥32 bytes for HMAC-SHA256", len(key)) + } + return key, nil +} diff --git a/internal/catalogd/subject_test.go b/internal/catalogd/subject_test.go new file mode 100644 index 0000000..4481538 --- /dev/null +++ b/internal/catalogd/subject_test.go @@ -0,0 +1,333 @@ +package catalogd + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "strings" + "testing" + "time" +) + +// deterministicKey is the same fixture key the Rust tests use: +// (0u8..32).collect() — so a Rust-written chain verifies under Go. +func deterministicKey() []byte { + k := make([]byte, 32) + for i := range k { + k[i] = byte(i) + } + return k +} + +func mkRow(candidateID string, fields []string, prevHash, ts string) SubjectAuditRow { + t, _ := time.Parse(time.RFC3339Nano, ts) + return SubjectAuditRow{ + Schema: "subject_audit.v1", + Ts: t, + CandidateID: candidateID, + Accessor: AuditAccessor{ + Kind: "gateway_lookup", + Daemon: "gateway", + Purpose: "fill_validation", + TraceID: "", + }, + FieldsAccessed: fields, + Result: "success", + PrevChainHash: prevHash, + RowHmac: "", // computed below + } +} + +// TestCanonicalJSON_KeysSortedAlphabetically asserts the same property +// the Rust unit test asserts (subject_audit::tests::canonical_json_sorts_keys_alphabetically). +func TestCanonicalJSON_KeysSortedAlphabetically(t *testing.T) { + v := map[string]any{ + "z": 1, + "a": 2, + "m": map[string]any{"y": 1, "b": 2}, + } + var buf strings.Builder + if err := writeCanonical(&buf, v); err != nil { + t.Fatalf("canonical: %v", err) + } + s := buf.String() + a, m, z := strings.Index(s, "\"a\""), strings.Index(s, "\"m\""), strings.Index(s, "\"z\"") + if !(a < m && m < z) { + t.Fatalf("top-level keys out of order: %s", s) + } + b, y := strings.Index(s, "\"b\""), strings.Index(s, "\"y\"") + if !(b < y) { + t.Fatalf("nested keys out of order: %s", s) + } +} + +// TestCanonicalJSON_ArraysPreserveOrder asserts arrays are NOT sorted — +// matches Rust subject_audit::tests::canonical_json_arrays_preserve_order. +func TestCanonicalJSON_ArraysPreserveOrder(t *testing.T) { + v := map[string]any{"k": []any{"c", "a", "b"}} + var buf strings.Builder + if err := writeCanonical(&buf, v); err != nil { + t.Fatalf("canonical: %v", err) + } + if !strings.Contains(buf.String(), "\"c\",\"a\",\"b\"") { + t.Fatalf("array order altered: %s", buf.String()) + } +} + +// buildEntry produces an AuditLogEntry by computing the HMAC against +// the row's struct-derived canonical bytes, then storing the resulting +// row JSON as the raw bytes. Test-only — production reads raw bytes +// straight from disk so the time-precision drift doesn't apply. +func buildEntry(row SubjectAuditRow, key []byte, prev string) AuditLogEntry { + canon, err := canonicalRowBytesFromStruct(&row) + if err != nil { + panic(err) + } + row.RowHmac = computeRowHMAC(key, prev, canon) + raw, err := json.Marshal(row) + if err != nil { + panic(err) + } + return AuditLogEntry{Row: row, Raw: raw} +} + +// TestVerifyChain_ReplaysAndReachesTip writes 3 rows with HMACs computed +// the same way Rust would, then verifies they chain. This is the local +// half of the parity contract — the cross-runtime half (Rust writes, +// Go verifies) is covered by scripts/cutover/parity/subject_audit_parity.sh. +func TestVerifyChain_ReplaysAndReachesTip(t *testing.T) { + key := deterministicKey() + r1 := mkRow("CAND-PARITY", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z") + e1 := buildEntry(r1, key, GenesisHash) + r2 := mkRow("CAND-PARITY", []string{"phone"}, e1.Row.RowHmac, "2026-05-03T12:00:01Z") + e2 := buildEntry(r2, key, e1.Row.RowHmac) + r3 := mkRow("CAND-PARITY", []string{"email"}, e2.Row.RowHmac, "2026-05-03T12:00:02Z") + e3 := buildEntry(r3, key, e2.Row.RowHmac) + + count, tip, err := VerifyChain([]AuditLogEntry{e1, e2, e3}, key) + if err != nil { + t.Fatalf("verify failed: %v", err) + } + if count != 3 { + t.Fatalf("expected 3 rows verified, got %d", count) + } + if tip != e3.Row.RowHmac { + t.Fatalf("chain tip wrong: tip=%s expected=%s", tip, e3.Row.RowHmac) + } +} + +// TestVerifyChain_EmptyLogIsTriviallyValid mirrors Rust's empty-log +// special case: 0 rows, GENESIS tip, no error. +func TestVerifyChain_EmptyLogIsTriviallyValid(t *testing.T) { + count, tip, err := VerifyChain(nil, deterministicKey()) + if err != nil { + t.Fatalf("empty log returned error: %v", err) + } + if count != 0 { + t.Fatalf("expected 0 rows on empty log, got %d", count) + } + if tip != GenesisHash { + t.Fatalf("expected GENESIS tip on empty log, got %q", tip) + } +} + +// TestVerifyChain_TamperDetected: tamper the raw line's `result` field +// (the canonicalizer sees the new bytes; HMAC mismatches the stored hash). +func TestVerifyChain_TamperDetected(t *testing.T) { + key := deterministicKey() + r1 := mkRow("CAND-T", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z") + e1 := buildEntry(r1, key, GenesisHash) + // Tamper: replace "success" with "denied" in the raw bytes ONLY. + // The struct's row_hmac (used as the "stored" comparator) stays put. + e1.Raw = []byte(strings.Replace(string(e1.Raw), `"success"`, `"denied"`, 1)) + _, _, err := VerifyChain([]AuditLogEntry{e1}, key) + if err == nil { + t.Fatal("expected hmac mismatch after tamper, got nil") + } + if !strings.Contains(err.Error(), "hmac mismatch") { + t.Fatalf("expected hmac mismatch, got: %v", err) + } +} + +// TestVerifyChain_BadKeyRejectsValidRows: same rows + wrong key = mismatch. +func TestVerifyChain_BadKeyRejectsValidRows(t *testing.T) { + good := deterministicKey() + r1 := mkRow("CAND-BK", []string{"name"}, GenesisHash, "2026-05-03T12:00:00Z") + e1 := buildEntry(r1, good, GenesisHash) + bad := make([]byte, 32) + for i := range bad { + bad[i] = 0xff + } + _, _, err := VerifyChain([]AuditLogEntry{e1}, bad) + if err == nil { + t.Fatal("expected hmac mismatch with wrong key") + } +} + +// TestComputeRowHMAC_StableAcrossRuns: same row + same key always = same hash. +func TestComputeRowHMAC_StableAcrossRuns(t *testing.T) { + key := deterministicKey() + r := mkRow("CAND-S", []string{"a", "b"}, GenesisHash, "2026-05-03T12:00:00Z") + c1, _ := canonicalRowBytesFromStruct(&r) + c2, _ := canonicalRowBytesFromStruct(&r) + if string(c1) != string(c2) { + t.Fatalf("canonical bytes unstable across runs:\n c1=%s\n c2=%s", c1, c2) + } + h1 := computeRowHMAC(key, GenesisHash, c1) + h2 := computeRowHMAC(key, GenesisHash, c2) + if h1 != h2 { + t.Fatalf("hmac unstable across runs: %s vs %s", h1, h2) + } + if len(h1) != 64 { + t.Fatalf("hmac wrong length %d", len(h1)) + } + // Sanity: hex-decodable. + if _, err := hex.DecodeString(h1); err != nil { + t.Fatalf("hmac not hex: %v", err) + } +} + +// TestKnownAnswerVector matches a Go-computed reference. The same +// inputs must produce this exact byte string under Rust as well — +// scripts/cutover/parity/subject_audit_parity.sh runs the Rust helper +// against this exact fixture and asserts byte-identical output. +// +// If you change the fixture, rebuild Rust's parity_subject_audit + Go's +// helper and update both sides together. +func TestKnownAnswerVector(t *testing.T) { + key := deterministicKey() + r := SubjectAuditRow{ + Schema: "subject_audit.v1", + Ts: time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC), + CandidateID: "WORKER-FIXED", + Accessor: AuditAccessor{ + Kind: "gateway_lookup", + Daemon: "gateway", + Purpose: "parity_test", + TraceID: "trace-fixed", + }, + FieldsAccessed: []string{"name"}, + Result: "success", + PrevChainHash: GenesisHash, + RowHmac: "", + } + canon, err := canonicalRowBytesFromStruct(&r) + if err != nil { + t.Fatalf("canonical: %v", err) + } + t.Logf("canonical bytes: %s", canon) + hmacHex := computeRowHMAC(key, GenesisHash, canon) + t.Logf("hmac: %s", hmacHex) + // Sanity: round-trip through encoding/json + canonicalization is stable. + again, err := canonicalRowBytesFromStruct(&r) + if err != nil { + t.Fatalf("canonical 2: %v", err) + } + if string(canon) != string(again) { + t.Fatalf("canonical drift: %s vs %s", canon, again) + } + // Sanity: real HMAC against the canonical bytes. + mac := hmac.New(sha256.New, key) + mac.Write([]byte(GenesisHash)) + mac.Write(canon) + expected := hex.EncodeToString(mac.Sum(nil)) + if hmacHex != expected { + t.Fatalf("computeRowHMAC drift: %s vs %s", hmacHex, expected) + } +} + +// TestVerifyChain_RawBytesPreserveTimePrecision is the regression test +// for the 2026-05-03 WORKER-5 finding: when a row's nanoseconds end in +// 0, time.RFC3339Nano strips the trailing zero on re-marshal, producing +// different canonical bytes than Rust's chrono AutoSi (which always +// emits 9 digits). VerifyChain MUST canonicalize from the raw line +// bytes to avoid this drift. Test feeds a hand-crafted raw line whose +// ts has a trailing-zero nano value and asserts verify succeeds when +// the chain hash was computed against THOSE EXACT bytes. +func TestVerifyChain_RawBytesPreserveTimePrecision(t *testing.T) { + key := deterministicKey() + // Hand-crafted raw line exactly as Rust would write it, with + // nanoseconds=461439210 (trailing zero present). + rawNoHmac := `{"schema":"subject_audit.v1","ts":"2026-05-03T09:12:47.461439210Z","candidate_id":"WORKER-5","accessor":{"kind":"validator_lookup","daemon":"gateway","purpose":"validator_worker_lookup","trace_id":""},"fields_accessed":["exists"],"result":"not_found","prev_chain_hash":"GENESIS"}` + canonical, err := canonicalRowBytesFromRaw([]byte(rawNoHmac)) + if err != nil { + t.Fatalf("canonicalize raw: %v", err) + } + hmacHex := computeRowHMAC(key, GenesisHash, canonical) + // Compose the full row by injecting row_hmac at the end (matches + // what the Rust writer produces — declaration order + appended hmac). + rawFull := strings.TrimSuffix(rawNoHmac, "}") + `,"row_hmac":"` + hmacHex + `"}` + + var row SubjectAuditRow + if err := json.Unmarshal([]byte(rawFull), &row); err != nil { + t.Fatalf("unmarshal: %v", err) + } + entry := AuditLogEntry{Row: row, Raw: []byte(rawFull)} + count, tip, err := VerifyChain([]AuditLogEntry{entry}, key) + if err != nil { + t.Fatalf("verify failed (regression: time-precision drift): %v", err) + } + if count != 1 { + t.Fatalf("expected 1 row verified, got %d", count) + } + if tip != hmacHex { + t.Fatalf("tip mismatch: %s vs %s", tip, hmacHex) + } +} + +// TestSubjectManifest_RoundTripJSON: parse a fixture JSON identical in +// shape to what crates/catalogd/src/registry.rs::put_subject writes to +// data/_catalog/subjects/.json. If this fails, the Go reader is +// out of sync with the Rust writer (a Step 8 contract violation). +func TestSubjectManifest_RoundTripJSON(t *testing.T) { + src := `{ + "schema": "subject_manifest.v1", + "candidate_id": "WORKER-1", + "created_at": "2026-05-03T08:22:24.571647177Z", + "updated_at": "2026-05-03T08:22:24.571647177Z", + "status": "active", + "vertical": "unknown", + "consent": { + "general_pii": { + "status": "pending_backfill_review", + "version": "" + }, + "biometric": { + "status": "never_collected" + } + }, + "retention": { + "general_pii_until": "2030-05-02T08:22:24.571647177Z", + "policy": "4_year_default" + }, + "datasets": [ + {"name": "workers_500k", "key_column": "worker_id", "key_value": "1"} + ], + "safe_views": ["workers_safe"], + "audit_log_path": "_catalog/subjects/WORKER-1.audit.jsonl", + "audit_log_chain_root": "" +}` + var m SubjectManifest + if err := json.Unmarshal([]byte(src), &m); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if m.CandidateID != "WORKER-1" { + t.Fatalf("candidate_id wrong: %s", m.CandidateID) + } + if m.Status != "active" { + t.Fatalf("status wrong: %s", m.Status) + } + if m.Consent.GeneralPii.Status != "pending_backfill_review" { + t.Fatalf("general_pii.status wrong: %s", m.Consent.GeneralPii.Status) + } + if m.Consent.Biometric.Status != "never_collected" { + t.Fatalf("biometric.status wrong: %s", m.Consent.Biometric.Status) + } + if m.Retention.Policy != "4_year_default" { + t.Fatalf("retention.policy wrong: %s", m.Retention.Policy) + } + if len(m.Datasets) != 1 || m.Datasets[0].Name != "workers_500k" { + t.Fatalf("datasets wrong: %+v", m.Datasets) + } +} diff --git a/reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md b/reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md new file mode 100644 index 0000000..c8daafc --- /dev/null +++ b/reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md @@ -0,0 +1,35 @@ +# subject_audit_parity + +**Generated:** 2026-05-03 09:16:14 UTC +**Spec:** /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md §5 Step 8 +**Rust helper:** `/home/profit/lakehouse/target/release/parity_subject_audit` +**Go helper:** `./bin/subject_audit_helper` +**Audit dir:** `/home/profit/lakehouse/data/_catalog/subjects` + +## Phase 1 — Known-answer vector + +Hardcoded fixture row, identical inputs, byte-compare canonical-JSON + HMAC. + +**MATCH** ✓ + +```json +{"mode":"known_answer","canonical":"{\"accessor\":{\"daemon\":\"gateway\",\"kind\":\"gateway_lookup\",\"purpose\":\"parity_test\",\"trace_id\":\"trace-fixed\"},\"candidate_id\":\"WORKER-FIXED\",\"fields_accessed\":[\"name\"],\"prev_chain_hash\":\"GENESIS\",\"result\":\"success\",\"schema\":\"subject_audit.v1\",\"ts\":\"2026-05-03T12:00:00Z\"}","hmac":"f730fa038c847c27386b92eb1939ec64c62086c0a92617ac0bdf9f650c390b96","canonical_bytes_len":266} +``` + +## Phase 2 — Real production audit logs + +Every `*.audit.jsonl` in `/home/profit/lakehouse/data/_catalog/subjects` verified by both runtimes. + +| Audit log | Rust verified | Go verified | Result | +|---|---|---|---| +| `WORKER-1.audit.jsonl` | 1 rows (true) | 1 rows (true) | **MATCH** ✓ | +| `WORKER-2.audit.jsonl` | 1 rows (true) | 1 rows (true) | **MATCH** ✓ | +| `WORKER-3.audit.jsonl` | 1 rows (true) | 1 rows (true) | **MATCH** ✓ | +| `WORKER-4.audit.jsonl` | 1 rows (true) | 1 rows (true) | **MATCH** ✓ | +| `WORKER-5.audit.jsonl` | 1 rows (true) | 1 rows (true) | **MATCH** ✓ | + +## Summary + +**6 / 6** parity assertions passed. + +**Status: PARITY** — every Rust assertion matches Go byte-for-byte. diff --git a/scripts/cutover/parity/subject_audit_helper/main.go b/scripts/cutover/parity/subject_audit_helper/main.go new file mode 100644 index 0000000..dbad80c --- /dev/null +++ b/scripts/cutover/parity/subject_audit_helper/main.go @@ -0,0 +1,168 @@ +// Cross-runtime parity helper — Go side. +// +// Specification: /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md §5 Step 8. +// +// Counterpart of crates/catalogd/src/bin/parity_subject_audit.rs. +// Both helpers MUST produce byte-identical output for the same input. +// +// Modes: +// +// --known-answer +// Print canonical-JSON + HMAC for a hardcoded fixture. Compared +// byte-for-byte against the Rust helper's output. If they +// differ, the canonical-JSON or HMAC algorithm has drifted. +// +// --verify --key +// Replay the HMAC chain on a real audit JSONL. Print one JSON +// object: {mode, count, tip, verified, error}. +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + cat "git.agentview.dev/profit/golangLAKEHOUSE/internal/catalogd" +) + +const genesis = "GENESIS" + +func deterministicKey() []byte { + k := make([]byte, 32) + for i := range k { + k[i] = byte(i) + } + return k +} + +// knownAnswerOut is intentionally identical to KnownAnswerOut in the +// Rust helper so a stdout diff is a one-line semantic comparison. +type knownAnswerOut struct { + Mode string `json:"mode"` + Canonical string `json:"canonical"` + Hmac string `json:"hmac"` + CanonicalBytesLen int `json:"canonical_bytes_len"` +} + +type verifyOut struct { + Mode string `json:"mode"` + Count int `json:"count"` + Tip string `json:"tip"` + Verified bool `json:"verified"` + Error *string `json:"error"` +} + +func runKnownAnswer() { + row := cat.SubjectAuditRow{ + Schema: "subject_audit.v1", + Ts: time.Date(2026, 5, 3, 12, 0, 0, 0, time.UTC), + CandidateID: "WORKER-FIXED", + Accessor: cat.AuditAccessor{ + Kind: "gateway_lookup", + Daemon: "gateway", + Purpose: "parity_test", + TraceID: "trace-fixed", + }, + FieldsAccessed: []string{"name"}, + Result: "success", + PrevChainHash: genesis, + } + canonical, hmacHex, err := cat.CanonicalAndHmac(&row, deterministicKey(), genesis) + if err != nil { + die("canonical/hmac: %v", err) + } + out := knownAnswerOut{ + Mode: "known_answer", + Canonical: string(canonical), + Hmac: hmacHex, + CanonicalBytesLen: len(canonical), + } + emit(out) +} + +func runVerify(auditPath, keyPath string) { + entries, err := cat.ReadAuditLog(auditPath) + if err != nil { + die("read audit log: %v", err) + } + key, err := os.ReadFile(keyPath) + if err != nil { + die("read key: %v", err) + } + count, tip, verr := cat.VerifyChain(entries, key) + out := verifyOut{ + Mode: "verify", + Count: count, + Tip: tip, + Verified: verr == nil, + } + if verr != nil { + s := verr.Error() + out.Error = &s + // Reset count + tip to match the Rust helper's error semantics. + out.Count = 0 + out.Tip = genesis + } + emit(out) +} + +func emit(v any) { + bs, err := json.Marshal(v) + if err != nil { + die("marshal output: %v", err) + } + fmt.Println(string(bs)) +} + +func die(format string, a ...any) { + fmt.Fprintf(os.Stderr, format+"\n", a...) + os.Exit(2) +} + +func main() { + args := os.Args[1:] + var ( + knownAnswer bool + auditPath string + keyPath string + ) + for i := 0; i < len(args); i++ { + switch args[i] { + case "--known-answer": + knownAnswer = true + case "--verify": + if i+1 >= len(args) { + die("--verify needs a path") + } + auditPath = args[i+1] + i++ + case "--key": + if i+1 >= len(args) { + die("--key needs a path") + } + keyPath = args[i+1] + i++ + case "-h", "--help": + fmt.Fprintln(os.Stderr, "subject_audit_helper --known-answer") + fmt.Fprintln(os.Stderr, "subject_audit_helper --verify --key ") + os.Exit(0) + default: + die("unknown arg: %s", args[i]) + } + } + if knownAnswer { + runKnownAnswer() + return + } + if auditPath == "" || keyPath == "" { + die("need --known-answer OR (--verify --key )") + } + // Sanity: file naming convention .audit.jsonl. + if !strings.HasSuffix(filepath.Base(auditPath), ".audit.jsonl") { + die("audit log path must end with .audit.jsonl") + } + runVerify(auditPath, keyPath) +} diff --git a/scripts/cutover/parity/subject_audit_parity.sh b/scripts/cutover/parity/subject_audit_parity.sh new file mode 100755 index 0000000..2d6e5d1 --- /dev/null +++ b/scripts/cutover/parity/subject_audit_parity.sh @@ -0,0 +1,203 @@ +#!/usr/bin/env bash +# subject_audit_parity — verify Rust and Go produce byte-identical +# canonical JSON + HMAC-SHA256 chain hashes for subject audit logs. +# +# Why: the SubjectManifest + audit-log substrate (Rust crates/catalogd +# subject_audit.rs) is consumed by the legal-tier endpoint /audit/subject/{id} +# AND by the Go internal/catalogd reader. A canonical-JSON drift between +# the two runtimes would mean a chain written by Rust does not verify +# under Go (or worse, verifies as tampered) — silently breaking the +# defensible-audit guarantee. +# +# The probe runs in two phases: +# +# Phase 1 — Known-answer vector +# Both helpers serialize the same hardcoded SubjectAuditRow, +# emit canonical bytes + HMAC. Outputs MUST be byte-identical. +# This catches algorithm drift independent of any real data. +# +# Phase 2 — Real production audit logs +# Walk every *.audit.jsonl in the live data dir. Run BOTH helpers +# against each one with the production signing key. Outputs MUST +# be byte-identical. This catches real-world drift (e.g. struct +# field tag mismatches that only fire on certain field values). +# +# Outputs: reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md +# +# Env overrides: +# RUST_REPO=/home/profit/lakehouse +# RUST_BIN=$RUST_REPO/target/release/parity_subject_audit +# GO_BIN=./bin/subject_audit_helper +# AUDIT_DIR=$RUST_REPO/data/_catalog/subjects +# KEY_PATH=/tmp/lakehouse_audit/subject_audit.key + +set -uo pipefail +cd "$(dirname "$0")/../../.." + +RUST_REPO="${RUST_REPO:-/home/profit/lakehouse}" +RUST_BIN="${RUST_BIN:-$RUST_REPO/target/release/parity_subject_audit}" +GO_BIN="${GO_BIN:-./bin/subject_audit_helper}" +AUDIT_DIR="${AUDIT_DIR:-$RUST_REPO/data/_catalog/subjects}" +KEY_PATH="${KEY_PATH:-/tmp/lakehouse_audit/subject_audit.key}" + +OUT_DIR="reports/cutover/gauntlet_2026-05-02/parity" +mkdir -p "$OUT_DIR" +OUT="$OUT_DIR/subject_audit_parity.md" + +export PATH="$PATH:/usr/local/go/bin" + +# ── Build / verify both sides ─────────────────────────────────────── +if [ ! -x "$RUST_BIN" ]; then + echo "[subject-audit-parity] building Rust helper..." + (cd "$RUST_REPO" && cargo build -p catalogd --bin parity_subject_audit --release 2>&1 | tail -3) +fi +if [ ! -x "$RUST_BIN" ]; then + echo "[subject-audit-parity] SKIP: $RUST_BIN missing" + exit 0 +fi + +if [ ! -x "$GO_BIN" ]; then + echo "[subject-audit-parity] building Go helper..." + go build -o "$GO_BIN" ./scripts/cutover/parity/subject_audit_helper/ +fi +if [ ! -x "$GO_BIN" ]; then + echo "[subject-audit-parity] FAIL: $GO_BIN missing after build" + exit 1 +fi + +# ── Report header ─────────────────────────────────────────────────── +{ + echo "# subject_audit_parity" + echo + echo "**Generated:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" + echo "**Spec:** /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md §5 Step 8" + echo "**Rust helper:** \`$RUST_BIN\`" + echo "**Go helper:** \`$GO_BIN\`" + echo "**Audit dir:** \`$AUDIT_DIR\`" + echo +} > "$OUT" + +PASS=0 +FAIL=0 + +# ── Phase 1 — Known-answer vector ─────────────────────────────────── +{ + echo "## Phase 1 — Known-answer vector" + echo + echo "Hardcoded fixture row, identical inputs, byte-compare canonical-JSON + HMAC." + echo +} >> "$OUT" + +RUST_KA="$(mktemp)"; GO_KA="$(mktemp)" +"$RUST_BIN" --known-answer > "$RUST_KA" 2>&1 || true +"$GO_BIN" --known-answer > "$GO_KA" 2>&1 || true + +if diff -q "$RUST_KA" "$GO_KA" >/dev/null 2>&1; then + PASS=$((PASS+1)) + { + echo "**MATCH** ✓" + echo + echo '```json' + cat "$RUST_KA" + echo '```' + echo + } >> "$OUT" +else + FAIL=$((FAIL+1)) + { + echo "**MISMATCH** ✗" + echo + echo "### Rust" + echo '```json' + cat "$RUST_KA" + echo '```' + echo + echo "### Go" + echo '```json' + cat "$GO_KA" + echo '```' + echo + echo "### Diff" + echo '```diff' + diff "$RUST_KA" "$GO_KA" || true + echo '```' + echo + } >> "$OUT" +fi + +# ── Phase 2 — Real production audit logs ──────────────────────────── +{ + echo "## Phase 2 — Real production audit logs" + echo + echo "Every \`*.audit.jsonl\` in \`$AUDIT_DIR\` verified by both runtimes." + echo +} >> "$OUT" + +if [ ! -r "$KEY_PATH" ]; then + { + echo "**SKIP** — signing key not readable at \`$KEY_PATH\`." + echo "Set \`KEY_PATH=...\` or seed /tmp/lakehouse_audit/subject_audit.key (see systemd unit)." + echo + } >> "$OUT" +else + shopt -s nullglob + LOGS=( "$AUDIT_DIR"/*.audit.jsonl ) + if [ "${#LOGS[@]}" -eq 0 ]; then + { + echo "**SKIP** — no \`*.audit.jsonl\` files under \`$AUDIT_DIR\`." + echo "(Trigger one by hitting the gateway with /v1/validate on a candidate_id.)" + echo + } >> "$OUT" + else + { + echo "| Audit log | Rust verified | Go verified | Result |" + echo "|---|---|---|---|" + } >> "$OUT" + for log in "${LOGS[@]}"; do + label="$(basename "$log")" + RUST_OUT="$(mktemp)"; GO_OUT="$(mktemp)" + "$RUST_BIN" --verify "$log" --key "$KEY_PATH" > "$RUST_OUT" 2>&1 || true + "$GO_BIN" --verify "$log" --key "$KEY_PATH" > "$GO_OUT" 2>&1 || true + rust_count=$(jq -r '.count // 0' < "$RUST_OUT" 2>/dev/null || echo "?") + go_count=$(jq -r '.count // 0' < "$GO_OUT" 2>/dev/null || echo "?") + rust_ok=$(jq -r '.verified // false' < "$RUST_OUT" 2>/dev/null || echo "?") + go_ok=$(jq -r '.verified // false' < "$GO_OUT" 2>/dev/null || echo "?") + if diff -q "$RUST_OUT" "$GO_OUT" >/dev/null 2>&1; then + PASS=$((PASS+1)) + echo "| \`$label\` | $rust_count rows ($rust_ok) | $go_count rows ($go_ok) | **MATCH** ✓ |" >> "$OUT" + else + FAIL=$((FAIL+1)) + { + echo "| \`$label\` | $rust_count rows ($rust_ok) | $go_count rows ($go_ok) | **MISMATCH** ✗ |" + echo + echo "### Diff for \`$label\`" + echo '```diff' + diff "$RUST_OUT" "$GO_OUT" || true + echo '```' + echo + } >> "$OUT" + fi + rm -f "$RUST_OUT" "$GO_OUT" + done + fi +fi +rm -f "$RUST_KA" "$GO_KA" + +# ── Summary ───────────────────────────────────────────────────────── +TOTAL=$((PASS + FAIL)) +{ + echo + echo "## Summary" + echo + echo "**$PASS / $TOTAL** parity assertions passed." + echo + if [ "$FAIL" -gt 0 ]; then + echo "**Status: DIVERGED** — Rust and Go disagree on at least one canonical-JSON or HMAC computation." + echo "Investigate the diff above before declaring cross-runtime parity." + else + echo "**Status: PARITY** — every Rust assertion matches Go byte-for-byte." + fi +} >> "$OUT" + +echo "[subject-audit-parity] $PASS / $TOTAL pass — report: $OUT" +[ "$FAIL" -eq 0 ]