Per /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md §5 Step 8.
Go side reads SubjectManifest + verifies HMAC chain on per-subject
audit JSONL files using IDENTICAL canonical-JSON + HMAC-SHA256 algorithm
to crates/catalogd/src/subject_audit.rs. A Rust-written chain now
verifies under Go and vice versa.
Files:
- internal/catalogd/subject.go
SubjectManifest, SubjectAuditRow, AuditAccessor, AuditLogEntry
LoadSubjectManifest, LoadKeyFile (32-byte minimum, matches Rust)
ReadAuditLog, VerifyChain
canonicalRowBytesFromRaw (production), canonicalRowBytesFromStruct (tests)
computeRowHMAC, CanonicalAndHmac (parity helper)
- internal/catalogd/subject_test.go (10 unit tests)
- scripts/cutover/parity/subject_audit_helper/main.go
CLI helper mirroring crates/catalogd/src/bin/parity_subject_audit.rs
- scripts/cutover/parity/subject_audit_parity.sh
Two-phase probe: known-answer + every real audit log
Two real bugs caught + fixed by the probe authoring loop:
1. omitempty on AuditAccessor.TraceID stripped the field when empty,
producing different canonical bytes than Rust (which always writes
the field). Removed omitempty. Rust + Go now produce identical
bytes for rows with trace_id="" (the common production case).
2. time.RFC3339Nano strips trailing zeros from nanoseconds, producing
"...46143921" where Rust's chrono AutoSi produces "...461439210".
Hashing through the parsed-then-re-marshaled struct breaks the
chain on any row whose nanos end in 0. Fixed by canonicalizing
from the RAW LINE BYTES (preserves the original timestamp string
byte-for-byte). Test TestVerifyChain_RawBytesPreserveTimePrecision
regression-locks this with a hand-crafted nanos=461439210 row.
Live verification (6 / 6 byte-identical assertions):
- Phase 1 known-answer: canonical bytes (266) + HMAC match
- Phase 2 real logs: WORKER-1..5 audit JSONL all verify under both
runtimes with identical (count, tip, verified, error) output
Report: reports/cutover/gauntlet_2026-05-02/parity/subject_audit_parity.md
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
330 lines
12 KiB
Go
330 lines
12 KiB
Go
// Subject manifests + per-subject audit-log chain verification.
|
|
//
|
|
// Specification: /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md.
|
|
//
|
|
// This file is the Go side of Step 8 (cross-runtime parity). The Rust
|
|
// side is the writer (crates/catalogd/src/subject_audit.rs); Go is a
|
|
// READER + VERIFIER only — both sides serialize / hash with identical
|
|
// algorithms so a chain written by Rust verifies under Go.
|
|
//
|
|
// Algorithm (must match Rust crates/catalogd/src/subject_audit.rs exactly):
|
|
// row_hmac = HMAC-SHA256(key, prev_chain_hash_bytes || canonical_row_bytes)
|
|
//
|
|
// where:
|
|
// - prev_chain_hash_bytes is the ASCII bytes of the previous row's
|
|
// row_hmac field, OR the literal ASCII string "GENESIS" for the
|
|
// first row (no hex decode!)
|
|
// - canonical_row_bytes is the row JSON with the row_hmac field
|
|
// dropped, with all object keys sorted alphabetically at every
|
|
// nesting depth, with no insignificant whitespace
|
|
// - the final hash is rendered as 64 lowercase hex characters
|
|
package catalogd
|
|
|
|
import (
|
|
"crypto/hmac"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const GenesisHash = "GENESIS"
|
|
|
|
// SubjectManifest mirrors crates/shared/src/types.rs::SubjectManifest.
|
|
// Keep field tags byte-identical so manifest JSON written by Rust round-trips.
|
|
type SubjectManifest struct {
|
|
Schema string `json:"schema"`
|
|
CandidateID string `json:"candidate_id"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
Status string `json:"status"`
|
|
Vertical string `json:"vertical,omitempty"`
|
|
Consent SubjectConsent `json:"consent"`
|
|
Retention SubjectRetention `json:"retention"`
|
|
Datasets []SubjectDatasetRef `json:"datasets"`
|
|
SafeViews []string `json:"safe_views"`
|
|
AuditLogPath string `json:"audit_log_path"`
|
|
AuditLogChainRoot string `json:"audit_log_chain_root"`
|
|
}
|
|
|
|
type SubjectConsent struct {
|
|
GeneralPii GeneralPiiConsent `json:"general_pii"`
|
|
Biometric BiometricConsent `json:"biometric"`
|
|
}
|
|
|
|
type GeneralPiiConsent struct {
|
|
Status string `json:"status"`
|
|
Version string `json:"version,omitempty"`
|
|
GivenAt *time.Time `json:"given_at,omitempty"`
|
|
WithdrawnAt *time.Time `json:"withdrawn_at,omitempty"`
|
|
}
|
|
|
|
type BiometricConsent struct {
|
|
Status string `json:"status"`
|
|
RetentionUntil *time.Time `json:"retention_until,omitempty"`
|
|
}
|
|
|
|
type SubjectRetention struct {
|
|
GeneralPiiUntil time.Time `json:"general_pii_until"`
|
|
Policy string `json:"policy,omitempty"`
|
|
}
|
|
|
|
type SubjectDatasetRef struct {
|
|
Name string `json:"name"`
|
|
KeyColumn string `json:"key_column"`
|
|
KeyValue string `json:"key_value"`
|
|
}
|
|
|
|
// SubjectAuditRow mirrors crates/shared/src/types.rs::SubjectAuditRow.
|
|
// JSON field order does NOT matter (canonicalizer sorts at hash time)
|
|
// but field names + types MUST match for round-trip.
|
|
type SubjectAuditRow struct {
|
|
Schema string `json:"schema"`
|
|
Ts time.Time `json:"ts"`
|
|
CandidateID string `json:"candidate_id"`
|
|
Accessor AuditAccessor `json:"accessor"`
|
|
FieldsAccessed []string `json:"fields_accessed"`
|
|
Result string `json:"result"`
|
|
PrevChainHash string `json:"prev_chain_hash"`
|
|
RowHmac string `json:"row_hmac"`
|
|
}
|
|
|
|
// AuditAccessor mirrors crates/shared/src/types.rs::AuditAccessor.
|
|
//
|
|
// IMPORTANT: trace_id has NO omitempty — Rust's serde always writes
|
|
// the field (even empty) because `#[serde(default)]` only affects
|
|
// READS. Stripping it here produces different canonical bytes than
|
|
// the writer used → HMAC mismatch on real production rows. (Caught
|
|
// 2026-05-03 by subject_audit_parity.sh on a live WORKER-1 row with
|
|
// trace_id="".)
|
|
type AuditAccessor struct {
|
|
Kind string `json:"kind"`
|
|
Daemon string `json:"daemon"`
|
|
Purpose string `json:"purpose"`
|
|
TraceID string `json:"trace_id"`
|
|
}
|
|
|
|
// LoadSubjectManifest reads + parses a subject manifest JSON file.
|
|
func LoadSubjectManifest(path string) (*SubjectManifest, error) {
|
|
bytes, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read manifest %s: %w", path, err)
|
|
}
|
|
var m SubjectManifest
|
|
if err := json.Unmarshal(bytes, &m); err != nil {
|
|
return nil, fmt.Errorf("parse manifest %s: %w", path, err)
|
|
}
|
|
return &m, nil
|
|
}
|
|
|
|
// AuditLogEntry is one parsed audit row PLUS the raw line bytes from
|
|
// the file. The raw bytes are load-bearing: we MUST canonicalize from
|
|
// them (not from the re-marshaled struct) to avoid time-precision drift.
|
|
//
|
|
// Why: Rust's chrono RFC3339-AutoSi serializes nanoseconds with 9 digits
|
|
// (e.g. "461439210"). Go's time.RFC3339Nano strips trailing zeros (e.g.
|
|
// "46143921"). Round-tripping through time.Time changes the byte sequence
|
|
// and breaks the HMAC. Read once, hash from the original bytes.
|
|
// (Caught 2026-05-03 by subject_audit_parity.sh on a real WORKER-5 row
|
|
// whose nanoseconds happened to end in 0.)
|
|
type AuditLogEntry struct {
|
|
Row SubjectAuditRow
|
|
Raw []byte
|
|
}
|
|
|
|
// ReadAuditLog parses an audit JSONL file. Returns one AuditLogEntry
|
|
// per non-empty line. Defensive: blank lines are skipped, unparseable
|
|
// lines are skipped (matches Rust read_rows_in_range).
|
|
func ReadAuditLog(path string) ([]AuditLogEntry, error) {
|
|
bytes, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, fmt.Errorf("read audit log %s: %w", path, err)
|
|
}
|
|
var entries []AuditLogEntry
|
|
for _, line := range strings.Split(string(bytes), "\n") {
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
var row SubjectAuditRow
|
|
if err := json.Unmarshal([]byte(trimmed), &row); err != nil {
|
|
continue
|
|
}
|
|
entries = append(entries, AuditLogEntry{Row: row, Raw: []byte(trimmed)})
|
|
}
|
|
return entries, nil
|
|
}
|
|
|
|
// canonicalRowBytesFromRaw is the production canonicalizer used by
|
|
// VerifyChain. It accepts the raw line bytes from the audit JSONL,
|
|
// drops the row_hmac key, sorts all keys alphabetically at every
|
|
// nesting depth, and re-emits compact JSON. Numbers + strings pass
|
|
// through verbatim (json.Number preserves number text, strings are
|
|
// re-JSON-escaped identically to Rust's serde_json).
|
|
//
|
|
// CRITICAL: hashing must always go through this function. NEVER hash
|
|
// the bytes produced by canonicalRowBytesFromStruct against a real
|
|
// production chain — the time-precision drift will give wrong HMACs
|
|
// for any row whose nanoseconds end in 0.
|
|
func canonicalRowBytesFromRaw(rawLine []byte) ([]byte, error) {
|
|
dec := json.NewDecoder(strings.NewReader(string(rawLine)))
|
|
dec.UseNumber()
|
|
var v any
|
|
if err := dec.Decode(&v); err != nil {
|
|
return nil, fmt.Errorf("decode raw line: %w", err)
|
|
}
|
|
if obj, ok := v.(map[string]any); ok {
|
|
delete(obj, "row_hmac")
|
|
}
|
|
var buf strings.Builder
|
|
if err := writeCanonical(&buf, v); err != nil {
|
|
return nil, err
|
|
}
|
|
return []byte(buf.String()), nil
|
|
}
|
|
|
|
// canonicalRowBytesFromStruct is for the parity probe's known-answer
|
|
// vector + tests where we WANT to canonicalize a Go-built row from
|
|
// scratch (no original bytes exist yet). Production verification uses
|
|
// canonicalRowBytesFromRaw against the file bytes; this function is
|
|
// only safe for synthetic inputs whose ts has no trailing-zero nanos.
|
|
func canonicalRowBytesFromStruct(row *SubjectAuditRow) ([]byte, error) {
|
|
raw, err := json.Marshal(row)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal row: %w", err)
|
|
}
|
|
return canonicalRowBytesFromRaw(raw)
|
|
}
|
|
|
|
// writeCanonical recursively writes v as canonical JSON: object keys
|
|
// sorted alphabetically, no insignificant whitespace. Arrays preserve
|
|
// element order (semantically significant per spec §3).
|
|
func writeCanonical(buf *strings.Builder, v any) error {
|
|
switch t := v.(type) {
|
|
case map[string]any:
|
|
keys := make([]string, 0, len(t))
|
|
for k := range t {
|
|
keys = append(keys, k)
|
|
}
|
|
sort.Strings(keys)
|
|
buf.WriteByte('{')
|
|
for i, k := range keys {
|
|
if i > 0 {
|
|
buf.WriteByte(',')
|
|
}
|
|
ks, err := json.Marshal(k)
|
|
if err != nil {
|
|
return fmt.Errorf("marshal key: %w", err)
|
|
}
|
|
buf.Write(ks)
|
|
buf.WriteByte(':')
|
|
if err := writeCanonical(buf, t[k]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
buf.WriteByte('}')
|
|
case []any:
|
|
buf.WriteByte('[')
|
|
for i, elem := range t {
|
|
if i > 0 {
|
|
buf.WriteByte(',')
|
|
}
|
|
if err := writeCanonical(buf, elem); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
buf.WriteByte(']')
|
|
default:
|
|
// json.Number, string, bool, nil — encoding/json renders these
|
|
// the same way Rust's serde_json does (compact, RFC-8259-conformant).
|
|
bs, err := json.Marshal(v)
|
|
if err != nil {
|
|
return fmt.Errorf("marshal scalar: %w", err)
|
|
}
|
|
buf.Write(bs)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CanonicalAndHmac is a helper for the parity probe: canonicalizes the
|
|
// row from scratch (struct → JSON → canonical), computes the HMAC
|
|
// against the given prev_hash, returns both. Used only for the
|
|
// known-answer fixture; production verification uses VerifyChain
|
|
// (which canonicalizes from the original file bytes to dodge time
|
|
// precision drift).
|
|
func CanonicalAndHmac(row *SubjectAuditRow, key []byte, prevHash string) (canonical []byte, hmacHex string, err error) {
|
|
canonical, err = canonicalRowBytesFromStruct(row)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
hmacHex = computeRowHMAC(key, prevHash, canonical)
|
|
return canonical, hmacHex, nil
|
|
}
|
|
|
|
// computeRowHMAC = HMAC-SHA256(key, prev_hash_ascii_bytes || canonical_row_bytes)
|
|
// rendered as 64-char lowercase hex.
|
|
func computeRowHMAC(key []byte, prevHash string, canonical []byte) string {
|
|
mac := hmac.New(sha256.New, key)
|
|
mac.Write([]byte(prevHash))
|
|
mac.Write(canonical)
|
|
return hex.EncodeToString(mac.Sum(nil))
|
|
}
|
|
|
|
// VerifyChain replays the chain for a subject's audit log. Returns the
|
|
// number of rows verified, the chain tip (last row's row_hmac, or
|
|
// GENESIS when log is empty), and an error describing the first chain
|
|
// break found.
|
|
//
|
|
// CRITICAL: canonicalizes from each entry's RAW LINE BYTES, not from
|
|
// the parsed struct. Round-tripping through encoding/json + time.Time
|
|
// strips trailing zeros from RFC3339 nanos and produces different
|
|
// canonical bytes than the Rust writer used (which used chrono's
|
|
// AutoSi 9-digit format). Catches: 2026-05-03 WORKER-5 nanos
|
|
// 461439210 → Rust emits 9 digits, Go's time.RFC3339Nano emits 8.
|
|
//
|
|
// Special case: empty/missing log returns (0, GENESIS, nil).
|
|
func VerifyChain(entries []AuditLogEntry, key []byte) (count int, chainTip string, err error) {
|
|
prev := GenesisHash
|
|
for i, entry := range entries {
|
|
if entry.Row.PrevChainHash != prev {
|
|
return i, prev, fmt.Errorf("chain break at row %d: prev_chain_hash=%q expected=%q",
|
|
i+1, entry.Row.PrevChainHash, prev)
|
|
}
|
|
claimed := entry.Row.RowHmac
|
|
canonical, cerr := canonicalRowBytesFromRaw(entry.Raw)
|
|
if cerr != nil {
|
|
return i, prev, fmt.Errorf("canonicalize row %d: %w", i+1, cerr)
|
|
}
|
|
recomputed := computeRowHMAC(key, prev, canonical)
|
|
if recomputed != claimed {
|
|
return i, prev, fmt.Errorf("hmac mismatch at row %d: stored=%s recomputed=%s",
|
|
i+1, claimed, recomputed)
|
|
}
|
|
prev = claimed
|
|
count = i + 1
|
|
}
|
|
chainTip = prev
|
|
return count, chainTip, nil
|
|
}
|
|
|
|
// LoadKeyFile reads a signing key file. Refuses keys shorter than 32
|
|
// bytes (matches Rust: 2026-05-03 kimi BLOCK fix lifted the minimum
|
|
// from 16 to 32 to align with HMAC-SHA256 best practice).
|
|
func LoadKeyFile(path string) ([]byte, error) {
|
|
key, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read key file %s: %w", path, err)
|
|
}
|
|
if len(key) < 32 {
|
|
return nil, fmt.Errorf("signing key is %d bytes; recommend ≥32 bytes for HMAC-SHA256", len(key))
|
|
}
|
|
return key, nil
|
|
}
|