// Subject manifests + per-subject audit-log chain verification. // // Specification: /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md. // // This file is the Go side of Step 8 (cross-runtime parity). The Rust // side is the writer (crates/catalogd/src/subject_audit.rs); Go is a // READER + VERIFIER only — both sides serialize / hash with identical // algorithms so a chain written by Rust verifies under Go. // // Algorithm (must match Rust crates/catalogd/src/subject_audit.rs exactly): // row_hmac = HMAC-SHA256(key, prev_chain_hash_bytes || canonical_row_bytes) // // where: // - prev_chain_hash_bytes is the ASCII bytes of the previous row's // row_hmac field, OR the literal ASCII string "GENESIS" for the // first row (no hex decode!) // - canonical_row_bytes is the row JSON with the row_hmac field // dropped, with all object keys sorted alphabetically at every // nesting depth, with no insignificant whitespace // - the final hash is rendered as 64 lowercase hex characters package catalogd import ( "bytes" "crypto/hmac" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "os" "sort" "strings" "time" ) const GenesisHash = "GENESIS" // SubjectManifest mirrors crates/shared/src/types.rs::SubjectManifest. // Keep field tags byte-identical so manifest JSON written by Rust round-trips. type SubjectManifest struct { Schema string `json:"schema"` CandidateID string `json:"candidate_id"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` Status string `json:"status"` Vertical string `json:"vertical,omitempty"` Consent SubjectConsent `json:"consent"` Retention SubjectRetention `json:"retention"` Datasets []SubjectDatasetRef `json:"datasets"` SafeViews []string `json:"safe_views"` AuditLogPath string `json:"audit_log_path"` AuditLogChainRoot string `json:"audit_log_chain_root"` } type SubjectConsent struct { GeneralPii GeneralPiiConsent `json:"general_pii"` Biometric BiometricConsent `json:"biometric"` } type GeneralPiiConsent struct { Status string `json:"status"` Version string `json:"version,omitempty"` GivenAt *time.Time `json:"given_at,omitempty"` WithdrawnAt *time.Time `json:"withdrawn_at,omitempty"` } type BiometricConsent struct { Status string `json:"status"` RetentionUntil *time.Time `json:"retention_until,omitempty"` } type SubjectRetention struct { GeneralPiiUntil time.Time `json:"general_pii_until"` Policy string `json:"policy,omitempty"` } type SubjectDatasetRef struct { Name string `json:"name"` KeyColumn string `json:"key_column"` KeyValue string `json:"key_value"` } // SubjectAuditRow mirrors crates/shared/src/types.rs::SubjectAuditRow. // JSON field order does NOT matter (canonicalizer sorts at hash time) // but field names + types MUST match for round-trip. type SubjectAuditRow struct { Schema string `json:"schema"` Ts time.Time `json:"ts"` CandidateID string `json:"candidate_id"` Accessor AuditAccessor `json:"accessor"` FieldsAccessed []string `json:"fields_accessed"` Result string `json:"result"` PrevChainHash string `json:"prev_chain_hash"` RowHmac string `json:"row_hmac"` } // AuditAccessor mirrors crates/shared/src/types.rs::AuditAccessor. // // IMPORTANT: trace_id has NO omitempty — Rust's serde always writes // the field (even empty) because `#[serde(default)]` only affects // READS. Stripping it here produces different canonical bytes than // the writer used → HMAC mismatch on real production rows. (Caught // 2026-05-03 by subject_audit_parity.sh on a live WORKER-1 row with // trace_id="".) type AuditAccessor struct { Kind string `json:"kind"` Daemon string `json:"daemon"` Purpose string `json:"purpose"` TraceID string `json:"trace_id"` } // LoadSubjectManifest reads + parses a subject manifest JSON file. func LoadSubjectManifest(path string) (*SubjectManifest, error) { bytes, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("read manifest %s: %w", path, err) } var m SubjectManifest if err := json.Unmarshal(bytes, &m); err != nil { return nil, fmt.Errorf("parse manifest %s: %w", path, err) } return &m, nil } // AuditLogEntry is one parsed audit row PLUS the raw line bytes from // the file. The raw bytes are load-bearing: we MUST canonicalize from // them (not from the re-marshaled struct) to avoid time-precision drift. // // Why: Rust's chrono RFC3339-AutoSi serializes nanoseconds with 9 digits // (e.g. "461439210"). Go's time.RFC3339Nano strips trailing zeros (e.g. // "46143921"). Round-tripping through time.Time changes the byte sequence // and breaks the HMAC. Read once, hash from the original bytes. // (Caught 2026-05-03 by subject_audit_parity.sh on a real WORKER-5 row // whose nanoseconds happened to end in 0.) type AuditLogEntry struct { Row SubjectAuditRow Raw []byte } // ReadAuditLog parses an audit JSONL file. Returns one AuditLogEntry // per non-empty line. Defensive: blank lines are skipped, unparseable // lines are skipped (matches Rust read_rows_in_range). func ReadAuditLog(path string) ([]AuditLogEntry, error) { bytes, err := os.ReadFile(path) if err != nil { if os.IsNotExist(err) { return nil, nil } return nil, fmt.Errorf("read audit log %s: %w", path, err) } var entries []AuditLogEntry for _, line := range strings.Split(string(bytes), "\n") { trimmed := strings.TrimSpace(line) if trimmed == "" { continue } var row SubjectAuditRow if err := json.Unmarshal([]byte(trimmed), &row); err != nil { continue } entries = append(entries, AuditLogEntry{Row: row, Raw: []byte(trimmed)}) } return entries, nil } // canonicalRowBytesFromRaw is the production canonicalizer used by // VerifyChain. It accepts the raw line bytes from the audit JSONL, // drops the row_hmac key, sorts all keys alphabetically at every // nesting depth, and re-emits compact JSON. Numbers + strings pass // through verbatim (json.Number preserves number text, strings are // re-JSON-escaped identically to Rust's serde_json). // // CRITICAL: hashing must always go through this function. NEVER hash // the bytes produced by canonicalRowBytesFromStruct against a real // production chain — the time-precision drift will give wrong HMACs // for any row whose nanoseconds end in 0. func canonicalRowBytesFromRaw(rawLine []byte) ([]byte, error) { dec := json.NewDecoder(strings.NewReader(string(rawLine))) dec.UseNumber() var v any if err := dec.Decode(&v); err != nil { return nil, fmt.Errorf("decode raw line: %w", err) } if obj, ok := v.(map[string]any); ok { delete(obj, "row_hmac") } var buf strings.Builder if err := writeCanonical(&buf, v); err != nil { return nil, err } return []byte(buf.String()), nil } // canonicalRowBytesFromStruct is for the parity probe's known-answer // vector + tests where we WANT to canonicalize a Go-built row from // scratch (no original bytes exist yet). Production verification uses // canonicalRowBytesFromRaw against the file bytes; this function is // only safe for synthetic inputs whose ts has no trailing-zero nanos. func canonicalRowBytesFromStruct(row *SubjectAuditRow) ([]byte, error) { raw, err := json.Marshal(row) if err != nil { return nil, fmt.Errorf("marshal row: %w", err) } return canonicalRowBytesFromRaw(raw) } // marshalNoEscapeHTML wraps json.Encoder with HTML escaping disabled. // // Why: Go's json.Marshal escapes `<`, `>`, `&` to `<`, `>`, // `&` by default. Rust's serde_json::to_vec keeps them literal. // Any string field containing one of those characters would produce // different canonical bytes across runtimes → broken HMAC chain. // (Caught 2026-05-03 by opus scrum WARN on parity_subject_audit.rs: // canonical_json — initially undetected because no production audit // field contained `<>&`, but realistic for purpose strings like // "error & retry" or trace_id "".) // // Also strips the trailing newline json.Encoder appends — that newline // is meaningful to JSONL consumers but is junk for hash input. func marshalNoEscapeHTML(v any) ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) enc.SetEscapeHTML(false) if err := enc.Encode(v); err != nil { return nil, err } out := buf.Bytes() return bytes.TrimRight(out, "\n"), nil } // writeCanonical recursively writes v as canonical JSON: object keys // sorted alphabetically, no insignificant whitespace. Arrays preserve // element order (semantically significant per spec §3). // // All scalar emission goes through marshalNoEscapeHTML so the byte // sequence matches Rust's serde_json output character-for-character. func writeCanonical(buf *strings.Builder, v any) error { switch t := v.(type) { case map[string]any: keys := make([]string, 0, len(t)) for k := range t { keys = append(keys, k) } sort.Strings(keys) buf.WriteByte('{') for i, k := range keys { if i > 0 { buf.WriteByte(',') } ks, err := marshalNoEscapeHTML(k) if err != nil { return fmt.Errorf("marshal key: %w", err) } buf.Write(ks) buf.WriteByte(':') if err := writeCanonical(buf, t[k]); err != nil { return err } } buf.WriteByte('}') case []any: buf.WriteByte('[') for i, elem := range t { if i > 0 { buf.WriteByte(',') } if err := writeCanonical(buf, elem); err != nil { return err } } buf.WriteByte(']') default: bs, err := marshalNoEscapeHTML(v) if err != nil { return fmt.Errorf("marshal scalar: %w", err) } buf.Write(bs) } return nil } // CanonicalAndHmac is a helper for the parity probe: canonicalizes the // row from scratch (struct → JSON → canonical), computes the HMAC // against the given prev_hash, returns both. Used only for the // known-answer fixture; production verification uses VerifyChain // (which canonicalizes from the original file bytes to dodge time // precision drift). func CanonicalAndHmac(row *SubjectAuditRow, key []byte, prevHash string) (canonical []byte, hmacHex string, err error) { canonical, err = canonicalRowBytesFromStruct(row) if err != nil { return nil, "", err } hmacHex = computeRowHMAC(key, prevHash, canonical) return canonical, hmacHex, nil } // computeRowHMAC = HMAC-SHA256(key, prev_hash_ascii_bytes || canonical_row_bytes) // rendered as 64-char lowercase hex. func computeRowHMAC(key []byte, prevHash string, canonical []byte) string { mac := hmac.New(sha256.New, key) mac.Write([]byte(prevHash)) mac.Write(canonical) return hex.EncodeToString(mac.Sum(nil)) } // VerifyChain replays the chain for a subject's audit log. Returns the // number of rows verified, the chain tip (last row's row_hmac, or // GENESIS when log is empty), and an error describing the first chain // break found. // // CRITICAL: canonicalizes from each entry's RAW LINE BYTES, not from // the parsed struct. Round-tripping through encoding/json + time.Time // strips trailing zeros from RFC3339 nanos and produces different // canonical bytes than the Rust writer used (which used chrono's // AutoSi 9-digit format). Catches: 2026-05-03 WORKER-5 nanos // 461439210 → Rust emits 9 digits, Go's time.RFC3339Nano emits 8. // // Special case: empty/missing log returns (0, GENESIS, nil). func VerifyChain(entries []AuditLogEntry, key []byte) (count int, chainTip string, err error) { prev := GenesisHash for i, entry := range entries { if entry.Row.PrevChainHash != prev { return i, prev, fmt.Errorf("chain break at row %d: prev_chain_hash=%q expected=%q", i+1, entry.Row.PrevChainHash, prev) } claimed := entry.Row.RowHmac canonical, cerr := canonicalRowBytesFromRaw(entry.Raw) if cerr != nil { return i, prev, fmt.Errorf("canonicalize row %d: %w", i+1, cerr) } recomputed := computeRowHMAC(key, prev, canonical) if recomputed != claimed { return i, prev, fmt.Errorf("hmac mismatch at row %d: stored=%s recomputed=%s", i+1, claimed, recomputed) } prev = claimed count = i + 1 } chainTip = prev return count, chainTip, nil } // LoadKeyFile reads a signing key file. Refuses keys shorter than 32 // bytes (matches Rust: 2026-05-03 kimi BLOCK fix lifted the minimum // from 16 to 32 to align with HMAC-SHA256 best practice). func LoadKeyFile(path string) ([]byte, error) { key, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("read key file %s: %w", path, err) } if len(key) < 32 { return nil, fmt.Errorf("signing key is %d bytes; recommend ≥32 bytes for HMAC-SHA256", len(key)) } return key, nil }