root 857ca4c971 catalogd: HTML-safe escape fix + decisions tracker entry
Per 2026-05-03 step_7_8_retention_and_parity scrum (opus WARN on
parity_subject_audit.rs:canonical_json):

Go's json.Marshal HTML-escapes < > & to < > & by
default. Rust's serde_json::to_vec keeps them literal. Any audit
row with these chars in any string field would silently produce
different canonical bytes across runtimes → broken HMAC chain.
Latent because no production audit field has carried <>& yet, but
realistic for purpose strings ("error & retry") or trace_id values
("<HTTP-Request-Id>").

Fix: marshalNoEscapeHTML helper wraps json.Encoder.SetEscapeHTML(false)
+ trims trailing newline. Routed through writeCanonical for both
keys and scalar values.

Regression test: TestVerifyChain_HtmlChars_NotEscaped (purpose has &,
trace_id has <>) asserts the canonical bytes contain literal chars,
not escape sequences.

11 unit tests pass including the new one; parity probe still 6/6
byte-identical against live production audit logs.

Decisions tracker: added 2026-05-03 entry for SUBJECT_MANIFESTS_ON_CATALOGD
Steps 1-8 closure + 6th cross-runtime parity probe (was 5).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 04:29:53 -05:00

356 lines
13 KiB
Go

// Subject manifests + per-subject audit-log chain verification.
//
// Specification: /home/profit/lakehouse/docs/specs/SUBJECT_MANIFESTS_ON_CATALOGD.md.
//
// This file is the Go side of Step 8 (cross-runtime parity). The Rust
// side is the writer (crates/catalogd/src/subject_audit.rs); Go is a
// READER + VERIFIER only — both sides serialize / hash with identical
// algorithms so a chain written by Rust verifies under Go.
//
// Algorithm (must match Rust crates/catalogd/src/subject_audit.rs exactly):
// row_hmac = HMAC-SHA256(key, prev_chain_hash_bytes || canonical_row_bytes)
//
// where:
// - prev_chain_hash_bytes is the ASCII bytes of the previous row's
// row_hmac field, OR the literal ASCII string "GENESIS" for the
// first row (no hex decode!)
// - canonical_row_bytes is the row JSON with the row_hmac field
// dropped, with all object keys sorted alphabetically at every
// nesting depth, with no insignificant whitespace
// - the final hash is rendered as 64 lowercase hex characters
package catalogd
import (
"bytes"
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"sort"
"strings"
"time"
)
const GenesisHash = "GENESIS"
// SubjectManifest mirrors crates/shared/src/types.rs::SubjectManifest.
// Keep field tags byte-identical so manifest JSON written by Rust round-trips.
type SubjectManifest struct {
Schema string `json:"schema"`
CandidateID string `json:"candidate_id"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
Status string `json:"status"`
Vertical string `json:"vertical,omitempty"`
Consent SubjectConsent `json:"consent"`
Retention SubjectRetention `json:"retention"`
Datasets []SubjectDatasetRef `json:"datasets"`
SafeViews []string `json:"safe_views"`
AuditLogPath string `json:"audit_log_path"`
AuditLogChainRoot string `json:"audit_log_chain_root"`
}
type SubjectConsent struct {
GeneralPii GeneralPiiConsent `json:"general_pii"`
Biometric BiometricConsent `json:"biometric"`
}
type GeneralPiiConsent struct {
Status string `json:"status"`
Version string `json:"version,omitempty"`
GivenAt *time.Time `json:"given_at,omitempty"`
WithdrawnAt *time.Time `json:"withdrawn_at,omitempty"`
}
type BiometricConsent struct {
Status string `json:"status"`
RetentionUntil *time.Time `json:"retention_until,omitempty"`
}
type SubjectRetention struct {
GeneralPiiUntil time.Time `json:"general_pii_until"`
Policy string `json:"policy,omitempty"`
}
type SubjectDatasetRef struct {
Name string `json:"name"`
KeyColumn string `json:"key_column"`
KeyValue string `json:"key_value"`
}
// SubjectAuditRow mirrors crates/shared/src/types.rs::SubjectAuditRow.
// JSON field order does NOT matter (canonicalizer sorts at hash time)
// but field names + types MUST match for round-trip.
type SubjectAuditRow struct {
Schema string `json:"schema"`
Ts time.Time `json:"ts"`
CandidateID string `json:"candidate_id"`
Accessor AuditAccessor `json:"accessor"`
FieldsAccessed []string `json:"fields_accessed"`
Result string `json:"result"`
PrevChainHash string `json:"prev_chain_hash"`
RowHmac string `json:"row_hmac"`
}
// AuditAccessor mirrors crates/shared/src/types.rs::AuditAccessor.
//
// IMPORTANT: trace_id has NO omitempty — Rust's serde always writes
// the field (even empty) because `#[serde(default)]` only affects
// READS. Stripping it here produces different canonical bytes than
// the writer used → HMAC mismatch on real production rows. (Caught
// 2026-05-03 by subject_audit_parity.sh on a live WORKER-1 row with
// trace_id="".)
type AuditAccessor struct {
Kind string `json:"kind"`
Daemon string `json:"daemon"`
Purpose string `json:"purpose"`
TraceID string `json:"trace_id"`
}
// LoadSubjectManifest reads + parses a subject manifest JSON file.
func LoadSubjectManifest(path string) (*SubjectManifest, error) {
bytes, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read manifest %s: %w", path, err)
}
var m SubjectManifest
if err := json.Unmarshal(bytes, &m); err != nil {
return nil, fmt.Errorf("parse manifest %s: %w", path, err)
}
return &m, nil
}
// AuditLogEntry is one parsed audit row PLUS the raw line bytes from
// the file. The raw bytes are load-bearing: we MUST canonicalize from
// them (not from the re-marshaled struct) to avoid time-precision drift.
//
// Why: Rust's chrono RFC3339-AutoSi serializes nanoseconds with 9 digits
// (e.g. "461439210"). Go's time.RFC3339Nano strips trailing zeros (e.g.
// "46143921"). Round-tripping through time.Time changes the byte sequence
// and breaks the HMAC. Read once, hash from the original bytes.
// (Caught 2026-05-03 by subject_audit_parity.sh on a real WORKER-5 row
// whose nanoseconds happened to end in 0.)
type AuditLogEntry struct {
Row SubjectAuditRow
Raw []byte
}
// ReadAuditLog parses an audit JSONL file. Returns one AuditLogEntry
// per non-empty line. Defensive: blank lines are skipped, unparseable
// lines are skipped (matches Rust read_rows_in_range).
func ReadAuditLog(path string) ([]AuditLogEntry, error) {
bytes, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("read audit log %s: %w", path, err)
}
var entries []AuditLogEntry
for _, line := range strings.Split(string(bytes), "\n") {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
var row SubjectAuditRow
if err := json.Unmarshal([]byte(trimmed), &row); err != nil {
continue
}
entries = append(entries, AuditLogEntry{Row: row, Raw: []byte(trimmed)})
}
return entries, nil
}
// canonicalRowBytesFromRaw is the production canonicalizer used by
// VerifyChain. It accepts the raw line bytes from the audit JSONL,
// drops the row_hmac key, sorts all keys alphabetically at every
// nesting depth, and re-emits compact JSON. Numbers + strings pass
// through verbatim (json.Number preserves number text, strings are
// re-JSON-escaped identically to Rust's serde_json).
//
// CRITICAL: hashing must always go through this function. NEVER hash
// the bytes produced by canonicalRowBytesFromStruct against a real
// production chain — the time-precision drift will give wrong HMACs
// for any row whose nanoseconds end in 0.
func canonicalRowBytesFromRaw(rawLine []byte) ([]byte, error) {
dec := json.NewDecoder(strings.NewReader(string(rawLine)))
dec.UseNumber()
var v any
if err := dec.Decode(&v); err != nil {
return nil, fmt.Errorf("decode raw line: %w", err)
}
if obj, ok := v.(map[string]any); ok {
delete(obj, "row_hmac")
}
var buf strings.Builder
if err := writeCanonical(&buf, v); err != nil {
return nil, err
}
return []byte(buf.String()), nil
}
// canonicalRowBytesFromStruct is for the parity probe's known-answer
// vector + tests where we WANT to canonicalize a Go-built row from
// scratch (no original bytes exist yet). Production verification uses
// canonicalRowBytesFromRaw against the file bytes; this function is
// only safe for synthetic inputs whose ts has no trailing-zero nanos.
func canonicalRowBytesFromStruct(row *SubjectAuditRow) ([]byte, error) {
raw, err := json.Marshal(row)
if err != nil {
return nil, fmt.Errorf("marshal row: %w", err)
}
return canonicalRowBytesFromRaw(raw)
}
// marshalNoEscapeHTML wraps json.Encoder with HTML escaping disabled.
//
// Why: Go's json.Marshal escapes `<`, `>`, `&` to `<`, `>`,
// `&` by default. Rust's serde_json::to_vec keeps them literal.
// Any string field containing one of those characters would produce
// different canonical bytes across runtimes → broken HMAC chain.
// (Caught 2026-05-03 by opus scrum WARN on parity_subject_audit.rs:
// canonical_json — initially undetected because no production audit
// field contained `<>&`, but realistic for purpose strings like
// "error & retry" or trace_id "<HTTP-Request-Id>".)
//
// Also strips the trailing newline json.Encoder appends — that newline
// is meaningful to JSONL consumers but is junk for hash input.
func marshalNoEscapeHTML(v any) ([]byte, error) {
var buf bytes.Buffer
enc := json.NewEncoder(&buf)
enc.SetEscapeHTML(false)
if err := enc.Encode(v); err != nil {
return nil, err
}
out := buf.Bytes()
return bytes.TrimRight(out, "\n"), nil
}
// writeCanonical recursively writes v as canonical JSON: object keys
// sorted alphabetically, no insignificant whitespace. Arrays preserve
// element order (semantically significant per spec §3).
//
// All scalar emission goes through marshalNoEscapeHTML so the byte
// sequence matches Rust's serde_json output character-for-character.
func writeCanonical(buf *strings.Builder, v any) error {
switch t := v.(type) {
case map[string]any:
keys := make([]string, 0, len(t))
for k := range t {
keys = append(keys, k)
}
sort.Strings(keys)
buf.WriteByte('{')
for i, k := range keys {
if i > 0 {
buf.WriteByte(',')
}
ks, err := marshalNoEscapeHTML(k)
if err != nil {
return fmt.Errorf("marshal key: %w", err)
}
buf.Write(ks)
buf.WriteByte(':')
if err := writeCanonical(buf, t[k]); err != nil {
return err
}
}
buf.WriteByte('}')
case []any:
buf.WriteByte('[')
for i, elem := range t {
if i > 0 {
buf.WriteByte(',')
}
if err := writeCanonical(buf, elem); err != nil {
return err
}
}
buf.WriteByte(']')
default:
bs, err := marshalNoEscapeHTML(v)
if err != nil {
return fmt.Errorf("marshal scalar: %w", err)
}
buf.Write(bs)
}
return nil
}
// CanonicalAndHmac is a helper for the parity probe: canonicalizes the
// row from scratch (struct → JSON → canonical), computes the HMAC
// against the given prev_hash, returns both. Used only for the
// known-answer fixture; production verification uses VerifyChain
// (which canonicalizes from the original file bytes to dodge time
// precision drift).
func CanonicalAndHmac(row *SubjectAuditRow, key []byte, prevHash string) (canonical []byte, hmacHex string, err error) {
canonical, err = canonicalRowBytesFromStruct(row)
if err != nil {
return nil, "", err
}
hmacHex = computeRowHMAC(key, prevHash, canonical)
return canonical, hmacHex, nil
}
// computeRowHMAC = HMAC-SHA256(key, prev_hash_ascii_bytes || canonical_row_bytes)
// rendered as 64-char lowercase hex.
func computeRowHMAC(key []byte, prevHash string, canonical []byte) string {
mac := hmac.New(sha256.New, key)
mac.Write([]byte(prevHash))
mac.Write(canonical)
return hex.EncodeToString(mac.Sum(nil))
}
// VerifyChain replays the chain for a subject's audit log. Returns the
// number of rows verified, the chain tip (last row's row_hmac, or
// GENESIS when log is empty), and an error describing the first chain
// break found.
//
// CRITICAL: canonicalizes from each entry's RAW LINE BYTES, not from
// the parsed struct. Round-tripping through encoding/json + time.Time
// strips trailing zeros from RFC3339 nanos and produces different
// canonical bytes than the Rust writer used (which used chrono's
// AutoSi 9-digit format). Catches: 2026-05-03 WORKER-5 nanos
// 461439210 → Rust emits 9 digits, Go's time.RFC3339Nano emits 8.
//
// Special case: empty/missing log returns (0, GENESIS, nil).
func VerifyChain(entries []AuditLogEntry, key []byte) (count int, chainTip string, err error) {
prev := GenesisHash
for i, entry := range entries {
if entry.Row.PrevChainHash != prev {
return i, prev, fmt.Errorf("chain break at row %d: prev_chain_hash=%q expected=%q",
i+1, entry.Row.PrevChainHash, prev)
}
claimed := entry.Row.RowHmac
canonical, cerr := canonicalRowBytesFromRaw(entry.Raw)
if cerr != nil {
return i, prev, fmt.Errorf("canonicalize row %d: %w", i+1, cerr)
}
recomputed := computeRowHMAC(key, prev, canonical)
if recomputed != claimed {
return i, prev, fmt.Errorf("hmac mismatch at row %d: stored=%s recomputed=%s",
i+1, claimed, recomputed)
}
prev = claimed
count = i + 1
}
chainTip = prev
return count, chainTip, nil
}
// LoadKeyFile reads a signing key file. Refuses keys shorter than 32
// bytes (matches Rust: 2026-05-03 kimi BLOCK fix lifted the minimum
// from 16 to 32 to align with HMAC-SHA256 best practice).
func LoadKeyFile(path string) ([]byte, error) {
key, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read key file %s: %w", path, err)
}
if len(key) < 32 {
return nil, fmt.Errorf("signing key is %d bytes; recommend ≥32 bytes for HMAC-SHA256", len(key))
}
return key, nil
}