golangLAKEHOUSE/internal/catalogd/manifest_test.go
root 66a704ca3e G0 D3: catalogd Parquet manifests + ADR-020 idempotent register · 6 scrum fixes
Phase G0 Day 3 ships catalogd: Arrow Parquet manifest codec, in-memory
registry with the ADR-020 idempotency contract (same name+fingerprint
reuses dataset_id; different fingerprint → 409 Conflict), HTTP client
to storaged for persistence, and rehydration on startup. Acceptance
smoke 6/6 PASSES end-to-end including rehydrate-across-restart — the
load-bearing test that the catalog/storaged service split actually
preserves state.

dataset_id derivation diverges from Rust: UUIDv5(namespace, name)
instead of v4 surrogate. Same name on any box generates the same
dataset_id; rehydrate after disk loss converges to the same identity
rather than silently re-issuing. Namespace pinned at
a8f3c1d2-4e5b-5a6c-9d8e-7f0a1b2c3d4e — every dataset_id ever issued
depends on these bytes.

Cross-lineage scrum on shipped code:
  - Opus 4.7 (opencode):                       1 BLOCK + 5 WARN + 3 INFO
  - Kimi K2-0905 (openrouter, validated D2):   2 BLOCK + 2 WARN + 1 INFO
  - Qwen3-coder (openrouter):                  2 BLOCK + 2 WARN + 2 INFO

Fixed:
  C1 list-offsets BLOCK (3-way convergent) → ValueOffsets(0) + bounds
  C2 Rehydrate mutex held across I/O → swap-under-brief-lock pattern
  S1 split-brain on persist failure → candidate-then-swap
  S2 brittle string-match for 400 vs 500 → ErrEmptyName/ErrEmptyFingerprint sentinels
  S3 Get/List shallow-copy aliasing → cloneManifest deep copy
  S4 keep-alive socket leak on error paths → drainAndClose helper

Dismissed (false positives, all single-reviewer):
  Kimi BLOCK "Decode crashes on empty Parquet" — already handled
  Kimi INFO "safeKey double-escapes" — wrong, splitting before escape is required
  Qwen INFO "rb.NewRecord() error unchecked" — API returns no error

Deferred to G1+: name validation regex, per-call deadlines, Snappy
compression, list pagination continuation tokens (storaged caps at
10k with sentinel for now).

Build clean, vet clean, all tests pass, smoke 6/6 PASS after every
fix round. arrow-go/v18 + google/uuid added; Go 1.24 → 1.25 forced
by arrow-go's minimum.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:36:57 -05:00

104 lines
2.7 KiB
Go

package catalogd
import (
"testing"
"time"
)
func TestEncodeDecode_RoundTrip(t *testing.T) {
rc := int64(500_000)
now := time.Unix(1777435000, 123456789)
want := &Manifest{
DatasetID: DatasetIDForName("workers_500k"),
Name: "workers_500k",
SchemaFingerprint: "sha256:abcdef",
Objects: []Object{
{Key: "datasets/workers_500k/part-001.parquet", Size: 75 * 1024 * 1024},
{Key: "datasets/workers_500k/part-002.parquet", Size: 12 * 1024 * 1024},
},
CreatedAt: now,
UpdatedAt: now.Add(time.Minute),
RowCount: &rc,
}
b, err := Encode(want)
if err != nil {
t.Fatalf("Encode: %v", err)
}
if len(b) == 0 {
t.Fatal("Encode returned 0 bytes")
}
got, err := Decode(b)
if err != nil {
t.Fatalf("Decode: %v", err)
}
if got.DatasetID != want.DatasetID {
t.Errorf("DatasetID: got %q, want %q", got.DatasetID, want.DatasetID)
}
if got.Name != want.Name {
t.Errorf("Name: got %q, want %q", got.Name, want.Name)
}
if got.SchemaFingerprint != want.SchemaFingerprint {
t.Errorf("SchemaFingerprint: got %q, want %q", got.SchemaFingerprint, want.SchemaFingerprint)
}
if len(got.Objects) != 2 {
t.Fatalf("Objects: got %d, want 2", len(got.Objects))
}
for i, o := range got.Objects {
if o.Key != want.Objects[i].Key || o.Size != want.Objects[i].Size {
t.Errorf("Objects[%d]: got %+v, want %+v", i, o, want.Objects[i])
}
}
// Times round-trip via UnixNano so equality is on the nanosecond.
if !got.CreatedAt.Equal(want.CreatedAt) {
t.Errorf("CreatedAt: got %v, want %v", got.CreatedAt, want.CreatedAt)
}
if !got.UpdatedAt.Equal(want.UpdatedAt) {
t.Errorf("UpdatedAt: got %v, want %v", got.UpdatedAt, want.UpdatedAt)
}
if got.RowCount == nil || *got.RowCount != *want.RowCount {
t.Errorf("RowCount: got %v, want %v", got.RowCount, want.RowCount)
}
}
func TestEncodeDecode_NoRowCount(t *testing.T) {
want := &Manifest{
DatasetID: DatasetIDForName("unknown_size"),
Name: "unknown_size",
SchemaFingerprint: "sha256:zero",
Objects: []Object{},
CreatedAt: time.Unix(0, 0),
UpdatedAt: time.Unix(0, 0),
RowCount: nil,
}
b, err := Encode(want)
if err != nil {
t.Fatal(err)
}
got, err := Decode(b)
if err != nil {
t.Fatal(err)
}
if got.RowCount != nil {
t.Errorf("RowCount: got %v, want nil", got.RowCount)
}
if len(got.Objects) != 0 {
t.Errorf("Objects: got %d, want 0", len(got.Objects))
}
}
func TestDatasetIDForName_Deterministic(t *testing.T) {
a := DatasetIDForName("workers_500k")
b := DatasetIDForName("workers_500k")
if a != b {
t.Errorf("DatasetIDForName not deterministic: %q vs %q", a, b)
}
c := DatasetIDForName("different")
if a == c {
t.Errorf("DatasetIDForName collided across distinct names")
}
}