golangLAKEHOUSE/scripts/d3_smoke.sh
root 66a704ca3e G0 D3: catalogd Parquet manifests + ADR-020 idempotent register · 6 scrum fixes
Phase G0 Day 3 ships catalogd: Arrow Parquet manifest codec, in-memory
registry with the ADR-020 idempotency contract (same name+fingerprint
reuses dataset_id; different fingerprint → 409 Conflict), HTTP client
to storaged for persistence, and rehydration on startup. Acceptance
smoke 6/6 PASSES end-to-end including rehydrate-across-restart — the
load-bearing test that the catalog/storaged service split actually
preserves state.

dataset_id derivation diverges from Rust: UUIDv5(namespace, name)
instead of v4 surrogate. Same name on any box generates the same
dataset_id; rehydrate after disk loss converges to the same identity
rather than silently re-issuing. Namespace pinned at
a8f3c1d2-4e5b-5a6c-9d8e-7f0a1b2c3d4e — every dataset_id ever issued
depends on these bytes.

Cross-lineage scrum on shipped code:
  - Opus 4.7 (opencode):                       1 BLOCK + 5 WARN + 3 INFO
  - Kimi K2-0905 (openrouter, validated D2):   2 BLOCK + 2 WARN + 1 INFO
  - Qwen3-coder (openrouter):                  2 BLOCK + 2 WARN + 2 INFO

Fixed:
  C1 list-offsets BLOCK (3-way convergent) → ValueOffsets(0) + bounds
  C2 Rehydrate mutex held across I/O → swap-under-brief-lock pattern
  S1 split-brain on persist failure → candidate-then-swap
  S2 brittle string-match for 400 vs 500 → ErrEmptyName/ErrEmptyFingerprint sentinels
  S3 Get/List shallow-copy aliasing → cloneManifest deep copy
  S4 keep-alive socket leak on error paths → drainAndClose helper

Dismissed (false positives, all single-reviewer):
  Kimi BLOCK "Decode crashes on empty Parquet" — already handled
  Kimi INFO "safeKey double-escapes" — wrong, splitting before escape is required
  Qwen INFO "rb.NewRecord() error unchecked" — API returns no error

Deferred to G1+: name validation regex, per-call deadlines, Snappy
compression, list pagination continuation tokens (storaged caps at
10k with sentinel for now).

Build clean, vet clean, all tests pass, smoke 6/6 PASS after every
fix round. arrow-go/v18 + google/uuid added; Go 1.24 → 1.25 forced
by arrow-go's minimum.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:36:57 -05:00

157 lines
5.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# D3 smoke — proves the Day 3 acceptance gate end-to-end.
#
# Validates:
# - Register a fresh dataset → 200 with existing=false, dataset_id from UUIDv5(name)
# - GET /catalog/manifest/<name> → manifest matches what we registered
# - GET /catalog/list → manifest listed
# - Restart catalogd → /catalog/list still shows it (Parquet-backed rehydrate)
# - Re-register same name + same fingerprint → 200, existing=true, same dataset_id
# - Re-register same name + different fingerprint → 409 Conflict
#
# Requires storaged (D2) running on :3211 and reachable.
#
# Usage: ./scripts/d3_smoke.sh
set -euo pipefail
cd "$(dirname "$0")/.."
export PATH="$PATH:/usr/local/go/bin"
echo "[d3-smoke] building storaged + catalogd..."
go build -o bin/ ./cmd/storaged ./cmd/catalogd
# Cleanup any prior processes on D3 ports.
pkill -f "bin/storaged" 2>/dev/null || true
pkill -f "bin/catalogd" 2>/dev/null || true
sleep 0.2
STORAGED_PID=""
CATALOGD_PID=""
TMP="$(mktemp -d)"
cleanup() {
echo "[d3-smoke] cleanup"
if [ -n "$CATALOGD_PID" ]; then kill "$CATALOGD_PID" 2>/dev/null || true; fi
if [ -n "$STORAGED_PID" ]; then kill "$STORAGED_PID" 2>/dev/null || true; fi
rm -rf "$TMP"
}
trap cleanup EXIT INT TERM
# --- launch storaged ---
echo "[d3-smoke] launching storaged..."
./bin/storaged > /tmp/storaged.log 2>&1 &
STORAGED_PID=$!
deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
if curl -sS --max-time 1 http://127.0.0.1:3211/health >/dev/null 2>&1; then break; fi
sleep 0.05
done
if ! curl -sS --max-time 1 http://127.0.0.1:3211/health >/dev/null 2>&1; then
echo " [d3-smoke] storaged failed to bind"; tail -10 /tmp/storaged.log; exit 1
fi
# --- clean any prior catalog manifests for a fresh smoke ---
for k in $(curl -sS "http://127.0.0.1:3211/storage/list?prefix=_catalog/manifests/" | jq -r '.objects[]?.Key // empty' 2>/dev/null); do
curl -sS -o /dev/null -X DELETE "http://127.0.0.1:3211/storage/delete/$k" || true
done
# --- launch catalogd (round 1) ---
launch_catalogd() {
./bin/catalogd > /tmp/catalogd.log 2>&1 &
CATALOGD_PID=$!
local deadline=$(($(date +%s) + 5))
while [ "$(date +%s)" -lt "$deadline" ]; do
if curl -sS --max-time 1 http://127.0.0.1:3212/health >/dev/null 2>&1; then return 0; fi
sleep 0.05
done
echo " [d3-smoke] catalogd failed to bind"; tail -10 /tmp/catalogd.log; return 1
}
echo "[d3-smoke] launching catalogd (first start, empty catalog)..."
launch_catalogd
FAILED=0
NAME="d3_smoke_dataset"
FP1="sha256:fingerprint-A"
FP2="sha256:fingerprint-B"
echo "[d3-smoke] POST /catalog/register (fresh):"
RESP="$(curl -sS -X POST http://127.0.0.1:3212/catalog/register \
-H 'Content-Type: application/json' \
-d "{\"name\":\"$NAME\",\"schema_fingerprint\":\"$FP1\",\"objects\":[{\"key\":\"datasets/$NAME/p1.parquet\",\"size\":1024}],\"row_count\":42}")"
EXISTING="$(echo "$RESP" | jq -r '.existing')"
DATASET_ID="$(echo "$RESP" | jq -r '.manifest.dataset_id')"
if [ "$EXISTING" = "false" ] && [ -n "$DATASET_ID" ] && [ "$DATASET_ID" != "null" ]; then
echo " ✓ fresh register → existing=false, dataset_id=$DATASET_ID"
else
echo " ✗ fresh register → $RESP"
FAILED=1
fi
echo "[d3-smoke] GET /catalog/manifest/$NAME:"
GOT="$(curl -sS "http://127.0.0.1:3212/catalog/manifest/$NAME" | jq -r '.dataset_id')"
if [ "$GOT" = "$DATASET_ID" ]; then
echo " ✓ manifest dataset_id matches"
else
echo " ✗ manifest dataset_id: got $GOT, want $DATASET_ID"
FAILED=1
fi
echo "[d3-smoke] GET /catalog/list (1 entry):"
COUNT="$(curl -sS http://127.0.0.1:3212/catalog/list | jq -r '.count')"
if [ "$COUNT" = "1" ]; then
echo " ✓ list count=1"
else
echo " ✗ list count=$COUNT (want 1)"
FAILED=1
fi
echo "[d3-smoke] restart catalogd → rehydrate from Parquet:"
kill "$CATALOGD_PID" 2>/dev/null || true; wait "$CATALOGD_PID" 2>/dev/null || true
launch_catalogd
REHYDRATED_ID="$(curl -sS "http://127.0.0.1:3212/catalog/manifest/$NAME" | jq -r '.dataset_id')"
if [ "$REHYDRATED_ID" = "$DATASET_ID" ]; then
echo " ✓ rehydrated dataset_id matches across restart"
else
echo " ✗ rehydrated dataset_id: got $REHYDRATED_ID, want $DATASET_ID"
FAILED=1
fi
echo "[d3-smoke] re-register (same name + same fingerprint) → existing=true:"
RESP2="$(curl -sS -X POST http://127.0.0.1:3212/catalog/register \
-H 'Content-Type: application/json' \
-d "{\"name\":\"$NAME\",\"schema_fingerprint\":\"$FP1\",\"objects\":[{\"key\":\"datasets/$NAME/p1.parquet\",\"size\":1024},{\"key\":\"datasets/$NAME/p2.parquet\",\"size\":2048}],\"row_count\":84}")"
EXISTING2="$(echo "$RESP2" | jq -r '.existing')"
DATASET_ID2="$(echo "$RESP2" | jq -r '.manifest.dataset_id')"
OBJ_COUNT="$(echo "$RESP2" | jq -r '.manifest.objects | length')"
if [ "$EXISTING2" = "true" ] && [ "$DATASET_ID2" = "$DATASET_ID" ] && [ "$OBJ_COUNT" = "2" ]; then
echo " ✓ existing=true, same dataset_id, objects replaced (count=2)"
else
echo " ✗ idempotent re-register: existing=$EXISTING2 id=$DATASET_ID2 objs=$OBJ_COUNT$RESP2"
FAILED=1
fi
echo "[d3-smoke] re-register (different fingerprint) → 409:"
HTTP="$(curl -sS -o "$TMP/conflict.out" -w '%{http_code}' -X POST http://127.0.0.1:3212/catalog/register \
-H 'Content-Type: application/json' \
-d "{\"name\":\"$NAME\",\"schema_fingerprint\":\"$FP2\",\"objects\":[]}")"
if [ "$HTTP" = "409" ]; then
echo " ✓ different fingerprint → 409 Conflict"
else
echo " ✗ different fingerprint → $HTTP (want 409)"
cat "$TMP/conflict.out"
FAILED=1
fi
# Cleanup smoke manifests.
curl -sS -o /dev/null -X DELETE "http://127.0.0.1:3211/storage/delete/_catalog/manifests/$NAME.parquet" || true
if [ "$FAILED" -eq 0 ]; then
echo "[d3-smoke] D3 acceptance gate: PASSED"
exit 0
else
echo "[d3-smoke] D3 acceptance gate: FAILED"
exit 1
fi