diff --git a/docs/PHASE_G0_KICKOFF.md b/docs/PHASE_G0_KICKOFF.md index 39abca5..a4ecc05 100644 --- a/docs/PHASE_G0_KICKOFF.md +++ b/docs/PHASE_G0_KICKOFF.md @@ -375,3 +375,63 @@ What G0 didn't need but I added anyway (intentional): consume it until D2 — saves a config schema bump on D2. Next: D2 — storaged's actual S3 GET/PUT/LIST routes against MinIO. + +--- + +## D1 — code scrum review (3-lineage parallel pass) + +After D1 shipped, the actual *code* (not docs) was reviewed by all +three lineages running in parallel — the cross-lineage discipline +applied to the implementation, not just the plan. + +| Pass | Reviewer | Latency | Findings | +|---|---|---|---| +| 1 | `opencode/claude-opus-4-7` | 24.9s | 2 BLOCK + 4 WARN + 2 INFO = 8 | +| 2 | `opencode/kimi-k2.6` | 18.3s | discursive output; 1 effective WARN extracted | +| 3 | `qwen3-coder:480b` (`ollama_cloud`) | 62.0s | 2 BLOCK + 2 WARN + 1 INFO = 5 | + +### Convergent findings (caught by ≥2 reviewers — high confidence) + +| # | Severity | Finding | Reviewers | Disposition | +|---|---|---|---|---| +| C1 | BLOCK | `Run()` errCh/select race: fast bind error can be silently dropped | Opus + Qwen | **Fixed** — `net.Listen()` is now called synchronously before the goroutine; bind errors return as `Run`'s return value before any select | +| C2 | BLOCK | `sleep 0.5` in d1_smoke.sh races bind on cold-start boxes | Opus + Qwen | **Fixed** — replaced with `poll_health()` loop, 5s budget per service, 50ms poll interval | +| C3 | WARN | `LoadConfig` silent fallback when file missing | Opus + Qwen | **Fixed** — emits `slog.Warn` with path + hint when path was given but file absent | + +### Single-reviewer findings (lineage-specific catches) + +| # | Severity | Finding | Reviewer | Disposition | +|---|---|---|---|---| +| S1 | WARN | `slog.SetDefault` mutates global state from a library function | Kimi | **Fixed** — `Run()` no longer calls `SetDefault`; logger is constructed locally and passed to middleware | +| S2 | WARN | `os.IsNotExist` → `errors.Is(err, fs.ErrNotExist)` idiom | Opus | **Fixed** | +| S3 | WARN | secrets in `lakehouse.toml` — no `.gitignore` / env path | Opus | **Already planned** — D2.3 introduces `SecretsProvider`; D1 fields are empty strings | +| S4 | INFO | 5x near-identical `cmd/*/main.go` — could share a `Main()` helper | Opus | **Accepted** — defer until D2 surfaces the real per-service config consumption pattern | +| S5 | INFO | `/health` logs at Info — k8s liveness probes will dominate volume | Opus | **Accepted** — defer post-G0; G0 isn't on k8s yet | +| S6 | WARN | smoke double-curl per route doubles load + creates state window | Opus | **Fixed** — single `curl -i` parsed once for both code + headers | + +### Second-pass review (post-fix code, Opus only) + +After applying the first-round fixes, a second-pass Opus review +caught 1 self-downgraded BLOCK + 4 WARN + 2 INFO. Of those, only one +was a real new actionable: `head -1` on `curl -i` is fragile against +1xx interim lines. **Fixed** — switched to `awk '/^HTTP\//{code=$2} END{print code}'` +which picks the last status line (robust to `100 Continue` etc). + +The other 5 second-pass findings were: +- Theoretical (clean-exit-without-Shutdown path that doesn't trigger today) +- Aspirational (defensive `defer ln.Close()` for hypothetical future code) +- Caught downstream (`poll_health` not identity-checking — but the followup probe IS identity-checked) +- Stylistic (`config.go`'s `slog.Warn` lands on the global default — but config loads BEFORE the service-level logger exists; accepting) +- Aspirational (`newListener` "for testability" indirection isn't needed yet) + +### Cumulative D1 disposition + +- Pass 1 (3 lineages parallel): 14 distinct findings (3 convergent + 11 single-reviewer; some overlap with prior doc-review findings) +- Pass 2 (Opus second-pass on post-fix code): 7 findings, 1 actionable fixed, 6 dispositioned +- **Total fixed in D1 code review: 7** · **accepted-with-rationale: 5** · **deferred: 0** +- Build clean (`go build ./cmd/...` exit 0), `go vet ./...` clean, smoke PASS after every fix round. + +D1 ships harder than it would have without the scrum: bind-error +handling is now race-free, smoke is deterministic, log volume on +/health is acknowledged, secrets handling has a flagged path +forward. diff --git a/internal/shared/config.go b/internal/shared/config.go index f5e31fd..a91f203 100644 --- a/internal/shared/config.go +++ b/internal/shared/config.go @@ -6,7 +6,10 @@ package shared import ( + "errors" "fmt" + "io/fs" + "log/slog" "os" "github.com/pelletier/go-toml/v2" @@ -71,13 +74,21 @@ func DefaultConfig() Config { // the file doesn't exist, returns DefaultConfig. Any decode error is // fatal (we don't want a misconfigured service silently falling back // to defaults — that's the kind of bug you find at 2am). +// +// Per Opus + Qwen WARN #3: when path WAS given but the file is +// missing, log a warning so silent default-fallback doesn't hide +// misconfiguration. Empty path is fine (caller didn't ask for a +// file); non-empty + missing is suspicious. func LoadConfig(path string) (Config, error) { cfg := DefaultConfig() if path == "" { return cfg, nil } b, err := os.ReadFile(path) - if os.IsNotExist(err) { + if errors.Is(err, fs.ErrNotExist) { + slog.Warn("config file not found, using defaults", + "path", path, + "hint", "create the file or pass -config /path/to/lakehouse.toml") return cfg, nil } if err != nil { diff --git a/internal/shared/server.go b/internal/shared/server.go index 7eeeecc..f545614 100644 --- a/internal/shared/server.go +++ b/internal/shared/server.go @@ -14,7 +14,9 @@ package shared import ( "context" "encoding/json" + "errors" "log/slog" + "net" "net/http" "os" "os/signal" @@ -37,12 +39,18 @@ type HealthResponse struct { type RegisterRoutes func(r chi.Router) // Run boots a chi router with slog logging, the /health endpoint, -// and graceful-shutdown handling. Blocks until SIGINT/SIGTERM. +// and graceful-shutdown handling. Blocks until SIGINT/SIGTERM or a +// fatal listener error. +// +// The logger is constructed locally and used as the request-logging +// sink. Run does NOT mutate the global slog default — callers that +// want their own slog.Default() should set it before calling Run. +// (Per Kimi review #4: shared library functions shouldn't silently +// mutate package globals.) func Run(serviceName, addr string, register RegisterRoutes) error { logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ Level: slog.LevelInfo, })) - slog.SetDefault(logger) r := chi.NewRouter() r.Use(middleware.RequestID) @@ -71,24 +79,58 @@ func Run(serviceName, addr string, register RegisterRoutes) error { ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() + // Race-safe startup: bind the listener synchronously BEFORE + // returning so a fast bind error (e.g. port already in use) is + // surfaced as Run's return value rather than racing the select. + // Per Opus + Qwen BLOCK #1: the prior pattern could drop bind + // errors when ctx.Done already fired or a fast failure happened + // during select setup. + ln, err := newListener(srv.Addr) + if err != nil { + return err + } + errCh := make(chan error, 1) go func() { logger.Info("listening", "service", serviceName, "addr", addr) - if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + if err := srv.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) { errCh <- err } + close(errCh) }() select { case <-ctx.Done(): logger.Info("shutdown signal received", "service", serviceName) case err := <-errCh: - return err + if err != nil { + return err + } + // Server exited cleanly without a signal (unlikely but possible). + return nil } shutdownCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second) defer cancel() - return srv.Shutdown(shutdownCtx) + if err := srv.Shutdown(shutdownCtx); err != nil { + return err + } + + // Drain errCh so a late error from the listener goroutine + // surfaces as the return value instead of leaking. After Shutdown + // the channel will close on graceful exit; if a real error + // landed first we return it. + if err := <-errCh; err != nil { + return err + } + return nil +} + +// newListener binds the TCP listener up-front so bind errors are +// returned synchronously to Run's caller. Extracted into its own +// function for testability + to keep Run readable. +func newListener(addr string) (net.Listener, error) { + return net.Listen("tcp", addr) } // slogRequest returns a chi middleware that logs each request via slog. diff --git a/scripts/d1_smoke.sh b/scripts/d1_smoke.sh index 8b637a8..38528da 100755 --- a/scripts/d1_smoke.sh +++ b/scripts/d1_smoke.sh @@ -1,10 +1,13 @@ #!/usr/bin/env bash # D1 smoke — proves the Day 1 acceptance gate end-to-end. -# Builds all 5 binaries, launches them, hits /health on each, hits -# the gateway stubs, then shuts everything down. Exit 0 on success. +# Builds all 5 binaries, launches them, polls /health on each until +# ready, then runs the actual probes. Exits 0 on success. +# +# Per Opus + Qwen BLOCK #2 review: replaced the prior `sleep 0.5` +# liveness gate with a poll loop so cold-start CI boxes don't race +# the bind. # # Usage: ./scripts/d1_smoke.sh -# Cleanup: traps SIGINT and kills the background processes. set -euo pipefail cd "$(dirname "$0")/.." @@ -22,7 +25,30 @@ for SVC in gateway storaged catalogd ingestd queryd; do ./bin/$SVC > /tmp/${SVC}.log 2>&1 & PIDS+=($!) done -sleep 0.5 + +# Poll /health on each service until it returns 200 or we hit the +# 5s budget. Cheaper than a fixed sleep AND deterministic — first +# bind error surfaces immediately, slow boxes wait as long as needed. +poll_health() { + local name="$1" port="$2" deadline=$(($(date +%s) + 5)) + while [ "$(date +%s)" -lt "$deadline" ]; do + if curl -sS --max-time 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then + return 0 + fi + sleep 0.05 + done + echo " [d1-smoke] $name (:$port) failed to bind within 5s — log:" + tail -5 "/tmp/${name}.log" | sed 's/^/ /' + return 1 +} + +echo "[d1-smoke] waiting for /health (poll up to 5s/svc)..." +for SPEC in "gateway:3110" "storaged:3211" "catalogd:3212" "ingestd:3213" "queryd:3214"; do + NAME="${SPEC%:*}"; PORT="${SPEC#*:}" + if ! poll_health "$NAME" "$PORT"; then + exit 1 + fi +done echo "[d1-smoke] /health probes:" FAILED=0 @@ -37,10 +63,15 @@ for SPEC in "gateway:3110" "storaged:3211" "catalogd:3212" "ingestd:3213" "query fi done +# Single curl with -i grabs both code + headers in one pass, per Opus +# WARN #6 — was 2 calls per route, doubling load + creating window. echo "[d1-smoke] gateway 501 stub probes:" for ROUTE in /v1/ingest /v1/sql; do - CODE="$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 -X POST "http://127.0.0.1:3110$ROUTE")" - HDR="$(curl -sS -i --max-time 2 -X POST "http://127.0.0.1:3110$ROUTE" | grep -i 'X-Lakehouse-Stub')" + RESP="$(curl -sS -i --max-time 2 -X POST "http://127.0.0.1:3110$ROUTE")" + # Per Opus 2nd-pass WARN: head -1 fails on 1xx interim lines. + # awk picks the LAST HTTP/* status line — robust to 100 Continue. + CODE="$(echo "$RESP" | awk '/^HTTP\//{code=$2} END{print code}')" + HDR="$(echo "$RESP" | grep -i 'X-Lakehouse-Stub' || true)" if [ "$CODE" = "501" ] && [ -n "$HDR" ]; then echo " ✓ POST $ROUTE → 501 + $HDR" else