root 814197cfd3 ADR-006: auth posture for non-loopback deploy + token rotation impl
ADR-003 locked the auth substrate; ADR-006 ratifies the operator
playbook + adds two implementation pieces needed for Sprint 4
deployment: env-resolved tokens and dual-token rotation.

Six decisions locked in docs/DECISIONS.md:
- 6.1: Non-loopback bind requires auth.token (mechanical gate at
       shared.Run, already implemented; this ratifies it).
- 6.2: Token from env, not TOML. /etc/lakehouse/auth.env (mode 0600)
       loaded by systemd EnvironmentFile=. New TokenEnv field on
       AuthConfig defaults to "AUTH_TOKEN".
- 6.3: AllowedIPs for inter-service same-trust-domain; Token for
       cross-trust-boundary (gateway ↔ external).
- 6.4: /health stays unauthenticated; everything else under
       shared.Run is gated. Already implemented; ratified here.
- 6.5: Token rotation is dual-token. New SecondaryTokens []string
       on AuthConfig — both primary and any secondary pass auth
       during the rotation window. Implemented in this commit.
- 6.6: TLS terminates at the network edge (nginx/Caddy), not
       in-process. Daemons stay HTTP-only; internal traffic stays
       on private subnets per Decision 6.3.

Implementation:
- internal/shared/config.go: AuthConfig gains TokenEnv +
  SecondaryTokens fields. New resolveAuthFromEnv() called by
  LoadConfig fills Token from os.Getenv(TokenEnv) when Token is
  empty. TokenEnv defaults to "AUTH_TOKEN" so the happy path needs
  no TOML config.
- internal/shared/auth.go: RequireAuth pre-encodes Bearer headers
  for primary + every secondary token; per-request constant-time
  compare walks the slice. Fast path is 1 compare (primary).

Tests:
- TestLoadConfig_AuthTokenFromEnv (3 sub-tests): default env name,
  custom token_env, explicit Token wins over env.
- TestRequireAuth_SecondaryTokenAccepted: both primary + secondary
  tokens pass during rotation window.
- TestRequireAuth_SecondaryTokensOnly: only-secondary path works
  for the case where primary was just promoted-to-empty mid-rotation.

go test ./internal/shared all green; existing auth_test.go
unchanged (constant-time compare path preserved).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 17:51:14 -05:00

145 lines
4.3 KiB
Go

// auth.go — inter-service auth middleware per ADR-003.
//
// Two layers, each independently configurable:
// - Bearer token (constant-time compare via crypto/subtle)
// - IP allowlist (CIDR set; bare IPs treated as /32)
//
// /health is exempt from both layers (load balancers + monitors need
// it open; the route doesn't expose anything sensitive).
//
// When both Token and AllowedIPs are empty, RequireAuth returns a
// pass-through that does no work — preserves G0 dev-mode behavior
// where every binary binds 127.0.0.1 and the network is the auth
// layer.
//
// The non-loopback-bind + empty-token coupling is enforced at
// startup in shared.Run, not in the middleware — the middleware
// only sees per-request auth, not the bind config. Together they
// make the audit's worst case (R-001 + R-007: queryd /sql RCE-eq
// off-loopback with no auth) mechanically impossible.
package shared
import (
"crypto/subtle"
"log/slog"
"net"
"net/http"
"strings"
)
// RequireAuth returns a chi-compatible middleware that enforces
// the configured AuthConfig. Empty config returns a pass-through.
func RequireAuth(cfg AuthConfig) func(http.Handler) http.Handler {
tokenSet := cfg.Token != "" || len(cfg.SecondaryTokens) > 0
if !tokenSet && len(cfg.AllowedIPs) == 0 {
// G0 dev mode — no auth wired.
return passthrough
}
// Pre-parse CIDRs once. Invalid entries log a warning and are
// dropped — fail-loud-but-not-fatal so a typo in one CIDR
// doesn't kill the binary; operator sees the warning at startup.
var allowedNets []*net.IPNet
for _, raw := range cfg.AllowedIPs {
cidr := raw
if !strings.Contains(cidr, "/") {
// Bare IP — single-host CIDR.
if strings.Contains(cidr, ":") {
cidr += "/128"
} else {
cidr += "/32"
}
}
_, n, err := net.ParseCIDR(cidr)
if err != nil {
slog.Warn("auth: invalid CIDR in allowed_ips, skipping",
"raw", raw, "err", err)
continue
}
allowedNets = append(allowedNets, n)
}
// Pre-encode wire-format Bearer headers for primary + every
// secondary token. Per-request comparison walks the slice with
// constant-time compare on each — fast path is the primary
// (first), so the typical case is one compare.
var expectedHeaders [][]byte
if cfg.Token != "" {
expectedHeaders = append(expectedHeaders, []byte("Bearer "+cfg.Token))
}
for _, sec := range cfg.SecondaryTokens {
if sec == "" {
continue
}
expectedHeaders = append(expectedHeaders, []byte("Bearer "+sec))
}
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// /health bypasses both layers. Operators rely on it
// being public for liveness probes.
if r.URL.Path == "/health" {
next.ServeHTTP(w, r)
return
}
if len(allowedNets) > 0 && !ipAllowed(r, allowedNets) {
http.Error(w, "forbidden", http.StatusForbidden)
return
}
if tokenSet {
got := []byte(r.Header.Get("Authorization"))
matched := false
for _, want := range expectedHeaders {
if subtle.ConstantTimeCompare(got, want) == 1 {
matched = true
break
}
}
if !matched {
http.Error(w, "unauthorized", http.StatusUnauthorized)
return
}
}
next.ServeHTTP(w, r)
})
}
}
// passthrough is the no-op middleware returned when no auth is
// configured. Used by RequireAuth in G0 dev mode.
func passthrough(next http.Handler) http.Handler { return next }
// ipAllowed checks whether the request's source IP is in any of
// the allowed networks. Falls back to false for unparseable
// RemoteAddr — a deploy with broken peer-IP logging would otherwise
// silently bypass the allowlist.
func ipAllowed(r *http.Request, nets []*net.IPNet) bool {
ip := remoteIP(r)
if ip == nil {
return false
}
for _, n := range nets {
if n.Contains(ip) {
return true
}
}
return false
}
// remoteIP extracts the request's source IP. Today: r.RemoteAddr.
// Future: when a trusted proxy fronts the gateway and adds
// X-Forwarded-For, we'd add a config knob to honor the first hop.
// G0 deploys are direct-to-binary so RemoteAddr suffices.
func remoteIP(r *http.Request) net.IP {
host, _, err := net.SplitHostPort(r.RemoteAddr)
if err != nil {
// SplitHostPort failure could mean the test's httptest.Server
// passed a bare IP; try parsing as-is.
host = r.RemoteAddr
}
return net.ParseIP(host)
}