Implements the auth posture from ADR-003 (commit 0d18ffa). Two independent layers — Bearer token (constant-time compare via crypto/subtle) and IP allowlist (CIDR set) — composed in shared.Run so every binary inherits the same gate without per-binary wiring. Together with the bind-gate from commit 6af0520, this mechanically closes audit risks R-001 + R-007: - non-loopback bind without auth.token = startup refuse - non-loopback bind WITH auth.token + override env = allowed - loopback bind = all gates open (G0 dev unchanged) internal/shared/auth.go (NEW) RequireAuth(cfg AuthConfig) returns chi-compatible middleware. Empty Token + empty AllowedIPs → pass-through (G0 dev mode). Token-only → 401 Bearer mismatch. AllowedIPs-only → 403 source IP not in CIDR set. Both → both gates apply. /health bypasses both layers (load-balancer / liveness probes shouldn't carry tokens). CIDR parsing pre-runs at boot; bare IP (no /N) treated as /32 (or /128 for IPv6). Invalid entries log warn and drop, fail-loud-but- not-fatal so a typo doesn't kill the binary. Token comparison: subtle.ConstantTimeCompare on the full "Bearer <token>" wire-format string. Length-mismatch returns 0 (per stdlib spec), so wrong-length tokens reject without timing leak. Pre-encoded comparison slice stored in the middleware closure — one allocation per request. Source-IP extraction prefers net.SplitHostPort fallback to RemoteAddr-as-is for httptest compatibility. X-Forwarded-For support is a follow-up when a trusted proxy fronts the gateway (config knob TBD per ADR-003 §"Future"). internal/shared/server.go Run signature: gained AuthConfig parameter (4th arg). /health stays mounted on the outer router (public). Registered routes go inside chi.Group with RequireAuth applied — empty config = transparent group. Added requireAuthOnNonLoopback startup check: non-loopback bind with empty Token = refuse to start (cites R-001 + R-007 by name). internal/shared/config.go AuthConfig type added with TOML tags. Fields: Token, AllowedIPs. Composed into Config under [auth]. cmd/<svc>/main.go × 7 (catalogd, embedd, gateway, ingestd, queryd, storaged, vectord, mcpd is unaffected — stdio doesn't bind a port) Each call site adds cfg.Auth as the 4th arg to shared.Run. No other changes — middleware applies via shared.Run uniformly. internal/shared/auth_test.go (12 test funcs) Empty config pass-through, missing-token 401, wrong-token 401, correct-token 200, raw-token-without-Bearer-prefix 401, /health always public, IP allowlist allow + reject, bare IP /32, both layers when both configured, invalid CIDR drop-with-warn, RemoteAddr shape extraction. The constant-time comparison is verified by inspection (comments in auth.go) plus the existence of the passthrough test (length-mismatch case). Verified: go test -count=1 ./internal/shared/ — all green (was 21, now 33 funcs) just verify — vet + test + 9 smokes 33s just proof contract — 53/0/1 unchanged Smokes + proof harness keep working without any token configuration: default Auth is empty struct → middleware is no-op → existing tests pass unchanged. To exercise the gate, operators set [auth].token in lakehouse.toml (or, per the "future" note in the ADR, via env var). Closes audit findings: R-001 HIGH — fully mechanically closed (was: partial via bind gate) R-007 MED — fully mechanically closed (was: design-only ADR-003) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
228 lines
8.3 KiB
Go
228 lines
8.3 KiB
Go
// Package shared also provides the TOML config loader. Per ADR
|
||
// equivalent of Rust ADR-006 (TOML config over env vars), every
|
||
// service reads `lakehouse.toml` with sane defaults and env
|
||
// overrides. Config is hot-reload-unaware in G0; reload-on-SIGHUP
|
||
// is a G1+ concern.
|
||
package shared
|
||
|
||
import (
|
||
"errors"
|
||
"fmt"
|
||
"io/fs"
|
||
"log/slog"
|
||
"os"
|
||
|
||
"github.com/pelletier/go-toml/v2"
|
||
)
|
||
|
||
// Config is the unified Lakehouse config. Each service reads only
|
||
// the section it cares about, but they all share the same file so
|
||
// operators have one place to look.
|
||
type Config struct {
|
||
Gateway GatewayConfig `toml:"gateway"`
|
||
Storaged ServiceConfig `toml:"storaged"`
|
||
Catalogd CatalogConfig `toml:"catalogd"`
|
||
Ingestd IngestConfig `toml:"ingestd"`
|
||
Queryd QuerydConfig `toml:"queryd"`
|
||
Vectord VectordConfig `toml:"vectord"`
|
||
Embedd EmbeddConfig `toml:"embedd"`
|
||
S3 S3Config `toml:"s3"`
|
||
Log LogConfig `toml:"log"`
|
||
Auth AuthConfig `toml:"auth"`
|
||
}
|
||
|
||
// IngestConfig adds ingestd-specific knobs. ingestd needs to PUT
|
||
// parquet to storaged AND register manifests with catalogd, so it
|
||
// holds two upstream URLs in addition to its own bind.
|
||
//
|
||
// MaxIngestBytes caps the multipart body size. CSVs are typically
|
||
// 4-6× larger than the resulting Snappy-compressed Parquet, so 256
|
||
// MiB CSV → ~50 MiB Parquet — well under storaged's 256 MiB PUT
|
||
// cap. Real-scale validation (2026-04-29) showed 500K workers ×
|
||
// 18 cols = 344 MiB CSV → 71 MiB Parquet; bumping this knob to
|
||
// 512 MiB is the documented path for that workload.
|
||
type IngestConfig struct {
|
||
Bind string `toml:"bind"`
|
||
StoragedURL string `toml:"storaged_url"`
|
||
CatalogdURL string `toml:"catalogd_url"`
|
||
MaxIngestBytes int64 `toml:"max_ingest_bytes"`
|
||
}
|
||
|
||
// GatewayConfig adds the upstream URLs the reverse proxy fronts.
|
||
// Each route family (/v1/storage, /v1/catalog, /v1/ingest, /v1/sql,
|
||
// /v1/vectors, /v1/embed) has its own upstream so we can scale
|
||
// services independently or move them to different boxes without
|
||
// touching gateway code.
|
||
type GatewayConfig struct {
|
||
Bind string `toml:"bind"`
|
||
StoragedURL string `toml:"storaged_url"`
|
||
CatalogdURL string `toml:"catalogd_url"`
|
||
IngestdURL string `toml:"ingestd_url"`
|
||
QuerydURL string `toml:"queryd_url"`
|
||
VectordURL string `toml:"vectord_url"`
|
||
EmbeddURL string `toml:"embedd_url"`
|
||
}
|
||
|
||
// EmbeddConfig drives the embed service. ProviderURL points at the
|
||
// embedding backend (Ollama in G2, possibly OpenAI/Voyage in G3+).
|
||
// DefaultModel is what gets used when callers don't specify a
|
||
// model in their request body. CacheSize is the LRU cache cap on
|
||
// (model, sha256(text)) → vector lookups; 0 disables caching.
|
||
// Default 10000 entries ≈ 30 MiB at d=768.
|
||
type EmbeddConfig struct {
|
||
Bind string `toml:"bind"`
|
||
ProviderURL string `toml:"provider_url"`
|
||
DefaultModel string `toml:"default_model"`
|
||
CacheSize int `toml:"cache_size"`
|
||
}
|
||
|
||
// VectordConfig adds vectord-specific knobs. StoragedURL is
|
||
// optional — empty string disables persistence, useful for ephemeral
|
||
// dev or test runs. When set, indexes Save after every state change
|
||
// and Load on startup.
|
||
type VectordConfig struct {
|
||
Bind string `toml:"bind"`
|
||
StoragedURL string `toml:"storaged_url"`
|
||
}
|
||
|
||
// QuerydConfig adds queryd-specific knobs. queryd talks DuckDB
|
||
// directly to MinIO via DuckDB's httpfs extension (so no storaged
|
||
// URL needed), and reads the catalog over HTTP for view registration.
|
||
// SecretsPath defaults to /etc/lakehouse/secrets-go.toml — the same
|
||
// file storaged uses, since both services need the S3 credentials.
|
||
type QuerydConfig struct {
|
||
Bind string `toml:"bind"`
|
||
CatalogdURL string `toml:"catalogd_url"`
|
||
SecretsPath string `toml:"secrets_path"`
|
||
RefreshEvery string `toml:"refresh_every"` // duration string, e.g. "30s"
|
||
}
|
||
|
||
// CatalogConfig adds catalogd-specific knobs on top of the standard
|
||
// bind. StoragedURL points at the storaged service for manifest
|
||
// persistence; G0 defaults to the localhost bind.
|
||
type CatalogConfig struct {
|
||
Bind string `toml:"bind"`
|
||
StoragedURL string `toml:"storaged_url"`
|
||
}
|
||
|
||
// ServiceConfig is the per-binary bind config. Default Bind ""
|
||
// means "use the service's hardcoded G0 default" — see DefaultConfig.
|
||
type ServiceConfig struct {
|
||
Bind string `toml:"bind"`
|
||
}
|
||
|
||
// S3Config holds S3-compatible storage settings. Endpoint blank →
|
||
// AWS default. Bucket "" → "lakehouse-primary".
|
||
type S3Config struct {
|
||
Endpoint string `toml:"endpoint"`
|
||
Region string `toml:"region"`
|
||
Bucket string `toml:"bucket"`
|
||
AccessKeyID string `toml:"access_key_id"`
|
||
SecretAccessKey string `toml:"secret_access_key"`
|
||
UsePathStyle bool `toml:"use_path_style"`
|
||
}
|
||
|
||
// LogConfig — slog level for now; structured fields land G1+.
|
||
type LogConfig struct {
|
||
Level string `toml:"level"`
|
||
}
|
||
|
||
// AuthConfig is the inter-service auth posture from ADR-003.
|
||
// Token is a Bearer token; empty means "no auth" (G0 dev mode).
|
||
// AllowedIPs is a list of CIDRs (or bare IPs treated as /32);
|
||
// empty means "any source IP."
|
||
//
|
||
// Both layers operate independently when set:
|
||
// - Token + AllowedIPs both empty → middleware is a no-op
|
||
// - Token only → 401 unless Bearer matches
|
||
// - AllowedIPs only → 403 unless r.RemoteAddr in CIDR
|
||
// - Both → both gates apply
|
||
//
|
||
// The startup gate in shared.Run refuses to start with non-loopback
|
||
// bind AND empty Token — that's the audit's R-001 + R-007 worst
|
||
// case (no auth, world-reachable). LH_<SVC>_ALLOW_NONLOOPBACK=1 still
|
||
// bypasses the bind gate for explicit dev cases; the auth gate is
|
||
// independent of that bypass and is the real production guard.
|
||
type AuthConfig struct {
|
||
Token string `toml:"token"`
|
||
AllowedIPs []string `toml:"allowed_ips"`
|
||
}
|
||
|
||
// DefaultConfig returns the G0 dev defaults. Ports are shifted to
|
||
// 3110+ to coexist with the live Rust lakehouse on 3100/3201-3204
|
||
// during the migration. G5 cutover flips gateway back to 3100.
|
||
func DefaultConfig() Config {
|
||
return Config{
|
||
Gateway: GatewayConfig{
|
||
Bind: "127.0.0.1:3110",
|
||
StoragedURL: "http://127.0.0.1:3211",
|
||
CatalogdURL: "http://127.0.0.1:3212",
|
||
IngestdURL: "http://127.0.0.1:3213",
|
||
QuerydURL: "http://127.0.0.1:3214",
|
||
VectordURL: "http://127.0.0.1:3215",
|
||
EmbeddURL: "http://127.0.0.1:3216",
|
||
},
|
||
Storaged: ServiceConfig{Bind: "127.0.0.1:3211"},
|
||
Catalogd: CatalogConfig{Bind: "127.0.0.1:3212", StoragedURL: "http://127.0.0.1:3211"},
|
||
Ingestd: IngestConfig{
|
||
Bind: "127.0.0.1:3213",
|
||
StoragedURL: "http://127.0.0.1:3211",
|
||
CatalogdURL: "http://127.0.0.1:3212",
|
||
MaxIngestBytes: 256 << 20, // 256 MiB; bump per deployment via lakehouse.toml
|
||
},
|
||
Vectord: VectordConfig{
|
||
Bind: "127.0.0.1:3215",
|
||
StoragedURL: "http://127.0.0.1:3211",
|
||
},
|
||
Embedd: EmbeddConfig{
|
||
Bind: "127.0.0.1:3216",
|
||
ProviderURL: "http://localhost:11434", // local Ollama
|
||
DefaultModel: "nomic-embed-text",
|
||
CacheSize: 10_000, // ~30 MiB at d=768; set to 0 to disable
|
||
},
|
||
Queryd: QuerydConfig{
|
||
Bind: "127.0.0.1:3214",
|
||
CatalogdURL: "http://127.0.0.1:3212",
|
||
SecretsPath: "/etc/lakehouse/secrets-go.toml",
|
||
RefreshEvery: "30s",
|
||
},
|
||
S3: S3Config{
|
||
Endpoint: "http://localhost:9000",
|
||
Region: "us-east-1",
|
||
Bucket: "lakehouse-primary",
|
||
UsePathStyle: true,
|
||
},
|
||
Log: LogConfig{Level: "info"},
|
||
}
|
||
}
|
||
|
||
// LoadConfig reads `lakehouse.toml` from path; if path is empty or
|
||
// the file doesn't exist, returns DefaultConfig. Any decode error is
|
||
// fatal (we don't want a misconfigured service silently falling back
|
||
// to defaults — that's the kind of bug you find at 2am).
|
||
//
|
||
// Per Opus + Qwen WARN #3: when path WAS given but the file is
|
||
// missing, log a warning so silent default-fallback doesn't hide
|
||
// misconfiguration. Empty path is fine (caller didn't ask for a
|
||
// file); non-empty + missing is suspicious.
|
||
func LoadConfig(path string) (Config, error) {
|
||
cfg := DefaultConfig()
|
||
if path == "" {
|
||
return cfg, nil
|
||
}
|
||
b, err := os.ReadFile(path)
|
||
if errors.Is(err, fs.ErrNotExist) {
|
||
slog.Warn("config file not found, using defaults",
|
||
"path", path,
|
||
"hint", "create the file or pass -config /path/to/lakehouse.toml")
|
||
return cfg, nil
|
||
}
|
||
if err != nil {
|
||
return cfg, fmt.Errorf("read config: %w", err)
|
||
}
|
||
if err := toml.Unmarshal(b, &cfg); err != nil {
|
||
return cfg, fmt.Errorf("parse config: %w", err)
|
||
}
|
||
return cfg, nil
|
||
}
|