root 814197cfd3 ADR-006: auth posture for non-loopback deploy + token rotation impl
ADR-003 locked the auth substrate; ADR-006 ratifies the operator
playbook + adds two implementation pieces needed for Sprint 4
deployment: env-resolved tokens and dual-token rotation.

Six decisions locked in docs/DECISIONS.md:
- 6.1: Non-loopback bind requires auth.token (mechanical gate at
       shared.Run, already implemented; this ratifies it).
- 6.2: Token from env, not TOML. /etc/lakehouse/auth.env (mode 0600)
       loaded by systemd EnvironmentFile=. New TokenEnv field on
       AuthConfig defaults to "AUTH_TOKEN".
- 6.3: AllowedIPs for inter-service same-trust-domain; Token for
       cross-trust-boundary (gateway ↔ external).
- 6.4: /health stays unauthenticated; everything else under
       shared.Run is gated. Already implemented; ratified here.
- 6.5: Token rotation is dual-token. New SecondaryTokens []string
       on AuthConfig — both primary and any secondary pass auth
       during the rotation window. Implemented in this commit.
- 6.6: TLS terminates at the network edge (nginx/Caddy), not
       in-process. Daemons stay HTTP-only; internal traffic stays
       on private subnets per Decision 6.3.

Implementation:
- internal/shared/config.go: AuthConfig gains TokenEnv +
  SecondaryTokens fields. New resolveAuthFromEnv() called by
  LoadConfig fills Token from os.Getenv(TokenEnv) when Token is
  empty. TokenEnv defaults to "AUTH_TOKEN" so the happy path needs
  no TOML config.
- internal/shared/auth.go: RequireAuth pre-encodes Bearer headers
  for primary + every secondary token; per-request constant-time
  compare walks the slice. Fast path is 1 compare (primary).

Tests:
- TestLoadConfig_AuthTokenFromEnv (3 sub-tests): default env name,
  custom token_env, explicit Token wins over env.
- TestRequireAuth_SecondaryTokenAccepted: both primary + secondary
  tokens pass during rotation window.
- TestRequireAuth_SecondaryTokensOnly: only-secondary path works
  for the case where primary was just promoted-to-empty mid-rotation.

go test ./internal/shared all green; existing auth_test.go
unchanged (constant-time compare path preserved).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 17:51:14 -05:00

470 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package shared also provides the TOML config loader. Per ADR
// equivalent of Rust ADR-006 (TOML config over env vars), every
// service reads `lakehouse.toml` with sane defaults and env
// overrides. Config is hot-reload-unaware in G0; reload-on-SIGHUP
// is a G1+ concern.
package shared
import (
"errors"
"fmt"
"io/fs"
"log/slog"
"os"
"github.com/pelletier/go-toml/v2"
)
// Config is the unified Lakehouse config. Each service reads only
// the section it cares about, but they all share the same file so
// operators have one place to look.
type Config struct {
Gateway GatewayConfig `toml:"gateway"`
Storaged ServiceConfig `toml:"storaged"`
Catalogd CatalogConfig `toml:"catalogd"`
Ingestd IngestConfig `toml:"ingestd"`
Queryd QuerydConfig `toml:"queryd"`
Vectord VectordConfig `toml:"vectord"`
Embedd EmbeddConfig `toml:"embedd"`
Pathwayd PathwaydConfig `toml:"pathwayd"`
Matrixd MatrixdConfig `toml:"matrixd"`
Observerd ObserverdConfig `toml:"observerd"`
Chatd ChatdConfig `toml:"chatd"`
S3 S3Config `toml:"s3"`
Models ModelsConfig `toml:"models"`
Log LogConfig `toml:"log"`
Auth AuthConfig `toml:"auth"`
}
// IngestConfig adds ingestd-specific knobs. ingestd needs to PUT
// parquet to storaged AND register manifests with catalogd, so it
// holds two upstream URLs in addition to its own bind.
//
// MaxIngestBytes caps the multipart body size. CSVs are typically
// 4-6× larger than the resulting Snappy-compressed Parquet, so 256
// MiB CSV → ~50 MiB Parquet — well under storaged's 256 MiB PUT
// cap. Real-scale validation (2026-04-29) showed 500K workers ×
// 18 cols = 344 MiB CSV → 71 MiB Parquet; bumping this knob to
// 512 MiB is the documented path for that workload.
type IngestConfig struct {
Bind string `toml:"bind"`
StoragedURL string `toml:"storaged_url"`
CatalogdURL string `toml:"catalogd_url"`
MaxIngestBytes int64 `toml:"max_ingest_bytes"`
}
// GatewayConfig adds the upstream URLs the reverse proxy fronts.
// Each route family (/v1/storage, /v1/catalog, /v1/ingest, /v1/sql,
// /v1/vectors, /v1/embed, /v1/pathway, /v1/matrix, /v1/observer)
// has its own upstream so we can scale services independently or
// move them to different boxes without touching gateway code.
type GatewayConfig struct {
Bind string `toml:"bind"`
StoragedURL string `toml:"storaged_url"`
CatalogdURL string `toml:"catalogd_url"`
IngestdURL string `toml:"ingestd_url"`
QuerydURL string `toml:"queryd_url"`
VectordURL string `toml:"vectord_url"`
EmbeddURL string `toml:"embedd_url"`
PathwaydURL string `toml:"pathwayd_url"`
MatrixdURL string `toml:"matrixd_url"`
ObserverdURL string `toml:"observerd_url"`
ChatdURL string `toml:"chatd_url"`
}
// EmbeddConfig drives the embed service. ProviderURL points at the
// embedding backend (Ollama in G2, possibly OpenAI/Voyage in G3+).
// DefaultModel is what gets used when callers don't specify a
// model in their request body. CacheSize is the LRU cache cap on
// (model, sha256(text)) → vector lookups; 0 disables caching.
// Default 10000 entries ≈ 30 MiB at d=768.
type EmbeddConfig struct {
Bind string `toml:"bind"`
ProviderURL string `toml:"provider_url"`
DefaultModel string `toml:"default_model"`
CacheSize int `toml:"cache_size"`
}
// VectordConfig adds vectord-specific knobs. StoragedURL is
// optional — empty string disables persistence, useful for ephemeral
// dev or test runs. When set, indexes Save after every state change
// and Load on startup.
type VectordConfig struct {
Bind string `toml:"bind"`
StoragedURL string `toml:"storaged_url"`
}
// PathwaydConfig drives the pathway-memory service (cmd/pathwayd).
// PersistPath: file path to the JSONL log; empty = in-memory only
// (test/dev). Production sets a stable path under /var/lib/lakehouse
// or similar so traces survive restart.
type PathwaydConfig struct {
Bind string `toml:"bind"`
PersistPath string `toml:"persist_path"`
}
// MatrixdConfig drives the matrix-indexer service (cmd/matrixd).
// Per docs/SPEC.md §3.4: multi-corpus retrieve+merge over vectord
// with embed-via-embedd for query text. Both upstream URLs are
// required — matrixd has no in-process fallback.
type MatrixdConfig struct {
Bind string `toml:"bind"`
EmbeddURL string `toml:"embedd_url"`
VectordURL string `toml:"vectord_url"`
}
// ChatdConfig drives the chat dispatcher service (cmd/chatd) — Phase 4.
// Routes /v1/chat to the right provider based on model-name prefix
// or :cloud suffix. Per-provider API keys come from env vars (or
// /etc/lakehouse/<provider>.env files); empty keys leave the provider
// unregistered so requests for that provider 404 cleanly.
//
// OllamaURL is the local Ollama upstream (no auth). Empty disables
// the local Ollama provider — useful in deployments that don't run
// Ollama on the box (cloud-only operation).
type ChatdConfig struct {
Bind string `toml:"bind"`
OllamaURL string `toml:"ollama_url"`
// API-key env var names. Default to the conventional names.
// Operators can rename these for environments using different env
// var conventions, but the defaults match /etc/lakehouse/*.env
// files that systemd already loads.
OllamaCloudKeyEnv string `toml:"ollama_cloud_key_env"`
OpenRouterKeyEnv string `toml:"openrouter_key_env"`
OpenCodeKeyEnv string `toml:"opencode_key_env"`
KimiKeyEnv string `toml:"kimi_key_env"`
// Optional .env file paths — a fallback when the env var isn't set
// in the process environment. Same keys, "KEY=value" lines.
OllamaCloudKeyFile string `toml:"ollama_cloud_key_file"`
OpenRouterKeyFile string `toml:"openrouter_key_file"`
OpenCodeKeyFile string `toml:"opencode_key_file"`
KimiKeyFile string `toml:"kimi_key_file"`
// Per-call timeout in seconds. 0 = 180s default.
TimeoutSecs int `toml:"timeout_secs"`
}
// ObserverdConfig drives the observer service (cmd/observerd).
// PersistPath: file path to the JSONL ops log; empty = in-memory
// only (test/dev). Production sets a stable path under
// /var/lib/lakehouse/observer/ops.jsonl so ops survive restart.
// Mirrors the PathwaydConfig pattern.
type ObserverdConfig struct {
Bind string `toml:"bind"`
PersistPath string `toml:"persist_path"`
}
// QuerydConfig adds queryd-specific knobs. queryd talks DuckDB
// directly to MinIO via DuckDB's httpfs extension (so no storaged
// URL needed), and reads the catalog over HTTP for view registration.
// SecretsPath defaults to /etc/lakehouse/secrets-go.toml — the same
// file storaged uses, since both services need the S3 credentials.
type QuerydConfig struct {
Bind string `toml:"bind"`
CatalogdURL string `toml:"catalogd_url"`
SecretsPath string `toml:"secrets_path"`
RefreshEvery string `toml:"refresh_every"` // duration string, e.g. "30s"
}
// CatalogConfig adds catalogd-specific knobs on top of the standard
// bind. StoragedURL points at the storaged service for manifest
// persistence; G0 defaults to the localhost bind.
type CatalogConfig struct {
Bind string `toml:"bind"`
StoragedURL string `toml:"storaged_url"`
}
// ServiceConfig is the per-binary bind config. Default Bind ""
// means "use the service's hardcoded G0 default" — see DefaultConfig.
type ServiceConfig struct {
Bind string `toml:"bind"`
}
// S3Config holds S3-compatible storage settings. Endpoint blank →
// AWS default. Bucket "" → "lakehouse-primary".
type S3Config struct {
Endpoint string `toml:"endpoint"`
Region string `toml:"region"`
Bucket string `toml:"bucket"`
AccessKeyID string `toml:"access_key_id"`
SecretAccessKey string `toml:"secret_access_key"`
UsePathStyle bool `toml:"use_path_style"`
}
// LogConfig — slog level for now; structured fields land G1+.
type LogConfig struct {
Level string `toml:"level"`
}
// ModelsConfig names the models used by each tier of the small-model
// pipeline (per project_small_model_pipeline_vision.md and the Rust
// `config/providers.toml` convention). Callers reference tier names,
// not literal model IDs — bumping a tier means editing this file, not
// hunting through code.
//
// Tier philosophy:
// - local_* : on-box Ollama. Cheap, fast, JSON-clean. Inner-loop
// hot path. Repeated calls per query.
// - cloud_* : Ollama Cloud (Pro plan). Larger context, called when
// local is uncertain. Auth via OLLAMA_CLOUD_KEY.
// - frontier_* : OpenRouter / OpenCode. Rate-limited, billed per call.
// Reserved for blockers and full-scope reviews, not
// steady state.
//
// WeakModels is the codified "local-hot-path eligible" list that the
// matrix downgrade gate consults — replacing the previous hardcoded
// switch in internal/matrix/downgrade.go. A model in this list bypasses
// the corpus-downgrade rule: it's already weak, no need to downgrade
// further. Strong (paid / cloud / frontier) models trigger the gate.
type ModelsConfig struct {
LocalFast string `toml:"local_fast"`
LocalEmbed string `toml:"local_embed"`
LocalJudge string `toml:"local_judge"`
LocalReview string `toml:"local_review"`
CloudJudge string `toml:"cloud_judge"`
CloudReview string `toml:"cloud_review"`
CloudStrong string `toml:"cloud_strong"`
FrontierReview string `toml:"frontier_review"`
FrontierArch string `toml:"frontier_arch"`
FrontierStrong string `toml:"frontier_strong"`
FrontierFree string `toml:"frontier_free"`
WeakModels []string `toml:"weak_models"`
}
// Resolve maps a tier name (e.g. "local_judge") to the configured
// model ID. Unknown tier returns "". Callers should fall back to a
// hardcoded default and log a warning when this returns empty — the
// alternative (panic) would make a missing tier crash the binary at
// startup, which is too aggressive for an additive config.
func (m ModelsConfig) Resolve(tier string) string {
switch tier {
case "local_fast":
return m.LocalFast
case "local_embed":
return m.LocalEmbed
case "local_judge":
return m.LocalJudge
case "local_review":
return m.LocalReview
case "cloud_judge":
return m.CloudJudge
case "cloud_review":
return m.CloudReview
case "cloud_strong":
return m.CloudStrong
case "frontier_review":
return m.FrontierReview
case "frontier_arch":
return m.FrontierArch
case "frontier_strong":
return m.FrontierStrong
case "frontier_free":
return m.FrontierFree
default:
return ""
}
}
// IsWeak reports whether `model` is in the configured WeakModels list.
// Used by matrix.downgrade to decide whether to bypass the strong-model
// downgrade gate (weak models stay on the full lakehouse path).
func (m ModelsConfig) IsWeak(model string) bool {
for _, w := range m.WeakModels {
if w == model {
return true
}
}
return false
}
// AuthConfig is the inter-service auth posture from ADR-003.
// Token is a Bearer token; empty means "no auth" (G0 dev mode).
// AllowedIPs is a list of CIDRs (or bare IPs treated as /32);
// empty means "any source IP."
//
// Both layers operate independently when set:
// - Token + AllowedIPs both empty → middleware is a no-op
// - Token only → 401 unless Bearer matches
// - AllowedIPs only → 403 unless r.RemoteAddr in CIDR
// - Both → both gates apply
//
// The startup gate in shared.Run refuses to start with non-loopback
// bind AND empty Token — that's the audit's R-001 + R-007 worst
// case (no auth, world-reachable). LH_<SVC>_ALLOW_NONLOOPBACK=1 still
// bypasses the bind gate for explicit dev cases; the auth gate is
// independent of that bypass and is the real production guard.
type AuthConfig struct {
Token string `toml:"token"`
AllowedIPs []string `toml:"allowed_ips"`
// TokenEnv names an environment variable; LoadConfig populates
// Token from os.Getenv(TokenEnv) when Token is empty. Per ADR-006
// 6.2: production deploys put the secret in /etc/lakehouse/auth.env
// (mode 0600) loaded by systemd EnvironmentFile=, NOT in the
// committed TOML. TokenEnv defaults to "AUTH_TOKEN".
TokenEnv string `toml:"token_env"`
// SecondaryTokens lets operators stage a rotation: both primary
// and any secondary token pass auth during the rotation window.
// After every caller updates, operators promote secondary →
// primary and clear secondary. Per ADR-006 Decision 6.5.
SecondaryTokens []string `toml:"secondary_tokens"`
}
// DefaultConfig returns the G0 dev defaults. Ports are shifted to
// 3110+ to coexist with the live Rust lakehouse on 3100/3201-3204
// during the migration. G5 cutover flips gateway back to 3100.
func DefaultConfig() Config {
return Config{
Gateway: GatewayConfig{
Bind: "127.0.0.1:3110",
StoragedURL: "http://127.0.0.1:3211",
CatalogdURL: "http://127.0.0.1:3212",
IngestdURL: "http://127.0.0.1:3213",
QuerydURL: "http://127.0.0.1:3214",
VectordURL: "http://127.0.0.1:3215",
EmbeddURL: "http://127.0.0.1:3216",
PathwaydURL: "http://127.0.0.1:3217",
MatrixdURL: "http://127.0.0.1:3218",
ObserverdURL: "http://127.0.0.1:3219",
ChatdURL: "http://127.0.0.1:3220",
},
Storaged: ServiceConfig{Bind: "127.0.0.1:3211"},
Catalogd: CatalogConfig{Bind: "127.0.0.1:3212", StoragedURL: "http://127.0.0.1:3211"},
Ingestd: IngestConfig{
Bind: "127.0.0.1:3213",
StoragedURL: "http://127.0.0.1:3211",
CatalogdURL: "http://127.0.0.1:3212",
MaxIngestBytes: 256 << 20, // 256 MiB; bump per deployment via lakehouse.toml
},
Vectord: VectordConfig{
Bind: "127.0.0.1:3215",
StoragedURL: "http://127.0.0.1:3211",
},
Embedd: EmbeddConfig{
Bind: "127.0.0.1:3216",
ProviderURL: "http://localhost:11434", // local Ollama
DefaultModel: "nomic-embed-text",
CacheSize: 10_000, // ~30 MiB at d=768; set to 0 to disable
},
Pathwayd: PathwaydConfig{
Bind: "127.0.0.1:3217",
// PersistPath empty by default = in-memory only. Production
// sets to e.g. /var/lib/lakehouse/pathway/state.jsonl.
},
Matrixd: MatrixdConfig{
Bind: "127.0.0.1:3218",
EmbeddURL: "http://127.0.0.1:3216",
VectordURL: "http://127.0.0.1:3215",
},
Observerd: ObserverdConfig{
Bind: "127.0.0.1:3219",
// PersistPath empty by default = in-memory only.
},
Chatd: ChatdConfig{
Bind: "127.0.0.1:3220",
OllamaURL: "http://localhost:11434",
OllamaCloudKeyEnv: "OLLAMA_CLOUD_KEY",
OpenRouterKeyEnv: "OPENROUTER_API_KEY",
OpenCodeKeyEnv: "OPENCODE_API_KEY",
KimiKeyEnv: "KIMI_API_KEY",
OllamaCloudKeyFile: "/etc/lakehouse/ollama_cloud.env",
OpenRouterKeyFile: "/etc/lakehouse/openrouter.env",
OpenCodeKeyFile: "/etc/lakehouse/opencode.env",
KimiKeyFile: "/etc/lakehouse/kimi.env",
TimeoutSecs: 180,
},
Queryd: QuerydConfig{
Bind: "127.0.0.1:3214",
CatalogdURL: "http://127.0.0.1:3212",
SecretsPath: "/etc/lakehouse/secrets-go.toml",
RefreshEvery: "30s",
},
S3: S3Config{
Endpoint: "http://localhost:9000",
Region: "us-east-1",
Bucket: "lakehouse-primary",
UsePathStyle: true,
},
Models: ModelsConfig{
// Tier 1 — local hot path. JSON-clean, fast, deterministic.
// qwen3.5:latest replaces qwen2.5 as the local default per
// 2026-04-29 architectural review (stronger local model,
// same JSON-clean property).
LocalFast: "qwen3.5:latest",
LocalEmbed: "nomic-embed-text",
LocalJudge: "qwen3.5:latest",
LocalReview: "qwen3.5:latest",
// Tier 2 — Ollama Cloud (Pro plan). 2026-04-28 upgrade.
// kimi-k2:1t is upstream-broken; deepseek/kimi-k2.6/qwen3-coder
// are the working primaries.
CloudJudge: "kimi-k2.6:cloud",
CloudReview: "qwen3-coder:480b",
CloudStrong: "deepseek-v3.2",
// Tier 3 — frontier. OpenRouter credits + OpenCode key.
// Use sparingly: rate-limited, billed per call.
FrontierReview: "openrouter/anthropic/claude-opus-4-7",
FrontierArch: "openrouter/moonshotai/kimi-k2-0905",
FrontierStrong: "openrouter/openai/gpt-5",
FrontierFree: "opencode/claude-opus-4-7",
// Local-hot-path eligible. matrix.downgrade reads this
// list to decide whether to bypass the strong-model gate.
WeakModels: []string{"qwen3.5:latest", "qwen3:latest"},
},
Log: LogConfig{Level: "info"},
}
}
// LoadConfig reads `lakehouse.toml` from path; if path is empty or
// the file doesn't exist, returns DefaultConfig. Any decode error is
// fatal (we don't want a misconfigured service silently falling back
// to defaults — that's the kind of bug you find at 2am).
//
// Per Opus + Qwen WARN #3: when path WAS given but the file is
// missing, log a warning so silent default-fallback doesn't hide
// misconfiguration. Empty path is fine (caller didn't ask for a
// file); non-empty + missing is suspicious.
func LoadConfig(path string) (Config, error) {
cfg := DefaultConfig()
if path == "" {
return cfg, nil
}
b, err := os.ReadFile(path)
if errors.Is(err, fs.ErrNotExist) {
slog.Warn("config file not found, using defaults",
"path", path,
"hint", "create the file or pass -config /path/to/lakehouse.toml")
return cfg, nil
}
if err != nil {
return cfg, fmt.Errorf("read config: %w", err)
}
if err := toml.Unmarshal(b, &cfg); err != nil {
return cfg, fmt.Errorf("parse config: %w", err)
}
resolveAuthFromEnv(&cfg.Auth)
return cfg, nil
}
// resolveAuthFromEnv populates cfg.Auth.Token from os.Getenv(TokenEnv)
// when Token is empty. Per ADR-006 Decision 6.2: production deploys
// keep the secret in /etc/lakehouse/auth.env (mode 0600), loaded by
// systemd EnvironmentFile=, never in the committed TOML.
//
// TokenEnv defaults to "AUTH_TOKEN" so operators don't have to
// configure both — setting AUTH_TOKEN env is enough.
func resolveAuthFromEnv(auth *AuthConfig) {
envName := auth.TokenEnv
if envName == "" {
envName = "AUTH_TOKEN"
}
if auth.Token == "" {
if v := os.Getenv(envName); v != "" {
auth.Token = v
}
}
}