root 814197cfd3 ADR-006: auth posture for non-loopback deploy + token rotation impl
ADR-003 locked the auth substrate; ADR-006 ratifies the operator
playbook + adds two implementation pieces needed for Sprint 4
deployment: env-resolved tokens and dual-token rotation.

Six decisions locked in docs/DECISIONS.md:
- 6.1: Non-loopback bind requires auth.token (mechanical gate at
       shared.Run, already implemented; this ratifies it).
- 6.2: Token from env, not TOML. /etc/lakehouse/auth.env (mode 0600)
       loaded by systemd EnvironmentFile=. New TokenEnv field on
       AuthConfig defaults to "AUTH_TOKEN".
- 6.3: AllowedIPs for inter-service same-trust-domain; Token for
       cross-trust-boundary (gateway ↔ external).
- 6.4: /health stays unauthenticated; everything else under
       shared.Run is gated. Already implemented; ratified here.
- 6.5: Token rotation is dual-token. New SecondaryTokens []string
       on AuthConfig — both primary and any secondary pass auth
       during the rotation window. Implemented in this commit.
- 6.6: TLS terminates at the network edge (nginx/Caddy), not
       in-process. Daemons stay HTTP-only; internal traffic stays
       on private subnets per Decision 6.3.

Implementation:
- internal/shared/config.go: AuthConfig gains TokenEnv +
  SecondaryTokens fields. New resolveAuthFromEnv() called by
  LoadConfig fills Token from os.Getenv(TokenEnv) when Token is
  empty. TokenEnv defaults to "AUTH_TOKEN" so the happy path needs
  no TOML config.
- internal/shared/auth.go: RequireAuth pre-encodes Bearer headers
  for primary + every secondary token; per-request constant-time
  compare walks the slice. Fast path is 1 compare (primary).

Tests:
- TestLoadConfig_AuthTokenFromEnv (3 sub-tests): default env name,
  custom token_env, explicit Token wins over env.
- TestRequireAuth_SecondaryTokenAccepted: both primary + secondary
  tokens pass during rotation window.
- TestRequireAuth_SecondaryTokensOnly: only-secondary path works
  for the case where primary was just promoted-to-empty mid-rotation.

go test ./internal/shared all green; existing auth_test.go
unchanged (constant-time compare path preserved).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 17:51:14 -05:00

294 lines
9.6 KiB
Go

package shared
import (
"os"
"path/filepath"
"strings"
"testing"
)
// Closes the config.go side of R-002 — TOML loader, default values,
// missing-file warn semantics. The audit flagged "internal/shared
// has zero tests" without distinguishing server.go from config.go;
// this file covers the latter.
func TestDefaultConfig_G0Ports(t *testing.T) {
cfg := DefaultConfig()
// Ports are shifted to 3110+ to coexist with the live Rust
// lakehouse on 3100/3201-3204 during the migration. Locking
// these values via test means a refactor that flips a port
// silently can't ship without a test edit.
checks := []struct {
name string
actual string
expected string
}{
{"gateway bind", cfg.Gateway.Bind, "127.0.0.1:3110"},
{"storaged bind", cfg.Storaged.Bind, "127.0.0.1:3211"},
{"catalogd bind", cfg.Catalogd.Bind, "127.0.0.1:3212"},
{"ingestd bind", cfg.Ingestd.Bind, "127.0.0.1:3213"},
{"queryd bind", cfg.Queryd.Bind, "127.0.0.1:3214"},
{"vectord bind", cfg.Vectord.Bind, "127.0.0.1:3215"},
{"embedd bind", cfg.Embedd.Bind, "127.0.0.1:3216"},
}
for _, c := range checks {
if c.actual != c.expected {
t.Errorf("%s = %q, want %q", c.name, c.actual, c.expected)
}
}
// G0 default: 256 MiB ingest cap (real-scale 500K test bumped
// this to 512 — still 256 here as the documented default).
if cfg.Ingestd.MaxIngestBytes != 256<<20 {
t.Errorf("ingestd MaxIngestBytes = %d, want %d", cfg.Ingestd.MaxIngestBytes, 256<<20)
}
// embedd default model is the G2 nomic-embed-text default.
if cfg.Embedd.DefaultModel != "nomic-embed-text" {
t.Errorf("embedd DefaultModel = %q, want nomic-embed-text", cfg.Embedd.DefaultModel)
}
// queryd refresh ticker default — production value, not the proof
// harness's 500ms override.
if cfg.Queryd.RefreshEvery != "30s" {
t.Errorf("queryd RefreshEvery = %q, want 30s", cfg.Queryd.RefreshEvery)
}
}
func TestLoadConfig_EmptyPath_ReturnsDefaults(t *testing.T) {
cfg, err := LoadConfig("")
if err != nil {
t.Fatalf("empty path should not error, got %v", err)
}
if cfg.Gateway.Bind != "127.0.0.1:3110" {
t.Errorf("expected default gateway bind, got %q", cfg.Gateway.Bind)
}
}
func TestLoadConfig_MissingFile_FallsBackToDefaults(t *testing.T) {
// Per the comment in config.go: "non-empty + missing is suspicious"
// — but the contract is to log a warn and return defaults, not
// fail. We verify the contract; capturing the warn line is a
// stretch for a unit test (slog default sink is os.Stderr).
cfg, err := LoadConfig("/nonexistent/path/lakehouse.toml")
if err != nil {
t.Fatalf("missing file should not error, got %v", err)
}
if cfg.Storaged.Bind != "127.0.0.1:3211" {
t.Errorf("expected default storaged bind on missing file, got %q", cfg.Storaged.Bind)
}
}
func TestLoadConfig_ValidTOML_RoundTrip(t *testing.T) {
// Write a partial config; verify only the overridden sections
// land while the rest stay at defaults.
dir := t.TempDir()
cfgPath := filepath.Join(dir, "lakehouse.toml")
body := `[gateway]
bind = "0.0.0.0:8080"
[s3]
endpoint = "http://other-minio:9000"
bucket = "custom-bucket"
`
if err := os.WriteFile(cfgPath, []byte(body), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := LoadConfig(cfgPath)
if err != nil {
t.Fatalf("LoadConfig: %v", err)
}
if cfg.Gateway.Bind != "0.0.0.0:8080" {
t.Errorf("gateway.bind = %q, want 0.0.0.0:8080", cfg.Gateway.Bind)
}
if cfg.S3.Bucket != "custom-bucket" {
t.Errorf("s3.bucket = %q, want custom-bucket", cfg.S3.Bucket)
}
// Unspecified sections keep defaults (TOML decoder doesn't zero
// fields it didn't see).
if cfg.Storaged.Bind != "127.0.0.1:3211" {
t.Errorf("storaged.bind drifted to %q, want default 127.0.0.1:3211", cfg.Storaged.Bind)
}
}
func TestDefaultConfig_ModelsTier(t *testing.T) {
cfg := DefaultConfig()
// Locks tier defaults so a refactor that drops a tier or renames
// a default silently can't ship without a test edit.
checks := []struct {
tier, want string
}{
{"local_fast", "qwen3.5:latest"},
{"local_embed", "nomic-embed-text"},
{"local_judge", "qwen3.5:latest"},
{"cloud_judge", "kimi-k2.6:cloud"},
{"cloud_review", "qwen3-coder:480b"},
{"frontier_review", "openrouter/anthropic/claude-opus-4-7"},
{"frontier_free", "opencode/claude-opus-4-7"},
}
for _, c := range checks {
if got := cfg.Models.Resolve(c.tier); got != c.want {
t.Errorf("Models.Resolve(%q) = %q, want %q", c.tier, got, c.want)
}
}
// Unknown tier returns "" — caller's responsibility to fall back.
if got := cfg.Models.Resolve("nonexistent"); got != "" {
t.Errorf("Models.Resolve(nonexistent) = %q, want empty string", got)
}
}
func TestModelsConfig_IsWeak(t *testing.T) {
cfg := DefaultConfig()
// Default WeakModels set is the matrix.downgrade bypass list.
if !cfg.Models.IsWeak("qwen3.5:latest") {
t.Errorf("qwen3.5:latest should be weak (in default WeakModels)")
}
if !cfg.Models.IsWeak("qwen3:latest") {
t.Errorf("qwen3:latest should be weak")
}
// Strong / cloud / frontier models are NOT weak.
for _, m := range []string{
"opencode/claude-opus-4-7",
"openrouter/openai/gpt-5",
"qwen3-coder:480b",
"deepseek-v3.2",
} {
if cfg.Models.IsWeak(m) {
t.Errorf("%s should NOT be weak", m)
}
}
}
func TestLoadConfig_ModelsTOMLRoundTrip(t *testing.T) {
// Override one tier name; verify it loads, rest stay at defaults.
dir := t.TempDir()
cfgPath := filepath.Join(dir, "lakehouse.toml")
body := `[models]
local_judge = "custom-judge:latest"
weak_models = ["custom-judge:latest", "qwen3:latest"]
`
if err := os.WriteFile(cfgPath, []byte(body), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := LoadConfig(cfgPath)
if err != nil {
t.Fatalf("LoadConfig: %v", err)
}
if cfg.Models.LocalJudge != "custom-judge:latest" {
t.Errorf("local_judge = %q, want custom-judge:latest", cfg.Models.LocalJudge)
}
if !cfg.Models.IsWeak("custom-judge:latest") {
t.Errorf("custom-judge:latest should be weak after override")
}
// Unspecified tiers keep defaults — TOML decoder doesn't zero
// fields it didn't see, BUT slices are replaced wholesale. The
// weak_models override above is intentional so callers know
// they're trading the full default list when they set this.
if cfg.Models.LocalFast != "qwen3.5:latest" {
t.Errorf("local_fast drifted to %q, want default qwen3.5:latest", cfg.Models.LocalFast)
}
}
// TestLoadConfig_AuthTokenFromEnv locks ADR-006 Decision 6.2:
// production deploys put the secret in /etc/lakehouse/auth.env (mode
// 0600), loaded by systemd EnvironmentFile=, NEVER in the committed
// TOML. The TOML names the env var via token_env; the loader fills
// Token from os.Getenv. TokenEnv defaults to "AUTH_TOKEN" so the
// happy path needs no TOML config at all.
func TestLoadConfig_AuthTokenFromEnv(t *testing.T) {
t.Run("default env name AUTH_TOKEN", func(t *testing.T) {
t.Setenv("AUTH_TOKEN", "from-default-env")
dir := t.TempDir()
path := filepath.Join(dir, "lakehouse.toml")
if err := os.WriteFile(path, []byte(`[auth]
allowed_ips = []
`), 0o644); err != nil {
t.Fatal(err)
}
cfg, err := LoadConfig(path)
if err != nil {
t.Fatalf("LoadConfig: %v", err)
}
if cfg.Auth.Token != "from-default-env" {
t.Errorf("Token = %q, want from AUTH_TOKEN env", cfg.Auth.Token)
}
})
t.Run("custom env name from token_env", func(t *testing.T) {
t.Setenv("CUSTOM_AUTH_TOKEN", "from-custom-env")
dir := t.TempDir()
path := filepath.Join(dir, "lakehouse.toml")
if err := os.WriteFile(path, []byte(`[auth]
token_env = "CUSTOM_AUTH_TOKEN"
`), 0o644); err != nil {
t.Fatal(err)
}
cfg, err := LoadConfig(path)
if err != nil {
t.Fatalf("LoadConfig: %v", err)
}
if cfg.Auth.Token != "from-custom-env" {
t.Errorf("Token = %q, want from CUSTOM_AUTH_TOKEN env", cfg.Auth.Token)
}
})
t.Run("explicit token wins over env", func(t *testing.T) {
t.Setenv("AUTH_TOKEN", "from-env")
dir := t.TempDir()
path := filepath.Join(dir, "lakehouse.toml")
if err := os.WriteFile(path, []byte(`[auth]
token = "from-toml"
`), 0o644); err != nil {
t.Fatal(err)
}
cfg, err := LoadConfig(path)
if err != nil {
t.Fatalf("LoadConfig: %v", err)
}
// Explicit Token in TOML wins over env — the loader only
// fills from env when Token is empty. Lets local dev
// override prod env vars.
if cfg.Auth.Token != "from-toml" {
t.Errorf("Token = %q, want explicit TOML value", cfg.Auth.Token)
}
})
}
func TestLoadConfig_InvalidTOML_ReturnsError(t *testing.T) {
dir := t.TempDir()
cfgPath := filepath.Join(dir, "bad.toml")
if err := os.WriteFile(cfgPath, []byte("this is = not [toml"), 0o644); err != nil {
t.Fatalf("write bad config: %v", err)
}
_, err := LoadConfig(cfgPath)
if err == nil {
t.Fatal("expected parse error on malformed TOML, got nil")
}
if !strings.Contains(err.Error(), "parse config") {
t.Errorf("error = %v, want 'parse config' wrapper", err)
}
}
func TestLoadConfig_FileButUnreadable(t *testing.T) {
// Skip on non-unix or when running as root (which can read
// 0000-permission files). We only need this case in CI/local-dev
// where test user isn't root. Per memory `feedback_pkill_scope.md`
// J's box runs many things as root; treat this as informational.
if os.Geteuid() == 0 {
t.Skip("root can read 0000 files; skipping unreadable-file case")
}
dir := t.TempDir()
cfgPath := filepath.Join(dir, "locked.toml")
if err := os.WriteFile(cfgPath, []byte("[gateway]\nbind=\":1\""), 0o000); err != nil {
t.Fatalf("write: %v", err)
}
_, err := LoadConfig(cfgPath)
if err == nil {
t.Fatal("expected read error on unreadable file, got nil")
}
if !strings.Contains(err.Error(), "read config") {
t.Errorf("error = %v, want 'read config' wrapper", err)
}
}