// Package shared also provides the TOML config loader. Per ADR // equivalent of Rust ADR-006 (TOML config over env vars), every // service reads `lakehouse.toml` with sane defaults and env // overrides. Config is hot-reload-unaware in G0; reload-on-SIGHUP // is a G1+ concern. package shared import ( "errors" "fmt" "io/fs" "log/slog" "os" "github.com/pelletier/go-toml/v2" ) // Config is the unified Lakehouse config. Each service reads only // the section it cares about, but they all share the same file so // operators have one place to look. type Config struct { Gateway GatewayConfig `toml:"gateway"` Storaged ServiceConfig `toml:"storaged"` Catalogd CatalogConfig `toml:"catalogd"` Ingestd IngestConfig `toml:"ingestd"` Queryd QuerydConfig `toml:"queryd"` Vectord VectordConfig `toml:"vectord"` Embedd EmbeddConfig `toml:"embedd"` Pathwayd PathwaydConfig `toml:"pathwayd"` Matrixd MatrixdConfig `toml:"matrixd"` Observerd ObserverdConfig `toml:"observerd"` Chatd ChatdConfig `toml:"chatd"` Validatord ValidatordConfig `toml:"validatord"` S3 S3Config `toml:"s3"` Models ModelsConfig `toml:"models"` Log LogConfig `toml:"log"` Auth AuthConfig `toml:"auth"` } // IngestConfig adds ingestd-specific knobs. ingestd needs to PUT // parquet to storaged AND register manifests with catalogd, so it // holds two upstream URLs in addition to its own bind. // // MaxIngestBytes caps the multipart body size. CSVs are typically // 4-6× larger than the resulting Snappy-compressed Parquet, so 256 // MiB CSV → ~50 MiB Parquet — well under storaged's 256 MiB PUT // cap. Real-scale validation (2026-04-29) showed 500K workers × // 18 cols = 344 MiB CSV → 71 MiB Parquet; bumping this knob to // 512 MiB is the documented path for that workload. type IngestConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` CatalogdURL string `toml:"catalogd_url"` MaxIngestBytes int64 `toml:"max_ingest_bytes"` } // GatewayConfig adds the upstream URLs the reverse proxy fronts. // Each route family (/v1/storage, /v1/catalog, /v1/ingest, /v1/sql, // /v1/vectors, /v1/embed, /v1/pathway, /v1/matrix, /v1/observer) // has its own upstream so we can scale services independently or // move them to different boxes without touching gateway code. type GatewayConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` CatalogdURL string `toml:"catalogd_url"` IngestdURL string `toml:"ingestd_url"` QuerydURL string `toml:"queryd_url"` VectordURL string `toml:"vectord_url"` EmbeddURL string `toml:"embedd_url"` PathwaydURL string `toml:"pathwayd_url"` MatrixdURL string `toml:"matrixd_url"` ObserverdURL string `toml:"observerd_url"` ChatdURL string `toml:"chatd_url"` ValidatordURL string `toml:"validatord_url"` } // EmbeddConfig drives the embed service. ProviderURL points at the // embedding backend (Ollama in G2, possibly OpenAI/Voyage in G3+). // DefaultModel is what gets used when callers don't specify a // model in their request body. CacheSize is the LRU cache cap on // (model, sha256(text)) → vector lookups; 0 disables caching. // Default 10000 entries ≈ 30 MiB at d=768. type EmbeddConfig struct { Bind string `toml:"bind"` ProviderURL string `toml:"provider_url"` DefaultModel string `toml:"default_model"` CacheSize int `toml:"cache_size"` } // VectordConfig adds vectord-specific knobs. StoragedURL is // optional — empty string disables persistence, useful for ephemeral // dev or test runs. When set, indexes Save after every state change // and Load on startup. type VectordConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` } // PathwaydConfig drives the pathway-memory service (cmd/pathwayd). // PersistPath: file path to the JSONL log; empty = in-memory only // (test/dev). Production sets a stable path under /var/lib/lakehouse // or similar so traces survive restart. type PathwaydConfig struct { Bind string `toml:"bind"` PersistPath string `toml:"persist_path"` } // MatrixdConfig drives the matrix-indexer service (cmd/matrixd). // Per docs/SPEC.md §3.4: multi-corpus retrieve+merge over vectord // with embed-via-embedd for query text. Both upstream URLs are // required — matrixd has no in-process fallback. type MatrixdConfig struct { Bind string `toml:"bind"` EmbeddURL string `toml:"embedd_url"` VectordURL string `toml:"vectord_url"` } // ChatdConfig drives the chat dispatcher service (cmd/chatd) — Phase 4. // Routes /v1/chat to the right provider based on model-name prefix // or :cloud suffix. Per-provider API keys come from env vars (or // /etc/lakehouse/.env files); empty keys leave the provider // unregistered so requests for that provider 404 cleanly. // // OllamaURL is the local Ollama upstream (no auth). Empty disables // the local Ollama provider — useful in deployments that don't run // Ollama on the box (cloud-only operation). type ChatdConfig struct { Bind string `toml:"bind"` OllamaURL string `toml:"ollama_url"` // API-key env var names. Default to the conventional names. // Operators can rename these for environments using different env // var conventions, but the defaults match /etc/lakehouse/*.env // files that systemd already loads. OllamaCloudKeyEnv string `toml:"ollama_cloud_key_env"` OpenRouterKeyEnv string `toml:"openrouter_key_env"` OpenCodeKeyEnv string `toml:"opencode_key_env"` KimiKeyEnv string `toml:"kimi_key_env"` // Optional .env file paths — a fallback when the env var isn't set // in the process environment. Same keys, "KEY=value" lines. OllamaCloudKeyFile string `toml:"ollama_cloud_key_file"` OpenRouterKeyFile string `toml:"openrouter_key_file"` OpenCodeKeyFile string `toml:"opencode_key_file"` KimiKeyFile string `toml:"kimi_key_file"` // Per-call timeout in seconds. 0 = 180s default. TimeoutSecs int `toml:"timeout_secs"` } // ValidatordConfig drives the validator service (cmd/validatord). // Hosts /validate (FillValidator + EmailValidator + PlaybookValidator) // and /iterate (generate→validate→correct loop). Routes to chatd via // ChatdURL for the iteration loop's LLM hops. // // RosterPath points at a JSONL roster (one WorkerRecord per line) that // FillValidator and EmailValidator use for worker-existence checks. // Empty disables the roster — worker-existence checks all fail // Consistency, which is the correct behavior when the roster isn't // configured. Production sets a stable path under /var/lib/lakehouse/. type ValidatordConfig struct { Bind string `toml:"bind"` ChatdURL string `toml:"chatd_url"` RosterPath string `toml:"roster_path"` // Per-call cap on the iteration loop. 0 = 3 (Phase 43 default). DefaultMaxIterations int `toml:"default_max_iterations"` // Per-call cap on chat hop max_tokens. 0 = 4096. DefaultMaxTokens int `toml:"default_max_tokens"` // Per-call timeout for the chat hop in seconds. 0 = 240s. ChatTimeoutSecs int `toml:"chat_timeout_secs"` // SessionLogPath: where to append SessionRecord JSONL rows for // offline analysis (DuckDB queries, scrum review tooling). Empty // = disabled. Production sets a stable path under // /var/lib/lakehouse/validator/sessions.jsonl. Append-only, // best-effort; see internal/validator/session_log.go. SessionLogPath string `toml:"session_log_path"` } // ObserverdConfig drives the observer service (cmd/observerd). // PersistPath: file path to the JSONL ops log; empty = in-memory // only (test/dev). Production sets a stable path under // /var/lib/lakehouse/observer/ops.jsonl so ops survive restart. // Mirrors the PathwaydConfig pattern. type ObserverdConfig struct { Bind string `toml:"bind"` PersistPath string `toml:"persist_path"` } // QuerydConfig adds queryd-specific knobs. queryd talks DuckDB // directly to MinIO via DuckDB's httpfs extension (so no storaged // URL needed), and reads the catalog over HTTP for view registration. // SecretsPath defaults to /etc/lakehouse/secrets-go.toml — the same // file storaged uses, since both services need the S3 credentials. type QuerydConfig struct { Bind string `toml:"bind"` CatalogdURL string `toml:"catalogd_url"` SecretsPath string `toml:"secrets_path"` RefreshEvery string `toml:"refresh_every"` // duration string, e.g. "30s" } // CatalogConfig adds catalogd-specific knobs on top of the standard // bind. StoragedURL points at the storaged service for manifest // persistence; G0 defaults to the localhost bind. type CatalogConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` } // ServiceConfig is the per-binary bind config. Default Bind "" // means "use the service's hardcoded G0 default" — see DefaultConfig. type ServiceConfig struct { Bind string `toml:"bind"` } // S3Config holds S3-compatible storage settings. Endpoint blank → // AWS default. Bucket "" → "lakehouse-primary". type S3Config struct { Endpoint string `toml:"endpoint"` Region string `toml:"region"` Bucket string `toml:"bucket"` AccessKeyID string `toml:"access_key_id"` SecretAccessKey string `toml:"secret_access_key"` UsePathStyle bool `toml:"use_path_style"` } // LogConfig — slog level for now; structured fields land G1+. type LogConfig struct { Level string `toml:"level"` } // ModelsConfig names the models used by each tier of the small-model // pipeline (per project_small_model_pipeline_vision.md and the Rust // `config/providers.toml` convention). Callers reference tier names, // not literal model IDs — bumping a tier means editing this file, not // hunting through code. // // Tier philosophy: // - local_* : on-box Ollama. Cheap, fast, JSON-clean. Inner-loop // hot path. Repeated calls per query. // - cloud_* : Ollama Cloud (Pro plan). Larger context, called when // local is uncertain. Auth via OLLAMA_CLOUD_KEY. // - frontier_* : OpenRouter / OpenCode. Rate-limited, billed per call. // Reserved for blockers and full-scope reviews, not // steady state. // // WeakModels is the codified "local-hot-path eligible" list that the // matrix downgrade gate consults — replacing the previous hardcoded // switch in internal/matrix/downgrade.go. A model in this list bypasses // the corpus-downgrade rule: it's already weak, no need to downgrade // further. Strong (paid / cloud / frontier) models trigger the gate. type ModelsConfig struct { LocalFast string `toml:"local_fast"` LocalEmbed string `toml:"local_embed"` LocalJudge string `toml:"local_judge"` LocalReview string `toml:"local_review"` CloudJudge string `toml:"cloud_judge"` CloudReview string `toml:"cloud_review"` CloudStrong string `toml:"cloud_strong"` FrontierReview string `toml:"frontier_review"` FrontierArch string `toml:"frontier_arch"` FrontierStrong string `toml:"frontier_strong"` FrontierFree string `toml:"frontier_free"` WeakModels []string `toml:"weak_models"` } // Resolve maps a tier name (e.g. "local_judge") to the configured // model ID. Unknown tier returns "". Callers should fall back to a // hardcoded default and log a warning when this returns empty — the // alternative (panic) would make a missing tier crash the binary at // startup, which is too aggressive for an additive config. func (m ModelsConfig) Resolve(tier string) string { switch tier { case "local_fast": return m.LocalFast case "local_embed": return m.LocalEmbed case "local_judge": return m.LocalJudge case "local_review": return m.LocalReview case "cloud_judge": return m.CloudJudge case "cloud_review": return m.CloudReview case "cloud_strong": return m.CloudStrong case "frontier_review": return m.FrontierReview case "frontier_arch": return m.FrontierArch case "frontier_strong": return m.FrontierStrong case "frontier_free": return m.FrontierFree default: return "" } } // IsWeak reports whether `model` is in the configured WeakModels list. // Used by matrix.downgrade to decide whether to bypass the strong-model // downgrade gate (weak models stay on the full lakehouse path). func (m ModelsConfig) IsWeak(model string) bool { for _, w := range m.WeakModels { if w == model { return true } } return false } // AuthConfig is the inter-service auth posture from ADR-003. // Token is a Bearer token; empty means "no auth" (G0 dev mode). // AllowedIPs is a list of CIDRs (or bare IPs treated as /32); // empty means "any source IP." // // Both layers operate independently when set: // - Token + AllowedIPs both empty → middleware is a no-op // - Token only → 401 unless Bearer matches // - AllowedIPs only → 403 unless r.RemoteAddr in CIDR // - Both → both gates apply // // The startup gate in shared.Run refuses to start with non-loopback // bind AND empty Token — that's the audit's R-001 + R-007 worst // case (no auth, world-reachable). LH__ALLOW_NONLOOPBACK=1 still // bypasses the bind gate for explicit dev cases; the auth gate is // independent of that bypass and is the real production guard. type AuthConfig struct { Token string `toml:"token"` AllowedIPs []string `toml:"allowed_ips"` // TokenEnv names an environment variable; LoadConfig populates // Token from os.Getenv(TokenEnv) when Token is empty. Per ADR-006 // 6.2: production deploys put the secret in /etc/lakehouse/auth.env // (mode 0600) loaded by systemd EnvironmentFile=, NOT in the // committed TOML. TokenEnv defaults to "AUTH_TOKEN". TokenEnv string `toml:"token_env"` // SecondaryTokens lets operators stage a rotation: both primary // and any secondary token pass auth during the rotation window. // After every caller updates, operators promote secondary → // primary and clear secondary. Per ADR-006 Decision 6.5. SecondaryTokens []string `toml:"secondary_tokens"` } // DefaultConfig returns the G0 dev defaults. Ports are shifted to // 3110+ to coexist with the live Rust lakehouse on 3100/3201-3204 // during the migration. G5 cutover flips gateway back to 3100. func DefaultConfig() Config { return Config{ Gateway: GatewayConfig{ Bind: "127.0.0.1:3110", StoragedURL: "http://127.0.0.1:3211", CatalogdURL: "http://127.0.0.1:3212", IngestdURL: "http://127.0.0.1:3213", QuerydURL: "http://127.0.0.1:3214", VectordURL: "http://127.0.0.1:3215", EmbeddURL: "http://127.0.0.1:3216", PathwaydURL: "http://127.0.0.1:3217", MatrixdURL: "http://127.0.0.1:3218", ObserverdURL: "http://127.0.0.1:3219", ChatdURL: "http://127.0.0.1:3220", ValidatordURL: "http://127.0.0.1:3221", }, Storaged: ServiceConfig{Bind: "127.0.0.1:3211"}, Catalogd: CatalogConfig{Bind: "127.0.0.1:3212", StoragedURL: "http://127.0.0.1:3211"}, Ingestd: IngestConfig{ Bind: "127.0.0.1:3213", StoragedURL: "http://127.0.0.1:3211", CatalogdURL: "http://127.0.0.1:3212", MaxIngestBytes: 256 << 20, // 256 MiB; bump per deployment via lakehouse.toml }, Vectord: VectordConfig{ Bind: "127.0.0.1:3215", StoragedURL: "http://127.0.0.1:3211", }, Embedd: EmbeddConfig{ Bind: "127.0.0.1:3216", ProviderURL: "http://localhost:11434", // local Ollama DefaultModel: "nomic-embed-text", CacheSize: 10_000, // ~30 MiB at d=768; set to 0 to disable }, Pathwayd: PathwaydConfig{ Bind: "127.0.0.1:3217", // PersistPath empty by default = in-memory only. Production // sets to e.g. /var/lib/lakehouse/pathway/state.jsonl. }, Matrixd: MatrixdConfig{ Bind: "127.0.0.1:3218", EmbeddURL: "http://127.0.0.1:3216", VectordURL: "http://127.0.0.1:3215", }, Observerd: ObserverdConfig{ Bind: "127.0.0.1:3219", // PersistPath empty by default = in-memory only. }, Validatord: ValidatordConfig{ Bind: "127.0.0.1:3221", ChatdURL: "http://127.0.0.1:3220", RosterPath: "", // empty = no roster; worker-existence checks fail Consistency DefaultMaxIterations: 3, DefaultMaxTokens: 4096, ChatTimeoutSecs: 240, SessionLogPath: "", // empty = no session JSONL. Operators set under /var/lib/lakehouse/validator/sessions.jsonl. }, Chatd: ChatdConfig{ Bind: "127.0.0.1:3220", OllamaURL: "http://localhost:11434", OllamaCloudKeyEnv: "OLLAMA_CLOUD_KEY", OpenRouterKeyEnv: "OPENROUTER_API_KEY", OpenCodeKeyEnv: "OPENCODE_API_KEY", KimiKeyEnv: "KIMI_API_KEY", OllamaCloudKeyFile: "/etc/lakehouse/ollama_cloud.env", OpenRouterKeyFile: "/etc/lakehouse/openrouter.env", OpenCodeKeyFile: "/etc/lakehouse/opencode.env", KimiKeyFile: "/etc/lakehouse/kimi.env", TimeoutSecs: 180, }, Queryd: QuerydConfig{ Bind: "127.0.0.1:3214", CatalogdURL: "http://127.0.0.1:3212", SecretsPath: "/etc/lakehouse/secrets-go.toml", RefreshEvery: "30s", }, S3: S3Config{ Endpoint: "http://localhost:9000", Region: "us-east-1", Bucket: "lakehouse-primary", UsePathStyle: true, }, Models: ModelsConfig{ // Tier 1 — local hot path. JSON-clean, fast, deterministic. // qwen3.5:latest replaces qwen2.5 as the local default per // 2026-04-29 architectural review (stronger local model, // same JSON-clean property). LocalFast: "qwen3.5:latest", LocalEmbed: "nomic-embed-text", LocalJudge: "qwen3.5:latest", LocalReview: "qwen3.5:latest", // Tier 2 — Ollama Cloud (Pro plan). 2026-04-28 upgrade. // kimi-k2:1t is upstream-broken; deepseek/kimi-k2.6/qwen3-coder // are the working primaries. CloudJudge: "kimi-k2.6:cloud", CloudReview: "qwen3-coder:480b", CloudStrong: "deepseek-v3.2", // Tier 3 — frontier. OpenRouter credits + OpenCode key. // Use sparingly: rate-limited, billed per call. FrontierReview: "openrouter/anthropic/claude-opus-4-7", FrontierArch: "openrouter/moonshotai/kimi-k2-0905", FrontierStrong: "openrouter/openai/gpt-5", FrontierFree: "opencode/claude-opus-4-7", // Local-hot-path eligible. matrix.downgrade reads this // list to decide whether to bypass the strong-model gate. WeakModels: []string{"qwen3.5:latest", "qwen3:latest"}, }, Log: LogConfig{Level: "info"}, } } // LoadConfig reads `lakehouse.toml` from path; if path is empty or // the file doesn't exist, returns DefaultConfig. Any decode error is // fatal (we don't want a misconfigured service silently falling back // to defaults — that's the kind of bug you find at 2am). // // Per Opus + Qwen WARN #3: when path WAS given but the file is // missing, log a warning so silent default-fallback doesn't hide // misconfiguration. Empty path is fine (caller didn't ask for a // file); non-empty + missing is suspicious. func LoadConfig(path string) (Config, error) { cfg := DefaultConfig() if path == "" { return cfg, nil } b, err := os.ReadFile(path) if errors.Is(err, fs.ErrNotExist) { slog.Warn("config file not found, using defaults", "path", path, "hint", "create the file or pass -config /path/to/lakehouse.toml") return cfg, nil } if err != nil { return cfg, fmt.Errorf("read config: %w", err) } if err := toml.Unmarshal(b, &cfg); err != nil { return cfg, fmt.Errorf("parse config: %w", err) } resolveAuthFromEnv(&cfg.Auth) return cfg, nil } // resolveAuthFromEnv populates cfg.Auth.Token from os.Getenv(TokenEnv) // when Token is empty. Per ADR-006 Decision 6.2: production deploys // keep the secret in /etc/lakehouse/auth.env (mode 0600), loaded by // systemd EnvironmentFile=, never in the committed TOML. // // TokenEnv defaults to "AUTH_TOKEN" so operators don't have to // configure both — setting AUTH_TOKEN env is enough. func resolveAuthFromEnv(auth *AuthConfig) { envName := auth.TokenEnv if envName == "" { envName = "AUTH_TOKEN" } if auth.Token == "" { if v := os.Getenv(envName); v != "" { auth.Token = v } } }