// Package shared also provides the TOML config loader. Per ADR // equivalent of Rust ADR-006 (TOML config over env vars), every // service reads `lakehouse.toml` with sane defaults and env // overrides. Config is hot-reload-unaware in G0; reload-on-SIGHUP // is a G1+ concern. package shared import ( "errors" "fmt" "io/fs" "log/slog" "os" "github.com/pelletier/go-toml/v2" ) // Config is the unified Lakehouse config. Each service reads only // the section it cares about, but they all share the same file so // operators have one place to look. type Config struct { Gateway GatewayConfig `toml:"gateway"` Storaged ServiceConfig `toml:"storaged"` Catalogd CatalogConfig `toml:"catalogd"` Ingestd IngestConfig `toml:"ingestd"` Queryd QuerydConfig `toml:"queryd"` Vectord VectordConfig `toml:"vectord"` Embedd EmbeddConfig `toml:"embedd"` S3 S3Config `toml:"s3"` Log LogConfig `toml:"log"` } // IngestConfig adds ingestd-specific knobs. ingestd needs to PUT // parquet to storaged AND register manifests with catalogd, so it // holds two upstream URLs in addition to its own bind. // // MaxIngestBytes caps the multipart body size. CSVs are typically // 4-6× larger than the resulting Snappy-compressed Parquet, so 256 // MiB CSV → ~50 MiB Parquet — well under storaged's 256 MiB PUT // cap. Real-scale validation (2026-04-29) showed 500K workers × // 18 cols = 344 MiB CSV → 71 MiB Parquet; bumping this knob to // 512 MiB is the documented path for that workload. type IngestConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` CatalogdURL string `toml:"catalogd_url"` MaxIngestBytes int64 `toml:"max_ingest_bytes"` } // GatewayConfig adds the upstream URLs the reverse proxy fronts. // Each route family (/v1/storage, /v1/catalog, /v1/ingest, /v1/sql, // /v1/vectors, /v1/embed) has its own upstream so we can scale // services independently or move them to different boxes without // touching gateway code. type GatewayConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` CatalogdURL string `toml:"catalogd_url"` IngestdURL string `toml:"ingestd_url"` QuerydURL string `toml:"queryd_url"` VectordURL string `toml:"vectord_url"` EmbeddURL string `toml:"embedd_url"` } // EmbeddConfig drives the embed service. ProviderURL points at the // embedding backend (Ollama in G2, possibly OpenAI/Voyage in G3+). // DefaultModel is what gets used when callers don't specify a // model in their request body. type EmbeddConfig struct { Bind string `toml:"bind"` ProviderURL string `toml:"provider_url"` DefaultModel string `toml:"default_model"` } // VectordConfig adds vectord-specific knobs. StoragedURL is // optional — empty string disables persistence, useful for ephemeral // dev or test runs. When set, indexes Save after every state change // and Load on startup. type VectordConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` } // QuerydConfig adds queryd-specific knobs. queryd talks DuckDB // directly to MinIO via DuckDB's httpfs extension (so no storaged // URL needed), and reads the catalog over HTTP for view registration. // SecretsPath defaults to /etc/lakehouse/secrets-go.toml — the same // file storaged uses, since both services need the S3 credentials. type QuerydConfig struct { Bind string `toml:"bind"` CatalogdURL string `toml:"catalogd_url"` SecretsPath string `toml:"secrets_path"` RefreshEvery string `toml:"refresh_every"` // duration string, e.g. "30s" } // CatalogConfig adds catalogd-specific knobs on top of the standard // bind. StoragedURL points at the storaged service for manifest // persistence; G0 defaults to the localhost bind. type CatalogConfig struct { Bind string `toml:"bind"` StoragedURL string `toml:"storaged_url"` } // ServiceConfig is the per-binary bind config. Default Bind "" // means "use the service's hardcoded G0 default" — see DefaultConfig. type ServiceConfig struct { Bind string `toml:"bind"` } // S3Config holds S3-compatible storage settings. Endpoint blank → // AWS default. Bucket "" → "lakehouse-primary". type S3Config struct { Endpoint string `toml:"endpoint"` Region string `toml:"region"` Bucket string `toml:"bucket"` AccessKeyID string `toml:"access_key_id"` SecretAccessKey string `toml:"secret_access_key"` UsePathStyle bool `toml:"use_path_style"` } // LogConfig — slog level for now; structured fields land G1+. type LogConfig struct { Level string `toml:"level"` } // DefaultConfig returns the G0 dev defaults. Ports are shifted to // 3110+ to coexist with the live Rust lakehouse on 3100/3201-3204 // during the migration. G5 cutover flips gateway back to 3100. func DefaultConfig() Config { return Config{ Gateway: GatewayConfig{ Bind: "127.0.0.1:3110", StoragedURL: "http://127.0.0.1:3211", CatalogdURL: "http://127.0.0.1:3212", IngestdURL: "http://127.0.0.1:3213", QuerydURL: "http://127.0.0.1:3214", VectordURL: "http://127.0.0.1:3215", EmbeddURL: "http://127.0.0.1:3216", }, Storaged: ServiceConfig{Bind: "127.0.0.1:3211"}, Catalogd: CatalogConfig{Bind: "127.0.0.1:3212", StoragedURL: "http://127.0.0.1:3211"}, Ingestd: IngestConfig{ Bind: "127.0.0.1:3213", StoragedURL: "http://127.0.0.1:3211", CatalogdURL: "http://127.0.0.1:3212", MaxIngestBytes: 256 << 20, // 256 MiB; bump per deployment via lakehouse.toml }, Vectord: VectordConfig{ Bind: "127.0.0.1:3215", StoragedURL: "http://127.0.0.1:3211", }, Embedd: EmbeddConfig{ Bind: "127.0.0.1:3216", ProviderURL: "http://localhost:11434", // local Ollama DefaultModel: "nomic-embed-text", }, Queryd: QuerydConfig{ Bind: "127.0.0.1:3214", CatalogdURL: "http://127.0.0.1:3212", SecretsPath: "/etc/lakehouse/secrets-go.toml", RefreshEvery: "30s", }, S3: S3Config{ Endpoint: "http://localhost:9000", Region: "us-east-1", Bucket: "lakehouse-primary", UsePathStyle: true, }, Log: LogConfig{Level: "info"}, } } // LoadConfig reads `lakehouse.toml` from path; if path is empty or // the file doesn't exist, returns DefaultConfig. Any decode error is // fatal (we don't want a misconfigured service silently falling back // to defaults — that's the kind of bug you find at 2am). // // Per Opus + Qwen WARN #3: when path WAS given but the file is // missing, log a warning so silent default-fallback doesn't hide // misconfiguration. Empty path is fine (caller didn't ask for a // file); non-empty + missing is suspicious. func LoadConfig(path string) (Config, error) { cfg := DefaultConfig() if path == "" { return cfg, nil } b, err := os.ReadFile(path) if errors.Is(err, fs.ErrNotExist) { slog.Warn("config file not found, using defaults", "path", path, "hint", "create the file or pass -config /path/to/lakehouse.toml") return cfg, nil } if err != nil { return cfg, fmt.Errorf("read config: %w", err) } if err := toml.Unmarshal(b, &cfg); err != nil { return cfg, fmt.Errorf("parse config: %w", err) } return cfg, nil }