Lands the matrix indexer's first piece per docs/SPEC.md §3.4:
multi-corpus retrieve+merge with corpus attribution per result.
Future components (relevance filter, downgrade gate, learning-loop
integration) layer on top of this surface.
Architecture:
- internal/matrix/retrieve.go — Retriever takes (query, corpora,
k, per_corpus_k), parallel-fans across vectord indexes, merges
by distance ascending, preserves corpus origin per hit
- cmd/matrixd — HTTP service on :3217, fronts /v1/matrix/*
- gateway proxy + [matrixd] config + lakehouse.toml entry
- Either query_text (matrix calls embedd) or query_vector
(caller pre-embedded) — vector takes precedence if both set
Error policy: fail-loud on any corpus error. Silent partial returns
would lie about coverage, defeating the matrix's whole purpose.
Bubbles vectord errors as 502 (upstream), validation as 400.
Smoke (scripts/matrix_smoke.sh, 6 assertions PASS first try):
- /matrix/corpora lists indexes
- Multi-corpus search returns hits from BOTH corpora
- Top hit is the globally-closest across all corpora
(b-near beats a-near at distance 0.05 vs 0.1 — proves merge)
- Metadata round-trips through the merge
- Distances ascending in result list
- Negative paths: empty corpora → 400, missing corpus → 502,
no query → 400
12-smoke regression sweep all green (D1-D6, G1, G1P, G2,
storaged_cap, pathway, matrix).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
76 lines
2.5 KiB
TOML
76 lines
2.5 KiB
TOML
# Lakehouse-Go config — G0 dev defaults. Overrides via env are a
|
||
# G1+ concern; for G0 edit this file and restart the affected service.
|
||
|
||
# G0 dev ports — shifted to 3110+ so the Go services run alongside
|
||
# the live Rust lakehouse on 3100/3201-3204 without colliding. G5
|
||
# (demo cutover) flips gateway back to 3100 when Rust retires.
|
||
[gateway]
|
||
bind = "127.0.0.1:3110"
|
||
storaged_url = "http://127.0.0.1:3211"
|
||
catalogd_url = "http://127.0.0.1:3212"
|
||
ingestd_url = "http://127.0.0.1:3213"
|
||
queryd_url = "http://127.0.0.1:3214"
|
||
vectord_url = "http://127.0.0.1:3215"
|
||
embedd_url = "http://127.0.0.1:3216"
|
||
pathwayd_url = "http://127.0.0.1:3217"
|
||
matrixd_url = "http://127.0.0.1:3218"
|
||
|
||
[storaged]
|
||
bind = "127.0.0.1:3211"
|
||
|
||
[catalogd]
|
||
bind = "127.0.0.1:3212"
|
||
storaged_url = "http://127.0.0.1:3211"
|
||
|
||
[ingestd]
|
||
bind = "127.0.0.1:3213"
|
||
storaged_url = "http://127.0.0.1:3211"
|
||
catalogd_url = "http://127.0.0.1:3212"
|
||
# CSV uploads are ~4-6× the resulting Parquet. 256 MiB cap keeps the in-memory
|
||
# parse + Arrow + Parquet output footprint bounded. Bump for known large
|
||
# datasets (e.g. workers_500k → 344 MiB CSV needs 512 MiB).
|
||
max_ingest_bytes = 268435456
|
||
|
||
[vectord]
|
||
bind = "127.0.0.1:3215"
|
||
# Optional — set to empty string to disable persistence (dev/test).
|
||
storaged_url = "http://127.0.0.1:3211"
|
||
|
||
[embedd]
|
||
bind = "127.0.0.1:3216"
|
||
# G2: Ollama local. G3+ may swap in OpenAI/Voyage by changing
|
||
# this URL + the wire format inside the provider.
|
||
provider_url = "http://localhost:11434"
|
||
default_model = "nomic-embed-text"
|
||
|
||
[queryd]
|
||
bind = "127.0.0.1:3214"
|
||
catalogd_url = "http://127.0.0.1:3212"
|
||
secrets_path = "/etc/lakehouse/secrets-go.toml"
|
||
refresh_every = "30s"
|
||
|
||
[pathwayd]
|
||
bind = "127.0.0.1:3217"
|
||
# Empty = in-memory only (dev/test). Production sets a path under
|
||
# /var/lib/lakehouse/pathway/state.jsonl so traces survive restart.
|
||
persist_path = ""
|
||
|
||
[matrixd]
|
||
bind = "127.0.0.1:3218"
|
||
# matrixd calls embedd (query-text → vector) and vectord (per-corpus
|
||
# search) directly. Localhost defaults; in distributed deployments
|
||
# these point at the gateway's upstream addresses.
|
||
embedd_url = "http://127.0.0.1:3216"
|
||
vectord_url = "http://127.0.0.1:3215"
|
||
|
||
[s3]
|
||
endpoint = "http://localhost:9000"
|
||
region = "us-east-1"
|
||
bucket = "lakehouse-go-primary" # G0 dedicated bucket so Rust + Go coexist
|
||
access_key_id = "" # populated by SecretsProvider from /etc/lakehouse/secrets-go.toml
|
||
secret_access_key = "" # ditto
|
||
use_path_style = true
|
||
|
||
[log]
|
||
level = "info"
|