Phase 0: bootstrap Rust workspace

- Cargo workspace with 6 crates: shared, storaged, catalogd, queryd, aibridge, gateway - shared: types (DatasetId, ObjectRef, SchemaFingerprint, DatasetManifest) + error enum - gateway: Axum HTTP entrypoint with nested service routers + tracing - All services expose /health stubs - justfile with build/test/run recipes - PRD, phase tracker, and ADR docs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 04:59:05 -05:00 · 2026-03-27 04:59:05 -05:00 · a52ca841c6
commit a52ca841c6
25 changed files with 1823 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+/target
+*.swp
+*.swo
+.env
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,22 @@
+[workspace]
+resolver = "2"
+members = [
+    "crates/shared",
+    "crates/storaged",
+    "crates/catalogd",
+    "crates/queryd",
+    "crates/aibridge",
+    "crates/gateway",
+]
+
+[workspace.dependencies]
+tokio = { version = "1", features = ["full"] }
+axum = "0.8"
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+thiserror = "2"
+uuid = { version = "1", features = ["v4", "serde"] }
+chrono = { version = "0.4", features = ["serde"] }
+tower-http = { version = "0.6", features = ["cors", "trace"] }
--- a/crates/aibridge/Cargo.toml
+++ b/crates/aibridge/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "aibridge"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+shared = { path = "../shared" }
+tokio = { workspace = true }
+axum = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
--- a/crates/aibridge/src/lib.rs
+++ b/crates/aibridge/src/lib.rs
@ -0,0 +1 @@
+pub mod service;
--- a/crates/aibridge/src/service.rs
+++ b/crates/aibridge/src/service.rs
@ -0,0 +1,9 @@
+use axum::{Router, routing::get};
+
+pub fn router() -> Router {
+    Router::new().route("/health", get(health))
+}
+
+async fn health() -> &'static str {
+    "aibridge ok"
+}
--- a/crates/catalogd/Cargo.toml
+++ b/crates/catalogd/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "catalogd"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+shared = { path = "../shared" }
+tokio = { workspace = true }
+axum = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
--- a/crates/catalogd/src/lib.rs
+++ b/crates/catalogd/src/lib.rs
@ -0,0 +1 @@
+pub mod service;
--- a/crates/catalogd/src/service.rs
+++ b/crates/catalogd/src/service.rs
@ -0,0 +1,9 @@
+use axum::{Router, routing::get};
+
+pub fn router() -> Router {
+    Router::new().route("/health", get(health))
+}
+
+async fn health() -> &'static str {
+    "catalogd ok"
+}
--- a/crates/gateway/Cargo.toml
+++ b/crates/gateway/Cargo.toml
@ -0,0 +1,18 @@
+[package]
+name = "gateway"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+shared = { path = "../shared" }
+storaged = { path = "../storaged" }
+catalogd = { path = "../catalogd" }
+queryd = { path = "../queryd" }
+aibridge = { path = "../aibridge" }
+tokio = { workspace = true }
+axum = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
+tracing-subscriber = { workspace = true }
+tower-http = { workspace = true }
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -0,0 +1,29 @@
+use axum::{Router, routing::get};
+use tower_http::trace::TraceLayer;
+use tracing_subscriber::{EnvFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt};
+
+#[tokio::main]
+async fn main() {
+    tracing_subscriber::registry()
+        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+        .with(fmt::layer())
+        .init();
+
+    let app = Router::new()
+        .route("/health", get(health))
+        .nest("/storage", storaged::service::router())
+        .nest("/catalog", catalogd::service::router())
+        .nest("/query", queryd::service::router())
+        .nest("/ai", aibridge::service::router())
+        .layer(TraceLayer::new_for_http());
+
+    let addr = "0.0.0.0:3100";
+    tracing::info!("gateway listening on {addr}");
+
+    let listener = tokio::net::TcpListener::bind(addr).await.unwrap();
+    axum::serve(listener, app).await.unwrap();
+}
+
+async fn health() -> &'static str {
+    "lakehouse ok"
+}
--- a/crates/queryd/Cargo.toml
+++ b/crates/queryd/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "queryd"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+shared = { path = "../shared" }
+tokio = { workspace = true }
+axum = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
--- a/crates/queryd/src/lib.rs
+++ b/crates/queryd/src/lib.rs
@ -0,0 +1 @@
+pub mod service;
--- a/crates/queryd/src/service.rs
+++ b/crates/queryd/src/service.rs
@ -0,0 +1,9 @@
+use axum::{Router, routing::get};
+
+pub fn router() -> Router {
+    Router::new().route("/health", get(health))
+}
+
+async fn health() -> &'static str {
+    "queryd ok"
+}
--- a/crates/shared/Cargo.toml
+++ b/crates/shared/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "shared"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+uuid = { workspace = true }
+chrono = { workspace = true }
--- a/crates/shared/src/errors.rs
+++ b/crates/shared/src/errors.rs
@ -0,0 +1,35 @@
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum LakehouseError {
+    #[error("dataset not found: {0}")]
+    DatasetNotFound(String),
+
+    #[error("object not found: {bucket}/{key}")]
+    ObjectNotFound { bucket: String, key: String },
+
+    #[error("storage error: {0}")]
+    Storage(String),
+
+    #[error("catalog error: {0}")]
+    Catalog(String),
+
+    #[error("query error: {0}")]
+    Query(String),
+
+    #[error("ai bridge error: {0}")]
+    AiBridge(String),
+
+    #[error("serialization error: {0}")]
+    Serialization(String),
+}
+
+impl LakehouseError {
+    pub fn status_code(&self) -> u16 {
+        match self {
+            Self::DatasetNotFound(_) | Self::ObjectNotFound { .. } => 404,
+            Self::Storage(_) | Self::Catalog(_) | Self::Query(_) | Self::AiBridge(_) => 500,
+            Self::Serialization(_) => 400,
+        }
+    }
+}
--- a/crates/shared/src/lib.rs
+++ b/crates/shared/src/lib.rs
@ -0,0 +1,2 @@
+pub mod types;
+pub mod errors;
--- a/crates/shared/src/types.rs
+++ b/crates/shared/src/types.rs
@ -0,0 +1,42 @@
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+/// Unique identifier for a dataset in the catalog.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct DatasetId(pub Uuid);
+
+impl DatasetId {
+    pub fn new() -> Self {
+        Self(Uuid::new_v4())
+    }
+}
+
+impl std::fmt::Display for DatasetId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+/// Pointer to an object in storage. This is what the catalog stores — never raw data.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ObjectRef {
+    pub bucket: String,
+    pub key: String,
+    pub size_bytes: u64,
+    pub created_at: chrono::DateTime<chrono::Utc>,
+}
+
+/// Schema fingerprint — deterministic hash of an Arrow schema.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct SchemaFingerprint(pub String);
+
+/// Dataset manifest — the catalog's view of a dataset.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DatasetManifest {
+    pub id: DatasetId,
+    pub name: String,
+    pub schema_fingerprint: SchemaFingerprint,
+    pub objects: Vec<ObjectRef>,
+    pub created_at: chrono::DateTime<chrono::Utc>,
+    pub updated_at: chrono::DateTime<chrono::Utc>,
+}
--- a/crates/storaged/Cargo.toml
+++ b/crates/storaged/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "storaged"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+shared = { path = "../shared" }
+tokio = { workspace = true }
+axum = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tracing = { workspace = true }
--- a/crates/storaged/src/lib.rs
+++ b/crates/storaged/src/lib.rs
@ -0,0 +1 @@
+pub mod service;
--- a/crates/storaged/src/service.rs
+++ b/crates/storaged/src/service.rs
@ -0,0 +1,9 @@
+use axum::{Router, routing::get};
+
+pub fn router() -> Router {
+    Router::new().route("/health", get(health))
+}
+
+async fn health() -> &'static str {
+    "storaged ok"
+}
--- a/docs/DECISIONS.md
+++ b/docs/DECISIONS.md
@ -0,0 +1,26 @@
+# Architecture Decision Records
+
+## ADR-001: Object storage as source of truth
+**Date:** 2026-03-27
+**Decision:** All data lives in S3-compatible object storage. No traditional database.
+**Rationale:** Eliminates DB operational overhead, enables infinite scale at storage tier, forces clean separation of data and metadata.
+
+## ADR-002: Catalog metadata persistence
+**Date:** 2026-03-27
+**Decision:** catalogd persists manifests as Parquet files in object storage. In-memory index rebuilt on startup.
+**Rationale:** No external DB dependency. Storage is already the source of truth. Write-ahead pattern ensures consistency.
+
+## ADR-003: Real models only (no mocks)
+**Date:** 2026-03-27
+**Decision:** AI sidecar hits Ollama with real models from Phase 3 onward. No stub/mock endpoints.
+**Rationale:** Local Ollama instance available with nomic-embed-text, qwen2.5, mistral, gemma2, llama3.2. Mocks hide integration bugs.
+
+## ADR-004: Python sidecar as Ollama adapter
+**Date:** 2026-03-27
+**Decision:** Python FastAPI sidecar is a thin HTTP adapter over Ollama's API. No model loading in Python.
+**Rationale:** Ollama handles model lifecycle, GPU scheduling, caching. Sidecar stays stateless and lightweight — no torch/transformers deps.
+
+## ADR-005: HTTP-first, gRPC later
+**Date:** 2026-03-27
+**Decision:** All inter-service communication uses HTTP through Phase 4. gRPC migration in Phase 5.
+**Rationale:** HTTP is simpler to debug, test, and iterate on. gRPC adds protobuf compilation and streaming complexity before APIs stabilize.
--- a/docs/PHASES.md
+++ b/docs/PHASES.md
@ -0,0 +1,57 @@
+# Phase Tracker
+
+## Phase 0: Bootstrap
+- [ ] 0.1 — Cargo workspace with all crate stubs compiling
+- [ ] 0.2 — `shared` crate: error types, ObjectRef, DatasetId
+- [ ] 0.3 — `gateway` with Axum: GET /health → 200
+- [ ] 0.4 — tracing + tracing-subscriber wired in gateway
+- [ ] 0.5 — justfile with build, test, run recipes
+- [ ] 0.6 — docs committed to git
+
+**Gate:** All crates compile. Gateway runs. Logs emit. Docs committed.
+
+## Phase 1: Storage + Catalog
+- [ ] 1.1 — storaged: object_store backend init (LocalFileSystem → S3)
+- [ ] 1.2 — storaged: Axum endpoints (PUT/GET/DELETE /objects/{key})
+- [ ] 1.3 — shared/arrow.rs: RecordBatch ↔ Parquet helpers
+- [ ] 1.4 — catalogd/registry.rs: in-memory index + manifest persistence
+- [ ] 1.5 — catalogd/schema.rs: schema fingerprinting
+- [ ] 1.6 — catalogd service: POST/GET /datasets endpoints
+- [ ] 1.7 — gateway routes to storaged + catalogd
+
+**Gate:** Upload Parquet → register → metadata → read back. All via gateway.
+
+## Phase 2: Query Engine
+- [ ] 2.1 — queryd: SessionContext + object_store config
+- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
+- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
+- [ ] 2.4 — queryd → catalogd wiring
+- [ ] 2.5 — gateway routes /query
+
+**Gate:** SQL over Parquet returns correct results via catalog resolution.
+
+## Phase 3: AI Integration
+- [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)
+- [ ] 3.2 — Dockerfile for sidecar
+- [ ] 3.3 — aibridge/client.rs: HTTP client to sidecar
+- [ ] 3.4 — aibridge service: Axum proxy endpoints
+- [ ] 3.5 — Model config via env vars
+
+**Gate:** Rust → Python → Ollama → real embeddings return.
+
+## Phase 4: Frontend
+- [ ] 4.1 — Dioxus scaffold, WASM build
+- [ ] 4.2 — Dataset browser
+- [ ] 4.3 — Query editor + results table
+- [ ] 4.4 — Error display + loading states
+
+**Gate:** Browse datasets and query from browser.
+
+## Phase 5: Hardening
+- [ ] 5.1 — Proto definitions
+- [ ] 5.2 — Internal gRPC migration
+- [ ] 5.3 — OpenTelemetry tracing
+- [ ] 5.4 — Auth middleware
+- [ ] 5.5 — Config-driven startup
+
+**Gate:** gRPC internals, traces, auth, restartable from repo + config.
--- a/docs/PRD.md
+++ b/docs/PRD.md
@ -0,0 +1,175 @@
+# PRD: Lakehouse — Rust-First Object Storage System
+
+**Status:** Active
+**Created:** 2026-03-27
+**Owner:** J
+
+---
+
+## Problem
+
+Traditional data platforms couple storage, compute, and metadata into monolithic databases. This creates vendor lock-in, scaling bottlenecks, and opaque data access. AI workloads bolt onto these systems awkwardly, sharing resources with transactional queries.
+
+We need a system where:
+- Object storage is the source of truth (not a database)
+- Metadata, access, and execution are controlled by Rust services
+- Queries run directly over object storage via Arrow/Parquet
+- AI inference is isolated and swappable
+- The entire system is rebuildable from repository + docs alone
+
+---
+
+## Solution
+
+A modular Rust service mesh over S3-compatible object storage.
+
+### Locked Stack
+
+| Layer | Technology | Locked |
+|---|---|---|
+| Frontend | Dioxus | Yes |
+| API | Axum + Tokio | Yes |
+| Object Storage Interface | Apache Arrow `object_store` | Yes |
+| Storage Backend | RustFS (fallback: SeaweedFS) | Yes |
+| Query Engine | DataFusion | Yes |
+| Data Format | Parquet + Arrow | Yes |
+| RPC (internal) | tonic (gRPC) | Yes |
+| AI Runtime | Ollama (local models) | Yes |
+| AI Boundary | Python FastAPI sidecar → Ollama HTTP API | Yes |
+
+No new frameworks. No exceptions.
+
+---
+
+## Architecture
+
+### Services
+
+| Service | Responsibility |
+|---|---|
+| **gateway** | HTTP ingress, routing, auth envelope, middleware |
+| **catalogd** | Metadata control plane — dataset registry, schema versions, manifest index |
+| **storaged** | Object I/O — read/write/list/delete via `object_store` crate |
+| **queryd** | SQL execution — DataFusion over registered Parquet datasets |
+| **aibridge** | Rust↔Python boundary — HTTP client to FastAPI sidecar |
+| **ui** | Dioxus frontend — dataset browser, query editor, results viewer |
+| **shared** | Types, errors, Arrow helpers, protobuf definitions |
+
+### AI Sidecar
+
+Python FastAPI process that adapts Ollama's HTTP API into Arrow-compatible formats:
+- `POST /embed` → `nomic-embed-text` via Ollama
+- `POST /generate` → configurable model (qwen2.5, mistral, gemma2, llama3.2)
+- `POST /rerank` → cross-encoder reranking via generate endpoint
+
+No mocks. No stubs. Real models from day one. Ollama manages model lifecycle, GPU scheduling, caching. Sidecar is stateless passthrough.
+
+### Data Flow
+
+```
+Client → gateway → catalogd (metadata lookup)
+                  → storaged (object read/write)
+                  → queryd  (SQL execution over Parquet)
+                  → aibridge → sidecar → Ollama (inference)
+```
+
+### Invariants
+
+1. Object storage = source of truth for all data
+2. catalogd = sole metadata authority (datasets, schemas, manifests)
+3. No raw data stored in catalog — only pointers (bucket, key, schema fingerprint)
+4. storaged never interprets data — dumb pipe with presigned URLs
+5. queryd registers tables via catalog pointers, not by scanning storage
+6. aibridge is stateless — Python sidecar is replaceable without touching Rust
+7. All services are modular and independently replaceable
+
+### Dependency Graph
+
+```
+shared ← storaged ← catalogd ← queryd
+shared ← aibridge
+gateway → {storaged, catalogd, queryd, aibridge}
+ui → gateway (HTTP only, no crate dependency)
+```
+
+---
+
+## Phases
+
+### Phase 0: Bootstrap
+Workspace compiles, gateway serves health check, structured logging works.
+
+**Gate:** `cargo build` clean, `GET /health` returns 200, logs on stdout, docs committed.
+
+### Phase 1: Storage + Catalog
+Write Parquet to object storage, register in catalog, read back.
+
+**Gate:** Upload Parquet → register dataset → retrieve metadata → read back. All via gateway HTTP.
+
+### Phase 2: Query Engine
+SQL queries over registered Parquet datasets via DataFusion.
+
+**Gate:** `SELECT * FROM dataset LIMIT 10` returns correct results. Resolution goes through catalog.
+
+### Phase 3: AI Integration
+Python sidecar with real Ollama models. Embeddings, generation, reranking.
+
+**Gate:** Rust sends text → Python → Ollama → real embeddings return as Arrow-compatible floats.
+
+### Phase 4: Frontend
+Dioxus UI: dataset browser, query editor, results table.
+
+**Gate:** User can browse datasets and run queries from browser.
+
+### Phase 5: Hardening
+gRPC internals, OpenTelemetry, auth, config-driven startup.
+
+**Gate:** Services communicate via gRPC. Traces propagate. Auth enforced. System restartable from repo + config.
+
+---
+
+## Available Local Models
+
+| Model | Use |
+|---|---|
+| `nomic-embed-text` | Embeddings (768d) |
+| `qwen2.5` | Code generation, structured output |
+| `mistral` | General generation |
+| `gemma2` | General generation |
+| `llama3.2` | General generation |
+
+Model selection via environment variables. No hardcoded model names in Rust code.
+
+---
+
+## Non-Goals
+
+- Multi-tenancy
+- Streaming ingestion / CDC
+- Custom file formats
+- Query caching / materialized views
+- Wrapping `object_store` with another abstraction
+- Cloud deployment (local-first)
+
+---
+
+## Risks
+
+| Risk | Severity | Mitigation |
+|---|---|---|
+| RustFS immaturity | High | Start with LocalFileSystem, test against MinIO, RustFS last. SeaweedFS fallback. |
+| DataFusion table registration overhead | Medium | Lazy registration + LRU cache of SessionContext instances. |
+| Catalog consistency without DB | Medium | Write-ahead: persist manifest before in-memory update. Rebuild from storage on restart. |
+| Dioxus WASM gaps | Medium | Phase 4 is last. Fallback to plain HTML if blocked. |
+| Schema evolution | Medium | Schema fingerprinting in Phase 1. Validate before query. |
+
+---
+
+## Operating Rules
+
+1. PRD > architecture > phases > status > git
+2. Git is memory, not chat
+3. No undocumented changes
+4. No silent architecture drift
+5. Always work in smallest valid step
+6. Always verify before moving on
--- a/28
+++ b/28
@ -0,0 +1,28 @@
+# Lakehouse task runner
+
+default:
+    @just --list
+
+# Build all crates
+build:
+    cargo build --workspace
+
+# Run all tests
+test:
+    cargo test --workspace
+
+# Run gateway
+run:
+    cargo run --bin gateway
+
+# Check without building
+check:
+    cargo check --workspace
+
+# Format all code
+fmt:
+    cargo fmt --all
+
+# Lint
+clippy:
+    cargo clippy --workspace -- -D warnings