Phase 0: bootstrap Rust workspace

- Cargo workspace with 6 crates: shared, storaged, catalogd, queryd, aibridge, gateway
- shared: types (DatasetId, ObjectRef, SchemaFingerprint, DatasetManifest) + error enum
- gateway: Axum HTTP entrypoint with nested service routers + tracing
- All services expose /health stubs
- justfile with build/test/run recipes
- PRD, phase tracker, and ADR docs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-27 04:59:05 -05:00
commit a52ca841c6
25 changed files with 1823 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
/target
*.swp
*.swo
.env

1286
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

22
Cargo.toml Normal file
View File

@ -0,0 +1,22 @@
[workspace]
resolver = "2"
members = [
"crates/shared",
"crates/storaged",
"crates/catalogd",
"crates/queryd",
"crates/aibridge",
"crates/gateway",
]
[workspace.dependencies]
tokio = { version = "1", features = ["full"] }
axum = "0.8"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
thiserror = "2"
uuid = { version = "1", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
tower-http = { version = "0.6", features = ["cors", "trace"] }

View File

@ -0,0 +1,12 @@
[package]
name = "aibridge"
version = "0.1.0"
edition = "2024"
[dependencies]
shared = { path = "../shared" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }

View File

@ -0,0 +1 @@
pub mod service;

View File

@ -0,0 +1,9 @@
use axum::{Router, routing::get};
pub fn router() -> Router {
Router::new().route("/health", get(health))
}
async fn health() -> &'static str {
"aibridge ok"
}

View File

@ -0,0 +1,12 @@
[package]
name = "catalogd"
version = "0.1.0"
edition = "2024"
[dependencies]
shared = { path = "../shared" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }

View File

@ -0,0 +1 @@
pub mod service;

View File

@ -0,0 +1,9 @@
use axum::{Router, routing::get};
pub fn router() -> Router {
Router::new().route("/health", get(health))
}
async fn health() -> &'static str {
"catalogd ok"
}

18
crates/gateway/Cargo.toml Normal file
View File

@ -0,0 +1,18 @@
[package]
name = "gateway"
version = "0.1.0"
edition = "2024"
[dependencies]
shared = { path = "../shared" }
storaged = { path = "../storaged" }
catalogd = { path = "../catalogd" }
queryd = { path = "../queryd" }
aibridge = { path = "../aibridge" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
tower-http = { workspace = true }

View File

@ -0,0 +1,29 @@
use axum::{Router, routing::get};
use tower_http::trace::TraceLayer;
use tracing_subscriber::{EnvFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt};
#[tokio::main]
async fn main() {
tracing_subscriber::registry()
.with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
.with(fmt::layer())
.init();
let app = Router::new()
.route("/health", get(health))
.nest("/storage", storaged::service::router())
.nest("/catalog", catalogd::service::router())
.nest("/query", queryd::service::router())
.nest("/ai", aibridge::service::router())
.layer(TraceLayer::new_for_http());
let addr = "0.0.0.0:3100";
tracing::info!("gateway listening on {addr}");
let listener = tokio::net::TcpListener::bind(addr).await.unwrap();
axum::serve(listener, app).await.unwrap();
}
async fn health() -> &'static str {
"lakehouse ok"
}

12
crates/queryd/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "queryd"
version = "0.1.0"
edition = "2024"
[dependencies]
shared = { path = "../shared" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }

1
crates/queryd/src/lib.rs Normal file
View File

@ -0,0 +1 @@
pub mod service;

View File

@ -0,0 +1,9 @@
use axum::{Router, routing::get};
pub fn router() -> Router {
Router::new().route("/health", get(health))
}
async fn health() -> &'static str {
"queryd ok"
}

11
crates/shared/Cargo.toml Normal file
View File

@ -0,0 +1,11 @@
[package]
name = "shared"
version = "0.1.0"
edition = "2024"
[dependencies]
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true }

View File

@ -0,0 +1,35 @@
use thiserror::Error;
#[derive(Debug, Error)]
pub enum LakehouseError {
#[error("dataset not found: {0}")]
DatasetNotFound(String),
#[error("object not found: {bucket}/{key}")]
ObjectNotFound { bucket: String, key: String },
#[error("storage error: {0}")]
Storage(String),
#[error("catalog error: {0}")]
Catalog(String),
#[error("query error: {0}")]
Query(String),
#[error("ai bridge error: {0}")]
AiBridge(String),
#[error("serialization error: {0}")]
Serialization(String),
}
impl LakehouseError {
pub fn status_code(&self) -> u16 {
match self {
Self::DatasetNotFound(_) | Self::ObjectNotFound { .. } => 404,
Self::Storage(_) | Self::Catalog(_) | Self::Query(_) | Self::AiBridge(_) => 500,
Self::Serialization(_) => 400,
}
}
}

2
crates/shared/src/lib.rs Normal file
View File

@ -0,0 +1,2 @@
pub mod types;
pub mod errors;

View File

@ -0,0 +1,42 @@
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// Unique identifier for a dataset in the catalog.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct DatasetId(pub Uuid);
impl DatasetId {
pub fn new() -> Self {
Self(Uuid::new_v4())
}
}
impl std::fmt::Display for DatasetId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
/// Pointer to an object in storage. This is what the catalog stores — never raw data.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ObjectRef {
pub bucket: String,
pub key: String,
pub size_bytes: u64,
pub created_at: chrono::DateTime<chrono::Utc>,
}
/// Schema fingerprint — deterministic hash of an Arrow schema.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SchemaFingerprint(pub String);
/// Dataset manifest — the catalog's view of a dataset.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetManifest {
pub id: DatasetId,
pub name: String,
pub schema_fingerprint: SchemaFingerprint,
pub objects: Vec<ObjectRef>,
pub created_at: chrono::DateTime<chrono::Utc>,
pub updated_at: chrono::DateTime<chrono::Utc>,
}

View File

@ -0,0 +1,12 @@
[package]
name = "storaged"
version = "0.1.0"
edition = "2024"
[dependencies]
shared = { path = "../shared" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }

View File

@ -0,0 +1 @@
pub mod service;

View File

@ -0,0 +1,9 @@
use axum::{Router, routing::get};
pub fn router() -> Router {
Router::new().route("/health", get(health))
}
async fn health() -> &'static str {
"storaged ok"
}

26
docs/DECISIONS.md Normal file
View File

@ -0,0 +1,26 @@
# Architecture Decision Records
## ADR-001: Object storage as source of truth
**Date:** 2026-03-27
**Decision:** All data lives in S3-compatible object storage. No traditional database.
**Rationale:** Eliminates DB operational overhead, enables infinite scale at storage tier, forces clean separation of data and metadata.
## ADR-002: Catalog metadata persistence
**Date:** 2026-03-27
**Decision:** catalogd persists manifests as Parquet files in object storage. In-memory index rebuilt on startup.
**Rationale:** No external DB dependency. Storage is already the source of truth. Write-ahead pattern ensures consistency.
## ADR-003: Real models only (no mocks)
**Date:** 2026-03-27
**Decision:** AI sidecar hits Ollama with real models from Phase 3 onward. No stub/mock endpoints.
**Rationale:** Local Ollama instance available with nomic-embed-text, qwen2.5, mistral, gemma2, llama3.2. Mocks hide integration bugs.
## ADR-004: Python sidecar as Ollama adapter
**Date:** 2026-03-27
**Decision:** Python FastAPI sidecar is a thin HTTP adapter over Ollama's API. No model loading in Python.
**Rationale:** Ollama handles model lifecycle, GPU scheduling, caching. Sidecar stays stateless and lightweight — no torch/transformers deps.
## ADR-005: HTTP-first, gRPC later
**Date:** 2026-03-27
**Decision:** All inter-service communication uses HTTP through Phase 4. gRPC migration in Phase 5.
**Rationale:** HTTP is simpler to debug, test, and iterate on. gRPC adds protobuf compilation and streaming complexity before APIs stabilize.

57
docs/PHASES.md Normal file
View File

@ -0,0 +1,57 @@
# Phase Tracker
## Phase 0: Bootstrap
- [ ] 0.1 — Cargo workspace with all crate stubs compiling
- [ ] 0.2 — `shared` crate: error types, ObjectRef, DatasetId
- [ ] 0.3 — `gateway` with Axum: GET /health → 200
- [ ] 0.4 — tracing + tracing-subscriber wired in gateway
- [ ] 0.5 — justfile with build, test, run recipes
- [ ] 0.6 — docs committed to git
**Gate:** All crates compile. Gateway runs. Logs emit. Docs committed.
## Phase 1: Storage + Catalog
- [ ] 1.1 — storaged: object_store backend init (LocalFileSystem → S3)
- [ ] 1.2 — storaged: Axum endpoints (PUT/GET/DELETE /objects/{key})
- [ ] 1.3 — shared/arrow.rs: RecordBatch ↔ Parquet helpers
- [ ] 1.4 — catalogd/registry.rs: in-memory index + manifest persistence
- [ ] 1.5 — catalogd/schema.rs: schema fingerprinting
- [ ] 1.6 — catalogd service: POST/GET /datasets endpoints
- [ ] 1.7 — gateway routes to storaged + catalogd
**Gate:** Upload Parquet → register → metadata → read back. All via gateway.
## Phase 2: Query Engine
- [ ] 2.1 — queryd: SessionContext + object_store config
- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
- [ ] 2.4 — queryd → catalogd wiring
- [ ] 2.5 — gateway routes /query
**Gate:** SQL over Parquet returns correct results via catalog resolution.
## Phase 3: AI Integration
- [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)
- [ ] 3.2 — Dockerfile for sidecar
- [ ] 3.3 — aibridge/client.rs: HTTP client to sidecar
- [ ] 3.4 — aibridge service: Axum proxy endpoints
- [ ] 3.5 — Model config via env vars
**Gate:** Rust → Python → Ollama → real embeddings return.
## Phase 4: Frontend
- [ ] 4.1 — Dioxus scaffold, WASM build
- [ ] 4.2 — Dataset browser
- [ ] 4.3 — Query editor + results table
- [ ] 4.4 — Error display + loading states
**Gate:** Browse datasets and query from browser.
## Phase 5: Hardening
- [ ] 5.1 — Proto definitions
- [ ] 5.2 — Internal gRPC migration
- [ ] 5.3 — OpenTelemetry tracing
- [ ] 5.4 — Auth middleware
- [ ] 5.5 — Config-driven startup
**Gate:** gRPC internals, traces, auth, restartable from repo + config.

175
docs/PRD.md Normal file
View File

@ -0,0 +1,175 @@
# PRD: Lakehouse — Rust-First Object Storage System
**Status:** Active
**Created:** 2026-03-27
**Owner:** J
---
## Problem
Traditional data platforms couple storage, compute, and metadata into monolithic databases. This creates vendor lock-in, scaling bottlenecks, and opaque data access. AI workloads bolt onto these systems awkwardly, sharing resources with transactional queries.
We need a system where:
- Object storage is the source of truth (not a database)
- Metadata, access, and execution are controlled by Rust services
- Queries run directly over object storage via Arrow/Parquet
- AI inference is isolated and swappable
- The entire system is rebuildable from repository + docs alone
---
## Solution
A modular Rust service mesh over S3-compatible object storage.
### Locked Stack
| Layer | Technology | Locked |
|---|---|---|
| Frontend | Dioxus | Yes |
| API | Axum + Tokio | Yes |
| Object Storage Interface | Apache Arrow `object_store` | Yes |
| Storage Backend | RustFS (fallback: SeaweedFS) | Yes |
| Query Engine | DataFusion | Yes |
| Data Format | Parquet + Arrow | Yes |
| RPC (internal) | tonic (gRPC) | Yes |
| AI Runtime | Ollama (local models) | Yes |
| AI Boundary | Python FastAPI sidecar → Ollama HTTP API | Yes |
No new frameworks. No exceptions.
---
## Architecture
### Services
| Service | Responsibility |
|---|---|
| **gateway** | HTTP ingress, routing, auth envelope, middleware |
| **catalogd** | Metadata control plane — dataset registry, schema versions, manifest index |
| **storaged** | Object I/O — read/write/list/delete via `object_store` crate |
| **queryd** | SQL execution — DataFusion over registered Parquet datasets |
| **aibridge** | Rust↔Python boundary — HTTP client to FastAPI sidecar |
| **ui** | Dioxus frontend — dataset browser, query editor, results viewer |
| **shared** | Types, errors, Arrow helpers, protobuf definitions |
### AI Sidecar
Python FastAPI process that adapts Ollama's HTTP API into Arrow-compatible formats:
- `POST /embed``nomic-embed-text` via Ollama
- `POST /generate` → configurable model (qwen2.5, mistral, gemma2, llama3.2)
- `POST /rerank` → cross-encoder reranking via generate endpoint
No mocks. No stubs. Real models from day one. Ollama manages model lifecycle, GPU scheduling, caching. Sidecar is stateless passthrough.
### Data Flow
```
Client → gateway → catalogd (metadata lookup)
→ storaged (object read/write)
→ queryd (SQL execution over Parquet)
→ aibridge → sidecar → Ollama (inference)
```
### Invariants
1. Object storage = source of truth for all data
2. catalogd = sole metadata authority (datasets, schemas, manifests)
3. No raw data stored in catalog — only pointers (bucket, key, schema fingerprint)
4. storaged never interprets data — dumb pipe with presigned URLs
5. queryd registers tables via catalog pointers, not by scanning storage
6. aibridge is stateless — Python sidecar is replaceable without touching Rust
7. All services are modular and independently replaceable
### Dependency Graph
```
shared ← storaged ← catalogd ← queryd
shared ← aibridge
gateway → {storaged, catalogd, queryd, aibridge}
ui → gateway (HTTP only, no crate dependency)
```
---
## Phases
### Phase 0: Bootstrap
Workspace compiles, gateway serves health check, structured logging works.
**Gate:** `cargo build` clean, `GET /health` returns 200, logs on stdout, docs committed.
### Phase 1: Storage + Catalog
Write Parquet to object storage, register in catalog, read back.
**Gate:** Upload Parquet → register dataset → retrieve metadata → read back. All via gateway HTTP.
### Phase 2: Query Engine
SQL queries over registered Parquet datasets via DataFusion.
**Gate:** `SELECT * FROM dataset LIMIT 10` returns correct results. Resolution goes through catalog.
### Phase 3: AI Integration
Python sidecar with real Ollama models. Embeddings, generation, reranking.
**Gate:** Rust sends text → Python → Ollama → real embeddings return as Arrow-compatible floats.
### Phase 4: Frontend
Dioxus UI: dataset browser, query editor, results table.
**Gate:** User can browse datasets and run queries from browser.
### Phase 5: Hardening
gRPC internals, OpenTelemetry, auth, config-driven startup.
**Gate:** Services communicate via gRPC. Traces propagate. Auth enforced. System restartable from repo + config.
---
## Available Local Models
| Model | Use |
|---|---|
| `nomic-embed-text` | Embeddings (768d) |
| `qwen2.5` | Code generation, structured output |
| `mistral` | General generation |
| `gemma2` | General generation |
| `llama3.2` | General generation |
Model selection via environment variables. No hardcoded model names in Rust code.
---
## Non-Goals
- Multi-tenancy
- Streaming ingestion / CDC
- Custom file formats
- Query caching / materialized views
- Wrapping `object_store` with another abstraction
- Cloud deployment (local-first)
---
## Risks
| Risk | Severity | Mitigation |
|---|---|---|
| RustFS immaturity | High | Start with LocalFileSystem, test against MinIO, RustFS last. SeaweedFS fallback. |
| DataFusion table registration overhead | Medium | Lazy registration + LRU cache of SessionContext instances. |
| Catalog consistency without DB | Medium | Write-ahead: persist manifest before in-memory update. Rebuild from storage on restart. |
| Dioxus WASM gaps | Medium | Phase 4 is last. Fallback to plain HTML if blocked. |
| Schema evolution | Medium | Schema fingerprinting in Phase 1. Validate before query. |
---
## Operating Rules
1. PRD > architecture > phases > status > git
2. Git is memory, not chat
3. No undocumented changes
4. No silent architecture drift
5. Always work in smallest valid step
6. Always verify before moving on

28
justfile Normal file
View File

@ -0,0 +1,28 @@
# Lakehouse task runner
default:
@just --list
# Build all crates
build:
cargo build --workspace
# Run all tests
test:
cargo test --workspace
# Run gateway
run:
cargo run --bin gateway
# Check without building
check:
cargo check --workspace
# Format all code
fmt:
cargo fmt --all
# Lint
clippy:
cargo clippy --workspace -- -D warnings