Phase 0: bootstrap Rust workspace
- Cargo workspace with 6 crates: shared, storaged, catalogd, queryd, aibridge, gateway - shared: types (DatasetId, ObjectRef, SchemaFingerprint, DatasetManifest) + error enum - gateway: Axum HTTP entrypoint with nested service routers + tracing - All services expose /health stubs - justfile with build/test/run recipes - PRD, phase tracker, and ADR docs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
commit
a52ca841c6
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
/target
|
||||
*.swp
|
||||
*.swo
|
||||
.env
|
||||
1286
Cargo.lock
generated
Normal file
1286
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
Cargo.toml
Normal file
22
Cargo.toml
Normal file
@ -0,0 +1,22 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"crates/shared",
|
||||
"crates/storaged",
|
||||
"crates/catalogd",
|
||||
"crates/queryd",
|
||||
"crates/aibridge",
|
||||
"crates/gateway",
|
||||
]
|
||||
|
||||
[workspace.dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
axum = "0.8"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||
thiserror = "2"
|
||||
uuid = { version = "1", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
tower-http = { version = "0.6", features = ["cors", "trace"] }
|
||||
12
crates/aibridge/Cargo.toml
Normal file
12
crates/aibridge/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "aibridge"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
1
crates/aibridge/src/lib.rs
Normal file
1
crates/aibridge/src/lib.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod service;
|
||||
9
crates/aibridge/src/service.rs
Normal file
9
crates/aibridge/src/service.rs
Normal file
@ -0,0 +1,9 @@
|
||||
use axum::{Router, routing::get};
|
||||
|
||||
pub fn router() -> Router {
|
||||
Router::new().route("/health", get(health))
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"aibridge ok"
|
||||
}
|
||||
12
crates/catalogd/Cargo.toml
Normal file
12
crates/catalogd/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "catalogd"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
1
crates/catalogd/src/lib.rs
Normal file
1
crates/catalogd/src/lib.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod service;
|
||||
9
crates/catalogd/src/service.rs
Normal file
9
crates/catalogd/src/service.rs
Normal file
@ -0,0 +1,9 @@
|
||||
use axum::{Router, routing::get};
|
||||
|
||||
pub fn router() -> Router {
|
||||
Router::new().route("/health", get(health))
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"catalogd ok"
|
||||
}
|
||||
18
crates/gateway/Cargo.toml
Normal file
18
crates/gateway/Cargo.toml
Normal file
@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "gateway"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
storaged = { path = "../storaged" }
|
||||
catalogd = { path = "../catalogd" }
|
||||
queryd = { path = "../queryd" }
|
||||
aibridge = { path = "../aibridge" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
tower-http = { workspace = true }
|
||||
29
crates/gateway/src/main.rs
Normal file
29
crates/gateway/src/main.rs
Normal file
@ -0,0 +1,29 @@
|
||||
use axum::{Router, routing::get};
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing_subscriber::{EnvFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::registry()
|
||||
.with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
|
||||
.with(fmt::layer())
|
||||
.init();
|
||||
|
||||
let app = Router::new()
|
||||
.route("/health", get(health))
|
||||
.nest("/storage", storaged::service::router())
|
||||
.nest("/catalog", catalogd::service::router())
|
||||
.nest("/query", queryd::service::router())
|
||||
.nest("/ai", aibridge::service::router())
|
||||
.layer(TraceLayer::new_for_http());
|
||||
|
||||
let addr = "0.0.0.0:3100";
|
||||
tracing::info!("gateway listening on {addr}");
|
||||
|
||||
let listener = tokio::net::TcpListener::bind(addr).await.unwrap();
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"lakehouse ok"
|
||||
}
|
||||
12
crates/queryd/Cargo.toml
Normal file
12
crates/queryd/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "queryd"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
1
crates/queryd/src/lib.rs
Normal file
1
crates/queryd/src/lib.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod service;
|
||||
9
crates/queryd/src/service.rs
Normal file
9
crates/queryd/src/service.rs
Normal file
@ -0,0 +1,9 @@
|
||||
use axum::{Router, routing::get};
|
||||
|
||||
pub fn router() -> Router {
|
||||
Router::new().route("/health", get(health))
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"queryd ok"
|
||||
}
|
||||
11
crates/shared/Cargo.toml
Normal file
11
crates/shared/Cargo.toml
Normal file
@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "shared"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
35
crates/shared/src/errors.rs
Normal file
35
crates/shared/src/errors.rs
Normal file
@ -0,0 +1,35 @@
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum LakehouseError {
|
||||
#[error("dataset not found: {0}")]
|
||||
DatasetNotFound(String),
|
||||
|
||||
#[error("object not found: {bucket}/{key}")]
|
||||
ObjectNotFound { bucket: String, key: String },
|
||||
|
||||
#[error("storage error: {0}")]
|
||||
Storage(String),
|
||||
|
||||
#[error("catalog error: {0}")]
|
||||
Catalog(String),
|
||||
|
||||
#[error("query error: {0}")]
|
||||
Query(String),
|
||||
|
||||
#[error("ai bridge error: {0}")]
|
||||
AiBridge(String),
|
||||
|
||||
#[error("serialization error: {0}")]
|
||||
Serialization(String),
|
||||
}
|
||||
|
||||
impl LakehouseError {
|
||||
pub fn status_code(&self) -> u16 {
|
||||
match self {
|
||||
Self::DatasetNotFound(_) | Self::ObjectNotFound { .. } => 404,
|
||||
Self::Storage(_) | Self::Catalog(_) | Self::Query(_) | Self::AiBridge(_) => 500,
|
||||
Self::Serialization(_) => 400,
|
||||
}
|
||||
}
|
||||
}
|
||||
2
crates/shared/src/lib.rs
Normal file
2
crates/shared/src/lib.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod types;
|
||||
pub mod errors;
|
||||
42
crates/shared/src/types.rs
Normal file
42
crates/shared/src/types.rs
Normal file
@ -0,0 +1,42 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Unique identifier for a dataset in the catalog.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct DatasetId(pub Uuid);
|
||||
|
||||
impl DatasetId {
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DatasetId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Pointer to an object in storage. This is what the catalog stores — never raw data.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ObjectRef {
|
||||
pub bucket: String,
|
||||
pub key: String,
|
||||
pub size_bytes: u64,
|
||||
pub created_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
/// Schema fingerprint — deterministic hash of an Arrow schema.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SchemaFingerprint(pub String);
|
||||
|
||||
/// Dataset manifest — the catalog's view of a dataset.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DatasetManifest {
|
||||
pub id: DatasetId,
|
||||
pub name: String,
|
||||
pub schema_fingerprint: SchemaFingerprint,
|
||||
pub objects: Vec<ObjectRef>,
|
||||
pub created_at: chrono::DateTime<chrono::Utc>,
|
||||
pub updated_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
12
crates/storaged/Cargo.toml
Normal file
12
crates/storaged/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "storaged"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
1
crates/storaged/src/lib.rs
Normal file
1
crates/storaged/src/lib.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod service;
|
||||
9
crates/storaged/src/service.rs
Normal file
9
crates/storaged/src/service.rs
Normal file
@ -0,0 +1,9 @@
|
||||
use axum::{Router, routing::get};
|
||||
|
||||
pub fn router() -> Router {
|
||||
Router::new().route("/health", get(health))
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"storaged ok"
|
||||
}
|
||||
26
docs/DECISIONS.md
Normal file
26
docs/DECISIONS.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Architecture Decision Records
|
||||
|
||||
## ADR-001: Object storage as source of truth
|
||||
**Date:** 2026-03-27
|
||||
**Decision:** All data lives in S3-compatible object storage. No traditional database.
|
||||
**Rationale:** Eliminates DB operational overhead, enables infinite scale at storage tier, forces clean separation of data and metadata.
|
||||
|
||||
## ADR-002: Catalog metadata persistence
|
||||
**Date:** 2026-03-27
|
||||
**Decision:** catalogd persists manifests as Parquet files in object storage. In-memory index rebuilt on startup.
|
||||
**Rationale:** No external DB dependency. Storage is already the source of truth. Write-ahead pattern ensures consistency.
|
||||
|
||||
## ADR-003: Real models only (no mocks)
|
||||
**Date:** 2026-03-27
|
||||
**Decision:** AI sidecar hits Ollama with real models from Phase 3 onward. No stub/mock endpoints.
|
||||
**Rationale:** Local Ollama instance available with nomic-embed-text, qwen2.5, mistral, gemma2, llama3.2. Mocks hide integration bugs.
|
||||
|
||||
## ADR-004: Python sidecar as Ollama adapter
|
||||
**Date:** 2026-03-27
|
||||
**Decision:** Python FastAPI sidecar is a thin HTTP adapter over Ollama's API. No model loading in Python.
|
||||
**Rationale:** Ollama handles model lifecycle, GPU scheduling, caching. Sidecar stays stateless and lightweight — no torch/transformers deps.
|
||||
|
||||
## ADR-005: HTTP-first, gRPC later
|
||||
**Date:** 2026-03-27
|
||||
**Decision:** All inter-service communication uses HTTP through Phase 4. gRPC migration in Phase 5.
|
||||
**Rationale:** HTTP is simpler to debug, test, and iterate on. gRPC adds protobuf compilation and streaming complexity before APIs stabilize.
|
||||
57
docs/PHASES.md
Normal file
57
docs/PHASES.md
Normal file
@ -0,0 +1,57 @@
|
||||
# Phase Tracker
|
||||
|
||||
## Phase 0: Bootstrap
|
||||
- [ ] 0.1 — Cargo workspace with all crate stubs compiling
|
||||
- [ ] 0.2 — `shared` crate: error types, ObjectRef, DatasetId
|
||||
- [ ] 0.3 — `gateway` with Axum: GET /health → 200
|
||||
- [ ] 0.4 — tracing + tracing-subscriber wired in gateway
|
||||
- [ ] 0.5 — justfile with build, test, run recipes
|
||||
- [ ] 0.6 — docs committed to git
|
||||
|
||||
**Gate:** All crates compile. Gateway runs. Logs emit. Docs committed.
|
||||
|
||||
## Phase 1: Storage + Catalog
|
||||
- [ ] 1.1 — storaged: object_store backend init (LocalFileSystem → S3)
|
||||
- [ ] 1.2 — storaged: Axum endpoints (PUT/GET/DELETE /objects/{key})
|
||||
- [ ] 1.3 — shared/arrow.rs: RecordBatch ↔ Parquet helpers
|
||||
- [ ] 1.4 — catalogd/registry.rs: in-memory index + manifest persistence
|
||||
- [ ] 1.5 — catalogd/schema.rs: schema fingerprinting
|
||||
- [ ] 1.6 — catalogd service: POST/GET /datasets endpoints
|
||||
- [ ] 1.7 — gateway routes to storaged + catalogd
|
||||
|
||||
**Gate:** Upload Parquet → register → metadata → read back. All via gateway.
|
||||
|
||||
## Phase 2: Query Engine
|
||||
- [ ] 2.1 — queryd: SessionContext + object_store config
|
||||
- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
|
||||
- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
|
||||
- [ ] 2.4 — queryd → catalogd wiring
|
||||
- [ ] 2.5 — gateway routes /query
|
||||
|
||||
**Gate:** SQL over Parquet returns correct results via catalog resolution.
|
||||
|
||||
## Phase 3: AI Integration
|
||||
- [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)
|
||||
- [ ] 3.2 — Dockerfile for sidecar
|
||||
- [ ] 3.3 — aibridge/client.rs: HTTP client to sidecar
|
||||
- [ ] 3.4 — aibridge service: Axum proxy endpoints
|
||||
- [ ] 3.5 — Model config via env vars
|
||||
|
||||
**Gate:** Rust → Python → Ollama → real embeddings return.
|
||||
|
||||
## Phase 4: Frontend
|
||||
- [ ] 4.1 — Dioxus scaffold, WASM build
|
||||
- [ ] 4.2 — Dataset browser
|
||||
- [ ] 4.3 — Query editor + results table
|
||||
- [ ] 4.4 — Error display + loading states
|
||||
|
||||
**Gate:** Browse datasets and query from browser.
|
||||
|
||||
## Phase 5: Hardening
|
||||
- [ ] 5.1 — Proto definitions
|
||||
- [ ] 5.2 — Internal gRPC migration
|
||||
- [ ] 5.3 — OpenTelemetry tracing
|
||||
- [ ] 5.4 — Auth middleware
|
||||
- [ ] 5.5 — Config-driven startup
|
||||
|
||||
**Gate:** gRPC internals, traces, auth, restartable from repo + config.
|
||||
175
docs/PRD.md
Normal file
175
docs/PRD.md
Normal file
@ -0,0 +1,175 @@
|
||||
# PRD: Lakehouse — Rust-First Object Storage System
|
||||
|
||||
**Status:** Active
|
||||
**Created:** 2026-03-27
|
||||
**Owner:** J
|
||||
|
||||
---
|
||||
|
||||
## Problem
|
||||
|
||||
Traditional data platforms couple storage, compute, and metadata into monolithic databases. This creates vendor lock-in, scaling bottlenecks, and opaque data access. AI workloads bolt onto these systems awkwardly, sharing resources with transactional queries.
|
||||
|
||||
We need a system where:
|
||||
- Object storage is the source of truth (not a database)
|
||||
- Metadata, access, and execution are controlled by Rust services
|
||||
- Queries run directly over object storage via Arrow/Parquet
|
||||
- AI inference is isolated and swappable
|
||||
- The entire system is rebuildable from repository + docs alone
|
||||
|
||||
---
|
||||
|
||||
## Solution
|
||||
|
||||
A modular Rust service mesh over S3-compatible object storage.
|
||||
|
||||
### Locked Stack
|
||||
|
||||
| Layer | Technology | Locked |
|
||||
|---|---|---|
|
||||
| Frontend | Dioxus | Yes |
|
||||
| API | Axum + Tokio | Yes |
|
||||
| Object Storage Interface | Apache Arrow `object_store` | Yes |
|
||||
| Storage Backend | RustFS (fallback: SeaweedFS) | Yes |
|
||||
| Query Engine | DataFusion | Yes |
|
||||
| Data Format | Parquet + Arrow | Yes |
|
||||
| RPC (internal) | tonic (gRPC) | Yes |
|
||||
| AI Runtime | Ollama (local models) | Yes |
|
||||
| AI Boundary | Python FastAPI sidecar → Ollama HTTP API | Yes |
|
||||
|
||||
No new frameworks. No exceptions.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
### Services
|
||||
|
||||
| Service | Responsibility |
|
||||
|---|---|
|
||||
| **gateway** | HTTP ingress, routing, auth envelope, middleware |
|
||||
| **catalogd** | Metadata control plane — dataset registry, schema versions, manifest index |
|
||||
| **storaged** | Object I/O — read/write/list/delete via `object_store` crate |
|
||||
| **queryd** | SQL execution — DataFusion over registered Parquet datasets |
|
||||
| **aibridge** | Rust↔Python boundary — HTTP client to FastAPI sidecar |
|
||||
| **ui** | Dioxus frontend — dataset browser, query editor, results viewer |
|
||||
| **shared** | Types, errors, Arrow helpers, protobuf definitions |
|
||||
|
||||
### AI Sidecar
|
||||
|
||||
Python FastAPI process that adapts Ollama's HTTP API into Arrow-compatible formats:
|
||||
- `POST /embed` → `nomic-embed-text` via Ollama
|
||||
- `POST /generate` → configurable model (qwen2.5, mistral, gemma2, llama3.2)
|
||||
- `POST /rerank` → cross-encoder reranking via generate endpoint
|
||||
|
||||
No mocks. No stubs. Real models from day one. Ollama manages model lifecycle, GPU scheduling, caching. Sidecar is stateless passthrough.
|
||||
|
||||
### Data Flow
|
||||
|
||||
```
|
||||
Client → gateway → catalogd (metadata lookup)
|
||||
→ storaged (object read/write)
|
||||
→ queryd (SQL execution over Parquet)
|
||||
→ aibridge → sidecar → Ollama (inference)
|
||||
```
|
||||
|
||||
### Invariants
|
||||
|
||||
1. Object storage = source of truth for all data
|
||||
2. catalogd = sole metadata authority (datasets, schemas, manifests)
|
||||
3. No raw data stored in catalog — only pointers (bucket, key, schema fingerprint)
|
||||
4. storaged never interprets data — dumb pipe with presigned URLs
|
||||
5. queryd registers tables via catalog pointers, not by scanning storage
|
||||
6. aibridge is stateless — Python sidecar is replaceable without touching Rust
|
||||
7. All services are modular and independently replaceable
|
||||
|
||||
### Dependency Graph
|
||||
|
||||
```
|
||||
shared ← storaged ← catalogd ← queryd
|
||||
shared ← aibridge
|
||||
gateway → {storaged, catalogd, queryd, aibridge}
|
||||
ui → gateway (HTTP only, no crate dependency)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phases
|
||||
|
||||
### Phase 0: Bootstrap
|
||||
Workspace compiles, gateway serves health check, structured logging works.
|
||||
|
||||
**Gate:** `cargo build` clean, `GET /health` returns 200, logs on stdout, docs committed.
|
||||
|
||||
### Phase 1: Storage + Catalog
|
||||
Write Parquet to object storage, register in catalog, read back.
|
||||
|
||||
**Gate:** Upload Parquet → register dataset → retrieve metadata → read back. All via gateway HTTP.
|
||||
|
||||
### Phase 2: Query Engine
|
||||
SQL queries over registered Parquet datasets via DataFusion.
|
||||
|
||||
**Gate:** `SELECT * FROM dataset LIMIT 10` returns correct results. Resolution goes through catalog.
|
||||
|
||||
### Phase 3: AI Integration
|
||||
Python sidecar with real Ollama models. Embeddings, generation, reranking.
|
||||
|
||||
**Gate:** Rust sends text → Python → Ollama → real embeddings return as Arrow-compatible floats.
|
||||
|
||||
### Phase 4: Frontend
|
||||
Dioxus UI: dataset browser, query editor, results table.
|
||||
|
||||
**Gate:** User can browse datasets and run queries from browser.
|
||||
|
||||
### Phase 5: Hardening
|
||||
gRPC internals, OpenTelemetry, auth, config-driven startup.
|
||||
|
||||
**Gate:** Services communicate via gRPC. Traces propagate. Auth enforced. System restartable from repo + config.
|
||||
|
||||
---
|
||||
|
||||
## Available Local Models
|
||||
|
||||
| Model | Use |
|
||||
|---|---|
|
||||
| `nomic-embed-text` | Embeddings (768d) |
|
||||
| `qwen2.5` | Code generation, structured output |
|
||||
| `mistral` | General generation |
|
||||
| `gemma2` | General generation |
|
||||
| `llama3.2` | General generation |
|
||||
|
||||
Model selection via environment variables. No hardcoded model names in Rust code.
|
||||
|
||||
---
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Multi-tenancy
|
||||
- Streaming ingestion / CDC
|
||||
- Custom file formats
|
||||
- Query caching / materialized views
|
||||
- Wrapping `object_store` with another abstraction
|
||||
- Cloud deployment (local-first)
|
||||
|
||||
---
|
||||
|
||||
## Risks
|
||||
|
||||
| Risk | Severity | Mitigation |
|
||||
|---|---|---|
|
||||
| RustFS immaturity | High | Start with LocalFileSystem, test against MinIO, RustFS last. SeaweedFS fallback. |
|
||||
| DataFusion table registration overhead | Medium | Lazy registration + LRU cache of SessionContext instances. |
|
||||
| Catalog consistency without DB | Medium | Write-ahead: persist manifest before in-memory update. Rebuild from storage on restart. |
|
||||
| Dioxus WASM gaps | Medium | Phase 4 is last. Fallback to plain HTML if blocked. |
|
||||
| Schema evolution | Medium | Schema fingerprinting in Phase 1. Validate before query. |
|
||||
|
||||
---
|
||||
|
||||
## Operating Rules
|
||||
|
||||
1. PRD > architecture > phases > status > git
|
||||
2. Git is memory, not chat
|
||||
3. No undocumented changes
|
||||
4. No silent architecture drift
|
||||
5. Always work in smallest valid step
|
||||
6. Always verify before moving on
|
||||
28
justfile
Normal file
28
justfile
Normal file
@ -0,0 +1,28 @@
|
||||
# Lakehouse task runner
|
||||
|
||||
default:
|
||||
@just --list
|
||||
|
||||
# Build all crates
|
||||
build:
|
||||
cargo build --workspace
|
||||
|
||||
# Run all tests
|
||||
test:
|
||||
cargo test --workspace
|
||||
|
||||
# Run gateway
|
||||
run:
|
||||
cargo run --bin gateway
|
||||
|
||||
# Check without building
|
||||
check:
|
||||
cargo check --workspace
|
||||
|
||||
# Format all code
|
||||
fmt:
|
||||
cargo fmt --all
|
||||
|
||||
# Lint
|
||||
clippy:
|
||||
cargo clippy --workspace -- -D warnings
|
||||
Loading…
x
Reference in New Issue
Block a user