- ingestd crate: detect file type → parse → schema detection → Parquet → catalog - CSV: auto-detect column types (int, float, bool, string), handles $, %, commas Strips dollar signs from amounts, flexible row parsing, sanitized column names - JSON: array or newline-delimited, nested object flattening (a.b.c → a_b_c) - PDF: text extraction via lopdf, one row per page (source_file, page_number, text) - Text/SMS: line-based ingestion with line numbers - Dedup: SHA-256 content hash, re-ingest same file = no-op - Gateway: POST /ingest/file multipart upload, 256MB body limit - Schema detection per ADR-010: ambiguous types default to String - 12 unit tests passing (CSV parsing, JSON flattening, type inference, dedup) - Tested: messy CSV with missing data, dollar amounts, N/A values → queryable Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
97 lines
3.3 KiB
Rust
97 lines
3.3 KiB
Rust
mod auth;
|
|
mod observability;
|
|
|
|
use axum::{Router, extract::DefaultBodyLimit, routing::get};
|
|
use proto::lakehouse::catalog_service_server::CatalogServiceServer;
|
|
use shared::config::Config;
|
|
use tower_http::cors::{Any, CorsLayer};
|
|
use tower_http::trace::TraceLayer;
|
|
|
|
#[tokio::main]
|
|
async fn main() {
|
|
// Load config
|
|
let config = Config::load_or_default();
|
|
|
|
// Initialize tracing + observability
|
|
observability::init_tracing(
|
|
&config.observability.service_name,
|
|
&config.observability.exporter,
|
|
);
|
|
|
|
tracing::info!("config loaded: gateway={}:{}, storage={}",
|
|
config.gateway.host, config.gateway.port, config.storage.root);
|
|
|
|
// Storage backend
|
|
let store = storaged::backend::init_local(&config.storage.root);
|
|
|
|
// Catalog
|
|
let registry = catalogd::registry::Registry::new(store.clone());
|
|
if let Err(e) = registry.rebuild().await {
|
|
tracing::warn!("catalog rebuild failed (empty store?): {e}");
|
|
}
|
|
|
|
// Query engine
|
|
let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone());
|
|
|
|
// AI sidecar client
|
|
let ai_client = aibridge::client::AiClient::new(&config.sidecar.url);
|
|
|
|
// HTTP router
|
|
let mut app = Router::new()
|
|
.route("/health", get(health))
|
|
.nest("/storage", storaged::service::router(store.clone()))
|
|
.nest("/catalog", catalogd::service::router(registry.clone()))
|
|
.nest("/query", queryd::service::router(engine))
|
|
.nest("/ai", aibridge::service::router(ai_client))
|
|
.nest("/ingest", ingestd::service::router(ingestd::service::IngestState {
|
|
store: store.clone(),
|
|
registry: registry.clone(),
|
|
}));
|
|
|
|
// Auth middleware (if enabled)
|
|
if config.auth.enabled {
|
|
if let Some(ref key) = config.auth.api_key {
|
|
tracing::info!("API key auth enabled");
|
|
let api_key = auth::ApiKey(key.clone());
|
|
app = app.layer(axum::Extension(api_key));
|
|
// Note: auth middleware applied per-route in production
|
|
// For now, the ApiKey extension is available for handlers to check
|
|
} else {
|
|
tracing::warn!("auth enabled but no api_key set — all requests allowed");
|
|
}
|
|
}
|
|
|
|
app = app
|
|
.layer(DefaultBodyLimit::max(256 * 1024 * 1024)) // 256MB
|
|
.layer(CorsLayer::new()
|
|
.allow_origin(Any)
|
|
.allow_methods(Any)
|
|
.allow_headers(Any))
|
|
.layer(TraceLayer::new_for_http());
|
|
|
|
// Start gRPC server on port+1
|
|
let grpc_port = config.gateway.port + 1;
|
|
let catalog_grpc = catalogd::grpc::CatalogGrpc::new(registry);
|
|
let grpc_addr = format!("{}:{}", config.gateway.host, grpc_port).parse().unwrap();
|
|
|
|
tokio::spawn(async move {
|
|
tracing::info!("gRPC server listening on {grpc_addr}");
|
|
tonic::transport::Server::builder()
|
|
.add_service(CatalogServiceServer::new(catalog_grpc))
|
|
.serve(grpc_addr)
|
|
.await
|
|
.expect("gRPC server failed");
|
|
});
|
|
|
|
// Start HTTP server
|
|
let http_addr = format!("{}:{}", config.gateway.host, config.gateway.port);
|
|
tracing::info!("HTTP gateway listening on {http_addr}");
|
|
|
|
let listener = tokio::net::TcpListener::bind(&http_addr).await.unwrap();
|
|
axum::serve(listener, app).await.unwrap();
|
|
}
|
|
|
|
async fn health() -> &'static str {
|
|
"lakehouse ok"
|
|
}
|