root bb05c4412e Phase 6: Ingest pipeline — CSV, JSON, PDF, text file support
- ingestd crate: detect file type → parse → schema detection → Parquet → catalog
- CSV: auto-detect column types (int, float, bool, string), handles $, %, commas
  Strips dollar signs from amounts, flexible row parsing, sanitized column names
- JSON: array or newline-delimited, nested object flattening (a.b.c → a_b_c)
- PDF: text extraction via lopdf, one row per page (source_file, page_number, text)
- Text/SMS: line-based ingestion with line numbers
- Dedup: SHA-256 content hash, re-ingest same file = no-op
- Gateway: POST /ingest/file multipart upload, 256MB body limit
- Schema detection per ADR-010: ambiguous types default to String
- 12 unit tests passing (CSV parsing, JSON flattening, type inference, dedup)
- Tested: messy CSV with missing data, dollar amounts, N/A values → queryable

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 08:07:31 -05:00

97 lines
3.3 KiB
Rust

mod auth;
mod observability;
use axum::{Router, extract::DefaultBodyLimit, routing::get};
use proto::lakehouse::catalog_service_server::CatalogServiceServer;
use shared::config::Config;
use tower_http::cors::{Any, CorsLayer};
use tower_http::trace::TraceLayer;
#[tokio::main]
async fn main() {
// Load config
let config = Config::load_or_default();
// Initialize tracing + observability
observability::init_tracing(
&config.observability.service_name,
&config.observability.exporter,
);
tracing::info!("config loaded: gateway={}:{}, storage={}",
config.gateway.host, config.gateway.port, config.storage.root);
// Storage backend
let store = storaged::backend::init_local(&config.storage.root);
// Catalog
let registry = catalogd::registry::Registry::new(store.clone());
if let Err(e) = registry.rebuild().await {
tracing::warn!("catalog rebuild failed (empty store?): {e}");
}
// Query engine
let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone());
// AI sidecar client
let ai_client = aibridge::client::AiClient::new(&config.sidecar.url);
// HTTP router
let mut app = Router::new()
.route("/health", get(health))
.nest("/storage", storaged::service::router(store.clone()))
.nest("/catalog", catalogd::service::router(registry.clone()))
.nest("/query", queryd::service::router(engine))
.nest("/ai", aibridge::service::router(ai_client))
.nest("/ingest", ingestd::service::router(ingestd::service::IngestState {
store: store.clone(),
registry: registry.clone(),
}));
// Auth middleware (if enabled)
if config.auth.enabled {
if let Some(ref key) = config.auth.api_key {
tracing::info!("API key auth enabled");
let api_key = auth::ApiKey(key.clone());
app = app.layer(axum::Extension(api_key));
// Note: auth middleware applied per-route in production
// For now, the ApiKey extension is available for handlers to check
} else {
tracing::warn!("auth enabled but no api_key set — all requests allowed");
}
}
app = app
.layer(DefaultBodyLimit::max(256 * 1024 * 1024)) // 256MB
.layer(CorsLayer::new()
.allow_origin(Any)
.allow_methods(Any)
.allow_headers(Any))
.layer(TraceLayer::new_for_http());
// Start gRPC server on port+1
let grpc_port = config.gateway.port + 1;
let catalog_grpc = catalogd::grpc::CatalogGrpc::new(registry);
let grpc_addr = format!("{}:{}", config.gateway.host, grpc_port).parse().unwrap();
tokio::spawn(async move {
tracing::info!("gRPC server listening on {grpc_addr}");
tonic::transport::Server::builder()
.add_service(CatalogServiceServer::new(catalog_grpc))
.serve(grpc_addr)
.await
.expect("gRPC server failed");
});
// Start HTTP server
let http_addr = format!("{}:{}", config.gateway.host, config.gateway.port);
tracing::info!("HTTP gateway listening on {http_addr}");
let listener = tokio::net::TcpListener::bind(&http_addr).await.unwrap();
axum::serve(listener, app).await.unwrap();
}
async fn health() -> &'static str {
"lakehouse ok"
}