lakehouse/crates/queryd/src/context.rs
root 19bdfab227 Phase 2: DataFusion query engine over Parquet
- queryd: SessionContext with custom URL scheme to avoid path doubling with LocalFileSystem
- queryd: ListingTable registration from catalog ObjectRefs with schema inference
- queryd: POST /query/sql returns JSON {columns, rows, row_count}
- queryd→catalogd wiring: reads all datasets, registers as named tables
- gateway: wires QueryEngine with shared store + registry
- e2e verified: SELECT *, WHERE/ORDER BY, COUNT/AVG all correct

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 05:48:20 -05:00

83 lines
3.1 KiB
Rust

use catalogd::registry::Registry;
use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::prelude::*;
use object_store::ObjectStore;
use std::sync::Arc;
use url::Url;
const STORE_SCHEME: &str = "lakehouse";
/// Query engine that runs DataFusion over catalog-registered Parquet datasets.
#[derive(Clone)]
pub struct QueryEngine {
registry: Registry,
store: Arc<dyn ObjectStore>,
}
impl QueryEngine {
pub fn new(registry: Registry, store: Arc<dyn ObjectStore>) -> Self {
Self { registry, store }
}
/// Execute a SQL query. Resolves all catalog datasets as tables before execution.
pub async fn query(&self, sql: &str) -> Result<Vec<arrow::array::RecordBatch>, String> {
let ctx = self.build_context().await?;
let df = ctx.sql(sql).await.map_err(|e| format!("SQL error: {e}"))?;
let batches = df.collect().await.map_err(|e| format!("execution error: {e}"))?;
Ok(batches)
}
/// Build a SessionContext with all catalog datasets registered as tables.
async fn build_context(&self) -> Result<SessionContext, String> {
let ctx = SessionContext::new();
// Register the object store under a custom scheme to avoid path doubling.
// The store already has the root prefix (e.g. ./data), so paths are relative keys.
let base_url = Url::parse(&format!("{STORE_SCHEME}://data/"))
.map_err(|e| format!("invalid store url: {e}"))?;
ctx.runtime_env().register_object_store(&base_url, self.store.clone());
// Register each catalog dataset as a table
let datasets = self.registry.list().await;
for dataset in &datasets {
if dataset.objects.is_empty() {
continue;
}
let opts = ListingOptions::new(Arc::new(ParquetFormat::default()));
let table_paths: Vec<ListingTableUrl> = dataset.objects.iter()
.filter_map(|o| {
let url_str = format!("{STORE_SCHEME}://data/{}", o.key);
ListingTableUrl::parse(&url_str).ok()
})
.collect();
if table_paths.is_empty() {
tracing::warn!("dataset {} has no valid paths, skipping", dataset.name);
continue;
}
// Infer schema from the first file
let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await
.map_err(|e| format!("schema inference failed for {}: {e}", dataset.name))?;
let config = ListingTableConfig::new_with_multi_paths(table_paths)
.with_listing_options(opts)
.with_schema(schema);
let table = ListingTable::try_new(config)
.map_err(|e| format!("table creation failed for {}: {e}", dataset.name))?;
ctx.register_table(&dataset.name, Arc::new(table))
.map_err(|e| format!("table registration failed for {}: {e}", dataset.name))?;
tracing::debug!("registered table: {}", dataset.name);
}
Ok(ctx)
}
}