Phase 2: DataFusion query engine over Parquet

- queryd: SessionContext with custom URL scheme to avoid path doubling with LocalFileSystem - queryd: ListingTable registration from catalog ObjectRefs with schema inference - queryd: POST /query/sql returns JSON {columns, rows, row_count} - queryd→catalogd wiring: reads all datasets, registers as named tables - gateway: wires QueryEngine with shared store + registry - e2e verified: SELECT *, WHERE/ORDER BY, COUNT/AVG all correct Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 05:48:20 -05:00 · 2026-03-27 05:48:20 -05:00 · 19bdfab227
commit 19bdfab227
parent 655b6c0b37
9 changed files with 1259 additions and 15 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -27,3 +27,4 @@ datafusion = "47"
 bytes = "1"
 futures = "0.3"
 sha2 = "0.10"
+url = "2"
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@ -19,11 +19,14 @@ async fn main() {
        tracing::warn!("catalog rebuild failed (empty store?): {e}");
    }

+    // Query engine — DataFusion over catalog-registered Parquet
+    let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone());
+
    let app = Router::new()
        .route("/health", get(health))
        .nest("/storage", storaged::service::router(store))
        .nest("/catalog", catalogd::service::router(registry))
-        .nest("/query", queryd::service::router())
+        .nest("/query", queryd::service::router(engine))
        .nest("/ai", aibridge::service::router())
        .layer(TraceLayer::new_for_http());

--- a/crates/queryd/Cargo.toml
+++ b/crates/queryd/Cargo.toml
@ -5,8 +5,16 @@ edition = "2024"

 [dependencies]
 shared = { path = "../shared" }
+catalogd = { path = "../catalogd" }
+storaged = { path = "../storaged" }
 tokio = { workspace = true }
 axum = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 tracing = { workspace = true }
+datafusion = { workspace = true }
+object_store = { workspace = true }
+arrow = { workspace = true }
+bytes = { workspace = true }
+futures = { workspace = true }
+url = { workspace = true }
--- a/crates/queryd/src/context.rs
+++ b/crates/queryd/src/context.rs
@ -0,0 +1,82 @@
+use catalogd::registry::Registry;
+use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl};
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::prelude::*;
+use object_store::ObjectStore;
+use std::sync::Arc;
+use url::Url;
+
+const STORE_SCHEME: &str = "lakehouse";
+
+/// Query engine that runs DataFusion over catalog-registered Parquet datasets.
+#[derive(Clone)]
+pub struct QueryEngine {
+    registry: Registry,
+    store: Arc<dyn ObjectStore>,
+}
+
+impl QueryEngine {
+    pub fn new(registry: Registry, store: Arc<dyn ObjectStore>) -> Self {
+        Self { registry, store }
+    }
+
+    /// Execute a SQL query. Resolves all catalog datasets as tables before execution.
+    pub async fn query(&self, sql: &str) -> Result<Vec<arrow::array::RecordBatch>, String> {
+        let ctx = self.build_context().await?;
+
+        let df = ctx.sql(sql).await.map_err(|e| format!("SQL error: {e}"))?;
+        let batches = df.collect().await.map_err(|e| format!("execution error: {e}"))?;
+        Ok(batches)
+    }
+
+    /// Build a SessionContext with all catalog datasets registered as tables.
+    async fn build_context(&self) -> Result<SessionContext, String> {
+        let ctx = SessionContext::new();
+
+        // Register the object store under a custom scheme to avoid path doubling.
+        // The store already has the root prefix (e.g. ./data), so paths are relative keys.
+        let base_url = Url::parse(&format!("{STORE_SCHEME}://data/"))
+            .map_err(|e| format!("invalid store url: {e}"))?;
+        ctx.runtime_env().register_object_store(&base_url, self.store.clone());
+
+        // Register each catalog dataset as a table
+        let datasets = self.registry.list().await;
+        for dataset in &datasets {
+            if dataset.objects.is_empty() {
+                continue;
+            }
+
+            let opts = ListingOptions::new(Arc::new(ParquetFormat::default()));
+
+            let table_paths: Vec<ListingTableUrl> = dataset.objects.iter()
+                .filter_map(|o| {
+                    let url_str = format!("{STORE_SCHEME}://data/{}", o.key);
+                    ListingTableUrl::parse(&url_str).ok()
+                })
+                .collect();
+
+            if table_paths.is_empty() {
+                tracing::warn!("dataset {} has no valid paths, skipping", dataset.name);
+                continue;
+            }
+
+            // Infer schema from the first file
+            let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await
+                .map_err(|e| format!("schema inference failed for {}: {e}", dataset.name))?;
+
+            let config = ListingTableConfig::new_with_multi_paths(table_paths)
+                .with_listing_options(opts)
+                .with_schema(schema);
+
+            let table = ListingTable::try_new(config)
+                .map_err(|e| format!("table creation failed for {}: {e}", dataset.name))?;
+
+            ctx.register_table(&dataset.name, Arc::new(table))
+                .map_err(|e| format!("table registration failed for {}: {e}", dataset.name))?;
+
+            tracing::debug!("registered table: {}", dataset.name);
+        }
+
+        Ok(ctx)
+    }
+}
--- a/crates/queryd/src/lib.rs
+++ b/crates/queryd/src/lib.rs
@ -1 +1,2 @@
+pub mod context;
 pub mod service;
--- a/crates/queryd/src/service.rs
+++ b/crates/queryd/src/service.rs
@ -1,9 +1,89 @@
-use axum::{Router, routing::get};
+use arrow::array::RecordBatch;
+use arrow::json::writer::{JsonArray, Writer as JsonWriter};
+use axum::{
+    Json, Router,
+    extract::State,
+    http::StatusCode,
+    response::IntoResponse,
+    routing::{get, post},
+};
+use serde::{Deserialize, Serialize};

-pub fn router() -> Router {
-    Router::new().route("/health", get(health))
+use crate::context::QueryEngine;
+
+pub fn router(engine: QueryEngine) -> Router {
+    Router::new()
+        .route("/health", get(health))
+        .route("/sql", post(execute_query))
+        .with_state(engine)
 }

 async fn health() -> &'static str {
    "queryd ok"
 }
+
+#[derive(Deserialize)]
+struct QueryRequest {
+    sql: String,
+}
+
+#[derive(Serialize)]
+struct QueryResponse {
+    columns: Vec<ColumnInfo>,
+    rows: serde_json::Value,
+    row_count: usize,
+}
+
+#[derive(Serialize)]
+struct ColumnInfo {
+    name: String,
+    data_type: String,
+}
+
+fn batches_to_json(batches: &[RecordBatch]) -> Result<serde_json::Value, String> {
+    let mut buf = Vec::new();
+    let mut writer = JsonWriter::<_, JsonArray>::new(&mut buf);
+    for batch in batches {
+        writer.write(batch).map_err(|e| format!("JSON write error: {e}"))?;
+    }
+    writer.finish().map_err(|e| format!("JSON finish error: {e}"))?;
+    drop(writer);
+    serde_json::from_slice(&buf).map_err(|e| format!("JSON parse error: {e}"))
+}
+
+async fn execute_query(
+    State(engine): State<QueryEngine>,
+    Json(req): Json<QueryRequest>,
+) -> impl IntoResponse {
+    tracing::info!("executing query: {}", req.sql);
+
+    match engine.query(&req.sql).await {
+        Ok(batches) => {
+            if batches.is_empty() {
+                return Ok(Json(QueryResponse {
+                    columns: vec![],
+                    rows: serde_json::Value::Array(vec![]),
+                    row_count: 0,
+                }));
+            }
+
+            let schema = batches[0].schema();
+            let columns: Vec<ColumnInfo> = schema.fields().iter().map(|f| ColumnInfo {
+                name: f.name().clone(),
+                data_type: f.data_type().to_string(),
+            }).collect();
+
+            let rows = batches_to_json(&batches)
+                .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;
+
+            let row_count = rows.as_array().map(|a| a.len()).unwrap_or(0);
+
+            Ok(Json(QueryResponse {
+                columns,
+                rows,
+                row_count,
+            }))
+        }
+        Err(e) => Err((StatusCode::BAD_REQUEST, e)),
+    }
+}
--- a/docs/PHASES.md
+++ b/docs/PHASES.md
@ -22,13 +22,13 @@
 **Gate: PASSED** — PUT object → register dataset → list → get by name. All via gateway HTTP.

 ## Phase 2: Query Engine
- [ ] 2.1 — queryd: SessionContext + object_store config
- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
- [ ] 2.4 — queryd → catalogd wiring
- [ ] 2.5 — gateway routes /query
+- [x] 2.1 — queryd: SessionContext + object_store config (custom scheme to avoid path doubling)
+- [x] 2.2 — queryd: ListingTable from catalog ObjectRefs with schema inference
+- [x] 2.3 — queryd service: POST /query/sql → JSON (columns + rows + row_count)
+- [x] 2.4 — queryd → catalogd wiring (reads dataset list, registers as tables)
+- [x] 2.5 — gateway routes /query with QueryEngine state

-**Gate:** SQL over Parquet returns correct results via catalog resolution.
+**Gate: PASSED** — SELECT *, WHERE/ORDER BY, COUNT/AVG all return correct results via catalog.

 ## Phase 3: AI Integration
 - [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)
--- a/tests/e2e_query.sh
+++ b/tests/e2e_query.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# End-to-end test: upload Parquet → register dataset → SQL query
+set -e
+
+BASE="http://localhost:3100"
+
+echo "=== Generate test Parquet file ==="
+python3 -c "
+import struct, io
+
+# Minimal Parquet via pyarrow if available, else skip
+try:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    table = pa.table({
+        'id': [1, 2, 3, 4, 5],
+        'name': ['alice', 'bob', 'carol', 'dave', 'eve'],
+        'score': [9.5, 8.2, 7.8, 6.1, 9.9],
+    })
+    pq.write_table(table, '/tmp/test_data.parquet')
+    print('generated with pyarrow')
+except ImportError:
+    print('pyarrow not available, generating via rust helper')
+    exit(1)
+"
+
+echo "=== Upload Parquet to storage ==="
+curl -s -X PUT "$BASE/storage/objects/datasets/scores.parquet" \
+  --data-binary @/tmp/test_data.parquet
+echo ""
+
+echo "=== Register dataset in catalog ==="
+SIZE=$(stat -c%s /tmp/test_data.parquet)
+curl -s -X POST "$BASE/catalog/datasets" \
+  -H "Content-Type: application/json" \
+  -d "{\"name\":\"scores\",\"schema_fingerprint\":\"test\",\"objects\":[{\"bucket\":\"data\",\"key\":\"datasets/scores.parquet\",\"size_bytes\":$SIZE}]}" | python3 -m json.tool
+echo ""
+
+echo "=== SQL: SELECT * FROM scores ==="
+curl -s -X POST "$BASE/query/sql" \
+  -H "Content-Type: application/json" \
+  -d '{"sql":"SELECT * FROM scores"}' | python3 -m json.tool
+echo ""
+
+echo "=== SQL: SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC ==="
+curl -s -X POST "$BASE/query/sql" \
+  -H "Content-Type: application/json" \
+  -d '{"sql":"SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC"}' | python3 -m json.tool
+echo ""
+
+echo "=== SQL: SELECT COUNT(*), AVG(score) FROM scores ==="
+curl -s -X POST "$BASE/query/sql" \
+  -H "Content-Type: application/json" \
+  -d '{"sql":"SELECT COUNT(*) as cnt, AVG(score) as avg_score FROM scores"}' | python3 -m json.tool
+echo ""
+
+echo "=== DONE ==="