Phase 2: DataFusion query engine over Parquet

- queryd: SessionContext with custom URL scheme to avoid path doubling with LocalFileSystem
- queryd: ListingTable registration from catalog ObjectRefs with schema inference
- queryd: POST /query/sql returns JSON {columns, rows, row_count}
- queryd→catalogd wiring: reads all datasets, registers as named tables
- gateway: wires QueryEngine with shared store + registry
- e2e verified: SELECT *, WHERE/ORDER BY, COUNT/AVG all correct

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-27 05:48:20 -05:00
parent 655b6c0b37
commit 19bdfab227
9 changed files with 1259 additions and 15 deletions

1022
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -27,3 +27,4 @@ datafusion = "47"
bytes = "1"
futures = "0.3"
sha2 = "0.10"
url = "2"

View File

@ -19,11 +19,14 @@ async fn main() {
tracing::warn!("catalog rebuild failed (empty store?): {e}");
}
// Query engine — DataFusion over catalog-registered Parquet
let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone());
let app = Router::new()
.route("/health", get(health))
.nest("/storage", storaged::service::router(store))
.nest("/catalog", catalogd::service::router(registry))
.nest("/query", queryd::service::router())
.nest("/query", queryd::service::router(engine))
.nest("/ai", aibridge::service::router())
.layer(TraceLayer::new_for_http());

View File

@ -5,8 +5,16 @@ edition = "2024"
[dependencies]
shared = { path = "../shared" }
catalogd = { path = "../catalogd" }
storaged = { path = "../storaged" }
tokio = { workspace = true }
axum = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
datafusion = { workspace = true }
object_store = { workspace = true }
arrow = { workspace = true }
bytes = { workspace = true }
futures = { workspace = true }
url = { workspace = true }

View File

@ -0,0 +1,82 @@
use catalogd::registry::Registry;
use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl};
use datafusion::datasource::file_format::parquet::ParquetFormat;
use datafusion::prelude::*;
use object_store::ObjectStore;
use std::sync::Arc;
use url::Url;
const STORE_SCHEME: &str = "lakehouse";
/// Query engine that runs DataFusion over catalog-registered Parquet datasets.
#[derive(Clone)]
pub struct QueryEngine {
registry: Registry,
store: Arc<dyn ObjectStore>,
}
impl QueryEngine {
pub fn new(registry: Registry, store: Arc<dyn ObjectStore>) -> Self {
Self { registry, store }
}
/// Execute a SQL query. Resolves all catalog datasets as tables before execution.
pub async fn query(&self, sql: &str) -> Result<Vec<arrow::array::RecordBatch>, String> {
let ctx = self.build_context().await?;
let df = ctx.sql(sql).await.map_err(|e| format!("SQL error: {e}"))?;
let batches = df.collect().await.map_err(|e| format!("execution error: {e}"))?;
Ok(batches)
}
/// Build a SessionContext with all catalog datasets registered as tables.
async fn build_context(&self) -> Result<SessionContext, String> {
let ctx = SessionContext::new();
// Register the object store under a custom scheme to avoid path doubling.
// The store already has the root prefix (e.g. ./data), so paths are relative keys.
let base_url = Url::parse(&format!("{STORE_SCHEME}://data/"))
.map_err(|e| format!("invalid store url: {e}"))?;
ctx.runtime_env().register_object_store(&base_url, self.store.clone());
// Register each catalog dataset as a table
let datasets = self.registry.list().await;
for dataset in &datasets {
if dataset.objects.is_empty() {
continue;
}
let opts = ListingOptions::new(Arc::new(ParquetFormat::default()));
let table_paths: Vec<ListingTableUrl> = dataset.objects.iter()
.filter_map(|o| {
let url_str = format!("{STORE_SCHEME}://data/{}", o.key);
ListingTableUrl::parse(&url_str).ok()
})
.collect();
if table_paths.is_empty() {
tracing::warn!("dataset {} has no valid paths, skipping", dataset.name);
continue;
}
// Infer schema from the first file
let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await
.map_err(|e| format!("schema inference failed for {}: {e}", dataset.name))?;
let config = ListingTableConfig::new_with_multi_paths(table_paths)
.with_listing_options(opts)
.with_schema(schema);
let table = ListingTable::try_new(config)
.map_err(|e| format!("table creation failed for {}: {e}", dataset.name))?;
ctx.register_table(&dataset.name, Arc::new(table))
.map_err(|e| format!("table registration failed for {}: {e}", dataset.name))?;
tracing::debug!("registered table: {}", dataset.name);
}
Ok(ctx)
}
}

View File

@ -1 +1,2 @@
pub mod context;
pub mod service;

View File

@ -1,9 +1,89 @@
use axum::{Router, routing::get};
use arrow::array::RecordBatch;
use arrow::json::writer::{JsonArray, Writer as JsonWriter};
use axum::{
Json, Router,
extract::State,
http::StatusCode,
response::IntoResponse,
routing::{get, post},
};
use serde::{Deserialize, Serialize};
pub fn router() -> Router {
Router::new().route("/health", get(health))
use crate::context::QueryEngine;
pub fn router(engine: QueryEngine) -> Router {
Router::new()
.route("/health", get(health))
.route("/sql", post(execute_query))
.with_state(engine)
}
async fn health() -> &'static str {
"queryd ok"
}
#[derive(Deserialize)]
struct QueryRequest {
sql: String,
}
#[derive(Serialize)]
struct QueryResponse {
columns: Vec<ColumnInfo>,
rows: serde_json::Value,
row_count: usize,
}
#[derive(Serialize)]
struct ColumnInfo {
name: String,
data_type: String,
}
fn batches_to_json(batches: &[RecordBatch]) -> Result<serde_json::Value, String> {
let mut buf = Vec::new();
let mut writer = JsonWriter::<_, JsonArray>::new(&mut buf);
for batch in batches {
writer.write(batch).map_err(|e| format!("JSON write error: {e}"))?;
}
writer.finish().map_err(|e| format!("JSON finish error: {e}"))?;
drop(writer);
serde_json::from_slice(&buf).map_err(|e| format!("JSON parse error: {e}"))
}
async fn execute_query(
State(engine): State<QueryEngine>,
Json(req): Json<QueryRequest>,
) -> impl IntoResponse {
tracing::info!("executing query: {}", req.sql);
match engine.query(&req.sql).await {
Ok(batches) => {
if batches.is_empty() {
return Ok(Json(QueryResponse {
columns: vec![],
rows: serde_json::Value::Array(vec![]),
row_count: 0,
}));
}
let schema = batches[0].schema();
let columns: Vec<ColumnInfo> = schema.fields().iter().map(|f| ColumnInfo {
name: f.name().clone(),
data_type: f.data_type().to_string(),
}).collect();
let rows = batches_to_json(&batches)
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;
let row_count = rows.as_array().map(|a| a.len()).unwrap_or(0);
Ok(Json(QueryResponse {
columns,
rows,
row_count,
}))
}
Err(e) => Err((StatusCode::BAD_REQUEST, e)),
}
}

View File

@ -22,13 +22,13 @@
**Gate: PASSED** — PUT object → register dataset → list → get by name. All via gateway HTTP.
## Phase 2: Query Engine
- [ ] 2.1 — queryd: SessionContext + object_store config
- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
- [ ] 2.4 — queryd → catalogd wiring
- [ ] 2.5 — gateway routes /query
- [x] 2.1 — queryd: SessionContext + object_store config (custom scheme to avoid path doubling)
- [x] 2.2 — queryd: ListingTable from catalog ObjectRefs with schema inference
- [x] 2.3 — queryd service: POST /query/sql → JSON (columns + rows + row_count)
- [x] 2.4 — queryd → catalogd wiring (reads dataset list, registers as tables)
- [x] 2.5 — gateway routes /query with QueryEngine state
**Gate:** SQL over Parquet returns correct results via catalog resolution.
**Gate: PASSED** — SELECT *, WHERE/ORDER BY, COUNT/AVG all return correct results via catalog.
## Phase 3: AI Integration
- [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)

57
tests/e2e_query.sh Normal file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env bash
# End-to-end test: upload Parquet → register dataset → SQL query
set -e
BASE="http://localhost:3100"
echo "=== Generate test Parquet file ==="
python3 -c "
import struct, io
# Minimal Parquet via pyarrow if available, else skip
try:
import pyarrow as pa
import pyarrow.parquet as pq
table = pa.table({
'id': [1, 2, 3, 4, 5],
'name': ['alice', 'bob', 'carol', 'dave', 'eve'],
'score': [9.5, 8.2, 7.8, 6.1, 9.9],
})
pq.write_table(table, '/tmp/test_data.parquet')
print('generated with pyarrow')
except ImportError:
print('pyarrow not available, generating via rust helper')
exit(1)
"
echo "=== Upload Parquet to storage ==="
curl -s -X PUT "$BASE/storage/objects/datasets/scores.parquet" \
--data-binary @/tmp/test_data.parquet
echo ""
echo "=== Register dataset in catalog ==="
SIZE=$(stat -c%s /tmp/test_data.parquet)
curl -s -X POST "$BASE/catalog/datasets" \
-H "Content-Type: application/json" \
-d "{\"name\":\"scores\",\"schema_fingerprint\":\"test\",\"objects\":[{\"bucket\":\"data\",\"key\":\"datasets/scores.parquet\",\"size_bytes\":$SIZE}]}" | python3 -m json.tool
echo ""
echo "=== SQL: SELECT * FROM scores ==="
curl -s -X POST "$BASE/query/sql" \
-H "Content-Type: application/json" \
-d '{"sql":"SELECT * FROM scores"}' | python3 -m json.tool
echo ""
echo "=== SQL: SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC ==="
curl -s -X POST "$BASE/query/sql" \
-H "Content-Type: application/json" \
-d '{"sql":"SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC"}' | python3 -m json.tool
echo ""
echo "=== SQL: SELECT COUNT(*), AVG(score) FROM scores ==="
curl -s -X POST "$BASE/query/sql" \
-H "Content-Type: application/json" \
-d '{"sql":"SELECT COUNT(*) as cnt, AVG(score) as avg_score FROM scores"}' | python3 -m json.tool
echo ""
echo "=== DONE ==="