Phase 2: DataFusion query engine over Parquet
- queryd: SessionContext with custom URL scheme to avoid path doubling with LocalFileSystem
- queryd: ListingTable registration from catalog ObjectRefs with schema inference
- queryd: POST /query/sql returns JSON {columns, rows, row_count}
- queryd→catalogd wiring: reads all datasets, registers as named tables
- gateway: wires QueryEngine with shared store + registry
- e2e verified: SELECT *, WHERE/ORDER BY, COUNT/AVG all correct
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
655b6c0b37
commit
19bdfab227
1022
Cargo.lock
generated
1022
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -27,3 +27,4 @@ datafusion = "47"
|
||||
bytes = "1"
|
||||
futures = "0.3"
|
||||
sha2 = "0.10"
|
||||
url = "2"
|
||||
|
||||
@ -19,11 +19,14 @@ async fn main() {
|
||||
tracing::warn!("catalog rebuild failed (empty store?): {e}");
|
||||
}
|
||||
|
||||
// Query engine — DataFusion over catalog-registered Parquet
|
||||
let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone());
|
||||
|
||||
let app = Router::new()
|
||||
.route("/health", get(health))
|
||||
.nest("/storage", storaged::service::router(store))
|
||||
.nest("/catalog", catalogd::service::router(registry))
|
||||
.nest("/query", queryd::service::router())
|
||||
.nest("/query", queryd::service::router(engine))
|
||||
.nest("/ai", aibridge::service::router())
|
||||
.layer(TraceLayer::new_for_http());
|
||||
|
||||
|
||||
@ -5,8 +5,16 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
catalogd = { path = "../catalogd" }
|
||||
storaged = { path = "../storaged" }
|
||||
tokio = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
datafusion = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
arrow = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
url = { workspace = true }
|
||||
|
||||
82
crates/queryd/src/context.rs
Normal file
82
crates/queryd/src/context.rs
Normal file
@ -0,0 +1,82 @@
|
||||
use catalogd::registry::Registry;
|
||||
use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl};
|
||||
use datafusion::datasource::file_format::parquet::ParquetFormat;
|
||||
use datafusion::prelude::*;
|
||||
use object_store::ObjectStore;
|
||||
use std::sync::Arc;
|
||||
use url::Url;
|
||||
|
||||
const STORE_SCHEME: &str = "lakehouse";
|
||||
|
||||
/// Query engine that runs DataFusion over catalog-registered Parquet datasets.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryEngine {
|
||||
registry: Registry,
|
||||
store: Arc<dyn ObjectStore>,
|
||||
}
|
||||
|
||||
impl QueryEngine {
|
||||
pub fn new(registry: Registry, store: Arc<dyn ObjectStore>) -> Self {
|
||||
Self { registry, store }
|
||||
}
|
||||
|
||||
/// Execute a SQL query. Resolves all catalog datasets as tables before execution.
|
||||
pub async fn query(&self, sql: &str) -> Result<Vec<arrow::array::RecordBatch>, String> {
|
||||
let ctx = self.build_context().await?;
|
||||
|
||||
let df = ctx.sql(sql).await.map_err(|e| format!("SQL error: {e}"))?;
|
||||
let batches = df.collect().await.map_err(|e| format!("execution error: {e}"))?;
|
||||
Ok(batches)
|
||||
}
|
||||
|
||||
/// Build a SessionContext with all catalog datasets registered as tables.
|
||||
async fn build_context(&self) -> Result<SessionContext, String> {
|
||||
let ctx = SessionContext::new();
|
||||
|
||||
// Register the object store under a custom scheme to avoid path doubling.
|
||||
// The store already has the root prefix (e.g. ./data), so paths are relative keys.
|
||||
let base_url = Url::parse(&format!("{STORE_SCHEME}://data/"))
|
||||
.map_err(|e| format!("invalid store url: {e}"))?;
|
||||
ctx.runtime_env().register_object_store(&base_url, self.store.clone());
|
||||
|
||||
// Register each catalog dataset as a table
|
||||
let datasets = self.registry.list().await;
|
||||
for dataset in &datasets {
|
||||
if dataset.objects.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let opts = ListingOptions::new(Arc::new(ParquetFormat::default()));
|
||||
|
||||
let table_paths: Vec<ListingTableUrl> = dataset.objects.iter()
|
||||
.filter_map(|o| {
|
||||
let url_str = format!("{STORE_SCHEME}://data/{}", o.key);
|
||||
ListingTableUrl::parse(&url_str).ok()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if table_paths.is_empty() {
|
||||
tracing::warn!("dataset {} has no valid paths, skipping", dataset.name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Infer schema from the first file
|
||||
let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await
|
||||
.map_err(|e| format!("schema inference failed for {}: {e}", dataset.name))?;
|
||||
|
||||
let config = ListingTableConfig::new_with_multi_paths(table_paths)
|
||||
.with_listing_options(opts)
|
||||
.with_schema(schema);
|
||||
|
||||
let table = ListingTable::try_new(config)
|
||||
.map_err(|e| format!("table creation failed for {}: {e}", dataset.name))?;
|
||||
|
||||
ctx.register_table(&dataset.name, Arc::new(table))
|
||||
.map_err(|e| format!("table registration failed for {}: {e}", dataset.name))?;
|
||||
|
||||
tracing::debug!("registered table: {}", dataset.name);
|
||||
}
|
||||
|
||||
Ok(ctx)
|
||||
}
|
||||
}
|
||||
@ -1 +1,2 @@
|
||||
pub mod context;
|
||||
pub mod service;
|
||||
|
||||
@ -1,9 +1,89 @@
|
||||
use axum::{Router, routing::get};
|
||||
use arrow::array::RecordBatch;
|
||||
use arrow::json::writer::{JsonArray, Writer as JsonWriter};
|
||||
use axum::{
|
||||
Json, Router,
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub fn router() -> Router {
|
||||
Router::new().route("/health", get(health))
|
||||
use crate::context::QueryEngine;
|
||||
|
||||
pub fn router(engine: QueryEngine) -> Router {
|
||||
Router::new()
|
||||
.route("/health", get(health))
|
||||
.route("/sql", post(execute_query))
|
||||
.with_state(engine)
|
||||
}
|
||||
|
||||
async fn health() -> &'static str {
|
||||
"queryd ok"
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct QueryRequest {
|
||||
sql: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct QueryResponse {
|
||||
columns: Vec<ColumnInfo>,
|
||||
rows: serde_json::Value,
|
||||
row_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ColumnInfo {
|
||||
name: String,
|
||||
data_type: String,
|
||||
}
|
||||
|
||||
fn batches_to_json(batches: &[RecordBatch]) -> Result<serde_json::Value, String> {
|
||||
let mut buf = Vec::new();
|
||||
let mut writer = JsonWriter::<_, JsonArray>::new(&mut buf);
|
||||
for batch in batches {
|
||||
writer.write(batch).map_err(|e| format!("JSON write error: {e}"))?;
|
||||
}
|
||||
writer.finish().map_err(|e| format!("JSON finish error: {e}"))?;
|
||||
drop(writer);
|
||||
serde_json::from_slice(&buf).map_err(|e| format!("JSON parse error: {e}"))
|
||||
}
|
||||
|
||||
async fn execute_query(
|
||||
State(engine): State<QueryEngine>,
|
||||
Json(req): Json<QueryRequest>,
|
||||
) -> impl IntoResponse {
|
||||
tracing::info!("executing query: {}", req.sql);
|
||||
|
||||
match engine.query(&req.sql).await {
|
||||
Ok(batches) => {
|
||||
if batches.is_empty() {
|
||||
return Ok(Json(QueryResponse {
|
||||
columns: vec![],
|
||||
rows: serde_json::Value::Array(vec![]),
|
||||
row_count: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
let schema = batches[0].schema();
|
||||
let columns: Vec<ColumnInfo> = schema.fields().iter().map(|f| ColumnInfo {
|
||||
name: f.name().clone(),
|
||||
data_type: f.data_type().to_string(),
|
||||
}).collect();
|
||||
|
||||
let rows = batches_to_json(&batches)
|
||||
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?;
|
||||
|
||||
let row_count = rows.as_array().map(|a| a.len()).unwrap_or(0);
|
||||
|
||||
Ok(Json(QueryResponse {
|
||||
columns,
|
||||
rows,
|
||||
row_count,
|
||||
}))
|
||||
}
|
||||
Err(e) => Err((StatusCode::BAD_REQUEST, e)),
|
||||
}
|
||||
}
|
||||
|
||||
@ -22,13 +22,13 @@
|
||||
**Gate: PASSED** — PUT object → register dataset → list → get by name. All via gateway HTTP.
|
||||
|
||||
## Phase 2: Query Engine
|
||||
- [ ] 2.1 — queryd: SessionContext + object_store config
|
||||
- [ ] 2.2 — queryd: ListingTable from catalog ObjectRefs
|
||||
- [ ] 2.3 — queryd service: POST /query → Arrow IPC or JSON
|
||||
- [ ] 2.4 — queryd → catalogd wiring
|
||||
- [ ] 2.5 — gateway routes /query
|
||||
- [x] 2.1 — queryd: SessionContext + object_store config (custom scheme to avoid path doubling)
|
||||
- [x] 2.2 — queryd: ListingTable from catalog ObjectRefs with schema inference
|
||||
- [x] 2.3 — queryd service: POST /query/sql → JSON (columns + rows + row_count)
|
||||
- [x] 2.4 — queryd → catalogd wiring (reads dataset list, registers as tables)
|
||||
- [x] 2.5 — gateway routes /query with QueryEngine state
|
||||
|
||||
**Gate:** SQL over Parquet returns correct results via catalog resolution.
|
||||
**Gate: PASSED** — SELECT *, WHERE/ORDER BY, COUNT/AVG all return correct results via catalog.
|
||||
|
||||
## Phase 3: AI Integration
|
||||
- [ ] 3.1 — Python sidecar: FastAPI + Ollama (embed/generate/rerank)
|
||||
|
||||
57
tests/e2e_query.sh
Normal file
57
tests/e2e_query.sh
Normal file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env bash
|
||||
# End-to-end test: upload Parquet → register dataset → SQL query
|
||||
set -e
|
||||
|
||||
BASE="http://localhost:3100"
|
||||
|
||||
echo "=== Generate test Parquet file ==="
|
||||
python3 -c "
|
||||
import struct, io
|
||||
|
||||
# Minimal Parquet via pyarrow if available, else skip
|
||||
try:
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
table = pa.table({
|
||||
'id': [1, 2, 3, 4, 5],
|
||||
'name': ['alice', 'bob', 'carol', 'dave', 'eve'],
|
||||
'score': [9.5, 8.2, 7.8, 6.1, 9.9],
|
||||
})
|
||||
pq.write_table(table, '/tmp/test_data.parquet')
|
||||
print('generated with pyarrow')
|
||||
except ImportError:
|
||||
print('pyarrow not available, generating via rust helper')
|
||||
exit(1)
|
||||
"
|
||||
|
||||
echo "=== Upload Parquet to storage ==="
|
||||
curl -s -X PUT "$BASE/storage/objects/datasets/scores.parquet" \
|
||||
--data-binary @/tmp/test_data.parquet
|
||||
echo ""
|
||||
|
||||
echo "=== Register dataset in catalog ==="
|
||||
SIZE=$(stat -c%s /tmp/test_data.parquet)
|
||||
curl -s -X POST "$BASE/catalog/datasets" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"scores\",\"schema_fingerprint\":\"test\",\"objects\":[{\"bucket\":\"data\",\"key\":\"datasets/scores.parquet\",\"size_bytes\":$SIZE}]}" | python3 -m json.tool
|
||||
echo ""
|
||||
|
||||
echo "=== SQL: SELECT * FROM scores ==="
|
||||
curl -s -X POST "$BASE/query/sql" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"sql":"SELECT * FROM scores"}' | python3 -m json.tool
|
||||
echo ""
|
||||
|
||||
echo "=== SQL: SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC ==="
|
||||
curl -s -X POST "$BASE/query/sql" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"sql":"SELECT name, score FROM scores WHERE score > 8.0 ORDER BY score DESC"}' | python3 -m json.tool
|
||||
echo ""
|
||||
|
||||
echo "=== SQL: SELECT COUNT(*), AVG(score) FROM scores ==="
|
||||
curl -s -X POST "$BASE/query/sql" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"sql":"SELECT COUNT(*) as cnt, AVG(score) as avg_score FROM scores"}' | python3 -m json.tool
|
||||
echo ""
|
||||
|
||||
echo "=== DONE ==="
|
||||
Loading…
x
Reference in New Issue
Block a user