use arrow::array::RecordBatch; use arrow::json::writer::{JsonArray, Writer as JsonWriter}; use axum::{ Json, Router, extract::{Path, Query, State}, http::StatusCode, response::IntoResponse, routing::{get, post}, }; use serde::{Deserialize, Serialize}; use std::sync::Arc; use truth::{RuleAction, TruthStore}; use crate::context::QueryEngine; use crate::delta; use crate::paged::ResultStore; #[derive(Clone)] pub struct QueryState { pub engine: QueryEngine, pub result_store: ResultStore, // Policy gate for incoming SQL. Every /sql and /paged request is // evaluated against this store before hitting DataFusion. Added for // P42-002 ("raw SQL forwarded without schema or policy gate") after // the scrum master's queryd/service.rs finding looped across iters // 3-5 without ever being reachable by the 6-line auto-applier. pub truth: Arc, } pub fn router(engine: QueryEngine) -> Router { router_with_truth(engine, Arc::new(truth::sql_query_guard_store())) } /// Test/integration hook: construct the router with a caller-supplied /// TruthStore so tests can assert reject/pass behavior deterministically /// without depending on the default needle list. pub fn router_with_truth(engine: QueryEngine, truth: Arc) -> Router { let state = QueryState { engine: engine.clone(), result_store: ResultStore::new(100, 50), // 100 rows/page, keep 50 results truth, }; Router::new() .route("/health", get(health)) .route("/sql", post(execute_query)) .route("/paged", post(paged_query)) .route("/page/{query_id}/{page}", get(get_page)) .route("/cache/pin", post(pin_dataset)) .route("/cache/evict", post(evict_dataset)) .route("/cache/stats", get(cache_stats)) .route("/compact", post(compact_dataset)) .with_state(state) } async fn health() -> &'static str { "queryd ok" } // --- SQL Query --- #[derive(Deserialize)] struct QueryRequest { sql: String, } #[derive(Serialize)] struct QueryResponse { columns: Vec, rows: serde_json::Value, row_count: usize, // Elapsed wall time from handler entry to response. Required for // audit-log parity — gateway's audit row previously stored null here. // Scrum iter 9 finding, populated from std::time::Instant captured // at the top of execute_query / paged_query. latency_ms: u64, } #[derive(Serialize)] struct ColumnInfo { name: String, data_type: String, } fn batches_to_json(batches: &[RecordBatch]) -> Result { let mut buf = Vec::new(); let mut writer = JsonWriter::<_, JsonArray>::new(&mut buf); for batch in batches { writer.write(batch).map_err(|e| format!("JSON write error: {e}"))?; } writer.finish().map_err(|e| format!("JSON finish error: {e}"))?; drop(writer); serde_json::from_slice(&buf).map_err(|e| format!("JSON parse error: {e}")) } /// Evaluate the request SQL against the configured TruthStore. Returns /// the Reject/Block message on the first failing mandatory rule so the /// handler can short-circuit. Returns None when all rules pass (or when /// the failures' declared action is non-mandatory like Redact/Pass). fn sql_policy_check(truth: &TruthStore, sql: &str) -> Option { let ctx = serde_json::json!({ "sql": sql }); for outcome in truth.evaluate("sql_query", &ctx) { if !outcome.passed { // FieldEmpty / FieldContainsAny etc. are enforced only when // condition HOLDS (i.e. passed=true). Below means "passed=false", // so the rule condition did not hold — no enforcement. continue; } match &outcome.action { RuleAction::Reject { message } | RuleAction::Block { message } => { return Some(message.clone()); } _ => {} } } None } async fn execute_query( State(state): State, Json(req): Json, ) -> impl IntoResponse { let started = std::time::Instant::now(); tracing::info!("executing query: {}", req.sql); if let Some(reason) = sql_policy_check(&state.truth, &req.sql) { tracing::warn!("sql rejected by truth gate: {reason}"); return Err((StatusCode::FORBIDDEN, reason)); } match state.engine.query(&req.sql).await { Ok(batches) => { if batches.is_empty() { return Ok(Json(QueryResponse { columns: vec![], rows: serde_json::Value::Array(vec![]), row_count: 0, latency_ms: started.elapsed().as_millis() as u64, })); } let schema = batches[0].schema(); let columns: Vec = schema.fields().iter().map(|f| ColumnInfo { name: f.name().clone(), data_type: f.data_type().to_string(), }).collect(); let rows = batches_to_json(&batches) .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; let row_count = rows.as_array().map(|a| a.len()).unwrap_or(0); Ok(Json(QueryResponse { columns, rows, row_count, latency_ms: started.elapsed().as_millis() as u64, })) } Err(e) => Err((StatusCode::BAD_REQUEST, e)), } } // --- Paged Queries (large result sets) --- async fn paged_query( State(state): State, Json(req): Json, ) -> impl IntoResponse { tracing::info!("paged query: {}", req.sql); if let Some(reason) = sql_policy_check(&state.truth, &req.sql) { tracing::warn!("paged sql rejected by truth gate: {reason}"); return Err((StatusCode::FORBIDDEN, reason)); } match state.result_store.execute_and_store(&state.engine, &req.sql).await { Ok(handle) => Ok(Json(handle)), Err(e) => Err((StatusCode::BAD_REQUEST, e)), } } #[derive(Deserialize)] struct PageQuery { size: Option, } async fn get_page( State(state): State, Path((query_id, page)): Path<(String, usize)>, Query(q): Query, ) -> impl IntoResponse { match state.result_store.get_page(&query_id, page, q.size).await { Ok(result) => Ok(Json(result)), Err(e) => Err((StatusCode::NOT_FOUND, e)), } } // --- Cache Management --- #[derive(Deserialize)] struct CacheRequest { dataset: String, } async fn pin_dataset( State(state): State, Json(req): Json, ) -> impl IntoResponse { match state.engine.pin_dataset(&req.dataset).await { Ok(()) => Ok((StatusCode::OK, format!("pinned: {}", req.dataset))), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } async fn evict_dataset( State(state): State, Json(req): Json, ) -> impl IntoResponse { if state.engine.cache().evict(&req.dataset).await { (StatusCode::OK, format!("evicted: {}", req.dataset)) } else { (StatusCode::NOT_FOUND, format!("not cached: {}", req.dataset)) } } async fn cache_stats(State(state): State) -> impl IntoResponse { let stats = state.engine.cache().stats().await; Json(stats) } // --- Compaction --- #[derive(Deserialize)] struct CompactRequest { dataset: String, base_key: String, primary_key: Option, } async fn compact_dataset( State(state): State, Json(req): Json, ) -> impl IntoResponse { // Phase E: pull tombstones for this dataset and let compact physically // drop those rows. After a successful rewrite, clear the tombstone log // — the rows are gone from disk, the tombstones have done their job. let tombstones = state .engine .catalog() .list_tombstones(&req.dataset) .await .unwrap_or_default(); match delta::compact( state.engine.store(), &req.dataset, &req.base_key, req.primary_key.as_deref(), &tombstones, ).await { Ok(result) => { if result.rows_dropped_by_tombstones > 0 { if let Err(e) = state.engine.catalog().tombstones().clear(&req.dataset).await { tracing::warn!("post-compact tombstone clear failed: {e}"); } } Ok(Json(result)) } Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } #[cfg(test)] mod sql_policy_tests { use super::*; use truth::sql_query_guard_store; // These tests exercise the policy gate without spinning up a DataFusion // engine — they only need `TruthStore`. Purpose: prove the P42-002 // enforcement point actually rejects destructive SQL. This is the // regression guard for the queryd/service.rs finding that looped // across scrum iters 3-5. #[test] fn blocks_drop_table() { let store = sql_query_guard_store(); let reason = sql_policy_check(&store, "DROP TABLE users").expect("must reject"); assert!(reason.contains("destructive"), "reason: {reason}"); } #[test] fn blocks_delete_from() { let store = sql_query_guard_store(); assert!(sql_policy_check(&store, "delete from t where 1=1").is_some()); } #[test] fn blocks_truncate() { let store = sql_query_guard_store(); assert!(sql_policy_check(&store, "TRUNCATE workers").is_some()); } #[test] fn blocks_empty_sql() { let store = sql_query_guard_store(); assert!(sql_policy_check(&store, "").is_some()); } #[test] fn allows_benign_select() { let store = sql_query_guard_store(); assert!(sql_policy_check(&store, "SELECT count(*) FROM workers").is_none()); } #[test] fn allows_select_with_deleted_word_in_column() { // Substring match is narrow ("delete from", not "delete"), so a // column named `deleted_at` doesn't trip the guard. Important // check — false positives on benign queries would make the gate // unusable in practice. let store = sql_query_guard_store(); assert!( sql_policy_check(&store, "SELECT deleted_at FROM t").is_none(), "column names containing 'delete' must not be rejected" ); } #[test] fn case_insensitive_match_catches_mixed_case() { let store = sql_query_guard_store(); assert!(sql_policy_check(&store, "Drop Table X").is_some()); } }