diff --git a/Cargo.lock b/Cargo.lock index 13e4345..aedd846 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3963,6 +3963,7 @@ dependencies = [ "axum", "bytes", "catalogd", + "chrono", "datafusion", "futures", "object_store", diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs index b29f72b..5f577ff 100644 --- a/crates/gateway/src/main.rs +++ b/crates/gateway/src/main.rs @@ -30,8 +30,9 @@ async fn main() { tracing::warn!("catalog rebuild failed (empty store?): {e}"); } - // Query engine - let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone()); + // Query engine with 16GB memory cache (configurable) + let cache = queryd::cache::MemCache::new(16 * 1024 * 1024 * 1024); // 16GB + let engine = queryd::context::QueryEngine::new(registry.clone(), store.clone(), cache); // AI sidecar client let ai_client = aibridge::client::AiClient::new(&config.sidecar.url); diff --git a/crates/queryd/Cargo.toml b/crates/queryd/Cargo.toml index e36b36e..af5ad07 100644 --- a/crates/queryd/Cargo.toml +++ b/crates/queryd/Cargo.toml @@ -18,3 +18,4 @@ arrow = { workspace = true } bytes = { workspace = true } futures = { workspace = true } url = { workspace = true } +chrono = { workspace = true } diff --git a/crates/queryd/src/cache.rs b/crates/queryd/src/cache.rs new file mode 100644 index 0000000..288c850 --- /dev/null +++ b/crates/queryd/src/cache.rs @@ -0,0 +1,135 @@ +/// In-memory cache for hot datasets. +/// Pinned datasets are loaded as Arrow RecordBatches and served from RAM. +/// LRU eviction keeps memory bounded. + +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::RwLock; + +/// A cached dataset — schema + all batches in memory. +#[derive(Clone)] +pub struct CachedDataset { + pub name: String, + pub schema: SchemaRef, + pub batches: Vec, + pub row_count: usize, + pub size_bytes: usize, // approximate + pub last_accessed: std::time::Instant, +} + +/// Memory cache with LRU eviction. +#[derive(Clone)] +pub struct MemCache { + datasets: Arc>>, + max_bytes: usize, +} + +impl MemCache { + pub fn new(max_bytes: usize) -> Self { + Self { + datasets: Arc::new(RwLock::new(HashMap::new())), + max_bytes, + } + } + + /// Load a dataset into the cache. + pub async fn put(&self, name: &str, schema: SchemaRef, batches: Vec) { + let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); + let size_bytes: usize = batches.iter() + .map(|b| b.get_array_memory_size()) + .sum(); + + // Evict if needed + self.evict_to_fit(size_bytes).await; + + let entry = CachedDataset { + name: name.to_string(), + schema, + batches, + row_count, + size_bytes, + last_accessed: std::time::Instant::now(), + }; + + tracing::info!("cached '{}': {} rows, {:.1} MB", + name, row_count, size_bytes as f64 / 1024.0 / 1024.0); + + let mut cache = self.datasets.write().await; + cache.insert(name.to_string(), entry); + } + + /// Get a cached dataset, updating access time. + pub async fn get(&self, name: &str) -> Option { + let mut cache = self.datasets.write().await; + if let Some(entry) = cache.get_mut(name) { + entry.last_accessed = std::time::Instant::now(); + Some(entry.clone()) + } else { + None + } + } + + /// Check if a dataset is cached. + pub async fn contains(&self, name: &str) -> bool { + self.datasets.read().await.contains_key(name) + } + + /// Evict a specific dataset. + pub async fn evict(&self, name: &str) -> bool { + let mut cache = self.datasets.write().await; + cache.remove(name).is_some() + } + + /// Get cache stats. + pub async fn stats(&self) -> CacheStats { + let cache = self.datasets.read().await; + let total_bytes: usize = cache.values().map(|d| d.size_bytes).sum(); + let total_rows: usize = cache.values().map(|d| d.row_count).sum(); + CacheStats { + datasets: cache.len(), + total_bytes, + total_rows, + max_bytes: self.max_bytes, + names: cache.keys().cloned().collect(), + } + } + + /// Evict least-recently-used entries to make room for `needed_bytes`. + async fn evict_to_fit(&self, needed_bytes: usize) { + let mut cache = self.datasets.write().await; + let current: usize = cache.values().map(|d| d.size_bytes).sum(); + + if current + needed_bytes <= self.max_bytes { + return; + } + + // Sort by last accessed (oldest first) + let mut entries: Vec<(String, std::time::Instant, usize)> = cache.iter() + .map(|(k, v)| (k.clone(), v.last_accessed, v.size_bytes)) + .collect(); + entries.sort_by_key(|(_, t, _)| *t); + + let mut freed = 0usize; + let target = (current + needed_bytes).saturating_sub(self.max_bytes); + + for (name, _, size) in entries { + if freed >= target { + break; + } + tracing::info!("evicting '{}' from cache ({:.1} MB)", name, size as f64 / 1024.0 / 1024.0); + cache.remove(&name); + freed += size; + } + } +} + +#[derive(Debug, Clone, serde::Serialize)] +pub struct CacheStats { + pub datasets: usize, + pub total_bytes: usize, + pub total_rows: usize, + pub max_bytes: usize, + pub names: Vec, +} diff --git a/crates/queryd/src/context.rs b/crates/queryd/src/context.rs index 7e2d0d7..76ba127 100644 --- a/crates/queryd/src/context.rs +++ b/crates/queryd/src/context.rs @@ -1,80 +1,131 @@ use catalogd::registry::Registry; use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}; +use datafusion::datasource::MemTable; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::prelude::*; use object_store::ObjectStore; use std::sync::Arc; use url::Url; +use crate::cache::MemCache; +use crate::delta; + const STORE_SCHEME: &str = "lakehouse"; -/// Query engine that runs DataFusion over catalog-registered Parquet datasets. +/// Query engine with in-memory cache and delta merge support. #[derive(Clone)] pub struct QueryEngine { registry: Registry, store: Arc, + cache: MemCache, } impl QueryEngine { - pub fn new(registry: Registry, store: Arc) -> Self { - Self { registry, store } + pub fn new(registry: Registry, store: Arc, cache: MemCache) -> Self { + Self { registry, store, cache } } - /// Execute a SQL query. Resolves all catalog datasets as tables before execution. + pub fn cache(&self) -> &MemCache { + &self.cache + } + + pub fn store(&self) -> &Arc { + &self.store + } + + /// Execute a SQL query. Uses cache for hot data, falls back to Parquet. pub async fn query(&self, sql: &str) -> Result, String> { let ctx = self.build_context().await?; - let df = ctx.sql(sql).await.map_err(|e| format!("SQL error: {e}"))?; let batches = df.collect().await.map_err(|e| format!("execution error: {e}"))?; Ok(batches) } - /// Build a SessionContext with all catalog datasets registered as tables. - async fn build_context(&self) -> Result { + /// Pin a dataset into the memory cache. + pub async fn pin_dataset(&self, name: &str) -> Result<(), String> { + // Read from Parquet let ctx = SessionContext::new(); - - // Register the object store under a custom scheme to avoid path doubling. - // The store already has the root prefix (e.g. ./data), so paths are relative keys. let base_url = Url::parse(&format!("{STORE_SCHEME}://data/")) .map_err(|e| format!("invalid store url: {e}"))?; ctx.runtime_env().register_object_store(&base_url, self.store.clone()); - // Register each catalog dataset as a table + let dataset = self.registry.get_by_name(name).await + .ok_or_else(|| format!("dataset not found: {name}"))?; + + if dataset.objects.is_empty() { + return Err(format!("dataset '{name}' has no objects")); + } + + let opts = ListingOptions::new(Arc::new(ParquetFormat::default())); + let table_paths: Vec = dataset.objects.iter() + .filter_map(|o| ListingTableUrl::parse(&format!("{STORE_SCHEME}://data/{}", o.key)).ok()) + .collect(); + + let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await + .map_err(|e| format!("schema inference: {e}"))?; + let config = ListingTableConfig::new_with_multi_paths(table_paths) + .with_listing_options(opts) + .with_schema(schema.clone()); + let table = ListingTable::try_new(config).map_err(|e| format!("table: {e}"))?; + + ctx.register_table(name, Arc::new(table)).map_err(|e| format!("register: {e}"))?; + let df = ctx.sql(&format!("SELECT * FROM {name}")).await.map_err(|e| format!("read: {e}"))?; + let batches = df.collect().await.map_err(|e| format!("collect: {e}"))?; + + self.cache.put(name, schema, batches).await; + Ok(()) + } + + async fn build_context(&self) -> Result { + let ctx = SessionContext::new(); + + let base_url = Url::parse(&format!("{STORE_SCHEME}://data/")) + .map_err(|e| format!("invalid store url: {e}"))?; + ctx.runtime_env().register_object_store(&base_url, self.store.clone()); + let datasets = self.registry.list().await; for dataset in &datasets { if dataset.objects.is_empty() { continue; } - let opts = ListingOptions::new(Arc::new(ParquetFormat::default())); + // Check cache first + if let Some(cached) = self.cache.get(&dataset.name).await { + // Load any delta files and merge + let delta_batches = delta::load_deltas(&self.store, &dataset.name).await.unwrap_or_default(); - let table_paths: Vec = dataset.objects.iter() - .filter_map(|o| { - let url_str = format!("{STORE_SCHEME}://data/{}", o.key); - ListingTableUrl::parse(&url_str).ok() - }) - .collect(); + let mut all_batches = cached.batches; + all_batches.extend(delta_batches); - if table_paths.is_empty() { - tracing::warn!("dataset {} has no valid paths, skipping", dataset.name); + let mem_table = MemTable::try_new(cached.schema, vec![all_batches]) + .map_err(|e| format!("MemTable error for {}: {e}", dataset.name))?; + ctx.register_table(&dataset.name, Arc::new(mem_table)) + .map_err(|e| format!("register cached {}: {e}", dataset.name))?; + + tracing::debug!("using cached table: {}", dataset.name); + continue; + } + + // Fall back to Parquet file + let opts = ListingOptions::new(Arc::new(ParquetFormat::default())); + let table_paths: Vec = dataset.objects.iter() + .filter_map(|o| ListingTableUrl::parse(&format!("{STORE_SCHEME}://data/{}", o.key)).ok()) + .collect(); + + if table_paths.is_empty() { continue; } - // Infer schema from the first file let schema = opts.infer_schema(&ctx.state(), &table_paths[0]).await .map_err(|e| format!("schema inference failed for {}: {e}", dataset.name))?; - let config = ListingTableConfig::new_with_multi_paths(table_paths) .with_listing_options(opts) .with_schema(schema); - let table = ListingTable::try_new(config) .map_err(|e| format!("table creation failed for {}: {e}", dataset.name))?; ctx.register_table(&dataset.name, Arc::new(table)) .map_err(|e| format!("table registration failed for {}: {e}", dataset.name))?; - - tracing::debug!("registered table: {}", dataset.name); } Ok(ctx) diff --git a/crates/queryd/src/delta.rs b/crates/queryd/src/delta.rs new file mode 100644 index 0000000..5fe87da --- /dev/null +++ b/crates/queryd/src/delta.rs @@ -0,0 +1,131 @@ +/// Delta store for incremental updates. +/// Instead of rewriting an entire Parquet file to change one row, +/// we write small delta files. At query time, deltas are merged with the base. +/// Periodic compaction merges deltas into the base file. + +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use bytes::Bytes; +use object_store::ObjectStore; +use std::sync::Arc; + +use shared::arrow_helpers::{parquet_to_record_batches, record_batch_to_parquet}; +use storaged::ops; + +/// Write a delta file for a dataset (new/updated rows). +pub async fn write_delta( + store: &Arc, + dataset_name: &str, + batch: &RecordBatch, +) -> Result { + let ts = chrono::Utc::now().timestamp_millis(); + let key = format!("deltas/{dataset_name}/{ts}.parquet"); + let parquet = record_batch_to_parquet(batch)?; + ops::put(store, &key, parquet).await?; + tracing::info!("wrote delta for '{}': {} rows at {}", dataset_name, batch.num_rows(), key); + Ok(key) +} + +/// List all delta files for a dataset. +pub async fn list_deltas( + store: &Arc, + dataset_name: &str, +) -> Result, String> { + let prefix = format!("deltas/{dataset_name}/"); + ops::list(store, Some(&prefix)).await +} + +/// Load all delta batches for a dataset. +pub async fn load_deltas( + store: &Arc, + dataset_name: &str, +) -> Result, String> { + let keys = list_deltas(store, dataset_name).await?; + let mut all_batches = Vec::new(); + + for key in &keys { + let data = ops::get(store, key).await?; + let (_, batches) = parquet_to_record_batches(&data)?; + all_batches.extend(batches); + } + + if !all_batches.is_empty() { + let total_rows: usize = all_batches.iter().map(|b| b.num_rows()).sum(); + tracing::debug!("loaded {} delta files ({} rows) for '{}'", keys.len(), total_rows, dataset_name); + } + + Ok(all_batches) +} + +/// Compact: merge base Parquet + all deltas into a single new base file. +/// Optionally deduplicates by a primary key column. +pub async fn compact( + store: &Arc, + dataset_name: &str, + base_key: &str, + primary_key_col: Option<&str>, +) -> Result { + // Load base + let base_data = ops::get(store, base_key).await?; + let (schema, mut base_batches) = parquet_to_record_batches(&base_data)?; + + // Load deltas + let delta_batches = load_deltas(store, dataset_name).await?; + let delta_count = delta_batches.len(); + + if delta_batches.is_empty() { + return Ok(CompactResult { + base_rows: base_batches.iter().map(|b| b.num_rows()).sum(), + delta_rows: 0, + final_rows: base_batches.iter().map(|b| b.num_rows()).sum(), + deltas_merged: 0, + }); + } + + base_batches.extend(delta_batches); + + let base_rows: usize = base_batches.iter().map(|b| b.num_rows()).sum(); + + // If primary key specified, deduplicate (keep last occurrence) + let final_batches = if let Some(_pk) = primary_key_col { + // For now, just concatenate. Full dedup requires sorting by PK + // and keeping the last row per key — this is a simplification. + // TODO: implement proper merge with dedup + base_batches + } else { + base_batches + }; + + let final_rows: usize = final_batches.iter().map(|b| b.num_rows()).sum(); + + // Write merged base + let mut merged_parquet = Vec::new(); + for batch in &final_batches { + let pq = record_batch_to_parquet(batch)?; + merged_parquet.extend_from_slice(&pq); + } + ops::put(store, base_key, Bytes::from(merged_parquet)).await?; + + // Delete delta files + let delta_keys = list_deltas(store, dataset_name).await?; + for key in &delta_keys { + let _ = ops::delete(store, key).await; + } + + tracing::info!("compacted '{}': {} deltas merged, {} → {} rows", dataset_name, delta_count, base_rows, final_rows); + + Ok(CompactResult { + base_rows, + delta_rows: final_rows - base_rows + delta_count, // approximate + final_rows, + deltas_merged: delta_count, + }) +} + +#[derive(Debug, Clone, serde::Serialize)] +pub struct CompactResult { + pub base_rows: usize, + pub delta_rows: usize, + pub final_rows: usize, + pub deltas_merged: usize, +} diff --git a/crates/queryd/src/lib.rs b/crates/queryd/src/lib.rs index 3690e02..ff9f6ac 100644 --- a/crates/queryd/src/lib.rs +++ b/crates/queryd/src/lib.rs @@ -1,2 +1,4 @@ +pub mod cache; pub mod context; +pub mod delta; pub mod service; diff --git a/crates/queryd/src/service.rs b/crates/queryd/src/service.rs index 7e83c4b..1660b8c 100644 --- a/crates/queryd/src/service.rs +++ b/crates/queryd/src/service.rs @@ -9,12 +9,18 @@ use axum::{ }; use serde::{Deserialize, Serialize}; +use crate::cache::CacheStats; use crate::context::QueryEngine; +use crate::delta; pub fn router(engine: QueryEngine) -> Router { Router::new() .route("/health", get(health)) .route("/sql", post(execute_query)) + .route("/cache/pin", post(pin_dataset)) + .route("/cache/evict", post(evict_dataset)) + .route("/cache/stats", get(cache_stats)) + .route("/compact", post(compact_dataset)) .with_state(engine) } @@ -22,6 +28,8 @@ async fn health() -> &'static str { "queryd ok" } +// --- SQL Query --- + #[derive(Deserialize)] struct QueryRequest { sql: String, @@ -87,3 +95,60 @@ async fn execute_query( Err(e) => Err((StatusCode::BAD_REQUEST, e)), } } + +// --- Cache Management --- + +#[derive(Deserialize)] +struct CacheRequest { + dataset: String, +} + +async fn pin_dataset( + State(engine): State, + Json(req): Json, +) -> impl IntoResponse { + match engine.pin_dataset(&req.dataset).await { + Ok(()) => Ok((StatusCode::OK, format!("pinned: {}", req.dataset))), + Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), + } +} + +async fn evict_dataset( + State(engine): State, + Json(req): Json, +) -> impl IntoResponse { + if engine.cache().evict(&req.dataset).await { + (StatusCode::OK, format!("evicted: {}", req.dataset)) + } else { + (StatusCode::NOT_FOUND, format!("not cached: {}", req.dataset)) + } +} + +async fn cache_stats(State(engine): State) -> impl IntoResponse { + let stats = engine.cache().stats().await; + Json(stats) +} + +// --- Compaction --- + +#[derive(Deserialize)] +struct CompactRequest { + dataset: String, + base_key: String, + primary_key: Option, +} + +async fn compact_dataset( + State(engine): State, + Json(req): Json, +) -> impl IntoResponse { + match delta::compact( + engine.store(), + &req.dataset, + &req.base_key, + req.primary_key.as_deref(), + ).await { + Ok(result) => Ok(Json(result)), + Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), + } +} diff --git a/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json b/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json deleted file mode 100644 index 89d98ae..0000000 --- a/data/_catalog/manifests/021ac283-883b-4b13-83ce-5395bacdc33a.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "021ac283-883b-4b13-83ce-5395bacdc33a", - "name": "clients", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/clients.parquet", - "size_bytes": 21971, - "created_at": "2026-03-27T13:15:18.000750302Z" - } - ], - "created_at": "2026-03-27T13:15:18.000757845Z", - "updated_at": "2026-03-27T13:15:18.000757845Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json b/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json deleted file mode 100644 index 96de7c0..0000000 --- a/data/_catalog/manifests/052cf81b-f5b6-4439-92d7-ecf09b24bd8b.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "052cf81b-f5b6-4439-92d7-ecf09b24bd8b", - "name": "candidates", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/candidates.parquet", - "size_bytes": 10592165, - "created_at": "2026-03-27T13:15:17.989860994Z" - } - ], - "created_at": "2026-03-27T13:15:17.989869155Z", - "updated_at": "2026-03-27T13:15:17.989869155Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/18d22cdd-24b3-4a65-bdcb-6624753e5ab7.json b/data/_catalog/manifests/18d22cdd-24b3-4a65-bdcb-6624753e5ab7.json new file mode 100644 index 0000000..1b7439a --- /dev/null +++ b/data/_catalog/manifests/18d22cdd-24b3-4a65-bdcb-6624753e5ab7.json @@ -0,0 +1,15 @@ +{ + "id": "18d22cdd-24b3-4a65-bdcb-6624753e5ab7", + "name": "job_orders", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/job_orders.parquet", + "size_bytes": 905534, + "created_at": "2026-03-27T13:36:42.130140103Z" + } + ], + "created_at": "2026-03-27T13:36:42.130146127Z", + "updated_at": "2026-03-27T13:36:42.130146127Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json b/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json deleted file mode 100644 index 5ec264f..0000000 --- a/data/_catalog/manifests/47756b77-9a2e-476c-8249-9b971f95fb2d.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "47756b77-9a2e-476c-8249-9b971f95fb2d", - "name": "call_log", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/call_log.parquet", - "size_bytes": 35951077, - "created_at": "2026-03-27T13:15:26.607093971Z" - } - ], - "created_at": "2026-03-27T13:15:26.607099665Z", - "updated_at": "2026-03-27T13:15:26.607099665Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/8fa7cb8b-ab6b-4e64-9384-d2480e79dd7c.json b/data/_catalog/manifests/8fa7cb8b-ab6b-4e64-9384-d2480e79dd7c.json new file mode 100644 index 0000000..e3ba151 --- /dev/null +++ b/data/_catalog/manifests/8fa7cb8b-ab6b-4e64-9384-d2480e79dd7c.json @@ -0,0 +1,15 @@ +{ + "id": "8fa7cb8b-ab6b-4e64-9384-d2480e79dd7c", + "name": "clients", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/clients.parquet", + "size_bytes": 21971, + "created_at": "2026-03-27T13:36:42.025701092Z" + } + ], + "created_at": "2026-03-27T13:36:42.025707574Z", + "updated_at": "2026-03-27T13:36:42.025707574Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/b5d49316-9c9b-4a57-8221-13b6dcda551a.json b/data/_catalog/manifests/b5d49316-9c9b-4a57-8221-13b6dcda551a.json new file mode 100644 index 0000000..1030c07 --- /dev/null +++ b/data/_catalog/manifests/b5d49316-9c9b-4a57-8221-13b6dcda551a.json @@ -0,0 +1,15 @@ +{ + "id": "b5d49316-9c9b-4a57-8221-13b6dcda551a", + "name": "placements", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/placements.parquet", + "size_bytes": 1213820, + "created_at": "2026-03-27T13:36:42.237756183Z" + } + ], + "created_at": "2026-03-27T13:36:42.237762120Z", + "updated_at": "2026-03-27T13:36:42.237762120Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/c00465bd-c562-419a-b40e-c557ba9054bf.json b/data/_catalog/manifests/c00465bd-c562-419a-b40e-c557ba9054bf.json new file mode 100644 index 0000000..be20bf8 --- /dev/null +++ b/data/_catalog/manifests/c00465bd-c562-419a-b40e-c557ba9054bf.json @@ -0,0 +1,15 @@ +{ + "id": "c00465bd-c562-419a-b40e-c557ba9054bf", + "name": "candidates", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/candidates.parquet", + "size_bytes": 10592165, + "created_at": "2026-03-27T13:36:42.018896280Z" + } + ], + "created_at": "2026-03-27T13:36:42.018904245Z", + "updated_at": "2026-03-27T13:36:42.018904245Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json b/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json deleted file mode 100644 index 1f3bb69..0000000 --- a/data/_catalog/manifests/c0224239-a265-4b15-a1e2-ebbc96aee60c.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "c0224239-a265-4b15-a1e2-ebbc96aee60c", - "name": "email_log", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/email_log.parquet", - "size_bytes": 16768671, - "created_at": "2026-03-27T13:15:28.446541739Z" - } - ], - "created_at": "2026-03-27T13:15:28.446547070Z", - "updated_at": "2026-03-27T13:15:28.446547070Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json b/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json deleted file mode 100644 index b29bb14..0000000 --- a/data/_catalog/manifests/c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "c8c9d519-b8b5-4d04-ba2b-5acf53c41bc2", - "name": "timesheets", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/timesheets.parquet", - "size_bytes": 17539932, - "created_at": "2026-03-27T13:15:23.111118100Z" - } - ], - "created_at": "2026-03-27T13:15:23.111124272Z", - "updated_at": "2026-03-27T13:15:23.111124272Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json b/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json deleted file mode 100644 index 6c99fb2..0000000 --- a/data/_catalog/manifests/dcca449b-a2f6-4c1f-99b6-c69dcdbdd204.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "dcca449b-a2f6-4c1f-99b6-c69dcdbdd204", - "name": "placements", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/placements.parquet", - "size_bytes": 1213820, - "created_at": "2026-03-27T13:15:18.264258909Z" - } - ], - "created_at": "2026-03-27T13:15:18.264266375Z", - "updated_at": "2026-03-27T13:15:18.264266375Z" -} \ No newline at end of file diff --git a/data/_catalog/manifests/dce14141-f679-481b-9b48-13438cbfe057.json b/data/_catalog/manifests/dce14141-f679-481b-9b48-13438cbfe057.json new file mode 100644 index 0000000..dcd69a8 --- /dev/null +++ b/data/_catalog/manifests/dce14141-f679-481b-9b48-13438cbfe057.json @@ -0,0 +1,15 @@ +{ + "id": "dce14141-f679-481b-9b48-13438cbfe057", + "name": "email_log", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/email_log.parquet", + "size_bytes": 16768671, + "created_at": "2026-03-27T13:36:52.383853471Z" + } + ], + "created_at": "2026-03-27T13:36:52.383859356Z", + "updated_at": "2026-03-27T13:36:52.383859356Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/e0bcb8de-a2c1-4706-bf2d-73c1b989a70d.json b/data/_catalog/manifests/e0bcb8de-a2c1-4706-bf2d-73c1b989a70d.json new file mode 100644 index 0000000..193e655 --- /dev/null +++ b/data/_catalog/manifests/e0bcb8de-a2c1-4706-bf2d-73c1b989a70d.json @@ -0,0 +1,15 @@ +{ + "id": "e0bcb8de-a2c1-4706-bf2d-73c1b989a70d", + "name": "timesheets", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/timesheets.parquet", + "size_bytes": 17539932, + "created_at": "2026-03-27T13:36:46.998375016Z" + } + ], + "created_at": "2026-03-27T13:36:46.998383728Z", + "updated_at": "2026-03-27T13:36:46.998383728Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/e47b637f-31c6-4209-ab3c-557f8c67c812.json b/data/_catalog/manifests/e47b637f-31c6-4209-ab3c-557f8c67c812.json new file mode 100644 index 0000000..173cdf5 --- /dev/null +++ b/data/_catalog/manifests/e47b637f-31c6-4209-ab3c-557f8c67c812.json @@ -0,0 +1,15 @@ +{ + "id": "e47b637f-31c6-4209-ab3c-557f8c67c812", + "name": "call_log", + "schema_fingerprint": "auto", + "objects": [ + { + "bucket": "data", + "key": "datasets/call_log.parquet", + "size_bytes": 35951077, + "created_at": "2026-03-27T13:36:50.546706609Z" + } + ], + "created_at": "2026-03-27T13:36:50.546712358Z", + "updated_at": "2026-03-27T13:36:50.546712358Z" +} \ No newline at end of file diff --git a/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json b/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json deleted file mode 100644 index c4cd6dd..0000000 --- a/data/_catalog/manifests/e8cc1ad2-114e-4441-a526-b8e6de10cb59.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "id": "e8cc1ad2-114e-4441-a526-b8e6de10cb59", - "name": "job_orders", - "schema_fingerprint": "auto", - "objects": [ - { - "bucket": "data", - "key": "datasets/job_orders.parquet", - "size_bytes": 905534, - "created_at": "2026-03-27T13:15:18.114659931Z" - } - ], - "created_at": "2026-03-27T13:15:18.114667579Z", - "updated_at": "2026-03-27T13:15:18.114667579Z" -} \ No newline at end of file diff --git a/data/vectors/resumes_10k.parquet b/data/vectors/resumes_10k.parquet deleted file mode 100644 index 54e4d7d..0000000 Binary files a/data/vectors/resumes_10k.parquet and /dev/null differ