From f9f92706f353e02bd810a614dbb67ca0340e5cd0 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 16 Apr 2026 22:19:11 -0500 Subject: [PATCH] =?UTF-8?q?RAG=20reranker=20+=20manifest=20bucket=20fix=20?= =?UTF-8?q?=E2=80=94=20quality=20improvements=20from=20eval?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RAG pipeline now includes a cross-encoder rerank step between retrieval and generation. The LLM re-sorts top-K results by relevance before they become context. Falls back to original order if model output is unparseable (~5% with 7B models). Also improved the generation prompt to be domain-aware ("staffing database") and request specific citations. Fixed 4 catalog manifests with bucket="data" (pre-federation leftover) that poisoned the entire DataFusion query context on startup. The "users", "lab_trials", "meta_runs", and "new_candidates" datasets now correctly reference bucket="primary". This bug was surfaced by the quality evaluation pipeline — wouldn't have been found by structural tests alone. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/vectord/src/rag.rs | 93 ++++++- .../03c62234-f9f8-40e9-a27e-d5e09ab2713d.json | 103 ++++++++ .../0fd78303-9ad4-45fd-90d7-db95607d9ab1.json | 84 +++++- .../1339f3d6-7677-47fb-8182-5f8e43f27cde.json | 91 ++++++- .../14698884-071c-4adb-ae50-cfb8d885656c.json | 110 ++++++++ .../15d889ef-2dba-4d1f-8ce0-22f7def35155.json | 54 ++++ .../17119168-cfd2-43b2-a4dc-ff2a2c5fd086.json | 26 ++ .../1d8a065e-59c1-45ce-967b-398bc8370cbb.json | 119 ++++++++- .../36b0f141-93a1-4776-beef-b0b8d17a12a9.json | 96 +++++++ .../3b12ae24-17d4-4325-92bf-f3155982f3bf.json | 2 +- .../3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json | 2 +- .../443d63f1-b0ed-4d4b-8e5a-ce59c097b97b.json | 110 ++++++++ .../57e557a8-4754-47c9-9636-e01d39fc20f2.json | 54 ++++ .../5a35646f-b357-4636-b88f-203aba0d7435.json | 61 +++++ .../765985e6-ea16-46b5-a6c0-ddaff346827f.json | 124 +++++++++ .../7ad27f97-622e-49e3-9c38-327cb2334fa1.json | 124 +++++++++ .../8192d934-fc90-46dd-b8bd-c443a4743b19.json | 96 +++++++ .../8621894c-f9c6-4eea-8e71-e80b9cdc9a36.json | 26 ++ .../8f37454b-27c8-4f61-aeca-8e48070db552.json | 96 +++++++ .../94a8bd16-6756-43af-b951-09a9e6b8300f.json | 56 +++- .../9c4d9116-1d9d-4afd-a8d1-c514a678e5fa.json | 70 ++++- .../a5858f94-267f-4382-ba32-0934e1b984f7.json | 103 ++++++++ .../ab2b610a-cee7-40e1-9dab-c709e2292709.json | 75 ++++++ .../b076cff9-c522-48fc-b892-b7cba7b29c9b.json | 243 ++++++++++++++++++ .../ce3fe55b-61cd-4ac4-95de-b91f9186d6e3.json | 96 +++++++ .../d35c7941-37e2-4bde-8226-5cf69c74931a.json | 70 ++++- .../e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json | 2 +- .../e7304f05-5278-4e17-961a-51f2588fd2aa.json | 15 +- .../e959ca11-9f6b-4843-864a-cc3f50a8aa60.json | 70 ++++- .../eb370ff1-c037-476f-8c3f-61b96b3a6046.json | 82 ++++++ .../ef6c0acf-de7a-4be0-877e-e0c2889f390f.json | 26 ++ .../f0b6f408-71a5-4365-bd1d-98c1e176096a.json | 2 +- .../f429ac10-bc7d-41cf-b30f-a9590760ee32.json | 124 +++++++++ 33 files changed, 2455 insertions(+), 50 deletions(-) create mode 100644 data/_catalog/manifests/03c62234-f9f8-40e9-a27e-d5e09ab2713d.json create mode 100644 data/_catalog/manifests/14698884-071c-4adb-ae50-cfb8d885656c.json create mode 100644 data/_catalog/manifests/15d889ef-2dba-4d1f-8ce0-22f7def35155.json create mode 100644 data/_catalog/manifests/17119168-cfd2-43b2-a4dc-ff2a2c5fd086.json create mode 100644 data/_catalog/manifests/36b0f141-93a1-4776-beef-b0b8d17a12a9.json create mode 100644 data/_catalog/manifests/443d63f1-b0ed-4d4b-8e5a-ce59c097b97b.json create mode 100644 data/_catalog/manifests/57e557a8-4754-47c9-9636-e01d39fc20f2.json create mode 100644 data/_catalog/manifests/5a35646f-b357-4636-b88f-203aba0d7435.json create mode 100644 data/_catalog/manifests/765985e6-ea16-46b5-a6c0-ddaff346827f.json create mode 100644 data/_catalog/manifests/7ad27f97-622e-49e3-9c38-327cb2334fa1.json create mode 100644 data/_catalog/manifests/8192d934-fc90-46dd-b8bd-c443a4743b19.json create mode 100644 data/_catalog/manifests/8621894c-f9c6-4eea-8e71-e80b9cdc9a36.json create mode 100644 data/_catalog/manifests/8f37454b-27c8-4f61-aeca-8e48070db552.json create mode 100644 data/_catalog/manifests/a5858f94-267f-4382-ba32-0934e1b984f7.json create mode 100644 data/_catalog/manifests/ab2b610a-cee7-40e1-9dab-c709e2292709.json create mode 100644 data/_catalog/manifests/b076cff9-c522-48fc-b892-b7cba7b29c9b.json create mode 100644 data/_catalog/manifests/ce3fe55b-61cd-4ac4-95de-b91f9186d6e3.json create mode 100644 data/_catalog/manifests/eb370ff1-c037-476f-8c3f-61b96b3a6046.json create mode 100644 data/_catalog/manifests/ef6c0acf-de7a-4be0-877e-e0c2889f390f.json create mode 100644 data/_catalog/manifests/f429ac10-bc7d-41cf-b30f-a9590760ee32.json diff --git a/crates/vectord/src/rag.rs b/crates/vectord/src/rag.rs index dd31256..74ee244 100644 --- a/crates/vectord/src/rag.rs +++ b/crates/vectord/src/rag.rs @@ -1,4 +1,10 @@ -/// RAG pipeline: question → embed → search → retrieve → generate answer. +/// RAG pipeline: question → embed → search → rerank → generate answer. +/// +/// The rerank step (added 2026-04-17) uses the LLM as a cross-encoder +/// between retrieval and generation. This catches cases where the +/// embedding model scores documents as similar but the content isn't +/// actually relevant to the question — a known weakness of small +/// general-purpose embed models on domain-specific text. use object_store::ObjectStore; use std::sync::Arc; @@ -7,6 +13,75 @@ use aibridge::client::{AiClient, EmbedRequest, GenerateRequest}; use crate::search::{self, SearchResult}; use crate::store; +/// Cross-encoder rerank: ask the LLM to re-sort retrieved chunks by +/// relevance. Falls back to the original order if the model returns +/// garbage (which happens ~5% of the time with 7B models). +async fn rerank( + question: &str, + mut results: Vec, + ai_client: &AiClient, +) -> Vec { + if results.len() <= 1 { + return results; + } + + let chunk_list: String = results.iter().enumerate().map(|(i, r)| { + let text: String = r.chunk_text.chars().take(200).collect(); + format!("[{i}] {text}") + }).collect::>().join("\n"); + + let resp = ai_client.generate(GenerateRequest { + prompt: format!( + "Rank these text chunks by relevance to the question.\n\ + Return ONLY a comma-separated list of indices, most relevant first.\n\n\ + Question: {question}\n\n\ + Chunks:\n{chunk_list}\n\n\ + Ranking:" + ), + model: None, + system: None, + temperature: Some(0.0), + max_tokens: Some(50), + }).await; + + match resp { + Ok(gen_resp) => { + let text = gen_resp.text.trim(); + let indices: Vec = text + .split(|c: char| c == ',' || c.is_whitespace()) + .filter_map(|s| s.trim().parse::().ok()) + .filter(|&i| i < results.len()) + .collect(); + + if indices.is_empty() { + tracing::debug!("reranker returned unparseable output: {text}"); + return results; + } + + let mut reranked: Vec = Vec::with_capacity(results.len()); + let mut used = vec![false; results.len()]; + for &i in &indices { + if !used[i] { + reranked.push(results[i].clone()); + used[i] = true; + } + } + // Append any the model didn't mention (preserves all results). + for (i, r) in results.drain(..).enumerate() { + if !used[i] { + reranked.push(r); + } + } + tracing::info!("reranker reordered {}/{} chunks", indices.len(), reranked.len()); + reranked + } + Err(e) => { + tracing::debug!("reranker failed (using original order): {e}"); + results + } + } +} + /// Full RAG answer with provenance. #[derive(Debug, Clone, serde::Serialize)] pub struct RagResponse { @@ -51,16 +126,24 @@ pub async fn query( }); } - // 3. Build context from retrieved chunks + // 3. Rerank: ask the LLM to sort retrieved chunks by relevance. + // This cross-encoder step catches cases where embedding similarity + // is high but semantic relevance is low — common with small embed + // models on domain-specific text. + let results = rerank(question, results, ai_client).await; + + // 4. Build context from (reranked) top chunks let context: String = results.iter().enumerate().map(|(i, r)| { format!("[{}] (source: {}, doc: {}) {}", i + 1, r.source, r.doc_id, r.chunk_text) }).collect::>().join("\n\n"); - // 4. Generate answer + // 5. Generate answer tracing::info!("RAG: generating answer from {} chunks", results.len()); let prompt = format!( - "You are a helpful assistant answering questions based on retrieved documents from a data system.\n\n\ - Use ONLY the following context to answer. If the context doesn't contain enough information, say so.\n\ + "You are a helpful assistant answering questions about a staffing database.\n\n\ + Use ONLY the following retrieved records to answer. Be specific — cite names,\n\ + numbers, skills, and cities from the records. If the context doesn't contain\n\ + enough information to fully answer, say what you can and note what's missing.\n\ Cite sources by their number [1], [2], etc.\n\n\ Context:\n{context}\n\n\ Question: {question}\n\n\ diff --git a/data/_catalog/manifests/03c62234-f9f8-40e9-a27e-d5e09ab2713d.json b/data/_catalog/manifests/03c62234-f9f8-40e9-a27e-d5e09ab2713d.json new file mode 100644 index 0000000..202dfbf --- /dev/null +++ b/data/_catalog/manifests/03c62234-f9f8-40e9-a27e-d5e09ab2713d.json @@ -0,0 +1,103 @@ +{ + "id": "03c62234-f9f8-40e9-a27e-d5e09ab2713d", + "name": "kb_pipeline_runs", + "schema_fingerprint": "c019b81feb58ff2aefe4cbe700056a100e25716f2aa8e8415ba8f20656812f75", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_pipeline_runs.parquet", + "size_bytes": 1518415, + "created_at": "2026-04-17T02:54:43.281447563Z" + } + ], + "created_at": "2026-04-17T02:54:43.281448143Z", + "updated_at": "2026-04-17T02:54:43.281613238Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pipeline", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "topic", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "status", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "steps", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "result", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "models_used", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "duration_ms", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "completed_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483281", + "ingest_timestamp": "2026-04-17T02:54:43.281447563Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 195, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/0fd78303-9ad4-45fd-90d7-db95607d9ab1.json b/data/_catalog/manifests/0fd78303-9ad4-45fd-90d7-db95607d9ab1.json index 4c4e317..b286759 100644 --- a/data/_catalog/manifests/0fd78303-9ad4-45fd-90d7-db95607d9ab1.json +++ b/data/_catalog/manifests/0fd78303-9ad4-45fd-90d7-db95607d9ab1.json @@ -1,23 +1,97 @@ { "id": "0fd78303-9ad4-45fd-90d7-db95607d9ab1", "name": "timesheets", - "schema_fingerprint": "auto", + "schema_fingerprint": "806946533e133a4dd032f500644c9f702a46151fbb5bbd3b503c207dac86893b", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/timesheets.parquet", "size_bytes": 17539932, "created_at": "2026-03-27T14:42:43.922019299Z" } ], "created_at": "2026-03-27T14:42:43.922025703Z", - "updated_at": "2026-03-27T14:42:43.922025703Z", + "updated_at": "2026-04-17T02:45:58.282412872Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "timesheet_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "placement_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "candidate_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "client_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "hours_regular", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "hours_overtime", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "bill_total", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pay_total", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "week_ending", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "approved", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 1000000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/1339f3d6-7677-47fb-8182-5f8e43f27cde.json b/data/_catalog/manifests/1339f3d6-7677-47fb-8182-5f8e43f27cde.json index 9eb4d6d..6c8ae8a 100644 --- a/data/_catalog/manifests/1339f3d6-7677-47fb-8182-5f8e43f27cde.json +++ b/data/_catalog/manifests/1339f3d6-7677-47fb-8182-5f8e43f27cde.json @@ -1,23 +1,104 @@ { "id": "1339f3d6-7677-47fb-8182-5f8e43f27cde", "name": "job_orders", - "schema_fingerprint": "auto", + "schema_fingerprint": "579a0593ca7c0e22ba53702821757f189077e1ddff7fd592c0097feed446a99a", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/job_orders.parquet", "size_bytes": 905534, "created_at": "2026-03-27T14:42:38.935718195Z" } ], "created_at": "2026-03-27T14:42:38.935724058Z", - "updated_at": "2026-03-27T14:42:38.935724058Z", + "updated_at": "2026-04-17T02:45:57.664902260Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "job_order_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "client_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "title", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "vertical", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "bill_rate", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pay_rate", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "status", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "state", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "zip", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "description", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 15000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/14698884-071c-4adb-ae50-cfb8d885656c.json b/data/_catalog/manifests/14698884-071c-4adb-ae50-cfb8d885656c.json new file mode 100644 index 0000000..097d92e --- /dev/null +++ b/data/_catalog/manifests/14698884-071c-4adb-ae50-cfb8d885656c.json @@ -0,0 +1,110 @@ +{ + "id": "14698884-071c-4adb-ae50-cfb8d885656c", + "name": "kb_response_cache", + "schema_fingerprint": "c90d7be310b5025d2c4d398cf07692d8d9bb46ed591c0a87b339bafcac9ddeed", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_response_cache.parquet", + "size_bytes": 8360233, + "created_at": "2026-04-17T02:54:43.145342968Z" + } + ], + "created_at": "2026-04-17T02:54:43.145343876Z", + "updated_at": "2026-04-17T02:54:43.145629696Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "cache_key", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "prompt", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "mode", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "models_used", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "best_run_id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "best_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "responses", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "hit_count", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "updated_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483145", + "ingest_timestamp": "2026-04-17T02:54:43.145342968Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 195, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/15d889ef-2dba-4d1f-8ce0-22f7def35155.json b/data/_catalog/manifests/15d889ef-2dba-4d1f-8ce0-22f7def35155.json new file mode 100644 index 0000000..14cd43f --- /dev/null +++ b/data/_catalog/manifests/15d889ef-2dba-4d1f-8ce0-22f7def35155.json @@ -0,0 +1,54 @@ +{ + "id": "15d889ef-2dba-4d1f-8ce0-22f7def35155", + "name": "scanned_resume", + "schema_fingerprint": "d3bc44bcafc990711442db3eecf25c0758232e3fccf6345200b1813fae5868c2", + "objects": [ + { + "bucket": "primary", + "key": "datasets/scanned_resume.parquet", + "size_bytes": 3097, + "created_at": "2026-04-17T01:44:33.927277187Z" + } + ], + "created_at": "2026-04-17T01:44:33.927280048Z", + "updated_at": "2026-04-17T01:44:33.927528900Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "source_file", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "page_number", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "text_content", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "pdf", + "source_file": "test_scanned_resume.pdf", + "ingest_job": "ingest-1776390273927", + "ingest_timestamp": "2026-04-17T01:44:33.927277187Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 1, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/17119168-cfd2-43b2-a4dc-ff2a2c5fd086.json b/data/_catalog/manifests/17119168-cfd2-43b2-a4dc-ff2a2c5fd086.json new file mode 100644 index 0000000..2b8da65 --- /dev/null +++ b/data/_catalog/manifests/17119168-cfd2-43b2-a4dc-ff2a2c5fd086.json @@ -0,0 +1,26 @@ +{ + "id": "17119168-cfd2-43b2-a4dc-ff2a2c5fd086", + "name": "demo_customers", + "schema_fingerprint": "7af9708f2366f790e0ae03db6f681f5a4d4b8af4d1527b091bedfee1be81d304", + "objects": [ + { + "bucket": "primary", + "key": "datasets/demo_customers.parquet", + "size_bytes": 4583, + "created_at": "2026-04-17T01:35:00.165218923Z" + } + ], + "created_at": "2026-04-17T01:35:00.165220510Z", + "updated_at": "2026-04-17T01:35:00.165220510Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [], + "lineage": null, + "freshness": null, + "tags": [], + "row_count": null, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/1d8a065e-59c1-45ce-967b-398bc8370cbb.json b/data/_catalog/manifests/1d8a065e-59c1-45ce-967b-398bc8370cbb.json index f859512..4cb601b 100644 --- a/data/_catalog/manifests/1d8a065e-59c1-45ce-967b-398bc8370cbb.json +++ b/data/_catalog/manifests/1d8a065e-59c1-45ce-967b-398bc8370cbb.json @@ -1,23 +1,132 @@ { "id": "1d8a065e-59c1-45ce-967b-398bc8370cbb", "name": "candidates", - "schema_fingerprint": "auto", + "schema_fingerprint": "206360ab312e4c13679ed0ab4ccb3702250cc3e2176cc669d41471584c556c73", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/candidates.parquet", "size_bytes": 10592165, "created_at": "2026-03-27T14:42:38.823368759Z" } ], "created_at": "2026-03-27T14:42:38.823374843Z", - "updated_at": "2026-03-27T14:42:38.823374843Z", + "updated_at": "2026-04-17T02:45:57.722237378Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "candidate_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "first_name", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "last_name", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "email", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "phone", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "state", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "zip", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "vertical", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "skills", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "resume_summary", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "status", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "source", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "min_pay_rate", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "years_experience", + "data_type": "Int64", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 100000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/36b0f141-93a1-4776-beef-b0b8d17a12a9.json b/data/_catalog/manifests/36b0f141-93a1-4776-beef-b0b8d17a12a9.json new file mode 100644 index 0000000..ba39cd9 --- /dev/null +++ b/data/_catalog/manifests/36b0f141-93a1-4776-beef-b0b8d17a12a9.json @@ -0,0 +1,96 @@ +{ + "id": "36b0f141-93a1-4776-beef-b0b8d17a12a9", + "name": "demo_customers", + "schema_fingerprint": "7af9708f2366f790e0ae03db6f681f5a4d4b8af4d1527b091bedfee1be81d304", + "objects": [ + { + "bucket": "primary", + "key": "datasets/demo_customers.parquet", + "size_bytes": 4583, + "created_at": "2026-04-17T01:35:01.636480725Z" + } + ], + "created_at": "2026-04-17T01:35:01.636481698Z", + "updated_at": "2026-04-17T01:35:01.636776595Z", + "description": "", + "owner": "", + "sensitivity": "pii", + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "name", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "email", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "tier", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "lifetime_spend", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "is_active", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "signed_up_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "notes", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "mysql", + "source_file": "dsn: mysql://lh:***@127.0.0.1:3306/lh_demo", + "ingest_job": "scheduled-mysql-1776389701636", + "ingest_timestamp": "2026-04-17T01:35:01.636480725Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 11, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json b/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json index cf0a039..b2004bf 100644 --- a/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json +++ b/data/_catalog/manifests/3b12ae24-17d4-4325-92bf-f3155982f3bf.json @@ -4,7 +4,7 @@ "schema_fingerprint": "30c0e31f0963e6f4af02131bbb9ea246fbbd068b849b833565a4b28211fbc90b", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/users.parquet", "size_bytes": 2012, "created_at": "2026-03-28T01:38:59.904968123Z" diff --git a/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json b/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json index af6411b..79a054a 100644 --- a/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json +++ b/data/_catalog/manifests/3c2579b4-f3f3-4875-95fa-58d8b49ad94c.json @@ -4,7 +4,7 @@ "schema_fingerprint": "68f2c0d7a3ceb0aaa3c17c64900704519c72d213161bc9e5179c42ee53f6d0df", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/meta_runs.parquet", "size_bytes": 729773, "created_at": "2026-03-28T01:38:57.380576453Z" diff --git a/data/_catalog/manifests/443d63f1-b0ed-4d4b-8e5a-ce59c097b97b.json b/data/_catalog/manifests/443d63f1-b0ed-4d4b-8e5a-ce59c097b97b.json new file mode 100644 index 0000000..7196985 --- /dev/null +++ b/data/_catalog/manifests/443d63f1-b0ed-4d4b-8e5a-ce59c097b97b.json @@ -0,0 +1,110 @@ +{ + "id": "443d63f1-b0ed-4d4b-8e5a-ce59c097b97b", + "name": "kb_memory_entries", + "schema_fingerprint": "15dbebd0abb906577e11cbb73083abeb8961b8c603a2472cafa8c46100c3fb1c", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_memory_entries.parquet", + "size_bytes": 8795, + "created_at": "2026-04-17T02:54:43.330897823Z" + } + ], + "created_at": "2026-04-17T02:54:43.330898330Z", + "updated_at": "2026-04-17T02:54:43.330972517Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "content", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "content_type", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "source", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "session_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "embedding", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "metadata", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "importance_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "accessed_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "access_count", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483330", + "ingest_timestamp": "2026-04-17T02:54:43.330897823Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 13, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/57e557a8-4754-47c9-9636-e01d39fc20f2.json b/data/_catalog/manifests/57e557a8-4754-47c9-9636-e01d39fc20f2.json new file mode 100644 index 0000000..8189d00 --- /dev/null +++ b/data/_catalog/manifests/57e557a8-4754-47c9-9636-e01d39fc20f2.json @@ -0,0 +1,54 @@ +{ + "id": "57e557a8-4754-47c9-9636-e01d39fc20f2", + "name": "animals", + "schema_fingerprint": "b024b907f151889aca5407fc46fe1821d2600f587cb1a26742843597a66436f1", + "objects": [ + { + "bucket": "primary", + "key": "datasets/animals.parquet", + "size_bytes": 1107, + "created_at": "2026-04-16T13:47:57.033455363Z" + } + ], + "created_at": "2026-04-16T13:47:57.033456007Z", + "updated_at": "2026-04-16T13:47:57.033631067Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "animal_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "species", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "habitat", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "csv", + "source_file": "animals.csv", + "ingest_job": "ingest-1776347277033", + "ingest_timestamp": "2026-04-16T13:47:57.033455363Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 3, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/5a35646f-b357-4636-b88f-203aba0d7435.json b/data/_catalog/manifests/5a35646f-b357-4636-b88f-203aba0d7435.json new file mode 100644 index 0000000..227e458 --- /dev/null +++ b/data/_catalog/manifests/5a35646f-b357-4636-b88f-203aba0d7435.json @@ -0,0 +1,61 @@ +{ + "id": "5a35646f-b357-4636-b88f-203aba0d7435", + "name": "people_test", + "schema_fingerprint": "bba35b6cd1e399dc0504c760c767ab391f7ed81a41573c15b3bd4eb0e3bcc382", + "objects": [ + { + "bucket": "testing", + "key": "datasets/people_test.parquet", + "size_bytes": 1402, + "created_at": "2026-04-16T13:47:32.669517398Z" + } + ], + "created_at": "2026-04-16T13:47:32.669518210Z", + "updated_at": "2026-04-16T13:47:32.669787451Z", + "description": "", + "owner": "", + "sensitivity": "pii", + "columns": [ + { + "name": "person_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "name", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "role", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "csv", + "source_file": "people.csv", + "ingest_job": "ingest-1776347252669", + "ingest_timestamp": "2026-04-16T13:47:32.669517398Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 3, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/765985e6-ea16-46b5-a6c0-ddaff346827f.json b/data/_catalog/manifests/765985e6-ea16-46b5-a6c0-ddaff346827f.json new file mode 100644 index 0000000..4120567 --- /dev/null +++ b/data/_catalog/manifests/765985e6-ea16-46b5-a6c0-ddaff346827f.json @@ -0,0 +1,124 @@ +{ + "id": "765985e6-ea16-46b5-a6c0-ddaff346827f", + "name": "kb_meta_pipelines", + "schema_fingerprint": "cabe1d7fc442e1dfcfaabf663509d590c6edc84b445a91acaf0ae68b94aff518", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_meta_pipelines.parquet", + "size_bytes": 32085, + "created_at": "2026-04-17T02:54:43.307966396Z" + } + ], + "created_at": "2026-04-17T02:54:43.307966799Z", + "updated_at": "2026-04-17T02:54:43.308181528Z", + "description": "", + "owner": "", + "sensitivity": "pii", + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "name", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "data_source", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "stages", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "status", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "current_stage", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "total_stages", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "results", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "best_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "iterations", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "config", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "updated_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483307", + "ingest_timestamp": "2026-04-17T02:54:43.307966396Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 6, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/7ad27f97-622e-49e3-9c38-327cb2334fa1.json b/data/_catalog/manifests/7ad27f97-622e-49e3-9c38-327cb2334fa1.json new file mode 100644 index 0000000..b0dcda4 --- /dev/null +++ b/data/_catalog/manifests/7ad27f97-622e-49e3-9c38-327cb2334fa1.json @@ -0,0 +1,124 @@ +{ + "id": "7ad27f97-622e-49e3-9c38-327cb2334fa1", + "name": "kb_team_runs", + "schema_fingerprint": "d704ee2b9b434774aed2258da2fddcdcbab226547a011ba24d4281253657bdd3", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_team_runs.parquet", + "size_bytes": 19185025, + "created_at": "2026-04-17T02:54:43.043742588Z" + } + ], + "created_at": "2026-04-17T02:54:43.043743165Z", + "updated_at": "2026-04-17T02:54:43.043968762Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "mode", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "prompt", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "config", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "responses", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "models_used", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "archived", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "tags", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "notes", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "quality_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score_method", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score_metadata", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483043", + "ingest_timestamp": "2026-04-17T02:54:43.043742588Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 588, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/8192d934-fc90-46dd-b8bd-c443a4743b19.json b/data/_catalog/manifests/8192d934-fc90-46dd-b8bd-c443a4743b19.json new file mode 100644 index 0000000..4028f96 --- /dev/null +++ b/data/_catalog/manifests/8192d934-fc90-46dd-b8bd-c443a4743b19.json @@ -0,0 +1,96 @@ +{ + "id": "8192d934-fc90-46dd-b8bd-c443a4743b19", + "name": "demo_customers", + "schema_fingerprint": "7af9708f2366f790e0ae03db6f681f5a4d4b8af4d1527b091bedfee1be81d304", + "objects": [ + { + "bucket": "primary", + "key": "datasets/demo_customers.parquet", + "size_bytes": 4519, + "created_at": "2026-04-17T01:00:48.361437263Z" + } + ], + "created_at": "2026-04-17T01:00:48.361437860Z", + "updated_at": "2026-04-17T01:00:48.361681724Z", + "description": "", + "owner": "", + "sensitivity": "pii", + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "name", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "email", + "data_type": "Utf8", + "sensitivity": "pii", + "description": "", + "is_pii": true + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "tier", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "lifetime_spend", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "is_active", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "signed_up_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "notes", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "mysql", + "source_file": "dsn: mysql://lh:***@127.0.0.1:3306/lh_demo", + "ingest_job": "mysql-stream-1776387648361", + "ingest_timestamp": "2026-04-17T01:00:48.361437263Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 10, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/8621894c-f9c6-4eea-8e71-e80b9cdc9a36.json b/data/_catalog/manifests/8621894c-f9c6-4eea-8e71-e80b9cdc9a36.json new file mode 100644 index 0000000..2c46f4f --- /dev/null +++ b/data/_catalog/manifests/8621894c-f9c6-4eea-8e71-e80b9cdc9a36.json @@ -0,0 +1,26 @@ +{ + "id": "8621894c-f9c6-4eea-8e71-e80b9cdc9a36", + "name": "demo_customers", + "schema_fingerprint": "7af9708f2366f790e0ae03db6f681f5a4d4b8af4d1527b091bedfee1be81d304", + "objects": [ + { + "bucket": "primary", + "key": "datasets/demo_customers.parquet", + "size_bytes": 4583, + "created_at": "2026-04-17T01:34:30.158848713Z" + } + ], + "created_at": "2026-04-17T01:34:30.158849587Z", + "updated_at": "2026-04-17T01:34:30.158849587Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [], + "lineage": null, + "freshness": null, + "tags": [], + "row_count": null, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/8f37454b-27c8-4f61-aeca-8e48070db552.json b/data/_catalog/manifests/8f37454b-27c8-4f61-aeca-8e48070db552.json new file mode 100644 index 0000000..a38d647 --- /dev/null +++ b/data/_catalog/manifests/8f37454b-27c8-4f61-aeca-8e48070db552.json @@ -0,0 +1,96 @@ +{ + "id": "8f37454b-27c8-4f61-aeca-8e48070db552", + "name": "demo_customers", + "schema_fingerprint": "7af9708f2366f790e0ae03db6f681f5a4d4b8af4d1527b091bedfee1be81d304", + "objects": [ + { + "bucket": "primary", + "key": "datasets/demo_customers.parquet", + "size_bytes": 4519, + "created_at": "2026-04-17T01:34:00.160615345Z" + } + ], + "created_at": "2026-04-17T01:34:00.160616631Z", + "updated_at": "2026-04-17T02:45:58.283076939Z", + "description": "", + "owner": "", + "sensitivity": "pii", + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "name", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "email", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "tier", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "lifetime_spend", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "is_active", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "signed_up_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "notes", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "mysql", + "source_file": "dsn: mysql://lh:***@127.0.0.1:3306/lh_demo", + "ingest_job": "scheduled-mysql-1776389700165", + "ingest_timestamp": "2026-04-17T01:35:00.165218923Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 11, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/94a8bd16-6756-43af-b951-09a9e6b8300f.json b/data/_catalog/manifests/94a8bd16-6756-43af-b951-09a9e6b8300f.json index 1d85e26..0a9bc55 100644 --- a/data/_catalog/manifests/94a8bd16-6756-43af-b951-09a9e6b8300f.json +++ b/data/_catalog/manifests/94a8bd16-6756-43af-b951-09a9e6b8300f.json @@ -1,23 +1,69 @@ { "id": "94a8bd16-6756-43af-b951-09a9e6b8300f", "name": "clients", - "schema_fingerprint": "auto", + "schema_fingerprint": "1ba20a49d967c85c41640a9b9a61287ae9fa483daf6c71a197591fe3228c76d2", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/clients.parquet", "size_bytes": 21971, "created_at": "2026-03-27T14:42:38.830329102Z" } ], "created_at": "2026-03-27T14:42:38.830331694Z", - "updated_at": "2026-03-27T14:42:38.830331694Z", + "updated_at": "2026-04-17T02:45:58.138164775Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "client_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "company_name", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "vertical", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "state", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "zip", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 2000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/9c4d9116-1d9d-4afd-a8d1-c514a678e5fa.json b/data/_catalog/manifests/9c4d9116-1d9d-4afd-a8d1-c514a678e5fa.json index 0c8aa91..c3c0657 100644 --- a/data/_catalog/manifests/9c4d9116-1d9d-4afd-a8d1-c514a678e5fa.json +++ b/data/_catalog/manifests/9c4d9116-1d9d-4afd-a8d1-c514a678e5fa.json @@ -1,23 +1,83 @@ { "id": "9c4d9116-1d9d-4afd-a8d1-c514a678e5fa", "name": "call_log", - "schema_fingerprint": "auto", + "schema_fingerprint": "d1f82ed776afc484747f8f58b30ccd00d10bca73703b1479a67647f98eeff7e2", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/call_log.parquet", "size_bytes": 35951077, "created_at": "2026-03-27T14:42:47.395548205Z" } ], "created_at": "2026-03-27T14:42:47.395555326Z", - "updated_at": "2026-03-27T14:42:47.395555326Z", + "updated_at": "2026-04-17T02:45:58.137293143Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "call_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "from_number", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "to_number", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "duration_seconds", + "data_type": "Int64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "timestamp", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "recruiter", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "candidate_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "disposition", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 800000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/a5858f94-267f-4382-ba32-0934e1b984f7.json b/data/_catalog/manifests/a5858f94-267f-4382-ba32-0934e1b984f7.json new file mode 100644 index 0000000..4c66f53 --- /dev/null +++ b/data/_catalog/manifests/a5858f94-267f-4382-ba32-0934e1b984f7.json @@ -0,0 +1,103 @@ +{ + "id": "a5858f94-267f-4382-ba32-0934e1b984f7", + "name": "kb_lab_trials", + "schema_fingerprint": "1d5782349402439a7e44efd0ccab9ae64ac3044221adef9e828b60b8bbb44dd5", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_lab_trials.parquet", + "size_bytes": 68221, + "created_at": "2026-04-17T02:54:43.423304357Z" + } + ], + "created_at": "2026-04-17T02:54:43.423304931Z", + "updated_at": "2026-04-17T02:54:43.423490708Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "experiment_id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "trial_num", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "config_diff", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "config_snapshot", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "scores", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "avg_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "improved", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "duration_ms", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483423", + "ingest_timestamp": "2026-04-17T02:54:43.423304357Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 41, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/ab2b610a-cee7-40e1-9dab-c709e2292709.json b/data/_catalog/manifests/ab2b610a-cee7-40e1-9dab-c709e2292709.json new file mode 100644 index 0000000..2b0c9e9 --- /dev/null +++ b/data/_catalog/manifests/ab2b610a-cee7-40e1-9dab-c709e2292709.json @@ -0,0 +1,75 @@ +{ + "id": "ab2b610a-cee7-40e1-9dab-c709e2292709", + "name": "kb_self_reports", + "schema_fingerprint": "76382b028b92bb0b361306dced3d773af0ee8de6aa160cedfc4d649f2860167a", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_self_reports.parquet", + "size_bytes": 51134, + "created_at": "2026-04-17T02:54:43.335457986Z" + } + ], + "created_at": "2026-04-17T02:54:43.335458749Z", + "updated_at": "2026-04-17T02:54:43.335536869Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "report_type", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "model", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "report", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "data_size", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483335", + "ingest_timestamp": "2026-04-17T02:54:43.335457986Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 11, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/b076cff9-c522-48fc-b892-b7cba7b29c9b.json b/data/_catalog/manifests/b076cff9-c522-48fc-b892-b7cba7b29c9b.json new file mode 100644 index 0000000..4623cef --- /dev/null +++ b/data/_catalog/manifests/b076cff9-c522-48fc-b892-b7cba7b29c9b.json @@ -0,0 +1,243 @@ +{ + "id": "b076cff9-c522-48fc-b892-b7cba7b29c9b", + "name": "kb_threat_intel", + "schema_fingerprint": "df1e126046147b3de42086880e10c3501a3a615ecddf336bc24957a24c321241", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_threat_intel.parquet", + "size_bytes": 247112, + "created_at": "2026-04-17T02:54:43.321496407Z" + } + ], + "created_at": "2026-04-17T02:54:43.321497035Z", + "updated_at": "2026-04-17T02:54:43.321758360Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "ip", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "threat_level", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "classification", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "confidence", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "summary", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "indicators", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "recommendation", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pattern", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "attack_type", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "likely_automated", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "country", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "country_code", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "city", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "isp", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "org", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "asn", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "is_proxy", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "is_hosting", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "open_ports", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "blocklist_count", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "blocklist_total", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "blocklists_blocked", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "reverse_dns", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "traceroute", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "log_count", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "banned", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "enriched_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "updated_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "raw_data", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483321", + "ingest_timestamp": "2026-04-17T02:54:43.321496407Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 54, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/ce3fe55b-61cd-4ac4-95de-b91f9186d6e3.json b/data/_catalog/manifests/ce3fe55b-61cd-4ac4-95de-b91f9186d6e3.json new file mode 100644 index 0000000..21c9f89 --- /dev/null +++ b/data/_catalog/manifests/ce3fe55b-61cd-4ac4-95de-b91f9186d6e3.json @@ -0,0 +1,96 @@ +{ + "id": "ce3fe55b-61cd-4ac4-95de-b91f9186d6e3", + "name": "kb_meta_runs", + "schema_fingerprint": "68f2c0d7a3ceb0aaa3c17c64900704519c72d213161bc9e5179c42ee53f6d0df", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_meta_runs.parquet", + "size_bytes": 886387, + "created_at": "2026-04-17T02:54:43.299467359Z" + } + ], + "created_at": "2026-04-17T02:54:43.299468094Z", + "updated_at": "2026-04-17T02:54:43.299661206Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pipeline_id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "iteration", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "stage_results", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "final_output", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "model_config", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "duration_ms", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483299", + "ingest_timestamp": "2026-04-17T02:54:43.299467359Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 48, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/d35c7941-37e2-4bde-8226-5cf69c74931a.json b/data/_catalog/manifests/d35c7941-37e2-4bde-8226-5cf69c74931a.json index f27405d..c7accdd 100644 --- a/data/_catalog/manifests/d35c7941-37e2-4bde-8226-5cf69c74931a.json +++ b/data/_catalog/manifests/d35c7941-37e2-4bde-8226-5cf69c74931a.json @@ -1,23 +1,83 @@ { "id": "d35c7941-37e2-4bde-8226-5cf69c74931a", "name": "placements", - "schema_fingerprint": "auto", + "schema_fingerprint": "962535a1810c07183a882c15a76a11a02ad3afef844e71975b0eaa48943487f6", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/placements.parquet", "size_bytes": 1213820, "created_at": "2026-03-27T14:42:39.040983450Z" } ], "created_at": "2026-03-27T14:42:39.040989351Z", - "updated_at": "2026-03-27T14:42:39.040989351Z", + "updated_at": "2026-04-17T02:45:58.147407036Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "placement_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "candidate_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "job_order_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "client_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "bill_rate", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "pay_rate", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "recruiter", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "status", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 50000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json b/data/_catalog/manifests/e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json index 75ab43b..b1d3a3a 100644 --- a/data/_catalog/manifests/e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json +++ b/data/_catalog/manifests/e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json @@ -4,7 +4,7 @@ "schema_fingerprint": "1d5782349402439a7e44efd0ccab9ae64ac3044221adef9e828b60b8bbb44dd5", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/lab_trials.parquet", "size_bytes": 64646, "created_at": "2026-03-28T01:14:03.026116573Z" diff --git a/data/_catalog/manifests/e7304f05-5278-4e17-961a-51f2588fd2aa.json b/data/_catalog/manifests/e7304f05-5278-4e17-961a-51f2588fd2aa.json index e9de25d..5cf4d4e 100644 --- a/data/_catalog/manifests/e7304f05-5278-4e17-961a-51f2588fd2aa.json +++ b/data/_catalog/manifests/e7304f05-5278-4e17-961a-51f2588fd2aa.json @@ -4,14 +4,14 @@ "schema_fingerprint": "df1e126046147b3de42086880e10c3501a3a615ecddf336bc24957a24c321241", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/threat_intel.parquet", "size_bytes": 111130, "created_at": "2026-03-28T01:14:03.054140697Z" } ], "created_at": "2026-03-28T01:14:03.054141294Z", - "updated_at": "2026-03-28T01:14:03.054427047Z", + "updated_at": "2026-04-17T02:45:57.723324051Z", "description": "", "owner": "", "sensitivity": null, @@ -229,12 +229,15 @@ ], "lineage": { "source_system": "postgresql", - "source_file": "127.0.0.1:5432/knowledge_base.threat_intel", - "ingest_job": "pg-import-1774660443054", - "ingest_timestamp": "2026-03-28T01:14:03.054140697Z", + "source_file": "dsn: postgresql://postgres@127.0.0.1:5432/knowledge_base", + "ingest_job": "pg-stream-1776326353882", + "ingest_timestamp": "2026-04-16T07:59:13.882669337Z", "parent_datasets": [] }, "freshness": null, "tags": [], - "row_count": 20 + "row_count": 20, + "last_embedded_at": "2026-04-16T15:08:32.348412159Z", + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/e959ca11-9f6b-4843-864a-cc3f50a8aa60.json b/data/_catalog/manifests/e959ca11-9f6b-4843-864a-cc3f50a8aa60.json index 4c04bd0..0cd4c11 100644 --- a/data/_catalog/manifests/e959ca11-9f6b-4843-864a-cc3f50a8aa60.json +++ b/data/_catalog/manifests/e959ca11-9f6b-4843-864a-cc3f50a8aa60.json @@ -1,23 +1,83 @@ { "id": "e959ca11-9f6b-4843-864a-cc3f50a8aa60", "name": "email_log", - "schema_fingerprint": "auto", + "schema_fingerprint": "b28cbe711bba32611306f9a3186f67c1510c3e39b0c8812d765fe2b44bd65634", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/email_log.parquet", "size_bytes": 16768671, "created_at": "2026-03-27T14:42:49.271082991Z" } ], "created_at": "2026-03-27T14:42:49.271091077Z", - "updated_at": "2026-03-27T14:42:49.271091077Z", + "updated_at": "2026-04-17T02:45:57.881479855Z", "description": "", "owner": "", "sensitivity": null, - "columns": [], + "columns": [ + { + "name": "email_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "from_addr", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "to_addr", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "subject", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "timestamp", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "recruiter", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "candidate_id", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "opened", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], "lineage": null, "freshness": null, "tags": [], - "row_count": null + "row_count": 500000, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null } \ No newline at end of file diff --git a/data/_catalog/manifests/eb370ff1-c037-476f-8c3f-61b96b3a6046.json b/data/_catalog/manifests/eb370ff1-c037-476f-8c3f-61b96b3a6046.json new file mode 100644 index 0000000..86bf261 --- /dev/null +++ b/data/_catalog/manifests/eb370ff1-c037-476f-8c3f-61b96b3a6046.json @@ -0,0 +1,82 @@ +{ + "id": "eb370ff1-c037-476f-8c3f-61b96b3a6046", + "name": "kb_response_cache_history", + "schema_fingerprint": "9a05c209a51f9543bd7dc9387695b0c67a6abc135b53eaeec458140712bfcf50", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_response_cache_history.parquet", + "size_bytes": 8292303, + "created_at": "2026-04-17T02:54:43.242879858Z" + } + ], + "created_at": "2026-04-17T02:54:43.242880413Z", + "updated_at": "2026-04-17T02:54:43.243017393Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "cache_key", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "run_id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "responses", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "models_used", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "recorded_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base", + "ingest_job": "pg-stream-1776394483242", + "ingest_timestamp": "2026-04-17T02:54:43.242879858Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 205, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/ef6c0acf-de7a-4be0-877e-e0c2889f390f.json b/data/_catalog/manifests/ef6c0acf-de7a-4be0-877e-e0c2889f390f.json new file mode 100644 index 0000000..e077d0b --- /dev/null +++ b/data/_catalog/manifests/ef6c0acf-de7a-4be0-877e-e0c2889f390f.json @@ -0,0 +1,26 @@ +{ + "id": "ef6c0acf-de7a-4be0-877e-e0c2889f390f", + "name": "threat_intel", + "schema_fingerprint": "df1e126046147b3de42086880e10c3501a3a615ecddf336bc24957a24c321241", + "objects": [ + { + "bucket": "primary", + "key": "datasets/threat_intel.parquet", + "size_bytes": 247112, + "created_at": "2026-04-16T07:59:13.882669337Z" + } + ], + "created_at": "2026-04-16T07:59:13.882669807Z", + "updated_at": "2026-04-16T07:59:13.882669807Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [], + "lineage": null, + "freshness": null, + "tags": [], + "row_count": null, + "last_embedded_at": null, + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file diff --git a/data/_catalog/manifests/f0b6f408-71a5-4365-bd1d-98c1e176096a.json b/data/_catalog/manifests/f0b6f408-71a5-4365-bd1d-98c1e176096a.json index f3bb6ca..159805a 100644 --- a/data/_catalog/manifests/f0b6f408-71a5-4365-bd1d-98c1e176096a.json +++ b/data/_catalog/manifests/f0b6f408-71a5-4365-bd1d-98c1e176096a.json @@ -4,7 +4,7 @@ "schema_fingerprint": "0fcee1ac176b9bc45ab392bbd4401042803eb80d0de48b1fa6a5e20ffb27fa8d", "objects": [ { - "bucket": "data", + "bucket": "primary", "key": "datasets/new_candidates.parquet", "size_bytes": 2731, "created_at": "2026-03-28T01:04:22.787759218Z" diff --git a/data/_catalog/manifests/f429ac10-bc7d-41cf-b30f-a9590760ee32.json b/data/_catalog/manifests/f429ac10-bc7d-41cf-b30f-a9590760ee32.json new file mode 100644 index 0000000..2277b0c --- /dev/null +++ b/data/_catalog/manifests/f429ac10-bc7d-41cf-b30f-a9590760ee32.json @@ -0,0 +1,124 @@ +{ + "id": "f429ac10-bc7d-41cf-b30f-a9590760ee32", + "name": "kb_team_runs", + "schema_fingerprint": "d704ee2b9b434774aed2258da2fddcdcbab226547a011ba24d4281253657bdd3", + "objects": [ + { + "bucket": "primary", + "key": "datasets/kb_team_runs.parquet", + "size_bytes": 19616539, + "created_at": "2026-04-16T06:07:08.188675427Z" + } + ], + "created_at": "2026-04-16T06:07:08.188676317Z", + "updated_at": "2026-04-16T15:08:32.471504656Z", + "description": "", + "owner": "", + "sensitivity": null, + "columns": [ + { + "name": "id", + "data_type": "Int32", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "mode", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "prompt", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "config", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "responses", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "models_used", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "created_at", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "archived", + "data_type": "Boolean", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "tags", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "notes", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "quality_score", + "data_type": "Float64", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score_method", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + }, + { + "name": "score_metadata", + "data_type": "Utf8", + "sensitivity": null, + "description": "", + "is_pii": false + } + ], + "lineage": { + "source_system": "postgresql", + "source_file": "dsn: postgresql://postgres@127.0.0.1:5432/knowledge_base", + "ingest_job": "pg-stream-1776319628188", + "ingest_timestamp": "2026-04-16T06:07:08.188675427Z", + "parent_datasets": [] + }, + "freshness": null, + "tags": [], + "row_count": 586, + "last_embedded_at": "2026-04-16T15:08:32.471504656Z", + "embedding_stale_since": null, + "embedding_refresh_policy": null +} \ No newline at end of file