lakehouse/data/_catalog/manifests/0efdef2d-52bc-4154-82d4-056bb3a1915d.json
root a710896db2 Ingest Ethereal 10K worker profiles — domain data in the substrate
10,000 staffing worker profiles from profit/ethereal repo. Flattened
JSON → CSV → Parquet. Indexed on HNSW (9.5s) + Lance IVF_PQ (7.2s).

SQL hybrid verified: forklift operators in IL with reliability > 0.8
returned exact matches. Vector search alone missed the state filter —
confirms the hybrid SQL+vector routing need from quality eval.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 22:26:19 -05:00

159 lines
3.4 KiB
JSON

{
"id": "0efdef2d-52bc-4154-82d4-056bb3a1915d",
"name": "ethereal_workers",
"schema_fingerprint": "9a1286ffada5390b459b217f56263e5ebffec1520ea29bab0be2efc6d5381adc",
"objects": [
{
"bucket": "primary",
"key": "datasets/ethereal_workers.parquet",
"size_bytes": 6991716,
"created_at": "2026-04-17T03:23:29.399966984Z"
}
],
"created_at": "2026-04-17T03:23:29.399967772Z",
"updated_at": "2026-04-17T03:23:29.400227081Z",
"description": "",
"owner": "",
"sensitivity": "pii",
"columns": [
{
"name": "worker_id",
"data_type": "Int64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "name",
"data_type": "Utf8",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "role",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "email",
"data_type": "Utf8",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "phone",
"data_type": "Int64",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "city",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "state",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "zip",
"data_type": "Int64",
"sensitivity": "pii",
"description": "",
"is_pii": true
},
{
"name": "skills",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "certifications",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "archetype",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "reliability",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "responsiveness",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "engagement",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "compliance",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "availability",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "communications",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "resume_text",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
}
],
"lineage": {
"source_system": "csv",
"source_file": "ethereal_workers.csv",
"ingest_job": "ingest-1776396209399",
"ingest_timestamp": "2026-04-17T03:23:29.399966984Z",
"parent_datasets": []
},
"freshness": null,
"tags": [],
"row_count": 10000,
"last_embedded_at": null,
"embedding_stale_since": null,
"embedding_refresh_policy": null
}