lakehouse/data/_catalog/manifests/e2a8f88a-59f6-40c7-a45b-e23d8f3533b6.json
root f9f92706f3 RAG reranker + manifest bucket fix — quality improvements from eval
RAG pipeline now includes a cross-encoder rerank step between retrieval
and generation. The LLM re-sorts top-K results by relevance before
they become context. Falls back to original order if model output is
unparseable (~5% with 7B models). Also improved the generation prompt
to be domain-aware ("staffing database") and request specific citations.

Fixed 4 catalog manifests with bucket="data" (pre-federation leftover)
that poisoned the entire DataFusion query context on startup. The
"users", "lab_trials", "meta_runs", and "new_candidates" datasets
now correctly reference bucket="primary". This bug was surfaced by
the quality evaluation pipeline — wouldn't have been found by
structural tests alone.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 22:19:11 -05:00

100 lines
2.2 KiB
JSON

{
"id": "e2a8f88a-59f6-40c7-a45b-e23d8f3533b6",
"name": "lab_trials",
"schema_fingerprint": "1d5782349402439a7e44efd0ccab9ae64ac3044221adef9e828b60b8bbb44dd5",
"objects": [
{
"bucket": "primary",
"key": "datasets/lab_trials.parquet",
"size_bytes": 64646,
"created_at": "2026-03-28T01:14:03.026116573Z"
}
],
"created_at": "2026-03-28T01:14:03.026117277Z",
"updated_at": "2026-03-28T01:14:03.026247826Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [
{
"name": "id",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "experiment_id",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "trial_num",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "config_diff",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "config_snapshot",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "scores",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "avg_score",
"data_type": "Float64",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "improved",
"data_type": "Boolean",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "duration_ms",
"data_type": "Int32",
"sensitivity": null,
"description": "",
"is_pii": false
},
{
"name": "created_at",
"data_type": "Utf8",
"sensitivity": null,
"description": "",
"is_pii": false
}
],
"lineage": {
"source_system": "postgresql",
"source_file": "127.0.0.1:5432/knowledge_base.lab_trials",
"ingest_job": "pg-import-1774660443026",
"ingest_timestamp": "2026-03-28T01:14:03.026116573Z",
"parent_datasets": []
},
"freshness": null,
"tags": [],
"row_count": 40
}