RAG pipeline now includes a cross-encoder rerank step between retrieval
and generation. The LLM re-sorts top-K results by relevance before
they become context. Falls back to original order if model output is
unparseable (~5% with 7B models). Also improved the generation prompt
to be domain-aware ("staffing database") and request specific citations.
Fixed 4 catalog manifests with bucket="data" (pre-federation leftover)
that poisoned the entire DataFusion query context on startup. The
"users", "lab_trials", "meta_runs", and "new_candidates" datasets
now correctly reference bucket="primary". This bug was surfaced by
the quality evaluation pipeline — wouldn't have been found by
structural tests alone.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
243 lines
5.1 KiB
JSON
243 lines
5.1 KiB
JSON
{
|
|
"id": "b076cff9-c522-48fc-b892-b7cba7b29c9b",
|
|
"name": "kb_threat_intel",
|
|
"schema_fingerprint": "df1e126046147b3de42086880e10c3501a3a615ecddf336bc24957a24c321241",
|
|
"objects": [
|
|
{
|
|
"bucket": "primary",
|
|
"key": "datasets/kb_threat_intel.parquet",
|
|
"size_bytes": 247112,
|
|
"created_at": "2026-04-17T02:54:43.321496407Z"
|
|
}
|
|
],
|
|
"created_at": "2026-04-17T02:54:43.321497035Z",
|
|
"updated_at": "2026-04-17T02:54:43.321758360Z",
|
|
"description": "",
|
|
"owner": "",
|
|
"sensitivity": null,
|
|
"columns": [
|
|
{
|
|
"name": "id",
|
|
"data_type": "Int32",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "ip",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "threat_level",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "classification",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "confidence",
|
|
"data_type": "Float64",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "summary",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "indicators",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "recommendation",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "pattern",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "attack_type",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "likely_automated",
|
|
"data_type": "Boolean",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "country",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "country_code",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "city",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "isp",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "org",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "asn",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "is_proxy",
|
|
"data_type": "Boolean",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "is_hosting",
|
|
"data_type": "Boolean",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "open_ports",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "blocklist_count",
|
|
"data_type": "Int32",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "blocklist_total",
|
|
"data_type": "Int32",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "blocklists_blocked",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "reverse_dns",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "traceroute",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "log_count",
|
|
"data_type": "Int32",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "banned",
|
|
"data_type": "Boolean",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "enriched_at",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "updated_at",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
},
|
|
{
|
|
"name": "raw_data",
|
|
"data_type": "Utf8",
|
|
"sensitivity": null,
|
|
"description": "",
|
|
"is_pii": false
|
|
}
|
|
],
|
|
"lineage": {
|
|
"source_system": "postgresql",
|
|
"source_file": "dsn: postgresql://kbuser@localhost:5432/knowledge_base",
|
|
"ingest_job": "pg-stream-1776394483321",
|
|
"ingest_timestamp": "2026-04-17T02:54:43.321496407Z",
|
|
"parent_datasets": []
|
|
},
|
|
"freshness": null,
|
|
"tags": [],
|
|
"row_count": 54,
|
|
"last_embedded_at": null,
|
|
"embedding_stale_since": null,
|
|
"embedding_refresh_policy": null
|
|
} |