root 9e53caaec3 Phase 10: Rich catalog v2 — metadata as product
- DatasetManifest expanded: description, owner, sensitivity, columns,
  lineage, freshness contract, tags, row_count
- All new fields use #[serde(default)] for backward compatibility
- PII auto-detection: scans column names for email, phone, SSN, salary,
  address, DOB, medical terms — flags as PII/PHI/Financial
- Column-level metadata: name, type, sensitivity, is_pii flag
- Lineage tracking: source_system, source_file, ingest_job, timestamp
- Ingest pipeline auto-populates: PII scan, column meta, lineage, row count
- PATCH /catalog/datasets/by-name/{name}/metadata — update metadata
- Catalog responses now include all rich fields
- 25 unit tests passing (5 new PII detection tests)

Per ADR-013: datasets without metadata become mystery files.
This makes every ingested file self-describing from day one.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 09:15:09 -05:00

117 lines
3.9 KiB
Rust

/// Auto-detect PII columns by name patterns.
/// Conservative: flags likely PII, doesn't miss obvious cases.
use crate::types::Sensitivity;
/// Check if a column name suggests PII content.
pub fn detect_sensitivity(column_name: &str) -> Option<Sensitivity> {
let lower = column_name.to_lowercase();
// Direct PII identifiers
if matches!(lower.as_str(),
"ssn" | "social_security" | "social_security_number" |
"sin" | "national_id" | "passport" | "passport_number" |
"drivers_license" | "driver_license" | "dl_number"
) {
return Some(Sensitivity::Pii);
}
// Names
if lower.contains("first_name") || lower.contains("last_name") ||
lower.contains("full_name") || lower.contains("middle_name") ||
lower == "name" || lower == "fname" || lower == "lname" {
return Some(Sensitivity::Pii);
}
// Contact info
if lower.contains("email") || lower.contains("e_mail") ||
lower.contains("phone") || lower.contains("mobile") || lower.contains("cell") ||
lower.contains("fax") || lower == "tel" || lower == "telephone" {
return Some(Sensitivity::Pii);
}
// Address
if lower.contains("address") || lower.contains("street") ||
(lower.contains("zip") && !lower.contains("unzip")) ||
lower == "postal_code" || lower == "postcode" {
return Some(Sensitivity::Pii);
}
// Financial
if lower.contains("salary") || lower.contains("wage") ||
lower.contains("pay_rate") || lower.contains("bill_rate") ||
lower.contains("compensation") || lower.contains("revenue") ||
lower.contains("bank_account") || lower.contains("routing_number") ||
lower.contains("credit_card") {
return Some(Sensitivity::Financial);
}
// Health
if lower.contains("diagnosis") || lower.contains("medication") ||
lower.contains("medical") || lower.contains("health") ||
lower.contains("patient_id") || lower.contains("mrn") {
return Some(Sensitivity::Phi);
}
// Date of birth
if lower == "dob" || lower == "date_of_birth" || lower == "birthdate" || lower == "birth_date" {
return Some(Sensitivity::Pii);
}
None
}
/// Classify all columns and return the highest sensitivity found.
pub fn detect_dataset_sensitivity(column_names: &[&str]) -> Option<Sensitivity> {
let mut highest: Option<Sensitivity> = None;
for name in column_names {
if let Some(sens) = detect_sensitivity(name) {
highest = Some(match (&highest, &sens) {
(None, s) => s.clone(),
(Some(Sensitivity::Public), s) => s.clone(),
(Some(Sensitivity::Internal), s) if !matches!(s, Sensitivity::Public | Sensitivity::Internal) => s.clone(),
(Some(Sensitivity::Financial), Sensitivity::Pii | Sensitivity::Phi) => sens.clone(),
(Some(Sensitivity::Pii), Sensitivity::Phi) => Sensitivity::Phi,
(existing, _) => existing.clone().unwrap(),
});
}
}
highest
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_email_as_pii() {
assert_eq!(detect_sensitivity("email"), Some(Sensitivity::Pii));
assert_eq!(detect_sensitivity("contact_email"), Some(Sensitivity::Pii));
}
#[test]
fn detects_salary_as_financial() {
assert_eq!(detect_sensitivity("salary"), Some(Sensitivity::Financial));
assert_eq!(detect_sensitivity("bill_rate"), Some(Sensitivity::Financial));
}
#[test]
fn detects_ssn() {
assert_eq!(detect_sensitivity("ssn"), Some(Sensitivity::Pii));
}
#[test]
fn non_sensitive_returns_none() {
assert_eq!(detect_sensitivity("status"), None);
assert_eq!(detect_sensitivity("created_at"), None);
}
#[test]
fn dataset_sensitivity_picks_highest() {
let cols = vec!["id", "name", "email", "status"];
assert_eq!(detect_dataset_sensitivity(&cols), Some(Sensitivity::Pii));
}
}