- DatasetManifest expanded: description, owner, sensitivity, columns,
lineage, freshness contract, tags, row_count
- All new fields use #[serde(default)] for backward compatibility
- PII auto-detection: scans column names for email, phone, SSN, salary,
address, DOB, medical terms — flags as PII/PHI/Financial
- Column-level metadata: name, type, sensitivity, is_pii flag
- Lineage tracking: source_system, source_file, ingest_job, timestamp
- Ingest pipeline auto-populates: PII scan, column meta, lineage, row count
- PATCH /catalog/datasets/by-name/{name}/metadata — update metadata
- Catalog responses now include all rich fields
- 25 unit tests passing (5 new PII detection tests)
Per ADR-013: datasets without metadata become mystery files.
This makes every ingested file self-describing from day one.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
117 lines
3.9 KiB
Rust
117 lines
3.9 KiB
Rust
/// Auto-detect PII columns by name patterns.
|
|
/// Conservative: flags likely PII, doesn't miss obvious cases.
|
|
|
|
use crate::types::Sensitivity;
|
|
|
|
/// Check if a column name suggests PII content.
|
|
pub fn detect_sensitivity(column_name: &str) -> Option<Sensitivity> {
|
|
let lower = column_name.to_lowercase();
|
|
|
|
// Direct PII identifiers
|
|
if matches!(lower.as_str(),
|
|
"ssn" | "social_security" | "social_security_number" |
|
|
"sin" | "national_id" | "passport" | "passport_number" |
|
|
"drivers_license" | "driver_license" | "dl_number"
|
|
) {
|
|
return Some(Sensitivity::Pii);
|
|
}
|
|
|
|
// Names
|
|
if lower.contains("first_name") || lower.contains("last_name") ||
|
|
lower.contains("full_name") || lower.contains("middle_name") ||
|
|
lower == "name" || lower == "fname" || lower == "lname" {
|
|
return Some(Sensitivity::Pii);
|
|
}
|
|
|
|
// Contact info
|
|
if lower.contains("email") || lower.contains("e_mail") ||
|
|
lower.contains("phone") || lower.contains("mobile") || lower.contains("cell") ||
|
|
lower.contains("fax") || lower == "tel" || lower == "telephone" {
|
|
return Some(Sensitivity::Pii);
|
|
}
|
|
|
|
// Address
|
|
if lower.contains("address") || lower.contains("street") ||
|
|
(lower.contains("zip") && !lower.contains("unzip")) ||
|
|
lower == "postal_code" || lower == "postcode" {
|
|
return Some(Sensitivity::Pii);
|
|
}
|
|
|
|
// Financial
|
|
if lower.contains("salary") || lower.contains("wage") ||
|
|
lower.contains("pay_rate") || lower.contains("bill_rate") ||
|
|
lower.contains("compensation") || lower.contains("revenue") ||
|
|
lower.contains("bank_account") || lower.contains("routing_number") ||
|
|
lower.contains("credit_card") {
|
|
return Some(Sensitivity::Financial);
|
|
}
|
|
|
|
// Health
|
|
if lower.contains("diagnosis") || lower.contains("medication") ||
|
|
lower.contains("medical") || lower.contains("health") ||
|
|
lower.contains("patient_id") || lower.contains("mrn") {
|
|
return Some(Sensitivity::Phi);
|
|
}
|
|
|
|
// Date of birth
|
|
if lower == "dob" || lower == "date_of_birth" || lower == "birthdate" || lower == "birth_date" {
|
|
return Some(Sensitivity::Pii);
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Classify all columns and return the highest sensitivity found.
|
|
pub fn detect_dataset_sensitivity(column_names: &[&str]) -> Option<Sensitivity> {
|
|
let mut highest: Option<Sensitivity> = None;
|
|
|
|
for name in column_names {
|
|
if let Some(sens) = detect_sensitivity(name) {
|
|
highest = Some(match (&highest, &sens) {
|
|
(None, s) => s.clone(),
|
|
(Some(Sensitivity::Public), s) => s.clone(),
|
|
(Some(Sensitivity::Internal), s) if !matches!(s, Sensitivity::Public | Sensitivity::Internal) => s.clone(),
|
|
(Some(Sensitivity::Financial), Sensitivity::Pii | Sensitivity::Phi) => sens.clone(),
|
|
(Some(Sensitivity::Pii), Sensitivity::Phi) => Sensitivity::Phi,
|
|
(existing, _) => existing.clone().unwrap(),
|
|
});
|
|
}
|
|
}
|
|
|
|
highest
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn detects_email_as_pii() {
|
|
assert_eq!(detect_sensitivity("email"), Some(Sensitivity::Pii));
|
|
assert_eq!(detect_sensitivity("contact_email"), Some(Sensitivity::Pii));
|
|
}
|
|
|
|
#[test]
|
|
fn detects_salary_as_financial() {
|
|
assert_eq!(detect_sensitivity("salary"), Some(Sensitivity::Financial));
|
|
assert_eq!(detect_sensitivity("bill_rate"), Some(Sensitivity::Financial));
|
|
}
|
|
|
|
#[test]
|
|
fn detects_ssn() {
|
|
assert_eq!(detect_sensitivity("ssn"), Some(Sensitivity::Pii));
|
|
}
|
|
|
|
#[test]
|
|
fn non_sensitive_returns_none() {
|
|
assert_eq!(detect_sensitivity("status"), None);
|
|
assert_eq!(detect_sensitivity("created_at"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn dataset_sensitivity_picks_highest() {
|
|
let cols = vec!["id", "name", "email", "status"];
|
|
assert_eq!(detect_dataset_sensitivity(&cols), Some(Sensitivity::Pii));
|
|
}
|
|
}
|