use sha2::{Digest, Sha256}; /// Detected file type from content inspection. #[derive(Debug, Clone, PartialEq)] pub enum FileType { Csv, Json, NdJson, // newline-delimited JSON Pdf, Text, // plain text, SMS logs, etc. Unknown, } /// Detect file type from filename extension and content sniffing. pub fn detect_file_type(filename: &str, content: &[u8]) -> FileType { // Extension-based first let lower = filename.to_lowercase(); if lower.ends_with(".csv") || lower.ends_with(".tsv") { return FileType::Csv; } if lower.ends_with(".json") { // Check if it's newline-delimited JSON if content.iter().take(4096).filter(|&&b| b == b'\n').count() > 2 { let first_line = content.split(|&b| b == b'\n').next().unwrap_or(b""); if first_line.starts_with(b"{") { return FileType::NdJson; } } return FileType::Json; } if lower.ends_with(".ndjson") || lower.ends_with(".jsonl") { return FileType::NdJson; } if lower.ends_with(".pdf") { return FileType::Pdf; } if lower.ends_with(".txt") || lower.ends_with(".log") || lower.ends_with(".sms") { return FileType::Text; } // Content sniffing fallback if content.starts_with(b"%PDF") { return FileType::Pdf; } if content.starts_with(b"[") || content.starts_with(b"{") { return FileType::Json; } // Check if it looks like CSV (has commas and newlines in first chunk) let sample = &content[..content.len().min(4096)]; let comma_count = sample.iter().filter(|&&b| b == b',').count(); let newline_count = sample.iter().filter(|&&b| b == b'\n').count(); if comma_count > 3 && newline_count > 1 { return FileType::Csv; } // If it's valid UTF-8, treat as text if std::str::from_utf8(sample).is_ok() { return FileType::Text; } FileType::Unknown } /// Compute SHA-256 hash of content for deduplication. pub fn content_hash(content: &[u8]) -> String { let mut hasher = Sha256::new(); hasher.update(content); format!("{:x}", hasher.finalize()) } #[cfg(test)] mod tests { use super::*; #[test] fn detect_csv_by_extension() { assert_eq!(detect_file_type("data.csv", b"a,b,c\n1,2,3"), FileType::Csv); } #[test] fn detect_json_by_extension() { assert_eq!(detect_file_type("data.json", b"[{\"a\":1}]"), FileType::Json); } #[test] fn detect_pdf_by_magic() { assert_eq!(detect_file_type("unknown", b"%PDF-1.4 blah"), FileType::Pdf); } #[test] fn detect_csv_by_content() { let csv = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n"; assert_eq!(detect_file_type("unknown.dat", csv), FileType::Csv); } #[test] fn content_hash_deterministic() { let h1 = content_hash(b"hello world"); let h2 = content_hash(b"hello world"); assert_eq!(h1, h2); } }