lakehouse/crates/ingestd/src/detect.rs

use sha2::{Digest, Sha256};

/// Detected file type from content inspection.
#[derive(Debug, Clone, PartialEq)]
pub enum FileType {
    Csv,
    Json,
    NdJson, // newline-delimited JSON
    Pdf,
    Text,   // plain text, SMS logs, etc.
    Unknown,
}

/// Detect file type from filename extension and content sniffing.
pub fn detect_file_type(filename: &str, content: &[u8]) -> FileType {
    // Extension-based first
    let lower = filename.to_lowercase();
    if lower.ends_with(".csv") || lower.ends_with(".tsv") {
        return FileType::Csv;
    }
    if lower.ends_with(".json") {
        // Check if it's newline-delimited JSON
        if content.iter().take(4096).filter(|&&b| b == b'\n').count() > 2 {
            let first_line = content.split(|&b| b == b'\n').next().unwrap_or(b"");
            if first_line.starts_with(b"{") {
                return FileType::NdJson;
            }
        }
        return FileType::Json;
    }
    if lower.ends_with(".ndjson") || lower.ends_with(".jsonl") {
        return FileType::NdJson;
    }
    if lower.ends_with(".pdf") {
        return FileType::Pdf;
    }
    if lower.ends_with(".txt") || lower.ends_with(".log") || lower.ends_with(".sms") {
        return FileType::Text;
    }

    // Content sniffing fallback
    if content.starts_with(b"%PDF") {
        return FileType::Pdf;
    }
    if content.starts_with(b"[") || content.starts_with(b"{") {
        return FileType::Json;
    }

    // Check if it looks like CSV (has commas and newlines in first chunk)
    let sample = &content[..content.len().min(4096)];
    let comma_count = sample.iter().filter(|&&b| b == b',').count();
    let newline_count = sample.iter().filter(|&&b| b == b'\n').count();
    if comma_count > 3 && newline_count > 1 {
        return FileType::Csv;
    }

    // If it's valid UTF-8, treat as text
    if std::str::from_utf8(sample).is_ok() {
        return FileType::Text;
    }

    FileType::Unknown
}

/// Compute SHA-256 hash of content for deduplication.
pub fn content_hash(content: &[u8]) -> String {
    let mut hasher = Sha256::new();
    hasher.update(content);
    format!("{:x}", hasher.finalize())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn detect_csv_by_extension() {
        assert_eq!(detect_file_type("data.csv", b"a,b,c\n1,2,3"), FileType::Csv);
    }

    #[test]
    fn detect_json_by_extension() {
        assert_eq!(detect_file_type("data.json", b"[{\"a\":1}]"), FileType::Json);
    }

    #[test]
    fn detect_pdf_by_magic() {
        assert_eq!(detect_file_type("unknown", b"%PDF-1.4 blah"), FileType::Pdf);
    }

    #[test]
    fn detect_csv_by_content() {
        let csv = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
        assert_eq!(detect_file_type("unknown.dat", csv), FileType::Csv);
    }

    #[test]
    fn content_hash_deterministic() {
        let h1 = content_hash(b"hello world");
        let h2 = content_hash(b"hello world");
        assert_eq!(h1, h2);
    }
}