lakehouse/crates/ingestd/src/csv_ingest.rs
root bb05c4412e Phase 6: Ingest pipeline — CSV, JSON, PDF, text file support
- ingestd crate: detect file type → parse → schema detection → Parquet → catalog
- CSV: auto-detect column types (int, float, bool, string), handles $, %, commas
  Strips dollar signs from amounts, flexible row parsing, sanitized column names
- JSON: array or newline-delimited, nested object flattening (a.b.c → a_b_c)
- PDF: text extraction via lopdf, one row per page (source_file, page_number, text)
- Text/SMS: line-based ingestion with line numbers
- Dedup: SHA-256 content hash, re-ingest same file = no-op
- Gateway: POST /ingest/file multipart upload, 256MB body limit
- Schema detection per ADR-010: ambiguous types default to String
- 12 unit tests passing (CSV parsing, JSON flattening, type inference, dedup)
- Tested: messy CSV with missing data, dollar amounts, N/A values → queryable

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 08:07:31 -05:00

204 lines
7.0 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, BooleanArray};
use arrow::datatypes::{DataType, Field, Schema};
use arrow::record_batch::RecordBatch;
use std::sync::Arc;
/// Inferred column type from sampling data.
#[derive(Debug, Clone, PartialEq)]
enum InferredType {
Integer,
Float,
Boolean,
String,
}
/// Parse CSV bytes into Arrow RecordBatches with automatic schema detection.
/// Per ADR-010: ambiguous types default to String.
pub fn parse_csv(content: &[u8]) -> Result<(Arc<Schema>, Vec<RecordBatch>), String> {
let mut reader = csv::ReaderBuilder::new()
.flexible(true) // allow varying column counts
.trim(csv::Trim::All)
.from_reader(content);
let headers: Vec<String> = reader.headers()
.map_err(|e| format!("CSV header error: {e}"))?
.iter()
.enumerate()
.map(|(i, h)| {
let h = h.trim().to_string();
if h.is_empty() { format!("column_{i}") } else { sanitize_column_name(&h) }
})
.collect();
let n_cols = headers.len();
if n_cols == 0 {
return Err("CSV has no columns".into());
}
// Read all rows into string columns
let mut columns: Vec<Vec<String>> = vec![vec![]; n_cols];
let mut row_count = 0;
for result in reader.records() {
let record = result.map_err(|e| format!("CSV row error: {e}"))?;
for (i, field) in record.iter().enumerate() {
if i < n_cols {
columns[i].push(field.trim().to_string());
}
}
// Pad short rows with empty strings
for col in columns.iter_mut().skip(record.len().min(n_cols)) {
col.push(String::new());
}
row_count += 1;
}
if row_count == 0 {
return Err("CSV has no data rows".into());
}
tracing::info!("parsed CSV: {row_count} rows × {n_cols} columns");
// Infer types by sampling (look at all values)
let types: Vec<InferredType> = columns.iter().map(|col| infer_column_type(col)).collect();
// Build Arrow schema
let fields: Vec<Field> = headers.iter().zip(types.iter()).map(|(name, typ)| {
let dt = match typ {
InferredType::Integer => DataType::Int64,
InferredType::Float => DataType::Float64,
InferredType::Boolean => DataType::Boolean,
InferredType::String => DataType::Utf8,
};
Field::new(name, dt, true) // all nullable
}).collect();
let schema = Arc::new(Schema::new(fields));
// Build arrays
let arrays: Vec<ArrayRef> = columns.iter().zip(types.iter()).map(|(col, typ)| {
match typ {
InferredType::Integer => {
let vals: Vec<Option<i64>> = col.iter().map(|v| {
if v.is_empty() { None } else { v.replace(',', "").parse().ok() }
}).collect();
Arc::new(Int64Array::from(vals)) as ArrayRef
}
InferredType::Float => {
let vals: Vec<Option<f64>> = col.iter().map(|v| {
if v.is_empty() { None }
else { v.replace(',', "").replace('$', "").replace('%', "").parse().ok() }
}).collect();
Arc::new(Float64Array::from(vals)) as ArrayRef
}
InferredType::Boolean => {
let vals: Vec<Option<bool>> = col.iter().map(|v| {
match v.to_lowercase().as_str() {
"true" | "yes" | "1" | "y" | "t" => Some(true),
"false" | "no" | "0" | "n" | "f" => Some(false),
_ => None,
}
}).collect();
Arc::new(BooleanArray::from(vals)) as ArrayRef
}
InferredType::String => {
Arc::new(StringArray::from(col.clone())) as ArrayRef
}
}
}).collect();
let batch = RecordBatch::try_new(schema.clone(), arrays)
.map_err(|e| format!("RecordBatch error: {e}"))?;
Ok((schema, vec![batch]))
}
/// Infer column type from values. Conservative: defaults to String on ambiguity.
fn infer_column_type(values: &[String]) -> InferredType {
let non_empty: Vec<&str> = values.iter()
.map(|v| v.as_str())
.filter(|v| !v.is_empty() && *v != "NULL" && *v != "null" && *v != "N/A" && *v != "n/a")
.collect();
if non_empty.is_empty() {
return InferredType::String;
}
// Check boolean
let all_bool = non_empty.iter().all(|v| {
matches!(v.to_lowercase().as_str(), "true" | "false" | "yes" | "no" | "1" | "0" | "y" | "n" | "t" | "f")
});
if all_bool && non_empty.len() >= 2 {
// Make sure it's not just all "1" and "0" which could be integers
let has_text_bool = non_empty.iter().any(|v| {
matches!(v.to_lowercase().as_str(), "true" | "false" | "yes" | "no" | "y" | "n" | "t" | "f")
});
if has_text_bool {
return InferredType::Boolean;
}
}
// Check integer (allow commas as thousands separator)
let int_rate = non_empty.iter()
.filter(|v| v.replace(',', "").parse::<i64>().is_ok())
.count() as f64 / non_empty.len() as f64;
if int_rate > 0.95 {
return InferredType::Integer;
}
// Check float (allow $, %, commas)
let float_rate = non_empty.iter()
.filter(|v| v.replace(',', "").replace('$', "").replace('%', "").parse::<f64>().is_ok())
.count() as f64 / non_empty.len() as f64;
if float_rate > 0.95 {
return InferredType::Float;
}
InferredType::String
}
/// Sanitize column name for SQL compatibility.
fn sanitize_column_name(name: &str) -> String {
name.chars()
.map(|c| if c.is_alphanumeric() || c == '_' { c.to_ascii_lowercase() } else { '_' })
.collect::<String>()
.trim_matches('_')
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_simple_csv() {
let csv = b"Name,Age,Salary\nAlice,30,50000\nBob,25,45000\n";
let (schema, batches) = parse_csv(csv).unwrap();
assert_eq!(schema.fields().len(), 3);
assert_eq!(batches[0].num_rows(), 2);
assert_eq!(schema.field(1).data_type(), &DataType::Int64);
assert_eq!(schema.field(2).data_type(), &DataType::Int64);
}
#[test]
fn parse_csv_with_mixed_types() {
let csv = b"id,value\n1,hello\n2,world\n3,N/A\n";
let (schema, _) = parse_csv(csv).unwrap();
assert_eq!(schema.field(0).data_type(), &DataType::Int64);
assert_eq!(schema.field(1).data_type(), &DataType::Utf8);
}
#[test]
fn parse_csv_with_dollar_amounts() {
let csv = b"item,price\nWidget,$29.99\nGadget,$149.50\n";
let (schema, _) = parse_csv(csv).unwrap();
assert_eq!(schema.field(1).data_type(), &DataType::Float64);
}
#[test]
fn sanitize_names() {
assert_eq!(sanitize_column_name("First Name"), "first_name");
assert_eq!(sanitize_column_name("Bill Rate ($)"), "bill_rate");
}
}