use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, BooleanArray}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; use std::collections::BTreeMap; use std::sync::Arc; /// Parse JSON (array of objects or newline-delimited) into Arrow RecordBatches. /// Nested objects are flattened: {"a": {"b": 1}} → column "a_b". pub fn parse_json(content: &[u8]) -> Result<(Arc, Vec), String> { let text = std::str::from_utf8(content).map_err(|e| format!("invalid UTF-8: {e}"))?; let text = text.trim(); // Parse into Vec let rows: Vec> = if text.starts_with('[') { // JSON array let arr: Vec = serde_json::from_str(text) .map_err(|e| format!("JSON parse error: {e}"))?; arr.into_iter() .filter_map(|v| v.as_object().cloned()) .collect() } else { // Newline-delimited JSON text.lines() .filter(|l| !l.trim().is_empty()) .map(|l| { let v: serde_json::Value = serde_json::from_str(l) .map_err(|e| format!("NDJSON line parse error: {e}"))?; v.as_object().cloned().ok_or_else(|| "NDJSON line is not an object".into()) }) .collect::, String>>()? }; if rows.is_empty() { return Err("JSON has no records".into()); } // Flatten and collect all columns let mut columns: BTreeMap> = BTreeMap::new(); let n_rows = rows.len(); for (row_idx, row) in rows.iter().enumerate() { let flat = flatten_object(row, ""); // Pad existing columns that aren't in this row for key in columns.keys().cloned().collect::>() { if !flat.contains_key(&key) { columns.get_mut(&key).unwrap().push(serde_json::Value::Null); } } // Add new columns or append values for (key, val) in flat { let col = columns.entry(key).or_insert_with(Vec::new); // Pad with nulls if this column is new and we've seen prior rows while col.len() < row_idx { col.push(serde_json::Value::Null); } col.push(val); } } // Pad short columns for col in columns.values_mut() { while col.len() < n_rows { col.push(serde_json::Value::Null); } } tracing::info!("parsed JSON: {n_rows} rows × {} columns", columns.len()); // Infer types and build schema let mut fields = Vec::new(); let mut arrays: Vec = Vec::new(); for (name, values) in &columns { let (dt, array) = json_column_to_arrow(values); fields.push(Field::new(name, dt, true)); arrays.push(array); } let schema = Arc::new(Schema::new(fields)); let batch = RecordBatch::try_new(schema.clone(), arrays) .map_err(|e| format!("RecordBatch error: {e}"))?; Ok((schema, vec![batch])) } /// Flatten nested JSON object: {"a": {"b": 1}} → {"a_b": 1} fn flatten_object(obj: &serde_json::Map, prefix: &str) -> BTreeMap { let mut result = BTreeMap::new(); for (key, val) in obj { let full_key = if prefix.is_empty() { key.clone() } else { format!("{prefix}_{key}") }; match val { serde_json::Value::Object(inner) => { result.extend(flatten_object(inner, &full_key)); } other => { result.insert(full_key, other.clone()); } } } result } /// Convert a column of JSON values to an Arrow array. fn json_column_to_arrow(values: &[serde_json::Value]) -> (DataType, ArrayRef) { // Check if all non-null values are the same type let non_null: Vec<&serde_json::Value> = values.iter() .filter(|v| !v.is_null()) .collect(); if non_null.is_empty() { return (DataType::Utf8, Arc::new(StringArray::from(vec![None::<&str>; values.len()]))); } let all_bool = non_null.iter().all(|v| v.is_boolean()); let all_i64 = non_null.iter().all(|v| v.is_i64() || v.is_u64()); let all_f64 = non_null.iter().all(|v| v.is_number()); if all_bool { let arr: Vec> = values.iter().map(|v| v.as_bool()).collect(); (DataType::Boolean, Arc::new(BooleanArray::from(arr))) } else if all_i64 { let arr: Vec> = values.iter().map(|v| v.as_i64()).collect(); (DataType::Int64, Arc::new(Int64Array::from(arr))) } else if all_f64 { let arr: Vec> = values.iter().map(|v| v.as_f64()).collect(); (DataType::Float64, Arc::new(Float64Array::from(arr))) } else { // Default to string let arr: Vec> = values.iter().map(|v| { match v { serde_json::Value::Null => None, serde_json::Value::String(s) => Some(s.clone()), other => Some(other.to_string()), } }).collect(); (DataType::Utf8, Arc::new(StringArray::from(arr))) } } #[cfg(test)] mod tests { use super::*; #[test] fn parse_json_array() { let json = br#"[{"name":"Alice","age":30},{"name":"Bob","age":25}]"#; let (schema, batches) = parse_json(json).unwrap(); assert_eq!(batches[0].num_rows(), 2); assert!(schema.field_with_name("age").unwrap().data_type() == &DataType::Int64); } #[test] fn parse_ndjson() { let json = b"{\"x\":1}\n{\"x\":2}\n{\"x\":3}\n"; let (_, batches) = parse_json(json).unwrap(); assert_eq!(batches[0].num_rows(), 3); } #[test] fn flatten_nested() { let json = br#"[{"user":{"name":"Alice","address":{"city":"NYC"}},"score":9.5}]"#; let (schema, _) = parse_json(json).unwrap(); let names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); assert!(names.contains(&"user_name")); assert!(names.contains(&"user_address_city")); } }