package ingestd import ( "bytes" "context" "strings" "testing" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/parquet/file" "github.com/apache/arrow-go/v18/parquet/pqarrow" ) func TestIngestCSV_Basic(t *testing.T) { csvText := strings.Join([]string{ "id,name,salary,active", "1,Alice,50000,true", "2,Bob,60000,false", "3,Carol,,true", "4,Dave,75000,", }, "\n") res, err := IngestCSV(strings.NewReader(csvText), 0, 0) if err != nil { t.Fatalf("IngestCSV: %v", err) } if res.RowCount != 4 { t.Errorf("RowCount: got %d, want 4", res.RowCount) } if len(res.Schema) != 4 { t.Fatalf("schema cols: got %d, want 4", len(res.Schema)) } want := []ColumnSpec{ {Name: "id", Type: TypeInt64, Nullable: false}, {Name: "name", Type: TypeString, Nullable: false}, {Name: "salary", Type: TypeInt64, Nullable: true}, // empty cell on row 3 {Name: "active", Type: TypeBool, Nullable: true}, // empty cell on row 4 } for i, w := range want { if res.Schema[i] != w { t.Errorf("col %d: got %+v, want %+v", i, res.Schema[i], w) } } // Round-trip through the pqarrow reader. tbl, err := readParquetTable(res.Parquet) if err != nil { t.Fatalf("read parquet: %v", err) } defer tbl.Release() if tbl.NumRows() != 4 { t.Errorf("parquet rows: got %d, want 4", tbl.NumRows()) } } func TestIngestCSV_StringFallback(t *testing.T) { // Per ADR-010: "salary" with mixed values → string. csvText := "id,salary\n1,50000\n2,N/A\n3,60000\n" res, err := IngestCSV(strings.NewReader(csvText), 0, 0) if err != nil { t.Fatal(err) } if res.Schema[1].Type != TypeString { t.Errorf("salary fell to %s, want string", res.Schema[1].Type) } } func TestIngestCSV_BatchBoundary(t *testing.T) { // 17 rows, batch size 5 → 4 batches (5+5+5+2). Tests the trailing- // partial-batch flush + the schema sample being smaller than rows // in the file. var sb strings.Builder sb.WriteString("id\n") for i := 0; i < 17; i++ { sb.WriteString("1\n") } res, err := IngestCSV(strings.NewReader(sb.String()), 5, 5) if err != nil { t.Fatal(err) } if res.RowCount != 17 { t.Errorf("RowCount: got %d, want 17", res.RowCount) } } func TestIngestCSV_EmptyFile(t *testing.T) { if _, err := IngestCSV(strings.NewReader(""), 0, 0); err == nil { t.Error("empty CSV should error") } } func TestIngestCSV_HeaderOnly(t *testing.T) { res, err := IngestCSV(strings.NewReader("a,b,c\n"), 0, 0) if err != nil { t.Fatal(err) } if res.RowCount != 0 { t.Errorf("RowCount: got %d, want 0", res.RowCount) } // All-empty samples → all string columns per inferColumnType. for _, c := range res.Schema { if c.Type != TypeString { t.Errorf("col %q with no samples: got %s, want string", c.Name, c.Type) } } } // readParquetTable is a small test helper. func readParquetTable(b []byte) (interface{ NumRows() int64; Release() }, error) { rdr, err := file.NewParquetReader(bytes.NewReader(b)) if err != nil { return nil, err } pr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, memory.NewGoAllocator()) if err != nil { return nil, err } tbl, err := pr.ReadTable(context.Background()) if err != nil { return nil, err } return tbl, nil } var _ = array.NewRecordBuilder // keep import for the symbol path