golangLAKEHOUSE/internal/ingestd/csv_test.go

package ingestd

import (
	"bytes"
	"context"
	"strings"
	"testing"

	"github.com/apache/arrow-go/v18/arrow/array"
	"github.com/apache/arrow-go/v18/arrow/memory"
	"github.com/apache/arrow-go/v18/parquet/file"
	"github.com/apache/arrow-go/v18/parquet/pqarrow"
)

func TestIngestCSV_Basic(t *testing.T) {
	csvText := strings.Join([]string{
		"id,name,salary,active",
		"1,Alice,50000,true",
		"2,Bob,60000,false",
		"3,Carol,,true",
		"4,Dave,75000,",
	}, "\n")

	res, err := IngestCSV(strings.NewReader(csvText), 0, 0)
	if err != nil {
		t.Fatalf("IngestCSV: %v", err)
	}

	if res.RowCount != 4 {
		t.Errorf("RowCount: got %d, want 4", res.RowCount)
	}
	if len(res.Schema) != 4 {
		t.Fatalf("schema cols: got %d, want 4", len(res.Schema))
	}

	want := []ColumnSpec{
		{Name: "id", Type: TypeInt64, Nullable: false},
		{Name: "name", Type: TypeString, Nullable: false},
		{Name: "salary", Type: TypeInt64, Nullable: true}, // empty cell on row 3
		{Name: "active", Type: TypeBool, Nullable: true},  // empty cell on row 4
	}
	for i, w := range want {
		if res.Schema[i] != w {
			t.Errorf("col %d: got %+v, want %+v", i, res.Schema[i], w)
		}
	}

	// Round-trip through the pqarrow reader.
	tbl, err := readParquetTable(res.Parquet)
	if err != nil {
		t.Fatalf("read parquet: %v", err)
	}
	defer tbl.Release()
	if tbl.NumRows() != 4 {
		t.Errorf("parquet rows: got %d, want 4", tbl.NumRows())
	}
}

func TestIngestCSV_StringFallback(t *testing.T) {
	// Per ADR-010: "salary" with mixed values → string.
	csvText := "id,salary\n1,50000\n2,N/A\n3,60000\n"
	res, err := IngestCSV(strings.NewReader(csvText), 0, 0)
	if err != nil {
		t.Fatal(err)
	}
	if res.Schema[1].Type != TypeString {
		t.Errorf("salary fell to %s, want string", res.Schema[1].Type)
	}
}

func TestIngestCSV_BatchBoundary(t *testing.T) {
	// 17 rows, batch size 5 → 4 batches (5+5+5+2). Tests the trailing-
	// partial-batch flush + the schema sample being smaller than rows
	// in the file.
	var sb strings.Builder
	sb.WriteString("id\n")
	for i := 0; i < 17; i++ {
		sb.WriteString("1\n")
	}
	res, err := IngestCSV(strings.NewReader(sb.String()), 5, 5)
	if err != nil {
		t.Fatal(err)
	}
	if res.RowCount != 17 {
		t.Errorf("RowCount: got %d, want 17", res.RowCount)
	}
}

func TestIngestCSV_EmptyFile(t *testing.T) {
	if _, err := IngestCSV(strings.NewReader(""), 0, 0); err == nil {
		t.Error("empty CSV should error")
	}
}

func TestIngestCSV_HeaderOnly(t *testing.T) {
	res, err := IngestCSV(strings.NewReader("a,b,c\n"), 0, 0)
	if err != nil {
		t.Fatal(err)
	}
	if res.RowCount != 0 {
		t.Errorf("RowCount: got %d, want 0", res.RowCount)
	}
	// All-empty samples → all string columns per inferColumnType.
	for _, c := range res.Schema {
		if c.Type != TypeString {
			t.Errorf("col %q with no samples: got %s, want string", c.Name, c.Type)
		}
	}
}

// readParquetTable is a small test helper.
func readParquetTable(b []byte) (interface{ NumRows() int64; Release() }, error) {
	rdr, err := file.NewParquetReader(bytes.NewReader(b))
	if err != nil {
		return nil, err
	}
	pr, err := pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, memory.NewGoAllocator())
	if err != nil {
		return nil, err
	}
	tbl, err := pr.ReadTable(context.Background())
	if err != nil {
		return nil, err
	}
	return tbl, nil
}

var _ = array.NewRecordBuilder // keep import for the symbol path