golangLAKEHOUSE/internal/ingestd/schema_test.go

package ingestd

import (
	"testing"
)

func TestInferSchema_CleanInts(t *testing.T) {
	headers := []string{"id", "count"}
	samples := [][]string{{"1", "100"}, {"2", "200"}, {"3", "300"}}
	got, err := InferSchema(headers, samples)
	if err != nil {
		t.Fatal(err)
	}
	for _, c := range got {
		if c.Type != TypeInt64 {
			t.Errorf("%s: got %s, want int64", c.Name, c.Type)
		}
		if c.Nullable {
			t.Errorf("%s should not be nullable", c.Name)
		}
	}
}

func TestInferSchema_FloatColumns(t *testing.T) {
	headers := []string{"price", "weight"}
	samples := [][]string{{"1.5", "2.0"}, {"100", "3.14"}, {"0.0", "0"}}
	got, _ := InferSchema(headers, samples)
	// "price" has 1.5 + "100" + 0.0 → float64 (one of the values isn't int-parseable in 1.5)
	if got[0].Type != TypeFloat64 {
		t.Errorf("price: got %s, want float64", got[0].Type)
	}
	if got[1].Type != TypeFloat64 {
		t.Errorf("weight: got %s, want float64", got[1].Type)
	}
}

func TestInferSchema_AmbiguousFallsToString(t *testing.T) {
	// ADR-010: a column with "123", "N/A", and "" is a string, not int.
	headers := []string{"salary"}
	samples := [][]string{{"50000"}, {"N/A"}, {"60000"}, {""}}
	got, _ := InferSchema(headers, samples)
	if got[0].Type != TypeString {
		t.Errorf("salary: got %s, want string (ADR-010 fallback)", got[0].Type)
	}
	if !got[0].Nullable {
		t.Errorf("salary: should be nullable (saw empty cell)")
	}
}

func TestInferSchema_BoolLiterals(t *testing.T) {
	headers := []string{"active", "deleted"}
	samples := [][]string{{"true", "false"}, {"True", "False"}, {"TRUE", "FALSE"}}
	got, _ := InferSchema(headers, samples)
	if got[0].Type != TypeBool {
		t.Errorf("active: got %s, want bool", got[0].Type)
	}
	if got[1].Type != TypeBool {
		t.Errorf("deleted: got %s, want bool", got[1].Type)
	}
}

func TestInferSchema_OneZeroIsInt_NotBool(t *testing.T) {
	// Keeps the type system honest — 1/0 columns in CRM data are
	// typically counts (children, certs), not flags.
	headers := []string{"children"}
	samples := [][]string{{"0"}, {"1"}, {"2"}, {"0"}}
	got, _ := InferSchema(headers, samples)
	if got[0].Type != TypeInt64 {
		t.Errorf("children: got %s, want int64 (1/0 is int, not bool)", got[0].Type)
	}
}

func TestInferSchema_EmptyHeader(t *testing.T) {
	if _, err := InferSchema(nil, nil); err == nil {
		t.Error("nil headers should error")
	}
	if _, err := InferSchema([]string{"valid", ""}, nil); err == nil {
		t.Error("empty header name should error")
	}
}

func TestFingerprint_Deterministic(t *testing.T) {
	s1, _ := InferSchema([]string{"id", "name"}, [][]string{{"1", "alice"}})
	s2, _ := InferSchema([]string{"id", "name"}, [][]string{{"1", "alice"}})
	if s1.Fingerprint() != s2.Fingerprint() {
		t.Errorf("fingerprint not deterministic: %s vs %s", s1.Fingerprint(), s2.Fingerprint())
	}
}

func TestFingerprint_FlipsOnTypeChange(t *testing.T) {
	intSchema, _ := InferSchema([]string{"id"}, [][]string{{"1"}, {"2"}})
	strSchema, _ := InferSchema([]string{"id"}, [][]string{{"1"}, {"abc"}})
	if intSchema.Fingerprint() == strSchema.Fingerprint() {
		t.Error("fingerprint should flip when column type changes")
	}
}

func TestFingerprint_StableUnderNullable(t *testing.T) {
	// Adding null cells doesn't flip the fingerprint — it's only
	// about (name, type), not nullability.
	a, _ := InferSchema([]string{"id"}, [][]string{{"1"}, {"2"}})
	b, _ := InferSchema([]string{"id"}, [][]string{{"1"}, {"2"}, {""}})
	if a.Fingerprint() != b.Fingerprint() {
		t.Error("fingerprint shouldn't flip when nullability changes")
	}
}

func TestFingerprint_RespectsColumnOrder(t *testing.T) {
	// Same columns, swapped order → different fingerprint.
	a, _ := InferSchema([]string{"id", "name"}, [][]string{{"1", "x"}})
	b, _ := InferSchema([]string{"name", "id"}, [][]string{{"x", "1"}})
	if a.Fingerprint() == b.Fingerprint() {
		t.Error("fingerprint should be order-sensitive")
	}
}