diff --git a/scripts/staffing/fixup_phone_type.py b/scripts/staffing/fixup_phone_type.py new file mode 100644 index 0000000..2962efd --- /dev/null +++ b/scripts/staffing/fixup_phone_type.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +fixup_phone_type.py — Decision D from the synthetic-data gap report. + +Converts workers_500k.parquet `phone` column from int64 → string. Phones +in this dataset are 11-digit US numbers (1 + area + 7), e.g. 13122277740. +Stored as int64, the column compares fine numerically but breaks join +keys with string-typed phone columns elsewhere (formatted "+1...", or +loaded from a CSV). + +Backs up the original to workers_500k.parquet.bak- before write. +Idempotent: detects when the fix has already been applied and exits 0. + +Usage: + python3 scripts/staffing/fixup_phone_type.py +""" + +import datetime as dt +import shutil +import sys +from pathlib import Path + +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.parquet as pq + +REPO = Path(__file__).resolve().parents[2] +TARGET = REPO / "data" / "datasets" / "workers_500k.parquet" + + +def main() -> int: + if not TARGET.exists(): + print(f"missing: {TARGET}", file=sys.stderr) + return 1 + + table = pq.read_table(TARGET) + phone_field = table.schema.field("phone") + if phone_field.type == pa.string(): + print(f"phone is already string — no-op") + return 0 + + today = dt.date.today().isoformat() + backup = TARGET.with_suffix(f".parquet.bak-{today}") + if not backup.exists(): + shutil.copy2(TARGET, backup) + print(f"backup: {backup.relative_to(REPO)}") + + phone_str = pc.cast(table["phone"], pa.string()) + new_table = table.set_column( + table.schema.get_field_index("phone"), + pa.field("phone", pa.string()), + phone_str, + ) + + pq.write_table(new_table, TARGET, compression="snappy") + rounds_trip = pq.read_table(TARGET, columns=["phone"]) + sample = rounds_trip["phone"].slice(0, 3).to_pylist() + print(f"wrote: {TARGET.relative_to(REPO)}") + print(f"phone type: {rounds_trip.schema.field('phone').type}") + print(f"sample: {sample}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())