From 940737daa77ba6ef08c3099336023fe21b7b4a96 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 10:45:38 -0500 Subject: [PATCH] =?UTF-8?q?staffing:=20D=20=E2=80=94=20workers=5F500k.phon?= =?UTF-8?q?e=20int=20=E2=86=92=20string=20fixup=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decision D from reports/staffing/synthetic-data-gap-report.md ยง7. Phones in workers_500k.parquet are 11-digit US numbers stored as int64 (e.g. 13122277740). Numerically fine, but breaks join keys against any other source that carries phone as string. Script casts the column to string in place, with non-destructive backup at data/datasets/workers_500k.parquet.bak- before write. Idempotent: if phone is already string, exits 0 with "no-op". Safe to re-run. The .parquet itself is too large to commit (75MB) and follows project convention of staying out of git. The script makes the conversion reproducible from the source dataset. --- scripts/staffing/fixup_phone_type.py | 65 ++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 scripts/staffing/fixup_phone_type.py diff --git a/scripts/staffing/fixup_phone_type.py b/scripts/staffing/fixup_phone_type.py new file mode 100644 index 0000000..2962efd --- /dev/null +++ b/scripts/staffing/fixup_phone_type.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +fixup_phone_type.py โ€” Decision D from the synthetic-data gap report. + +Converts workers_500k.parquet `phone` column from int64 โ†’ string. Phones +in this dataset are 11-digit US numbers (1 + area + 7), e.g. 13122277740. +Stored as int64, the column compares fine numerically but breaks join +keys with string-typed phone columns elsewhere (formatted "+1...", or +loaded from a CSV). + +Backs up the original to workers_500k.parquet.bak- before write. +Idempotent: detects when the fix has already been applied and exits 0. + +Usage: + python3 scripts/staffing/fixup_phone_type.py +""" + +import datetime as dt +import shutil +import sys +from pathlib import Path + +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.parquet as pq + +REPO = Path(__file__).resolve().parents[2] +TARGET = REPO / "data" / "datasets" / "workers_500k.parquet" + + +def main() -> int: + if not TARGET.exists(): + print(f"missing: {TARGET}", file=sys.stderr) + return 1 + + table = pq.read_table(TARGET) + phone_field = table.schema.field("phone") + if phone_field.type == pa.string(): + print(f"phone is already string โ€” no-op") + return 0 + + today = dt.date.today().isoformat() + backup = TARGET.with_suffix(f".parquet.bak-{today}") + if not backup.exists(): + shutil.copy2(TARGET, backup) + print(f"backup: {backup.relative_to(REPO)}") + + phone_str = pc.cast(table["phone"], pa.string()) + new_table = table.set_column( + table.schema.get_field_index("phone"), + pa.field("phone", pa.string()), + phone_str, + ) + + pq.write_table(new_table, TARGET, compression="snappy") + rounds_trip = pq.read_table(TARGET, columns=["phone"]) + sample = rounds_trip["phone"].slice(0, 3).to_pylist() + print(f"wrote: {TARGET.relative_to(REPO)}") + print(f"phone type: {rounds_trip.schema.field('phone').type}") + print(f"sample: {sample}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())