Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11
65
scripts/staffing/fixup_phone_type.py
Normal file
65
scripts/staffing/fixup_phone_type.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
fixup_phone_type.py — Decision D from the synthetic-data gap report.
|
||||||
|
|
||||||
|
Converts workers_500k.parquet `phone` column from int64 → string. Phones
|
||||||
|
in this dataset are 11-digit US numbers (1 + area + 7), e.g. 13122277740.
|
||||||
|
Stored as int64, the column compares fine numerically but breaks join
|
||||||
|
keys with string-typed phone columns elsewhere (formatted "+1...", or
|
||||||
|
loaded from a CSV).
|
||||||
|
|
||||||
|
Backs up the original to workers_500k.parquet.bak-<date> before write.
|
||||||
|
Idempotent: detects when the fix has already been applied and exits 0.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/staffing/fixup_phone_type.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import datetime as dt
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
import pyarrow.compute as pc
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parents[2]
|
||||||
|
TARGET = REPO / "data" / "datasets" / "workers_500k.parquet"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if not TARGET.exists():
|
||||||
|
print(f"missing: {TARGET}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
table = pq.read_table(TARGET)
|
||||||
|
phone_field = table.schema.field("phone")
|
||||||
|
if phone_field.type == pa.string():
|
||||||
|
print(f"phone is already string — no-op")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
today = dt.date.today().isoformat()
|
||||||
|
backup = TARGET.with_suffix(f".parquet.bak-{today}")
|
||||||
|
if not backup.exists():
|
||||||
|
shutil.copy2(TARGET, backup)
|
||||||
|
print(f"backup: {backup.relative_to(REPO)}")
|
||||||
|
|
||||||
|
phone_str = pc.cast(table["phone"], pa.string())
|
||||||
|
new_table = table.set_column(
|
||||||
|
table.schema.get_field_index("phone"),
|
||||||
|
pa.field("phone", pa.string()),
|
||||||
|
phone_str,
|
||||||
|
)
|
||||||
|
|
||||||
|
pq.write_table(new_table, TARGET, compression="snappy")
|
||||||
|
rounds_trip = pq.read_table(TARGET, columns=["phone"])
|
||||||
|
sample = rounds_trip["phone"].slice(0, 3).to_pylist()
|
||||||
|
print(f"wrote: {TARGET.relative_to(REPO)}")
|
||||||
|
print(f"phone type: {rounds_trip.schema.field('phone').type}")
|
||||||
|
print(f"sample: {sample}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
x
Reference in New Issue
Block a user