Session infrastructure: OpenRouter + tree-split reducer + observer→LLM Team + scrum_applier #11
65
scripts/staffing/fixup_phone_type.py
Normal file
65
scripts/staffing/fixup_phone_type.py
Normal file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
fixup_phone_type.py — Decision D from the synthetic-data gap report.
|
||||
|
||||
Converts workers_500k.parquet `phone` column from int64 → string. Phones
|
||||
in this dataset are 11-digit US numbers (1 + area + 7), e.g. 13122277740.
|
||||
Stored as int64, the column compares fine numerically but breaks join
|
||||
keys with string-typed phone columns elsewhere (formatted "+1...", or
|
||||
loaded from a CSV).
|
||||
|
||||
Backs up the original to workers_500k.parquet.bak-<date> before write.
|
||||
Idempotent: detects when the fix has already been applied and exits 0.
|
||||
|
||||
Usage:
|
||||
python3 scripts/staffing/fixup_phone_type.py
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
TARGET = REPO / "data" / "datasets" / "workers_500k.parquet"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not TARGET.exists():
|
||||
print(f"missing: {TARGET}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
table = pq.read_table(TARGET)
|
||||
phone_field = table.schema.field("phone")
|
||||
if phone_field.type == pa.string():
|
||||
print(f"phone is already string — no-op")
|
||||
return 0
|
||||
|
||||
today = dt.date.today().isoformat()
|
||||
backup = TARGET.with_suffix(f".parquet.bak-{today}")
|
||||
if not backup.exists():
|
||||
shutil.copy2(TARGET, backup)
|
||||
print(f"backup: {backup.relative_to(REPO)}")
|
||||
|
||||
phone_str = pc.cast(table["phone"], pa.string())
|
||||
new_table = table.set_column(
|
||||
table.schema.get_field_index("phone"),
|
||||
pa.field("phone", pa.string()),
|
||||
phone_str,
|
||||
)
|
||||
|
||||
pq.write_table(new_table, TARGET, compression="snappy")
|
||||
rounds_trip = pq.read_table(TARGET, columns=["phone"])
|
||||
sample = rounds_trip["phone"].slice(0, 3).to_pylist()
|
||||
print(f"wrote: {TARGET.relative_to(REPO)}")
|
||||
print(f"phone type: {rounds_trip.schema.field('phone').type}")
|
||||
print(f"sample: {sample}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
x
Reference in New Issue
Block a user