lakehouse/scripts/staffing_demo.py

#!/usr/bin/env python3
"""
Realistic staffing company data generator.
Multiple source systems, overlapping data, real cross-reference problems.

Data sources (like a real staffing company):
  - ATS (Applicant Tracking System) → candidates
  - CRM → client companies + contacts
  - Job board → job orders with descriptions
  - Placements → who got placed where
  - Timesheets → hours worked, bill/pay rates
  - Phone system CDR → call detail records
  - Email logs → communication tracking
"""

import random, json, urllib.request, hashlib, string, time
from datetime import datetime, timedelta
import pyarrow as pa, pyarrow.parquet as pq

API = "http://localhost:3100"
random.seed(2026)

def upload(name, table):
    path = f"/tmp/{name}.parquet"
    pq.write_table(table, path, compression="snappy")
    with open(path, "rb") as f:
        data = f.read()
    key = f"datasets/{name}.parquet"
    req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT")
    urllib.request.urlopen(req)
    body = json.dumps({"name": name, "schema_fingerprint": "auto",
        "objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode()
    req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST",
        headers={"Content-Type": "application/json"})
    urllib.request.urlopen(req)
    print(f"  {name}: {table.num_rows:,} rows ({len(data)/1024:.0f} KB)")

# ============================================================
# Shared reference data
# ============================================================

first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth",
    "William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen",
    "Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra",
    "Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna",
    "Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah",
    "Ronald","Stephanie","Edward","Rebecca","Jason","Sharon","Jeffrey","Laura","Ryan","Cynthia",
    "Jacob","Kathleen","Gary","Amy","Nicholas","Angela","Eric","Shirley","Jonathan","Anna",
    "Stephen","Brenda","Larry","Pamela","Justin","Emma","Scott","Nicole","Brandon","Helen",
    "Benjamin","Samantha","Samuel","Katherine","Raymond","Christine","Gregory","Debra","Frank","Rachel",
    "Alexander","Carolyn","Patrick","Janet","Jack","Catherine","Dennis","Maria","Jerry","Heather"]

last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez",
    "Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin",
    "Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson",
    "Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores",
    "Green","Adams","Nelson","Baker","Hall","Rivera","Campbell","Mitchell","Carter","Roberts",
    "Gomez","Phillips","Evans","Turner","Diaz","Parker","Cruz","Edwards","Collins","Reyes",
    "Stewart","Morris","Morales","Murphy","Cook","Rogers","Gutierrez","Ortiz","Morgan","Cooper",
    "Peterson","Bailey","Reed","Kelly","Howard","Ramos","Kim","Cox","Ward","Richardson"]

cities_zips = [
    ("Chicago","IL","60601"),("Chicago","IL","60602"),("Chicago","IL","60603"),("Chicago","IL","60610"),
    ("Chicago","IL","60614"),("Chicago","IL","60616"),("Chicago","IL","60622"),("Chicago","IL","60647"),
    ("New York","NY","10001"),("New York","NY","10002"),("New York","NY","10003"),("New York","NY","10010"),
    ("New York","NY","10016"),("New York","NY","10019"),("New York","NY","10022"),("New York","NY","10036"),
    ("Los Angeles","CA","90001"),("Los Angeles","CA","90012"),("Los Angeles","CA","90024"),("Los Angeles","CA","90036"),
    ("Houston","TX","77001"),("Houston","TX","77002"),("Houston","TX","77003"),("Houston","TX","77019"),
    ("Dallas","TX","75201"),("Dallas","TX","75202"),("Dallas","TX","75204"),("Dallas","TX","75219"),
    ("Atlanta","GA","30301"),("Atlanta","GA","30303"),("Atlanta","GA","30305"),("Atlanta","GA","30309"),
    ("Denver","CO","80201"),("Denver","CO","80202"),("Denver","CO","80204"),("Denver","CO","80206"),
    ("Phoenix","AZ","85001"),("Phoenix","AZ","85003"),("Phoenix","AZ","85004"),("Phoenix","AZ","85006"),
    ("Seattle","WA","98101"),("Seattle","WA","98102"),("Seattle","WA","98103"),("Seattle","WA","98104"),
    ("Miami","FL","33101"),("Miami","FL","33125"),("Miami","FL","33130"),("Miami","FL","33132"),
]

skills_pool = {
    "IT": ["Java","Python","C#",".NET","JavaScript","TypeScript","React","Angular","Node.js","SQL",
           "AWS","Azure","GCP","Docker","Kubernetes","Linux","Git","REST APIs","GraphQL","MongoDB",
           "PostgreSQL","MySQL","Redis","Terraform","Jenkins","CI/CD","Agile","Scrum","DevOps","Microservices",
           "Spring Boot","Django","Flask","Ruby on Rails","Go","Rust","Swift","Kotlin","PHP","Vue.js"],
    "Healthcare": ["RN","LPN","CNA","BLS","ACLS","PALS","EMR","Epic","Cerner","Meditech",
                   "ICD-10","CPT","Medical Billing","Medical Coding","HIPAA","Phlebotomy","IV Therapy",
                   "Telemetry","ICU","OR","ER","Med-Surg","Labor & Delivery","Pediatrics","Oncology"],
    "Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","MIG","TIG","CNC","PLC","Blueprint Reading",
                   "Quality Control","Six Sigma","Lean Manufacturing","AutoCAD","SolidWorks","GD&T",
                   "Mechanical Assembly","Electrical","Hydraulics","Pneumatics","Warehouse"],
    "Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger",
                   "Financial Reporting","Tax Preparation","CPA","Payroll","Budgeting","Forecasting",
                   "Audit","Compliance","Excel Advanced","Power BI","Tableau","GAAP","SOX"],
    "Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Filing","Receptionist",
              "Executive Assistant","Travel Coordination","Calendar Management","SAP","Salesforce",
              "CRM","Multi-line Phone","Typing 60+ WPM","Notary","Bilingual Spanish"],
}

verticals = list(skills_pool.keys())
email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","aol.com","icloud.com","protonmail.com"]

def make_phone():
    return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}"

def make_email(first, last):
    sep = random.choice([".", "_", ""])
    num = random.choice(["", str(random.randint(1,99))])
    return f"{first.lower()}{sep}{last.lower()}{num}@{random.choice(email_domains)}"

base_date = datetime(2026, 1, 1)

# ============================================================
# 1. CANDIDATES — 15,000 from ATS
# ============================================================
print("Generating candidates (15K)...")

N_CAND = 15000
c_ids, c_first, c_last, c_emails, c_phones, c_phones_alt = [], [], [], [], [], []
c_city, c_state, c_zip = [], [], []
c_vertical, c_skills, c_resume_summary = [], [], []
c_status, c_source, c_pay_rate_min, c_created = [], [], [], []
c_availability, c_years_exp = [], []

for i in range(N_CAND):
    fn = random.choice(first_names)
    ln = random.choice(last_names)
    city, state, zipcode = random.choice(cities_zips)
    vert = random.choice(verticals)
    n_skills = random.randint(3, 12)
    sk = random.sample(skills_pool[vert], min(n_skills, len(skills_pool[vert])))
    yrs = random.randint(0, 25)

    resume = f"{fn} {ln} — {vert} professional with {yrs} years experience. "
    resume += f"Based in {city}, {state} {zipcode}. "
    resume += f"Key skills: {', '.join(sk)}. "
    resume += random.choice([
        f"Previously worked at {random.choice(['Acme Corp','TechFlow','GlobalStaff','MedPro','BuildRight'])} as a {random.choice(['Senior','Lead','Staff','Junior'])} {vert} specialist.",
        f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} opportunities in the {city} metro area.",
        f"Available {random.choice(['immediately','in 2 weeks','after current contract ends'])}. Open to {random.choice(['remote','hybrid','on-site'])} work.",
    ])

    c_ids.append(f"CAND-{i+1:05d}")
    c_first.append(fn)
    c_last.append(ln)
    c_emails.append(make_email(fn, ln))
    c_phones.append(make_phone())
    c_phones_alt.append(make_phone() if random.random() < 0.3 else "")
    c_city.append(city)
    c_state.append(state)
    c_zip.append(zipcode)
    c_vertical.append(vert)
    c_skills.append("|".join(sk))
    c_resume_summary.append(resume)
    c_status.append(random.choice(["active","active","active","active","inactive","do_not_contact","placed"]))
    c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","CareerBuilder","Website","Job Fair"]))
    c_pay_rate_min.append(round(random.uniform(12, 85), 2))
    c_created.append((base_date - timedelta(days=random.randint(0, 1095))).strftime("%Y-%m-%d"))
    c_availability.append(random.choice(["immediate","1_week","2_weeks","1_month","not_available"]))
    c_years_exp.append(yrs)

candidates = pa.table({
    "candidate_id": c_ids, "first_name": c_first, "last_name": c_last,
    "email": c_emails, "phone": c_phones, "phone_alt": c_phones_alt,
    "city": c_city, "state": c_state, "zip": c_zip,
    "vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume_summary,
    "status": c_status, "source": c_source, "min_pay_rate": c_pay_rate_min,
    "created_date": c_created, "availability": c_availability, "years_experience": c_years_exp,
})
upload("candidates", candidates)

# ============================================================
# 2. CLIENTS — 500 companies
# ============================================================
print("Generating clients (500)...")

company_prefixes = ["Apex","Summit","Core","First","National","Metro","Pacific","Atlantic","Central","Premier",
    "Global","United","Alliance","Pinnacle","Elite","Horizon","Pioneer","Titan","Quantum","Vertex"]
company_suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners",
    "Holdings","Enterprises","Manufacturing","Healthcare","Logistics","Financial","Engineering"]

cl_ids, cl_names, cl_verticals, cl_contacts, cl_contact_emails, cl_contact_phones = [], [], [], [], [], []
cl_city, cl_state, cl_zip, cl_bill_rate_avg, cl_status, cl_since = [], [], [], [], [], []

for i in range(500):
    name = f"{random.choice(company_prefixes)} {random.choice(company_suffixes)}"
    city, state, zipcode = random.choice(cities_zips)
    vert = random.choice(verticals)
    contact_fn = random.choice(first_names)
    contact_ln = random.choice(last_names)

    cl_ids.append(f"CLI-{i+1:04d}")
    cl_names.append(name)
    cl_verticals.append(vert)
    cl_contacts.append(f"{contact_fn} {contact_ln}")
    cl_contact_emails.append(f"{contact_fn.lower()}.{contact_ln.lower()}@{name.lower().replace(' ','')}.com")
    cl_contact_phones.append(make_phone())
    cl_city.append(city)
    cl_state.append(state)
    cl_zip.append(zipcode)
    cl_bill_rate_avg.append(round(random.uniform(25, 150), 2))
    cl_status.append(random.choice(["active","active","active","inactive","prospect"]))
    cl_since.append((base_date - timedelta(days=random.randint(30, 2000))).strftime("%Y-%m-%d"))

clients = pa.table({
    "client_id": cl_ids, "company_name": cl_names, "vertical": cl_verticals,
    "contact_name": cl_contacts, "contact_email": cl_contact_emails, "contact_phone": cl_contact_phones,
    "city": cl_city, "state": cl_state, "zip": cl_zip,
    "avg_bill_rate": cl_bill_rate_avg, "status": cl_status, "client_since": cl_since,
})
upload("clients", clients)

# ============================================================
# 3. JOB ORDERS — 3,000 open/filled/closed
# ============================================================
print("Generating job_orders (3K)...")

titles = {
    "IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst",
           "QA Engineer","Systems Admin","Help Desk","Network Engineer","Cloud Architect",
           "Full Stack Developer","Python Developer","React Developer","DBA","Security Analyst"],
    "Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist",
                   "Radiology Tech","Pharmacy Tech","Medical Coder","Billing Specialist","Case Manager"],
    "Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech",
                   "Electrician","Warehouse Associate","Assembly Technician","Production Supervisor","Shipping Clerk"],
    "Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Tax Preparer",
                   "Financial Analyst","Bookkeeper","Audit Associate","Controller","Cost Accountant"],
    "Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager",
              "Customer Service Rep","HR Coordinator","Legal Secretary","Office Coordinator","Scheduler"],
}

jo_ids, jo_client_ids, jo_titles, jo_verticals, jo_descriptions = [], [], [], [], []
jo_city, jo_state, jo_zip = [], [], []
jo_bill_rate, jo_pay_rate, jo_status, jo_openings, jo_created = [], [], [], [], []
jo_work_type, jo_duration = [], []

for i in range(3000):
    vert = random.choice(verticals)
    title = random.choice(titles[vert])
    ci = random.randint(0, 499)
    city, state, zipcode = random.choice(cities_zips)
    bill = round(random.uniform(25, 150), 2)
    pay = round(bill * random.uniform(0.55, 0.75), 2)

    req_skills = random.sample(skills_pool[vert], min(random.randint(3, 6), len(skills_pool[vert])))
    desc = f"{title} needed for {cl_names[ci]} in {city}, {state}. "
    desc += f"Requirements: {', '.join(req_skills)}. "
    desc += f"{random.randint(1,10)}+ years experience preferred. "
    desc += f"Bill rate: ${bill}/hr. "
    desc += random.choice([
        "Background check required.",
        "Drug screen required.",
        "Must have reliable transportation.",
        "Steel-toe boots required on site.",
        "Remote work available.",
        "Hybrid schedule: 3 days on-site.",
    ])

    jo_ids.append(f"JO-{i+1:05d}")
    jo_client_ids.append(cl_ids[ci])
    jo_titles.append(title)
    jo_verticals.append(vert)
    jo_descriptions.append(desc)
    jo_city.append(city)
    jo_state.append(state)
    jo_zip.append(zipcode)
    jo_bill_rate.append(bill)
    jo_pay_rate.append(pay)
    jo_status.append(random.choice(["open","open","open","filled","filled","closed","on_hold"]))
    jo_openings.append(random.randint(1, 5))
    jo_created.append((base_date - timedelta(days=random.randint(0, 365))).strftime("%Y-%m-%d"))
    jo_work_type.append(random.choice(["contract","temp_to_hire","direct_hire","contract"]))
    jo_duration.append(random.choice(["3 months","6 months","12 months","ongoing","project-based"]))

job_orders = pa.table({
    "job_order_id": jo_ids, "client_id": jo_client_ids, "title": jo_titles,
    "vertical": jo_verticals, "description": jo_descriptions,
    "city": jo_city, "state": jo_state, "zip": jo_zip,
    "bill_rate": jo_bill_rate, "pay_rate": jo_pay_rate, "status": jo_status,
    "openings": jo_openings, "created_date": jo_created,
    "work_type": jo_work_type, "duration": jo_duration,
})
upload("job_orders", job_orders)

# ============================================================
# 4. PLACEMENTS — 8,000 candidate-job matches
# ============================================================
print("Generating placements (8K)...")

p_ids, p_cand_ids, p_job_ids, p_client_ids = [], [], [], []
p_start, p_end, p_status, p_bill, p_pay, p_recruiter = [], [], [], [], [], []

recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(30)]

for i in range(8000):
    ci = random.randint(0, N_CAND - 1)
    ji = random.randint(0, 2999)
    start = base_date - timedelta(days=random.randint(0, 730))
    end = start + timedelta(days=random.randint(30, 365))

    p_ids.append(f"PL-{i+1:05d}")
    p_cand_ids.append(c_ids[ci])
    p_job_ids.append(jo_ids[ji])
    p_client_ids.append(jo_client_ids[ji])
    p_start.append(start.strftime("%Y-%m-%d"))
    p_end.append(end.strftime("%Y-%m-%d") if random.random() < 0.7 else "")
    p_status.append(random.choice(["active","active","completed","completed","terminated","no_show"]))
    p_bill.append(jo_bill_rate[ji])
    p_pay.append(jo_pay_rate[ji])
    p_recruiter.append(random.choice(recruiters))

placements = pa.table({
    "placement_id": p_ids, "candidate_id": p_cand_ids, "job_order_id": p_job_ids,
    "client_id": p_client_ids, "start_date": p_start, "end_date": p_end,
    "status": p_status, "bill_rate": p_bill, "pay_rate": p_pay, "recruiter": p_recruiter,
})
upload("placements", placements)

# ============================================================
# 5. TIMESHEETS — 120K weekly entries
# ============================================================
print("Generating timesheets (120K)...")

ts_ids, ts_placement_ids, ts_cand_ids, ts_client_ids = [], [], [], []
ts_week_ending, ts_hours_reg, ts_hours_ot, ts_bill_total, ts_pay_total = [], [], [], [], []
ts_approved, ts_approved_by = [], []

for i in range(120000):
    pi = random.randint(0, 7999)
    hrs_reg = round(random.choice([40, 40, 40, 32, 24, 20, 8]), 1)
    hrs_ot = round(random.choice([0, 0, 0, 0, 4, 8, 12, 16]), 1)
    bill = p_bill[pi]
    pay = p_pay[pi]

    ts_ids.append(f"TS-{i+1:06d}")
    ts_placement_ids.append(p_ids[pi])
    ts_cand_ids.append(p_cand_ids[pi])
    ts_client_ids.append(p_client_ids[pi])
    ts_week_ending.append((base_date - timedelta(weeks=random.randint(0, 104))).strftime("%Y-%m-%d"))
    ts_hours_reg.append(hrs_reg)
    ts_hours_ot.append(hrs_ot)
    ts_bill_total.append(round(hrs_reg * bill + hrs_ot * bill * 1.5, 2))
    ts_pay_total.append(round(hrs_reg * pay + hrs_ot * pay * 1.5, 2))
    ts_approved.append(random.choice([True, True, True, True, False]))
    ts_approved_by.append(random.choice(cl_contacts) if ts_approved[-1] else "")

timesheets = pa.table({
    "timesheet_id": ts_ids, "placement_id": ts_placement_ids,
    "candidate_id": ts_cand_ids, "client_id": ts_client_ids,
    "week_ending": ts_week_ending, "hours_regular": ts_hours_reg, "hours_overtime": ts_hours_ot,
    "bill_total": ts_bill_total, "pay_total": ts_pay_total,
    "approved": ts_approved, "approved_by": ts_approved_by,
})
upload("timesheets", timesheets)

# ============================================================
# 6. CALL LOG — 80K phone records (CDR)
# ============================================================
print("Generating call_log (80K)...")

call_ids, call_from, call_to, call_direction = [], [], [], []
call_duration, call_timestamp, call_recruiter, call_cand_id, call_disposition = [], [], [], [], []

dispositions = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled","declined"]

for i in range(80000):
    ci = random.randint(0, N_CAND - 1)
    rec = random.choice(recruiters)
    direction = random.choice(["outbound","outbound","outbound","inbound"])

    call_ids.append(f"CALL-{i+1:06d}")
    if direction == "outbound":
        call_from.append(make_phone())  # recruiter's line
        call_to.append(c_phones[ci])
    else:
        call_from.append(c_phones[ci])
        call_to.append(make_phone())
    call_direction.append(direction)
    call_duration.append(random.randint(0, 1800))
    call_timestamp.append((base_date - timedelta(seconds=random.randint(0, 86400 * 365))).isoformat())
    call_recruiter.append(rec)
    call_cand_id.append(c_ids[ci])
    call_disposition.append(random.choice(dispositions))

call_log = pa.table({
    "call_id": call_ids, "from_number": call_from, "to_number": call_to,
    "direction": call_direction, "duration_seconds": call_duration,
    "timestamp": call_timestamp, "recruiter": call_recruiter,
    "candidate_id": call_cand_id, "disposition": call_disposition,
})
upload("call_log", call_log)

# ============================================================
# 7. EMAIL LOG — 60K email records
# ============================================================
print("Generating email_log (60K)...")

em_ids, em_from, em_to, em_subject, em_timestamp = [], [], [], [], []
em_recruiter, em_cand_id, em_direction, em_opened = [], [], [], []

subjects = [
    "New job opportunity — {title} in {city}",
    "Following up on your application",
    "Interview scheduled — {title}",
    "Timesheet reminder for week ending {date}",
    "Your background check is complete",
    "New assignment details — {client}",
    "Pay rate update for your current assignment",
    "Re: Availability for {title} position",
    "Welcome to {client} — your first day info",
    "Reference check request",
]

for i in range(60000):
    ci = random.randint(0, N_CAND - 1)
    ji = random.randint(0, 2999)
    rec = random.choice(recruiters)
    direction = random.choice(["outbound","outbound","outbound","inbound"])
    subj = random.choice(subjects).format(
        title=jo_titles[ji], city=jo_city[ji], date="2026-01-05", client=cl_names[random.randint(0,499)]
    )

    em_ids.append(f"EM-{i+1:06d}")
    if direction == "outbound":
        em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
        em_to.append(c_emails[ci])
    else:
        em_from.append(c_emails[ci])
        em_to.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
    em_subject.append(subj)
    em_timestamp.append((base_date - timedelta(seconds=random.randint(0, 86400 * 365))).isoformat())
    em_recruiter.append(rec)
    em_cand_id.append(c_ids[ci])
    em_direction.append(direction)
    em_opened.append(random.random() < 0.6 if direction == "outbound" else True)

email_log = pa.table({
    "email_id": em_ids, "from_addr": em_from, "to_addr": em_to,
    "subject": em_subject, "timestamp": em_timestamp,
    "recruiter": em_recruiter, "candidate_id": em_cand_id,
    "direction": em_direction, "opened": em_opened,
})
upload("email_log", email_log)

# ============================================================
total = sum([candidates.num_rows, clients.num_rows, job_orders.num_rows,
             placements.num_rows, timesheets.num_rows, call_log.num_rows, email_log.num_rows])
print(f"\n{'='*60}")
print(f"Staffing company data loaded: {total:,} total rows across 7 tables")
print(f"{'='*60}")
print(f"""
Cross-reference queries to try:
  "Find all Java developers in Chicago who are available immediately"
  "Which recruiter has the most placements this year?"
  "Show me the total revenue by client for Q1 2026"
  "Find candidates who were called more than 5 times but never placed"
  "What's the average bill rate for .NET developers in New York?"
  "Which clients have the highest overtime hours?"
  "Show candidates in zip 60601 with Healthcare skills"
  "Find the spread (bill - pay) by vertical"
  "Which candidates have worked for multiple different clients?"
  "Show email open rates by recruiter"
""")