lakehouse/scripts/generate_workers.py

#!/usr/bin/env python3
"""Generate realistic staffing worker profiles at scale.

Usage:
  python3 generate_workers.py 100000 > /tmp/workers_100k.csv
  curl -X POST "http://localhost:3100/ingest/file?name=workers_100k" \
       -F "file=@/tmp/workers_100k.csv"

Design: combinatorial generation with industry-realistic distributions.
No LLM dependency — runs in seconds. Each worker has:
  - Unique name, email, phone
  - Role drawn from weighted staffing-industry distribution
  - City/state from real Midwest/South geography (staffing agency footprint)
  - Skills per role (realistic combos, not random)
  - Certifications with expiry dates
  - Behavioral archetype + numeric scores
  - SMS communication history (templated from real patterns)
  - Resume summary text (for embedding)

The generated data is designed to stress-test:
  - SQL filters (WHERE role=X AND state=Y AND reliability>Z)
  - Vector search (resume_text embeddings)
  - Hybrid SQL+vector (structured + semantic together)
  - Profile-scoped search (bound_datasets filtering)
  - Concurrent query load at scale
"""

import csv, random, sys, hashlib
from datetime import datetime, timedelta

# ─── Configuration ───

N = int(sys.argv[1]) if len(sys.argv) > 1 else 100000
SEED = 2026
random.seed(SEED)

# ─── Realistic data pools ───

FIRST_NAMES = [
    "James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda",
    "David", "Elizabeth", "William", "Barbara", "Richard", "Susan", "Joseph", "Jessica",
    "Thomas", "Sarah", "Christopher", "Karen", "Charles", "Lisa", "Daniel", "Nancy",
    "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", "Donald", "Ashley",
    "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
    "Kenneth", "Carol", "Kevin", "Amanda", "Brian", "Dorothy", "George", "Melissa",
    "Timothy", "Deborah", "Ronald", "Stephanie", "Edward", "Rebecca", "Jason", "Sharon",
    "Jeffrey", "Laura", "Ryan", "Cynthia", "Jacob", "Kathleen", "Gary", "Amy",
    "Nicholas", "Angela", "Eric", "Shirley", "Jonathan", "Anna", "Stephen", "Brenda",
    "Larry", "Pamela", "Justin", "Emma", "Scott", "Nicole", "Brandon", "Helen",
    "Benjamin", "Samantha", "Samuel", "Katherine", "Raymond", "Christine", "Gregory", "Debra",
    "Frank", "Rachel", "Alexander", "Carolyn", "Patrick", "Janet", "Jack", "Catherine",
    "Dennis", "Maria", "Jerry", "Heather", "Tyler", "Diane", "Aaron", "Ruth",
    "Jose", "Julie", "Adam", "Olivia", "Nathan", "Joyce", "Henry", "Virginia",
    "Douglas", "Victoria", "Zachary", "Kelly", "Peter", "Lauren", "Kyle", "Christina",
    "Jamal", "Terrence", "Marcus", "DeShawn", "Malik", "Andre", "Carlos", "Miguel",
    "Luis", "Sofia", "Rosa", "Carmen", "Alejandro", "Roberto", "Priya", "Aisha",
    "Wei", "Yuki", "Omar", "Fatima", "Raj", "Mei", "Olga", "Ivan",
]

LAST_NAMES = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
    "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson",
    "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson",
    "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker",
    "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill",
    "Flores", "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell",
    "Mitchell", "Carter", "Roberts", "Gomez", "Phillips", "Evans", "Turner", "Diaz",
    "Parker", "Cruz", "Edwards", "Collins", "Reyes", "Stewart", "Morris", "Morales",
    "Murphy", "Cook", "Rogers", "Gutierrez", "Ortiz", "Morgan", "Cooper", "Peterson",
    "Bailey", "Reed", "Kelly", "Howard", "Ramos", "Kim", "Cox", "Ward",
    "Richardson", "Watson", "Brooks", "Chavez", "Wood", "James", "Bennett", "Gray",
    "Mendoza", "Ruiz", "Hughes", "Price", "Alvarez", "Castillo", "Sanders", "Patel",
    "Myers", "Long", "Ross", "Foster", "Jimenez", "Powell", "Jenkins", "Perry",
]

# Real Midwest/South cities — staffing agency footprint
CITIES = [
    ("Chicago", "IL", "606"), ("Springfield", "IL", "627"), ("Rockford", "IL", "611"),
    ("Peoria", "IL", "616"), ("Champaign", "IL", "618"), ("Decatur", "IL", "625"),
    ("Bloomington", "IL", "617"), ("Joliet", "IL", "604"), ("Mattoon", "IL", "619"),
    ("Danville", "IL", "618"), ("Quincy", "IL", "623"), ("Galesburg", "IL", "614"),
    ("Indianapolis", "IN", "462"), ("Fort Wayne", "IN", "468"), ("Evansville", "IN", "477"),
    ("South Bend", "IN", "466"), ("Terre Haute", "IN", "478"), ("Bloomington", "IN", "474"),
    ("Columbus", "OH", "432"), ("Cleveland", "OH", "441"), ("Cincinnati", "OH", "452"),
    ("Dayton", "OH", "454"), ("Toledo", "OH", "436"), ("Akron", "OH", "443"),
    ("St. Louis", "MO", "631"), ("Kansas City", "MO", "641"), ("Springfield", "MO", "658"),
    ("Columbia", "MO", "652"), ("Jefferson City", "MO", "651"),
    ("Nashville", "TN", "372"), ("Memphis", "TN", "381"), ("Knoxville", "TN", "379"),
    ("Louisville", "KY", "402"), ("Lexington", "KY", "405"),
    ("Milwaukee", "WI", "532"), ("Madison", "WI", "537"), ("Green Bay", "WI", "543"),
    ("Detroit", "MI", "481"), ("Grand Rapids", "MI", "495"), ("Lansing", "MI", "489"),
    ("Des Moines", "IA", "503"), ("Cedar Rapids", "IA", "524"),
    ("Minneapolis", "MN", "554"), ("St. Paul", "MN", "551"),
]

# Roles with industry-realistic weights
ROLES = [
    ("Forklift Operator", 12), ("Material Handler", 11), ("Machine Operator", 10),
    ("Assembler", 9), ("Production Worker", 9), ("Warehouse Associate", 8),
    ("Quality Tech", 7), ("Shipping Clerk", 6), ("Loader", 6),
    ("Inventory Clerk", 5), ("Line Lead", 4), ("Maintenance Tech", 4),
    ("Welder", 3), ("CNC Operator", 3), ("Sanitation Worker", 3),
    ("Packaging Operator", 3), ("Electrician", 2), ("Tool & Die Maker", 2),
    ("Safety Coordinator", 1), ("Logistics Coordinator", 1),
]
ROLE_NAMES = [r for r, _ in ROLES]
ROLE_WEIGHTS = [w for _, w in ROLES]

# Skills per role family (realistic combos)
SKILL_POOLS = {
    "warehouse": ["forklift", "pallet jack", "RF scanner", "inventory", "shipping",
                   "receiving", "pick-to-light", "packaging", "cold storage", "loading dock"],
    "production": ["assembly", "line work", "quality inspection", "lean manufacturing",
                    "6S", "SPC", "conveyor ops", "batch processing", "labeling"],
    "machine": ["CNC", "lathe", "mill", "grinder", "press brake", "EDM",
                 "blueprint reading", "GD&T", "micrometer", "calipers"],
    "maintenance": ["preventive maintenance", "troubleshooting", "PLC", "electrical",
                     "hydraulics", "pneumatics", "welding", "lockout/tagout", "CMMS"],
    "quality": ["inspection", "CMM", "SPC", "gauge R&R", "ISO 9001", "root cause analysis",
                 "first article", "nonconformance", "calibration"],
    "general": ["Excel", "SAP", "first aid", "hazmat", "confined space",
                 "overhead crane", "team lead", "training", "bilingual"],
}

ROLE_SKILL_MAP = {
    "Forklift Operator": ["warehouse", "general"],
    "Material Handler": ["warehouse", "general"],
    "Machine Operator": ["machine", "production"],
    "Assembler": ["production", "quality"],
    "Production Worker": ["production", "general"],
    "Warehouse Associate": ["warehouse", "general"],
    "Quality Tech": ["quality", "production"],
    "Shipping Clerk": ["warehouse", "general"],
    "Loader": ["warehouse", "general"],
    "Inventory Clerk": ["warehouse", "general"],
    "Line Lead": ["production", "general"],
    "Maintenance Tech": ["maintenance", "general"],
    "Welder": ["maintenance", "machine"],
    "CNC Operator": ["machine", "quality"],
    "Sanitation Worker": ["general"],
    "Packaging Operator": ["production", "warehouse"],
    "Electrician": ["maintenance"],
    "Tool & Die Maker": ["machine", "maintenance"],
    "Safety Coordinator": ["quality", "general"],
    "Logistics Coordinator": ["warehouse", "general"],
}

CERTS = [
    ("OSHA-10", 0.35), ("OSHA-30", 0.15), ("Forklift", 0.30), ("Hazmat", 0.12),
    ("First Aid/CPR", 0.25), ("Reach Truck", 0.10), ("Order Picker", 0.08),
    ("ServSafe", 0.05), ("MSDS", 0.07), ("Confined Space", 0.06),
    ("Lockout/Tagout", 0.08), ("Fire Safety", 0.04), ("ISO 9001", 0.03),
]

ARCHETYPES = [
    ("reliable", 25), ("communicator", 25), ("flexible", 20),
    ("leader", 15), ("specialist", 10), ("improving", 3),
    ("erratic", 1), ("silent", 1),
]
ARCHETYPE_NAMES = [a for a, _ in ARCHETYPES]
ARCHETYPE_WEIGHTS = [w for _, w in ARCHETYPES]

SMS_TEMPLATES = [
    "On my way, running about {min} minutes late.",
    "Got it, I'll be there at {time}.",
    "Can I switch to the {shift} shift this week?",
    "Thanks for the update!",
    "Is overtime available this Saturday?",
    "I need to call in tomorrow, family emergency.",
    "Stuck in traffic on I-{highway}, might be {min} late.",
    "Hey, just confirming my start time is {time}?",
    "I finished the {task} ahead of schedule.",
    "Can you send me the address for the new site?",
    "Do I need steel toes for this assignment?",
    "What's the dress code at {client}?",
    "My certification expires next month, where do I renew?",
    "I'm available for any extra shifts this week.",
    "Is there parking on site or do I need to take the bus?",
    "The supervisor said I did great today!",
    "I have a doctor's appointment {day}, can I come in late?",
    "Weather looks bad tomorrow, is the site still open?",
    "I completed the safety orientation.",
    "Thanks for getting me this placement, really appreciate it.",
]

CLIENTS = [
    "Midwest Logistics", "Precision Mfg", "AutoParts Direct", "CleanSpace",
    "Summit Packaging", "Great Lakes Steel", "Heartland Foods", "Prairie Wind Energy",
    "River City Plastics", "Cardinal Health", "TechFlow Assembly", "Union Pacific",
]

# ─── Generator ───

def gen_skills(role):
    pools = ROLE_SKILL_MAP.get(role, ["general"])
    skills = set()
    for pool in pools:
        available = SKILL_POOLS.get(pool, [])
        n = random.randint(2, min(5, len(available)))
        skills.update(random.sample(available, n))
    return sorted(skills)

def gen_certs():
    certs = []
    for name, prob in CERTS:
        if random.random() < prob:
            expires = datetime(2026, 1, 1) + timedelta(days=random.randint(30, 730))
            certs.append(f"{name}")
    return certs

def gen_scores(archetype):
    base = {
        "reliable": (0.85, 0.05), "communicator": (0.70, 0.10),
        "flexible": (0.75, 0.08), "leader": (0.80, 0.07),
        "specialist": (0.78, 0.06), "improving": (0.60, 0.15),
        "erratic": (0.40, 0.20), "silent": (0.65, 0.12),
    }
    mean, std = base.get(archetype, (0.70, 0.10))
    return {
        "reliability": max(0, min(1, random.gauss(mean, std))),
        "responsiveness": max(0, min(1, random.gauss(mean - 0.05, std + 0.05))),
        "engagement": max(0, min(1, random.gauss(mean - 0.03, std))),
        "compliance": max(0, min(1, random.gauss(mean + 0.02, std - 0.02))),
        "availability": max(0, min(1, random.gauss(0.75, 0.15))),
    }

def gen_comms(archetype, n=5):
    if archetype == "silent":
        n = random.randint(0, 1)
    elif archetype == "communicator":
        n = random.randint(5, 10)
    msgs = []
    for _ in range(n):
        tmpl = random.choice(SMS_TEMPLATES)
        msg = tmpl.format(
            min=random.randint(5, 25),
            time=f"{random.randint(5,8)}:{random.choice(['00','15','30','45'])} AM",
            shift=random.choice(["morning", "evening", "night"]),
            highway=random.randint(55, 94),
            task=random.choice(["pallet count", "safety check", "machine setup", "inventory audit"]),
            client=random.choice(CLIENTS),
            day=random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]),
        )
        msgs.append(msg)
    return " | ".join(msgs)

def gen_email(first, last, wid):
    domain = random.choice(["gmail.com", "yahoo.com", "outlook.com", "protonmail.com", "mail.com"])
    tag = random.choice(["", str(random.randint(1,99)), str(wid)])
    return f"{first.lower()}.{last.lower()}{tag}@{domain}"

def gen_phone():
    area = random.choice([312,773,217,309,618,219,317,812,614,513,216,314,615,502,414,608,313,515,612])
    return f"+1{area}{random.randint(2000000,9999999)}"

# ─── Main ───

writer = csv.writer(sys.stdout)
writer.writerow([
    "worker_id", "name", "role", "email", "phone", "city", "state", "zip",
    "skills", "certifications", "archetype",
    "reliability", "responsiveness", "engagement", "compliance", "availability",
    "communications", "resume_text",
])

for i in range(1, N + 1):
    first = random.choice(FIRST_NAMES)
    last = random.choice(LAST_NAMES)
    # At scale, names repeat — that's realistic. Worker ID is the unique key.
    # Add middle initial for variety above 15K workers.
    if i > 15000:
        mid = chr(65 + (i % 26))  # A-Z
        name = f"{first} {mid}. {last}"
    else:
        name = f"{first} {last}"

    role = random.choices(ROLE_NAMES, weights=ROLE_WEIGHTS, k=1)[0]
    city, state, zip_pre = random.choice(CITIES)
    zipcode = f"{zip_pre}{random.randint(10,99)}"
    skills = gen_skills(role)
    certs = gen_certs()
    archetype = random.choices(ARCHETYPE_NAMES, weights=ARCHETYPE_WEIGHTS, k=1)[0]
    scores = gen_scores(archetype)
    comms = gen_comms(archetype)

    resume = (
        f"{name} — {role} in {city}, {state}. "
        f"Skills: {'|'.join(skills)}. "
        f"Certs: {'|'.join(certs) if certs else 'none'}. "
        f"Archetype: {archetype}. "
        f"Reliability: {scores['reliability']:.2f}, "
        f"Availability: {scores['availability']:.2f}"
    )

    writer.writerow([
        i, name, role, gen_email(first, last, i), gen_phone(),
        city, state, zipcode,
        ", ".join(skills), ", ".join(certs), archetype,
        f"{scores['reliability']:.4f}", f"{scores['responsiveness']:.4f}",
        f"{scores['engagement']:.4f}", f"{scores['compliance']:.4f}",
        f"{scores['availability']:.4f}",
        comms, resume,
    ])

    if i % 25000 == 0:
        print(f"  generated {i:,}/{N:,}...", file=sys.stderr)

print(f"Done: {N:,} workers", file=sys.stderr)