- Phase 6: Ingest pipeline (CSV/JSON → schema detect → Parquet → catalog) - Phase 7: Vector index + RAG (embed → HNSW → semantic search → LLM answer) - Phase 8: Hot cache + incremental updates (MemTable, delta files, merge-on-read) - ADR-008 through ADR-011: embeddings as Parquet, delta files not Delta Lake, schema defaults to string, not a CRM replacement - Staffing company reference dataset (286K rows, 7 tables) - Honest risk assessment: vector search at scale and incremental updates are hard Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
460 lines
23 KiB
Python
460 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Realistic staffing company data generator.
|
|
Multiple source systems, overlapping data, real cross-reference problems.
|
|
|
|
Data sources (like a real staffing company):
|
|
- ATS (Applicant Tracking System) → candidates
|
|
- CRM → client companies + contacts
|
|
- Job board → job orders with descriptions
|
|
- Placements → who got placed where
|
|
- Timesheets → hours worked, bill/pay rates
|
|
- Phone system CDR → call detail records
|
|
- Email logs → communication tracking
|
|
"""
|
|
|
|
import random, json, urllib.request, hashlib, string, time
|
|
from datetime import datetime, timedelta
|
|
import pyarrow as pa, pyarrow.parquet as pq
|
|
|
|
API = "http://localhost:3100"
|
|
random.seed(2026)
|
|
|
|
def upload(name, table):
|
|
path = f"/tmp/{name}.parquet"
|
|
pq.write_table(table, path, compression="snappy")
|
|
with open(path, "rb") as f:
|
|
data = f.read()
|
|
key = f"datasets/{name}.parquet"
|
|
req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT")
|
|
urllib.request.urlopen(req)
|
|
body = json.dumps({"name": name, "schema_fingerprint": "auto",
|
|
"objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode()
|
|
req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST",
|
|
headers={"Content-Type": "application/json"})
|
|
urllib.request.urlopen(req)
|
|
print(f" {name}: {table.num_rows:,} rows ({len(data)/1024:.0f} KB)")
|
|
|
|
# ============================================================
|
|
# Shared reference data
|
|
# ============================================================
|
|
|
|
first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth",
|
|
"William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen",
|
|
"Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra",
|
|
"Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna",
|
|
"Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah",
|
|
"Ronald","Stephanie","Edward","Rebecca","Jason","Sharon","Jeffrey","Laura","Ryan","Cynthia",
|
|
"Jacob","Kathleen","Gary","Amy","Nicholas","Angela","Eric","Shirley","Jonathan","Anna",
|
|
"Stephen","Brenda","Larry","Pamela","Justin","Emma","Scott","Nicole","Brandon","Helen",
|
|
"Benjamin","Samantha","Samuel","Katherine","Raymond","Christine","Gregory","Debra","Frank","Rachel",
|
|
"Alexander","Carolyn","Patrick","Janet","Jack","Catherine","Dennis","Maria","Jerry","Heather"]
|
|
|
|
last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez",
|
|
"Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin",
|
|
"Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson",
|
|
"Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores",
|
|
"Green","Adams","Nelson","Baker","Hall","Rivera","Campbell","Mitchell","Carter","Roberts",
|
|
"Gomez","Phillips","Evans","Turner","Diaz","Parker","Cruz","Edwards","Collins","Reyes",
|
|
"Stewart","Morris","Morales","Murphy","Cook","Rogers","Gutierrez","Ortiz","Morgan","Cooper",
|
|
"Peterson","Bailey","Reed","Kelly","Howard","Ramos","Kim","Cox","Ward","Richardson"]
|
|
|
|
cities_zips = [
|
|
("Chicago","IL","60601"),("Chicago","IL","60602"),("Chicago","IL","60603"),("Chicago","IL","60610"),
|
|
("Chicago","IL","60614"),("Chicago","IL","60616"),("Chicago","IL","60622"),("Chicago","IL","60647"),
|
|
("New York","NY","10001"),("New York","NY","10002"),("New York","NY","10003"),("New York","NY","10010"),
|
|
("New York","NY","10016"),("New York","NY","10019"),("New York","NY","10022"),("New York","NY","10036"),
|
|
("Los Angeles","CA","90001"),("Los Angeles","CA","90012"),("Los Angeles","CA","90024"),("Los Angeles","CA","90036"),
|
|
("Houston","TX","77001"),("Houston","TX","77002"),("Houston","TX","77003"),("Houston","TX","77019"),
|
|
("Dallas","TX","75201"),("Dallas","TX","75202"),("Dallas","TX","75204"),("Dallas","TX","75219"),
|
|
("Atlanta","GA","30301"),("Atlanta","GA","30303"),("Atlanta","GA","30305"),("Atlanta","GA","30309"),
|
|
("Denver","CO","80201"),("Denver","CO","80202"),("Denver","CO","80204"),("Denver","CO","80206"),
|
|
("Phoenix","AZ","85001"),("Phoenix","AZ","85003"),("Phoenix","AZ","85004"),("Phoenix","AZ","85006"),
|
|
("Seattle","WA","98101"),("Seattle","WA","98102"),("Seattle","WA","98103"),("Seattle","WA","98104"),
|
|
("Miami","FL","33101"),("Miami","FL","33125"),("Miami","FL","33130"),("Miami","FL","33132"),
|
|
]
|
|
|
|
skills_pool = {
|
|
"IT": ["Java","Python","C#",".NET","JavaScript","TypeScript","React","Angular","Node.js","SQL",
|
|
"AWS","Azure","GCP","Docker","Kubernetes","Linux","Git","REST APIs","GraphQL","MongoDB",
|
|
"PostgreSQL","MySQL","Redis","Terraform","Jenkins","CI/CD","Agile","Scrum","DevOps","Microservices",
|
|
"Spring Boot","Django","Flask","Ruby on Rails","Go","Rust","Swift","Kotlin","PHP","Vue.js"],
|
|
"Healthcare": ["RN","LPN","CNA","BLS","ACLS","PALS","EMR","Epic","Cerner","Meditech",
|
|
"ICD-10","CPT","Medical Billing","Medical Coding","HIPAA","Phlebotomy","IV Therapy",
|
|
"Telemetry","ICU","OR","ER","Med-Surg","Labor & Delivery","Pediatrics","Oncology"],
|
|
"Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","MIG","TIG","CNC","PLC","Blueprint Reading",
|
|
"Quality Control","Six Sigma","Lean Manufacturing","AutoCAD","SolidWorks","GD&T",
|
|
"Mechanical Assembly","Electrical","Hydraulics","Pneumatics","Warehouse"],
|
|
"Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger",
|
|
"Financial Reporting","Tax Preparation","CPA","Payroll","Budgeting","Forecasting",
|
|
"Audit","Compliance","Excel Advanced","Power BI","Tableau","GAAP","SOX"],
|
|
"Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Filing","Receptionist",
|
|
"Executive Assistant","Travel Coordination","Calendar Management","SAP","Salesforce",
|
|
"CRM","Multi-line Phone","Typing 60+ WPM","Notary","Bilingual Spanish"],
|
|
}
|
|
|
|
verticals = list(skills_pool.keys())
|
|
email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","aol.com","icloud.com","protonmail.com"]
|
|
|
|
def make_phone():
|
|
return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}"
|
|
|
|
def make_email(first, last):
|
|
sep = random.choice([".", "_", ""])
|
|
num = random.choice(["", str(random.randint(1,99))])
|
|
return f"{first.lower()}{sep}{last.lower()}{num}@{random.choice(email_domains)}"
|
|
|
|
base_date = datetime(2026, 1, 1)
|
|
|
|
# ============================================================
|
|
# 1. CANDIDATES — 15,000 from ATS
|
|
# ============================================================
|
|
print("Generating candidates (15K)...")
|
|
|
|
N_CAND = 15000
|
|
c_ids, c_first, c_last, c_emails, c_phones, c_phones_alt = [], [], [], [], [], []
|
|
c_city, c_state, c_zip = [], [], []
|
|
c_vertical, c_skills, c_resume_summary = [], [], []
|
|
c_status, c_source, c_pay_rate_min, c_created = [], [], [], []
|
|
c_availability, c_years_exp = [], []
|
|
|
|
for i in range(N_CAND):
|
|
fn = random.choice(first_names)
|
|
ln = random.choice(last_names)
|
|
city, state, zipcode = random.choice(cities_zips)
|
|
vert = random.choice(verticals)
|
|
n_skills = random.randint(3, 12)
|
|
sk = random.sample(skills_pool[vert], min(n_skills, len(skills_pool[vert])))
|
|
yrs = random.randint(0, 25)
|
|
|
|
resume = f"{fn} {ln} — {vert} professional with {yrs} years experience. "
|
|
resume += f"Based in {city}, {state} {zipcode}. "
|
|
resume += f"Key skills: {', '.join(sk)}. "
|
|
resume += random.choice([
|
|
f"Previously worked at {random.choice(['Acme Corp','TechFlow','GlobalStaff','MedPro','BuildRight'])} as a {random.choice(['Senior','Lead','Staff','Junior'])} {vert} specialist.",
|
|
f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} opportunities in the {city} metro area.",
|
|
f"Available {random.choice(['immediately','in 2 weeks','after current contract ends'])}. Open to {random.choice(['remote','hybrid','on-site'])} work.",
|
|
])
|
|
|
|
c_ids.append(f"CAND-{i+1:05d}")
|
|
c_first.append(fn)
|
|
c_last.append(ln)
|
|
c_emails.append(make_email(fn, ln))
|
|
c_phones.append(make_phone())
|
|
c_phones_alt.append(make_phone() if random.random() < 0.3 else "")
|
|
c_city.append(city)
|
|
c_state.append(state)
|
|
c_zip.append(zipcode)
|
|
c_vertical.append(vert)
|
|
c_skills.append("|".join(sk))
|
|
c_resume_summary.append(resume)
|
|
c_status.append(random.choice(["active","active","active","active","inactive","do_not_contact","placed"]))
|
|
c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","CareerBuilder","Website","Job Fair"]))
|
|
c_pay_rate_min.append(round(random.uniform(12, 85), 2))
|
|
c_created.append((base_date - timedelta(days=random.randint(0, 1095))).strftime("%Y-%m-%d"))
|
|
c_availability.append(random.choice(["immediate","1_week","2_weeks","1_month","not_available"]))
|
|
c_years_exp.append(yrs)
|
|
|
|
candidates = pa.table({
|
|
"candidate_id": c_ids, "first_name": c_first, "last_name": c_last,
|
|
"email": c_emails, "phone": c_phones, "phone_alt": c_phones_alt,
|
|
"city": c_city, "state": c_state, "zip": c_zip,
|
|
"vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume_summary,
|
|
"status": c_status, "source": c_source, "min_pay_rate": c_pay_rate_min,
|
|
"created_date": c_created, "availability": c_availability, "years_experience": c_years_exp,
|
|
})
|
|
upload("candidates", candidates)
|
|
|
|
# ============================================================
|
|
# 2. CLIENTS — 500 companies
|
|
# ============================================================
|
|
print("Generating clients (500)...")
|
|
|
|
company_prefixes = ["Apex","Summit","Core","First","National","Metro","Pacific","Atlantic","Central","Premier",
|
|
"Global","United","Alliance","Pinnacle","Elite","Horizon","Pioneer","Titan","Quantum","Vertex"]
|
|
company_suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners",
|
|
"Holdings","Enterprises","Manufacturing","Healthcare","Logistics","Financial","Engineering"]
|
|
|
|
cl_ids, cl_names, cl_verticals, cl_contacts, cl_contact_emails, cl_contact_phones = [], [], [], [], [], []
|
|
cl_city, cl_state, cl_zip, cl_bill_rate_avg, cl_status, cl_since = [], [], [], [], [], []
|
|
|
|
for i in range(500):
|
|
name = f"{random.choice(company_prefixes)} {random.choice(company_suffixes)}"
|
|
city, state, zipcode = random.choice(cities_zips)
|
|
vert = random.choice(verticals)
|
|
contact_fn = random.choice(first_names)
|
|
contact_ln = random.choice(last_names)
|
|
|
|
cl_ids.append(f"CLI-{i+1:04d}")
|
|
cl_names.append(name)
|
|
cl_verticals.append(vert)
|
|
cl_contacts.append(f"{contact_fn} {contact_ln}")
|
|
cl_contact_emails.append(f"{contact_fn.lower()}.{contact_ln.lower()}@{name.lower().replace(' ','')}.com")
|
|
cl_contact_phones.append(make_phone())
|
|
cl_city.append(city)
|
|
cl_state.append(state)
|
|
cl_zip.append(zipcode)
|
|
cl_bill_rate_avg.append(round(random.uniform(25, 150), 2))
|
|
cl_status.append(random.choice(["active","active","active","inactive","prospect"]))
|
|
cl_since.append((base_date - timedelta(days=random.randint(30, 2000))).strftime("%Y-%m-%d"))
|
|
|
|
clients = pa.table({
|
|
"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verticals,
|
|
"contact_name": cl_contacts, "contact_email": cl_contact_emails, "contact_phone": cl_contact_phones,
|
|
"city": cl_city, "state": cl_state, "zip": cl_zip,
|
|
"avg_bill_rate": cl_bill_rate_avg, "status": cl_status, "client_since": cl_since,
|
|
})
|
|
upload("clients", clients)
|
|
|
|
# ============================================================
|
|
# 3. JOB ORDERS — 3,000 open/filled/closed
|
|
# ============================================================
|
|
print("Generating job_orders (3K)...")
|
|
|
|
titles = {
|
|
"IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst",
|
|
"QA Engineer","Systems Admin","Help Desk","Network Engineer","Cloud Architect",
|
|
"Full Stack Developer","Python Developer","React Developer","DBA","Security Analyst"],
|
|
"Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist",
|
|
"Radiology Tech","Pharmacy Tech","Medical Coder","Billing Specialist","Case Manager"],
|
|
"Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech",
|
|
"Electrician","Warehouse Associate","Assembly Technician","Production Supervisor","Shipping Clerk"],
|
|
"Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Tax Preparer",
|
|
"Financial Analyst","Bookkeeper","Audit Associate","Controller","Cost Accountant"],
|
|
"Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager",
|
|
"Customer Service Rep","HR Coordinator","Legal Secretary","Office Coordinator","Scheduler"],
|
|
}
|
|
|
|
jo_ids, jo_client_ids, jo_titles, jo_verticals, jo_descriptions = [], [], [], [], []
|
|
jo_city, jo_state, jo_zip = [], [], []
|
|
jo_bill_rate, jo_pay_rate, jo_status, jo_openings, jo_created = [], [], [], [], []
|
|
jo_work_type, jo_duration = [], []
|
|
|
|
for i in range(3000):
|
|
vert = random.choice(verticals)
|
|
title = random.choice(titles[vert])
|
|
ci = random.randint(0, 499)
|
|
city, state, zipcode = random.choice(cities_zips)
|
|
bill = round(random.uniform(25, 150), 2)
|
|
pay = round(bill * random.uniform(0.55, 0.75), 2)
|
|
|
|
req_skills = random.sample(skills_pool[vert], min(random.randint(3, 6), len(skills_pool[vert])))
|
|
desc = f"{title} needed for {cl_names[ci]} in {city}, {state}. "
|
|
desc += f"Requirements: {', '.join(req_skills)}. "
|
|
desc += f"{random.randint(1,10)}+ years experience preferred. "
|
|
desc += f"Bill rate: ${bill}/hr. "
|
|
desc += random.choice([
|
|
"Background check required.",
|
|
"Drug screen required.",
|
|
"Must have reliable transportation.",
|
|
"Steel-toe boots required on site.",
|
|
"Remote work available.",
|
|
"Hybrid schedule: 3 days on-site.",
|
|
])
|
|
|
|
jo_ids.append(f"JO-{i+1:05d}")
|
|
jo_client_ids.append(cl_ids[ci])
|
|
jo_titles.append(title)
|
|
jo_verticals.append(vert)
|
|
jo_descriptions.append(desc)
|
|
jo_city.append(city)
|
|
jo_state.append(state)
|
|
jo_zip.append(zipcode)
|
|
jo_bill_rate.append(bill)
|
|
jo_pay_rate.append(pay)
|
|
jo_status.append(random.choice(["open","open","open","filled","filled","closed","on_hold"]))
|
|
jo_openings.append(random.randint(1, 5))
|
|
jo_created.append((base_date - timedelta(days=random.randint(0, 365))).strftime("%Y-%m-%d"))
|
|
jo_work_type.append(random.choice(["contract","temp_to_hire","direct_hire","contract"]))
|
|
jo_duration.append(random.choice(["3 months","6 months","12 months","ongoing","project-based"]))
|
|
|
|
job_orders = pa.table({
|
|
"job_order_id": jo_ids, "client_id": jo_client_ids, "title": jo_titles,
|
|
"vertical": jo_verticals, "description": jo_descriptions,
|
|
"city": jo_city, "state": jo_state, "zip": jo_zip,
|
|
"bill_rate": jo_bill_rate, "pay_rate": jo_pay_rate, "status": jo_status,
|
|
"openings": jo_openings, "created_date": jo_created,
|
|
"work_type": jo_work_type, "duration": jo_duration,
|
|
})
|
|
upload("job_orders", job_orders)
|
|
|
|
# ============================================================
|
|
# 4. PLACEMENTS — 8,000 candidate-job matches
|
|
# ============================================================
|
|
print("Generating placements (8K)...")
|
|
|
|
p_ids, p_cand_ids, p_job_ids, p_client_ids = [], [], [], []
|
|
p_start, p_end, p_status, p_bill, p_pay, p_recruiter = [], [], [], [], [], []
|
|
|
|
recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(30)]
|
|
|
|
for i in range(8000):
|
|
ci = random.randint(0, N_CAND - 1)
|
|
ji = random.randint(0, 2999)
|
|
start = base_date - timedelta(days=random.randint(0, 730))
|
|
end = start + timedelta(days=random.randint(30, 365))
|
|
|
|
p_ids.append(f"PL-{i+1:05d}")
|
|
p_cand_ids.append(c_ids[ci])
|
|
p_job_ids.append(jo_ids[ji])
|
|
p_client_ids.append(jo_client_ids[ji])
|
|
p_start.append(start.strftime("%Y-%m-%d"))
|
|
p_end.append(end.strftime("%Y-%m-%d") if random.random() < 0.7 else "")
|
|
p_status.append(random.choice(["active","active","completed","completed","terminated","no_show"]))
|
|
p_bill.append(jo_bill_rate[ji])
|
|
p_pay.append(jo_pay_rate[ji])
|
|
p_recruiter.append(random.choice(recruiters))
|
|
|
|
placements = pa.table({
|
|
"placement_id": p_ids, "candidate_id": p_cand_ids, "job_order_id": p_job_ids,
|
|
"client_id": p_client_ids, "start_date": p_start, "end_date": p_end,
|
|
"status": p_status, "bill_rate": p_bill, "pay_rate": p_pay, "recruiter": p_recruiter,
|
|
})
|
|
upload("placements", placements)
|
|
|
|
# ============================================================
|
|
# 5. TIMESHEETS — 120K weekly entries
|
|
# ============================================================
|
|
print("Generating timesheets (120K)...")
|
|
|
|
ts_ids, ts_placement_ids, ts_cand_ids, ts_client_ids = [], [], [], []
|
|
ts_week_ending, ts_hours_reg, ts_hours_ot, ts_bill_total, ts_pay_total = [], [], [], [], []
|
|
ts_approved, ts_approved_by = [], []
|
|
|
|
for i in range(120000):
|
|
pi = random.randint(0, 7999)
|
|
hrs_reg = round(random.choice([40, 40, 40, 32, 24, 20, 8]), 1)
|
|
hrs_ot = round(random.choice([0, 0, 0, 0, 4, 8, 12, 16]), 1)
|
|
bill = p_bill[pi]
|
|
pay = p_pay[pi]
|
|
|
|
ts_ids.append(f"TS-{i+1:06d}")
|
|
ts_placement_ids.append(p_ids[pi])
|
|
ts_cand_ids.append(p_cand_ids[pi])
|
|
ts_client_ids.append(p_client_ids[pi])
|
|
ts_week_ending.append((base_date - timedelta(weeks=random.randint(0, 104))).strftime("%Y-%m-%d"))
|
|
ts_hours_reg.append(hrs_reg)
|
|
ts_hours_ot.append(hrs_ot)
|
|
ts_bill_total.append(round(hrs_reg * bill + hrs_ot * bill * 1.5, 2))
|
|
ts_pay_total.append(round(hrs_reg * pay + hrs_ot * pay * 1.5, 2))
|
|
ts_approved.append(random.choice([True, True, True, True, False]))
|
|
ts_approved_by.append(random.choice(cl_contacts) if ts_approved[-1] else "")
|
|
|
|
timesheets = pa.table({
|
|
"timesheet_id": ts_ids, "placement_id": ts_placement_ids,
|
|
"candidate_id": ts_cand_ids, "client_id": ts_client_ids,
|
|
"week_ending": ts_week_ending, "hours_regular": ts_hours_reg, "hours_overtime": ts_hours_ot,
|
|
"bill_total": ts_bill_total, "pay_total": ts_pay_total,
|
|
"approved": ts_approved, "approved_by": ts_approved_by,
|
|
})
|
|
upload("timesheets", timesheets)
|
|
|
|
# ============================================================
|
|
# 6. CALL LOG — 80K phone records (CDR)
|
|
# ============================================================
|
|
print("Generating call_log (80K)...")
|
|
|
|
call_ids, call_from, call_to, call_direction = [], [], [], []
|
|
call_duration, call_timestamp, call_recruiter, call_cand_id, call_disposition = [], [], [], [], []
|
|
|
|
dispositions = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled","declined"]
|
|
|
|
for i in range(80000):
|
|
ci = random.randint(0, N_CAND - 1)
|
|
rec = random.choice(recruiters)
|
|
direction = random.choice(["outbound","outbound","outbound","inbound"])
|
|
|
|
call_ids.append(f"CALL-{i+1:06d}")
|
|
if direction == "outbound":
|
|
call_from.append(make_phone()) # recruiter's line
|
|
call_to.append(c_phones[ci])
|
|
else:
|
|
call_from.append(c_phones[ci])
|
|
call_to.append(make_phone())
|
|
call_direction.append(direction)
|
|
call_duration.append(random.randint(0, 1800))
|
|
call_timestamp.append((base_date - timedelta(seconds=random.randint(0, 86400 * 365))).isoformat())
|
|
call_recruiter.append(rec)
|
|
call_cand_id.append(c_ids[ci])
|
|
call_disposition.append(random.choice(dispositions))
|
|
|
|
call_log = pa.table({
|
|
"call_id": call_ids, "from_number": call_from, "to_number": call_to,
|
|
"direction": call_direction, "duration_seconds": call_duration,
|
|
"timestamp": call_timestamp, "recruiter": call_recruiter,
|
|
"candidate_id": call_cand_id, "disposition": call_disposition,
|
|
})
|
|
upload("call_log", call_log)
|
|
|
|
# ============================================================
|
|
# 7. EMAIL LOG — 60K email records
|
|
# ============================================================
|
|
print("Generating email_log (60K)...")
|
|
|
|
em_ids, em_from, em_to, em_subject, em_timestamp = [], [], [], [], []
|
|
em_recruiter, em_cand_id, em_direction, em_opened = [], [], [], []
|
|
|
|
subjects = [
|
|
"New job opportunity — {title} in {city}",
|
|
"Following up on your application",
|
|
"Interview scheduled — {title}",
|
|
"Timesheet reminder for week ending {date}",
|
|
"Your background check is complete",
|
|
"New assignment details — {client}",
|
|
"Pay rate update for your current assignment",
|
|
"Re: Availability for {title} position",
|
|
"Welcome to {client} — your first day info",
|
|
"Reference check request",
|
|
]
|
|
|
|
for i in range(60000):
|
|
ci = random.randint(0, N_CAND - 1)
|
|
ji = random.randint(0, 2999)
|
|
rec = random.choice(recruiters)
|
|
direction = random.choice(["outbound","outbound","outbound","inbound"])
|
|
subj = random.choice(subjects).format(
|
|
title=jo_titles[ji], city=jo_city[ji], date="2026-01-05", client=cl_names[random.randint(0,499)]
|
|
)
|
|
|
|
em_ids.append(f"EM-{i+1:06d}")
|
|
if direction == "outbound":
|
|
em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
|
|
em_to.append(c_emails[ci])
|
|
else:
|
|
em_from.append(c_emails[ci])
|
|
em_to.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
|
|
em_subject.append(subj)
|
|
em_timestamp.append((base_date - timedelta(seconds=random.randint(0, 86400 * 365))).isoformat())
|
|
em_recruiter.append(rec)
|
|
em_cand_id.append(c_ids[ci])
|
|
em_direction.append(direction)
|
|
em_opened.append(random.random() < 0.6 if direction == "outbound" else True)
|
|
|
|
email_log = pa.table({
|
|
"email_id": em_ids, "from_addr": em_from, "to_addr": em_to,
|
|
"subject": em_subject, "timestamp": em_timestamp,
|
|
"recruiter": em_recruiter, "candidate_id": em_cand_id,
|
|
"direction": em_direction, "opened": em_opened,
|
|
})
|
|
upload("email_log", email_log)
|
|
|
|
# ============================================================
|
|
total = sum([candidates.num_rows, clients.num_rows, job_orders.num_rows,
|
|
placements.num_rows, timesheets.num_rows, call_log.num_rows, email_log.num_rows])
|
|
print(f"\n{'='*60}")
|
|
print(f"Staffing company data loaded: {total:,} total rows across 7 tables")
|
|
print(f"{'='*60}")
|
|
print(f"""
|
|
Cross-reference queries to try:
|
|
"Find all Java developers in Chicago who are available immediately"
|
|
"Which recruiter has the most placements this year?"
|
|
"Show me the total revenue by client for Q1 2026"
|
|
"Find candidates who were called more than 5 times but never placed"
|
|
"What's the average bill rate for .NET developers in New York?"
|
|
"Which clients have the highest overtime hours?"
|
|
"Show candidates in zip 60601 with Healthcare skills"
|
|
"Find the spread (bill - pay) by vertical"
|
|
"Which candidates have worked for multiple different clients?"
|
|
"Show email open rates by recruiter"
|
|
""")
|