#!/usr/bin/env python3 """ Scale test: 2.5M rows across staffing tables + 100K vector embeddings. Designed for 128GB RAM machine. """ import random, json, urllib.request, time from datetime import datetime, timedelta import pyarrow as pa, pyarrow.parquet as pq API = "http://localhost:3100" random.seed(2026) def upload(name, table): path = f"/tmp/{name}.parquet" pq.write_table(table, path, compression="snappy") with open(path, "rb") as f: data = f.read() key = f"datasets/{name}.parquet" req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT") urllib.request.urlopen(req) body = json.dumps({"name": name, "schema_fingerprint": "auto", "objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode() req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST", headers={"Content-Type": "application/json"}) urllib.request.urlopen(req) print(f" {name}: {table.num_rows:,} rows ({len(data)/1024/1024:.1f} MB)") # Shared data first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth", "William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen", "Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra", "Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna", "Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah"] last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez", "Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin", "Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson", "Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores"] cities_zips = [ ("Chicago","IL","60601"),("Chicago","IL","60610"),("Chicago","IL","60614"),("Chicago","IL","60622"), ("New York","NY","10001"),("New York","NY","10016"),("New York","NY","10022"),("New York","NY","10036"), ("Los Angeles","CA","90001"),("Los Angeles","CA","90024"),("Houston","TX","77001"),("Houston","TX","77019"), ("Dallas","TX","75201"),("Dallas","TX","75219"),("Atlanta","GA","30301"),("Atlanta","GA","30309"), ("Denver","CO","80201"),("Denver","CO","80206"),("Phoenix","AZ","85001"),("Phoenix","AZ","85006"), ("Seattle","WA","98101"),("Seattle","WA","98104"),("Miami","FL","33101"),("Miami","FL","33132"), ] verticals = ["IT","Healthcare","Industrial","Accounting","Admin"] skills_by_vert = { "IT": ["Java","Python","C#",".NET","JavaScript","React","Angular","Node.js","SQL","AWS","Azure","Docker","Kubernetes","Linux","Git","REST APIs","MongoDB","PostgreSQL","Redis","Terraform","Jenkins","Agile","Spring Boot","Django","Go","Rust","TypeScript","GraphQL","Microservices","CI/CD"], "Healthcare": ["RN","LPN","CNA","BLS","ACLS","EMR","Epic","Cerner","ICD-10","CPT","HIPAA","Phlebotomy","ICU","OR","ER","Med-Surg","Pediatrics","Oncology","Telemetry","IV Therapy"], "Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","CNC","PLC","Blueprint Reading","Quality Control","Six Sigma","AutoCAD","SolidWorks","Mechanical Assembly","Electrical","Hydraulics","Warehouse","Lean Manufacturing"], "Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger","Financial Reporting","CPA","Payroll","Budgeting","Excel Advanced","Power BI","Tableau","GAAP","SOX","Audit"], "Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Receptionist","Executive Assistant","Calendar Management","Salesforce","CRM","Multi-line Phone","Typing 60+ WPM","Bilingual Spanish","Notary"], } email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","icloud.com","protonmail.com"] base_date = datetime(2026, 1, 1) def make_phone(): return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}" print("=" * 60) print("SCALE TEST: 2.5M rows + 100K vectors") print("=" * 60) t_start = time.time() # ============================================================ # 100K CANDIDATES # ============================================================ print("\nGenerating candidates (100K)...") t0 = time.time() N = 100_000 c_ids, c_first, c_last, c_emails, c_phones = [], [], [], [], [] c_city, c_state, c_zip = [], [], [] c_vertical, c_skills, c_resume = [], [], [] c_status, c_source, c_pay, c_years = [], [], [], [] for i in range(N): fn = random.choice(first_names) ln = random.choice(last_names) city, state, zc = random.choice(cities_zips) vert = random.choice(verticals) sk = random.sample(skills_by_vert[vert], min(random.randint(3, 10), len(skills_by_vert[vert]))) yrs = random.randint(0, 30) resume = f"{fn} {ln} — {vert} professional with {yrs} years experience in {city}, {state} {zc}. Skills: {', '.join(sk)}. " resume += random.choice([ f"Previously at {random.choice(['Acme','TechFlow','GlobalStaff','MedPro','BuildRight','CoreSys','Apex','Summit'])} Corp.", f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} in {city} metro.", f"Available {random.choice(['immediately','in 2 weeks','after current assignment'])}.", f"Open to {random.choice(['remote','hybrid','on-site'])} work arrangements.", f"Certified in {random.choice(sk)} with hands-on project experience.", ]) c_ids.append(f"CAND-{i+1:06d}") c_first.append(fn) c_last.append(ln) c_emails.append(f"{fn.lower()}.{ln.lower()}{random.randint(1,99)}@{random.choice(email_domains)}") c_phones.append(make_phone()) c_city.append(city) c_state.append(state) c_zip.append(zc) c_vertical.append(vert) c_skills.append("|".join(sk)) c_resume.append(resume) c_status.append(random.choice(["active","active","active","inactive","placed"])) c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","Website"])) c_pay.append(round(random.uniform(12, 95), 2)) c_years.append(yrs) candidates = pa.table({ "candidate_id": c_ids, "first_name": c_first, "last_name": c_last, "email": c_emails, "phone": c_phones, "city": c_city, "state": c_state, "zip": c_zip, "vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume, "status": c_status, "source": c_source, "min_pay_rate": c_pay, "years_experience": c_years, }) upload("candidates", candidates) print(f" Generated in {time.time()-t0:.1f}s") # ============================================================ # 2K CLIENTS # ============================================================ print("\nGenerating clients (2K)...") prefixes = ["Apex","Summit","Core","National","Metro","Pacific","Global","United","Pinnacle","Horizon","Pioneer","Titan","Quantum","Vertex","Elite"] suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners","Manufacturing","Healthcare"] cl_ids, cl_names, cl_verts, cl_city, cl_state, cl_zip = [], [], [], [], [], [] for i in range(2000): city, state, zc = random.choice(cities_zips) cl_ids.append(f"CLI-{i+1:05d}") cl_names.append(f"{random.choice(prefixes)} {random.choice(suffixes)}") cl_verts.append(random.choice(verticals)) cl_city.append(city) cl_state.append(state) cl_zip.append(zc) clients = pa.table({"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verts, "city": cl_city, "state": cl_state, "zip": cl_zip}) upload("clients", clients) # ============================================================ # 15K JOB ORDERS # ============================================================ print("\nGenerating job_orders (15K)...") titles_map = { "IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst","QA Engineer","Cloud Architect","React Developer","DBA","Security Analyst","Python Developer","Full Stack Developer"], "Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist","Radiology Tech","Pharmacy Tech","Medical Coder"], "Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech","Electrician","Warehouse Associate","Assembly Tech"], "Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Financial Analyst","Bookkeeper","Controller","Tax Preparer"], "Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager","Customer Service Rep","HR Coordinator"], } jo_ids, jo_clients, jo_titles, jo_verts, jo_bills, jo_pays, jo_status = [], [], [], [], [], [], [] jo_city, jo_state, jo_zip, jo_desc = [], [], [], [] for i in range(15000): vert = random.choice(verticals) title = random.choice(titles_map[vert]) ci = random.randint(0, 1999) city, state, zc = random.choice(cities_zips) bill = round(random.uniform(25, 150), 2) pay = round(bill * random.uniform(0.55, 0.75), 2) req_sk = random.sample(skills_by_vert[vert], min(random.randint(3, 6), len(skills_by_vert[vert]))) desc = f"{title} for {cl_names[ci]} in {city}, {state}. Requires: {', '.join(req_sk)}. {random.randint(1,10)}+ years exp. ${bill}/hr." jo_ids.append(f"JO-{i+1:06d}") jo_clients.append(cl_ids[ci]) jo_titles.append(title) jo_verts.append(vert) jo_bills.append(bill) jo_pays.append(pay) jo_status.append(random.choice(["open","open","filled","filled","closed"])) jo_city.append(city) jo_state.append(state) jo_zip.append(zc) jo_desc.append(desc) job_orders = pa.table({"job_order_id": jo_ids, "client_id": jo_clients, "title": jo_titles, "vertical": jo_verts, "bill_rate": jo_bills, "pay_rate": jo_pays, "status": jo_status, "city": jo_city, "state": jo_state, "zip": jo_zip, "description": jo_desc}) upload("job_orders", job_orders) # ============================================================ # 50K PLACEMENTS # ============================================================ print("\nGenerating placements (50K)...") recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(100)] p_ids, p_cands, p_jobs, p_clients, p_bills, p_pays, p_recs, p_status = [], [], [], [], [], [], [], [] for i in range(50000): ci = random.randint(0, N-1) ji = random.randint(0, 14999) p_ids.append(f"PL-{i+1:06d}") p_cands.append(c_ids[ci]) p_jobs.append(jo_ids[ji]) p_clients.append(jo_clients[ji]) p_bills.append(jo_bills[ji]) p_pays.append(jo_pays[ji]) p_recs.append(random.choice(recruiters)) p_status.append(random.choice(["active","active","completed","completed","terminated"])) placements = pa.table({"placement_id": p_ids, "candidate_id": p_cands, "job_order_id": p_jobs, "client_id": p_clients, "bill_rate": p_bills, "pay_rate": p_pays, "recruiter": p_recs, "status": p_status}) upload("placements", placements) # ============================================================ # 1M TIMESHEETS # ============================================================ print("\nGenerating timesheets (1M)...") ts_ids, ts_placements, ts_cands, ts_clients = [], [], [], [] ts_hrs_reg, ts_hrs_ot, ts_bill_total, ts_pay_total, ts_weeks, ts_approved = [], [], [], [], [], [] for i in range(1_000_000): pi = random.randint(0, 49999) hrs = random.choice([40.0, 40.0, 40.0, 32.0, 24.0, 20.0]) ot = random.choice([0.0, 0.0, 0.0, 4.0, 8.0, 12.0]) b = p_bills[pi] p = p_pays[pi] ts_ids.append(f"TS-{i+1:07d}") ts_placements.append(p_ids[pi]) ts_cands.append(p_cands[pi]) ts_clients.append(p_clients[pi]) ts_hrs_reg.append(hrs) ts_hrs_ot.append(ot) ts_bill_total.append(round(hrs * b + ot * b * 1.5, 2)) ts_pay_total.append(round(hrs * p + ot * p * 1.5, 2)) ts_weeks.append((base_date - timedelta(weeks=random.randint(0, 156))).strftime("%Y-%m-%d")) ts_approved.append(random.random() < 0.85) timesheets = pa.table({"timesheet_id": ts_ids, "placement_id": ts_placements, "candidate_id": ts_cands, "client_id": ts_clients, "hours_regular": ts_hrs_reg, "hours_overtime": ts_hrs_ot, "bill_total": ts_bill_total, "pay_total": ts_pay_total, "week_ending": ts_weeks, "approved": ts_approved}) upload("timesheets", timesheets) # ============================================================ # 800K CALL LOG # ============================================================ print("\nGenerating call_log (800K)...") call_ids, call_from, call_to, call_dur, call_ts, call_rec, call_cand, call_disp = [], [], [], [], [], [], [], [] disps = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled"] for i in range(800_000): ci = random.randint(0, N-1) call_ids.append(f"CALL-{i+1:07d}") call_from.append(make_phone()) call_to.append(c_phones[ci]) call_dur.append(random.randint(0, 1800)) call_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat()) call_rec.append(random.choice(recruiters)) call_cand.append(c_ids[ci]) call_disp.append(random.choice(disps)) call_log = pa.table({"call_id": call_ids, "from_number": call_from, "to_number": call_to, "duration_seconds": call_dur, "timestamp": call_ts, "recruiter": call_rec, "candidate_id": call_cand, "disposition": call_disp}) upload("call_log", call_log) # ============================================================ # 500K EMAIL LOG # ============================================================ print("\nGenerating email_log (500K)...") em_ids, em_from, em_to, em_subj, em_ts, em_rec, em_cand, em_opened = [], [], [], [], [], [], [], [] subjects = ["New opportunity: {}", "Following up", "Interview scheduled", "Timesheet reminder", "Background check complete", "Assignment details", "Rate update", "Welcome aboard"] for i in range(500_000): ci = random.randint(0, N-1) ji = random.randint(0, 14999) rec = random.choice(recruiters) em_ids.append(f"EM-{i+1:07d}") em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com") em_to.append(c_emails[ci]) em_subj.append(random.choice(subjects).format(jo_titles[ji])) em_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat()) em_rec.append(rec) em_cand.append(c_ids[ci]) em_opened.append(random.random() < 0.55) email_log = pa.table({"email_id": em_ids, "from_addr": em_from, "to_addr": em_to, "subject": em_subj, "timestamp": em_ts, "recruiter": em_rec, "candidate_id": em_cand, "opened": em_opened}) upload("email_log", email_log) total = 100_000 + 2_000 + 15_000 + 50_000 + 1_000_000 + 800_000 + 500_000 t_total = time.time() - t_start print(f"\n{'='*60}") print(f"LOADED: {total:,} rows in {t_total:.0f}s") print(f"{'='*60}")