Benchmarks on 128GB RAM server: - 100K candidate filter (skills+city+status): 257ms - 1M timesheet aggregation (revenue by client): 942ms - 800K call log cross-reference (cold leads): 642ms - Triple JOIN recruiter performance: 487ms - 500K email open rate aggregation: 259ms - COUNT all 2.47M rows: 84ms - 10K vector search (cosine similarity): ~450ms - Embedding throughput: 49 chunks/sec via Ollama - RAG correctly refuses to hallucinate when no match exists Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
265 lines
14 KiB
Python
265 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scale test: 2.5M rows across staffing tables + 100K vector embeddings.
|
|
Designed for 128GB RAM machine.
|
|
"""
|
|
|
|
import random, json, urllib.request, time
|
|
from datetime import datetime, timedelta
|
|
import pyarrow as pa, pyarrow.parquet as pq
|
|
|
|
API = "http://localhost:3100"
|
|
random.seed(2026)
|
|
|
|
def upload(name, table):
|
|
path = f"/tmp/{name}.parquet"
|
|
pq.write_table(table, path, compression="snappy")
|
|
with open(path, "rb") as f:
|
|
data = f.read()
|
|
key = f"datasets/{name}.parquet"
|
|
req = urllib.request.Request(f"{API}/storage/objects/{key}", data=data, method="PUT")
|
|
urllib.request.urlopen(req)
|
|
body = json.dumps({"name": name, "schema_fingerprint": "auto",
|
|
"objects": [{"bucket": "data", "key": key, "size_bytes": len(data)}]}).encode()
|
|
req = urllib.request.Request(f"{API}/catalog/datasets", data=body, method="POST",
|
|
headers={"Content-Type": "application/json"})
|
|
urllib.request.urlopen(req)
|
|
print(f" {name}: {table.num_rows:,} rows ({len(data)/1024/1024:.1f} MB)")
|
|
|
|
# Shared data
|
|
first_names = ["James","Mary","Robert","Patricia","John","Jennifer","Michael","Linda","David","Elizabeth",
|
|
"William","Barbara","Richard","Susan","Joseph","Jessica","Thomas","Sarah","Christopher","Karen",
|
|
"Charles","Lisa","Daniel","Nancy","Matthew","Betty","Anthony","Margaret","Mark","Sandra",
|
|
"Donald","Ashley","Steven","Dorothy","Paul","Kimberly","Andrew","Emily","Joshua","Donna",
|
|
"Kenneth","Michelle","Kevin","Carol","Brian","Amanda","George","Melissa","Timothy","Deborah"]
|
|
last_names = ["Smith","Johnson","Williams","Brown","Jones","Garcia","Miller","Davis","Rodriguez","Martinez",
|
|
"Hernandez","Lopez","Gonzalez","Wilson","Anderson","Thomas","Taylor","Moore","Jackson","Martin",
|
|
"Lee","Perez","Thompson","White","Harris","Sanchez","Clark","Ramirez","Lewis","Robinson",
|
|
"Walker","Young","Allen","King","Wright","Scott","Torres","Nguyen","Hill","Flores"]
|
|
cities_zips = [
|
|
("Chicago","IL","60601"),("Chicago","IL","60610"),("Chicago","IL","60614"),("Chicago","IL","60622"),
|
|
("New York","NY","10001"),("New York","NY","10016"),("New York","NY","10022"),("New York","NY","10036"),
|
|
("Los Angeles","CA","90001"),("Los Angeles","CA","90024"),("Houston","TX","77001"),("Houston","TX","77019"),
|
|
("Dallas","TX","75201"),("Dallas","TX","75219"),("Atlanta","GA","30301"),("Atlanta","GA","30309"),
|
|
("Denver","CO","80201"),("Denver","CO","80206"),("Phoenix","AZ","85001"),("Phoenix","AZ","85006"),
|
|
("Seattle","WA","98101"),("Seattle","WA","98104"),("Miami","FL","33101"),("Miami","FL","33132"),
|
|
]
|
|
verticals = ["IT","Healthcare","Industrial","Accounting","Admin"]
|
|
skills_by_vert = {
|
|
"IT": ["Java","Python","C#",".NET","JavaScript","React","Angular","Node.js","SQL","AWS","Azure","Docker","Kubernetes","Linux","Git","REST APIs","MongoDB","PostgreSQL","Redis","Terraform","Jenkins","Agile","Spring Boot","Django","Go","Rust","TypeScript","GraphQL","Microservices","CI/CD"],
|
|
"Healthcare": ["RN","LPN","CNA","BLS","ACLS","EMR","Epic","Cerner","ICD-10","CPT","HIPAA","Phlebotomy","ICU","OR","ER","Med-Surg","Pediatrics","Oncology","Telemetry","IV Therapy"],
|
|
"Industrial": ["Forklift","OSHA 10","OSHA 30","Welding","CNC","PLC","Blueprint Reading","Quality Control","Six Sigma","AutoCAD","SolidWorks","Mechanical Assembly","Electrical","Hydraulics","Warehouse","Lean Manufacturing"],
|
|
"Accounting": ["QuickBooks","SAP","Oracle","Accounts Payable","Accounts Receivable","General Ledger","Financial Reporting","CPA","Payroll","Budgeting","Excel Advanced","Power BI","Tableau","GAAP","SOX","Audit"],
|
|
"Admin": ["Microsoft Office","Data Entry","Customer Service","Scheduling","Receptionist","Executive Assistant","Calendar Management","Salesforce","CRM","Multi-line Phone","Typing 60+ WPM","Bilingual Spanish","Notary"],
|
|
}
|
|
email_domains = ["gmail.com","yahoo.com","hotmail.com","outlook.com","icloud.com","protonmail.com"]
|
|
base_date = datetime(2026, 1, 1)
|
|
|
|
def make_phone():
|
|
return f"({random.randint(200,999)}) {random.randint(200,999)}-{random.randint(1000,9999)}"
|
|
|
|
print("=" * 60)
|
|
print("SCALE TEST: 2.5M rows + 100K vectors")
|
|
print("=" * 60)
|
|
t_start = time.time()
|
|
|
|
# ============================================================
|
|
# 100K CANDIDATES
|
|
# ============================================================
|
|
print("\nGenerating candidates (100K)...")
|
|
t0 = time.time()
|
|
N = 100_000
|
|
|
|
c_ids, c_first, c_last, c_emails, c_phones = [], [], [], [], []
|
|
c_city, c_state, c_zip = [], [], []
|
|
c_vertical, c_skills, c_resume = [], [], []
|
|
c_status, c_source, c_pay, c_years = [], [], [], []
|
|
|
|
for i in range(N):
|
|
fn = random.choice(first_names)
|
|
ln = random.choice(last_names)
|
|
city, state, zc = random.choice(cities_zips)
|
|
vert = random.choice(verticals)
|
|
sk = random.sample(skills_by_vert[vert], min(random.randint(3, 10), len(skills_by_vert[vert])))
|
|
yrs = random.randint(0, 30)
|
|
|
|
resume = f"{fn} {ln} — {vert} professional with {yrs} years experience in {city}, {state} {zc}. Skills: {', '.join(sk)}. "
|
|
resume += random.choice([
|
|
f"Previously at {random.choice(['Acme','TechFlow','GlobalStaff','MedPro','BuildRight','CoreSys','Apex','Summit'])} Corp.",
|
|
f"Seeking {random.choice(['contract','full-time','temp-to-hire'])} in {city} metro.",
|
|
f"Available {random.choice(['immediately','in 2 weeks','after current assignment'])}.",
|
|
f"Open to {random.choice(['remote','hybrid','on-site'])} work arrangements.",
|
|
f"Certified in {random.choice(sk)} with hands-on project experience.",
|
|
])
|
|
|
|
c_ids.append(f"CAND-{i+1:06d}")
|
|
c_first.append(fn)
|
|
c_last.append(ln)
|
|
c_emails.append(f"{fn.lower()}.{ln.lower()}{random.randint(1,99)}@{random.choice(email_domains)}")
|
|
c_phones.append(make_phone())
|
|
c_city.append(city)
|
|
c_state.append(state)
|
|
c_zip.append(zc)
|
|
c_vertical.append(vert)
|
|
c_skills.append("|".join(sk))
|
|
c_resume.append(resume)
|
|
c_status.append(random.choice(["active","active","active","inactive","placed"]))
|
|
c_source.append(random.choice(["Indeed","LinkedIn","Referral","Walk-in","Monster","Website"]))
|
|
c_pay.append(round(random.uniform(12, 95), 2))
|
|
c_years.append(yrs)
|
|
|
|
candidates = pa.table({
|
|
"candidate_id": c_ids, "first_name": c_first, "last_name": c_last,
|
|
"email": c_emails, "phone": c_phones,
|
|
"city": c_city, "state": c_state, "zip": c_zip,
|
|
"vertical": c_vertical, "skills": c_skills, "resume_summary": c_resume,
|
|
"status": c_status, "source": c_source, "min_pay_rate": c_pay, "years_experience": c_years,
|
|
})
|
|
upload("candidates", candidates)
|
|
print(f" Generated in {time.time()-t0:.1f}s")
|
|
|
|
# ============================================================
|
|
# 2K CLIENTS
|
|
# ============================================================
|
|
print("\nGenerating clients (2K)...")
|
|
prefixes = ["Apex","Summit","Core","National","Metro","Pacific","Global","United","Pinnacle","Horizon","Pioneer","Titan","Quantum","Vertex","Elite"]
|
|
suffixes = ["Industries","Solutions","Systems","Group","Corp","Technologies","Services","Partners","Manufacturing","Healthcare"]
|
|
cl_ids, cl_names, cl_verts, cl_city, cl_state, cl_zip = [], [], [], [], [], []
|
|
for i in range(2000):
|
|
city, state, zc = random.choice(cities_zips)
|
|
cl_ids.append(f"CLI-{i+1:05d}")
|
|
cl_names.append(f"{random.choice(prefixes)} {random.choice(suffixes)}")
|
|
cl_verts.append(random.choice(verticals))
|
|
cl_city.append(city)
|
|
cl_state.append(state)
|
|
cl_zip.append(zc)
|
|
clients = pa.table({"client_id": cl_ids, "company_name": cl_names, "vertical": cl_verts, "city": cl_city, "state": cl_state, "zip": cl_zip})
|
|
upload("clients", clients)
|
|
|
|
# ============================================================
|
|
# 15K JOB ORDERS
|
|
# ============================================================
|
|
print("\nGenerating job_orders (15K)...")
|
|
titles_map = {
|
|
"IT": ["Software Developer","Java Developer",".NET Developer","DevOps Engineer","Data Analyst","QA Engineer","Cloud Architect","React Developer","DBA","Security Analyst","Python Developer","Full Stack Developer"],
|
|
"Healthcare": ["Registered Nurse","LPN","CNA","Medical Assistant","Phlebotomist","Radiology Tech","Pharmacy Tech","Medical Coder"],
|
|
"Industrial": ["Forklift Operator","Welder","CNC Machinist","Quality Inspector","Maintenance Tech","Electrician","Warehouse Associate","Assembly Tech"],
|
|
"Accounting": ["Staff Accountant","AP Specialist","AR Specialist","Payroll Clerk","Financial Analyst","Bookkeeper","Controller","Tax Preparer"],
|
|
"Admin": ["Administrative Assistant","Executive Assistant","Receptionist","Data Entry Clerk","Office Manager","Customer Service Rep","HR Coordinator"],
|
|
}
|
|
jo_ids, jo_clients, jo_titles, jo_verts, jo_bills, jo_pays, jo_status = [], [], [], [], [], [], []
|
|
jo_city, jo_state, jo_zip, jo_desc = [], [], [], []
|
|
for i in range(15000):
|
|
vert = random.choice(verticals)
|
|
title = random.choice(titles_map[vert])
|
|
ci = random.randint(0, 1999)
|
|
city, state, zc = random.choice(cities_zips)
|
|
bill = round(random.uniform(25, 150), 2)
|
|
pay = round(bill * random.uniform(0.55, 0.75), 2)
|
|
req_sk = random.sample(skills_by_vert[vert], min(random.randint(3, 6), len(skills_by_vert[vert])))
|
|
desc = f"{title} for {cl_names[ci]} in {city}, {state}. Requires: {', '.join(req_sk)}. {random.randint(1,10)}+ years exp. ${bill}/hr."
|
|
jo_ids.append(f"JO-{i+1:06d}")
|
|
jo_clients.append(cl_ids[ci])
|
|
jo_titles.append(title)
|
|
jo_verts.append(vert)
|
|
jo_bills.append(bill)
|
|
jo_pays.append(pay)
|
|
jo_status.append(random.choice(["open","open","filled","filled","closed"]))
|
|
jo_city.append(city)
|
|
jo_state.append(state)
|
|
jo_zip.append(zc)
|
|
jo_desc.append(desc)
|
|
job_orders = pa.table({"job_order_id": jo_ids, "client_id": jo_clients, "title": jo_titles, "vertical": jo_verts, "bill_rate": jo_bills, "pay_rate": jo_pays, "status": jo_status, "city": jo_city, "state": jo_state, "zip": jo_zip, "description": jo_desc})
|
|
upload("job_orders", job_orders)
|
|
|
|
# ============================================================
|
|
# 50K PLACEMENTS
|
|
# ============================================================
|
|
print("\nGenerating placements (50K)...")
|
|
recruiters = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(100)]
|
|
p_ids, p_cands, p_jobs, p_clients, p_bills, p_pays, p_recs, p_status = [], [], [], [], [], [], [], []
|
|
for i in range(50000):
|
|
ci = random.randint(0, N-1)
|
|
ji = random.randint(0, 14999)
|
|
p_ids.append(f"PL-{i+1:06d}")
|
|
p_cands.append(c_ids[ci])
|
|
p_jobs.append(jo_ids[ji])
|
|
p_clients.append(jo_clients[ji])
|
|
p_bills.append(jo_bills[ji])
|
|
p_pays.append(jo_pays[ji])
|
|
p_recs.append(random.choice(recruiters))
|
|
p_status.append(random.choice(["active","active","completed","completed","terminated"]))
|
|
placements = pa.table({"placement_id": p_ids, "candidate_id": p_cands, "job_order_id": p_jobs, "client_id": p_clients, "bill_rate": p_bills, "pay_rate": p_pays, "recruiter": p_recs, "status": p_status})
|
|
upload("placements", placements)
|
|
|
|
# ============================================================
|
|
# 1M TIMESHEETS
|
|
# ============================================================
|
|
print("\nGenerating timesheets (1M)...")
|
|
ts_ids, ts_placements, ts_cands, ts_clients = [], [], [], []
|
|
ts_hrs_reg, ts_hrs_ot, ts_bill_total, ts_pay_total, ts_weeks, ts_approved = [], [], [], [], [], []
|
|
for i in range(1_000_000):
|
|
pi = random.randint(0, 49999)
|
|
hrs = random.choice([40.0, 40.0, 40.0, 32.0, 24.0, 20.0])
|
|
ot = random.choice([0.0, 0.0, 0.0, 4.0, 8.0, 12.0])
|
|
b = p_bills[pi]
|
|
p = p_pays[pi]
|
|
ts_ids.append(f"TS-{i+1:07d}")
|
|
ts_placements.append(p_ids[pi])
|
|
ts_cands.append(p_cands[pi])
|
|
ts_clients.append(p_clients[pi])
|
|
ts_hrs_reg.append(hrs)
|
|
ts_hrs_ot.append(ot)
|
|
ts_bill_total.append(round(hrs * b + ot * b * 1.5, 2))
|
|
ts_pay_total.append(round(hrs * p + ot * p * 1.5, 2))
|
|
ts_weeks.append((base_date - timedelta(weeks=random.randint(0, 156))).strftime("%Y-%m-%d"))
|
|
ts_approved.append(random.random() < 0.85)
|
|
timesheets = pa.table({"timesheet_id": ts_ids, "placement_id": ts_placements, "candidate_id": ts_cands, "client_id": ts_clients, "hours_regular": ts_hrs_reg, "hours_overtime": ts_hrs_ot, "bill_total": ts_bill_total, "pay_total": ts_pay_total, "week_ending": ts_weeks, "approved": ts_approved})
|
|
upload("timesheets", timesheets)
|
|
|
|
# ============================================================
|
|
# 800K CALL LOG
|
|
# ============================================================
|
|
print("\nGenerating call_log (800K)...")
|
|
call_ids, call_from, call_to, call_dur, call_ts, call_rec, call_cand, call_disp = [], [], [], [], [], [], [], []
|
|
disps = ["connected","voicemail","no_answer","busy","wrong_number","callback_scheduled"]
|
|
for i in range(800_000):
|
|
ci = random.randint(0, N-1)
|
|
call_ids.append(f"CALL-{i+1:07d}")
|
|
call_from.append(make_phone())
|
|
call_to.append(c_phones[ci])
|
|
call_dur.append(random.randint(0, 1800))
|
|
call_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
|
|
call_rec.append(random.choice(recruiters))
|
|
call_cand.append(c_ids[ci])
|
|
call_disp.append(random.choice(disps))
|
|
call_log = pa.table({"call_id": call_ids, "from_number": call_from, "to_number": call_to, "duration_seconds": call_dur, "timestamp": call_ts, "recruiter": call_rec, "candidate_id": call_cand, "disposition": call_disp})
|
|
upload("call_log", call_log)
|
|
|
|
# ============================================================
|
|
# 500K EMAIL LOG
|
|
# ============================================================
|
|
print("\nGenerating email_log (500K)...")
|
|
em_ids, em_from, em_to, em_subj, em_ts, em_rec, em_cand, em_opened = [], [], [], [], [], [], [], []
|
|
subjects = ["New opportunity: {}", "Following up", "Interview scheduled", "Timesheet reminder", "Background check complete", "Assignment details", "Rate update", "Welcome aboard"]
|
|
for i in range(500_000):
|
|
ci = random.randint(0, N-1)
|
|
ji = random.randint(0, 14999)
|
|
rec = random.choice(recruiters)
|
|
em_ids.append(f"EM-{i+1:07d}")
|
|
em_from.append(f"{rec.replace(' ','.').lower()}@acmestaffing.com")
|
|
em_to.append(c_emails[ci])
|
|
em_subj.append(random.choice(subjects).format(jo_titles[ji]))
|
|
em_ts.append((base_date - timedelta(seconds=random.randint(0, 86400*365))).isoformat())
|
|
em_rec.append(rec)
|
|
em_cand.append(c_ids[ci])
|
|
em_opened.append(random.random() < 0.55)
|
|
email_log = pa.table({"email_id": em_ids, "from_addr": em_from, "to_addr": em_to, "subject": em_subj, "timestamp": em_ts, "recruiter": em_rec, "candidate_id": em_cand, "opened": em_opened})
|
|
upload("email_log", email_log)
|
|
|
|
total = 100_000 + 2_000 + 15_000 + 50_000 + 1_000_000 + 800_000 + 500_000
|
|
t_total = time.time() - t_start
|
|
print(f"\n{'='*60}")
|
|
print(f"LOADED: {total:,} rows in {t_total:.0f}s")
|
|
print(f"{'='*60}")
|