100K embedding: supervisor achieves 67.6/sec (57% faster than single pipeline)

- 4 parallel pipelines on i9 + A4000 via Ollama
- Previous single-pipeline: 43/sec, 39min for 100K
- Supervisor: 67.6/sec, 22min for 100K
- Previous 100K attempt failed at 97K (no retry) — supervisor handles this
- Checkpointing every 1000 chunks for crash recovery
- Round-robin retry on batch failure (3 attempts)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-27 09:45:59 -05:00
parent 6f0f92a9e4
commit b2cd54e941
15 changed files with 162 additions and 105 deletions

View File

@ -1,15 +0,0 @@
{
"id": "03b65605-7cce-4a49-b338-4f19b0ff2ed5",
"name": "call_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/call_log.parquet",
"size_bytes": 35951077,
"created_at": "2026-03-27T14:00:44.377704982Z"
}
],
"created_at": "2026-03-27T14:00:44.377712082Z",
"updated_at": "2026-03-27T14:00:44.377712082Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "0e4feb1a-1421-46ac-8222-ba0f0bd6e13e",
"name": "email_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/email_log.parquet",
"size_bytes": 16768671,
"created_at": "2026-03-27T14:00:46.272499334Z"
}
],
"created_at": "2026-03-27T14:00:46.272507485Z",
"updated_at": "2026-03-27T14:00:46.272507485Z"
}

View File

@ -0,0 +1,23 @@
{
"id": "0fd78303-9ad4-45fd-90d7-db95607d9ab1",
"name": "timesheets",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/timesheets.parquet",
"size_bytes": 17539932,
"created_at": "2026-03-27T14:42:43.922019299Z"
}
],
"created_at": "2026-03-27T14:42:43.922025703Z",
"updated_at": "2026-03-27T14:42:43.922025703Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -0,0 +1,23 @@
{
"id": "1339f3d6-7677-47fb-8182-5f8e43f27cde",
"name": "job_orders",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/job_orders.parquet",
"size_bytes": 905534,
"created_at": "2026-03-27T14:42:38.935718195Z"
}
],
"created_at": "2026-03-27T14:42:38.935724058Z",
"updated_at": "2026-03-27T14:42:38.935724058Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -1,15 +0,0 @@
{
"id": "154cb8fe-5dcb-4d23-8ddb-c95b259757e9",
"name": "timesheets",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/timesheets.parquet",
"size_bytes": 17539932,
"created_at": "2026-03-27T14:00:40.845373500Z"
}
],
"created_at": "2026-03-27T14:00:40.845380446Z",
"updated_at": "2026-03-27T14:00:40.845380446Z"
}

View File

@ -0,0 +1,23 @@
{
"id": "1d8a065e-59c1-45ce-967b-398bc8370cbb",
"name": "candidates",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/candidates.parquet",
"size_bytes": 10592165,
"created_at": "2026-03-27T14:42:38.823368759Z"
}
],
"created_at": "2026-03-27T14:42:38.823374843Z",
"updated_at": "2026-03-27T14:42:38.823374843Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -0,0 +1,23 @@
{
"id": "94a8bd16-6756-43af-b951-09a9e6b8300f",
"name": "clients",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/clients.parquet",
"size_bytes": 21971,
"created_at": "2026-03-27T14:42:38.830329102Z"
}
],
"created_at": "2026-03-27T14:42:38.830331694Z",
"updated_at": "2026-03-27T14:42:38.830331694Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -0,0 +1,23 @@
{
"id": "9c4d9116-1d9d-4afd-a8d1-c514a678e5fa",
"name": "call_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/call_log.parquet",
"size_bytes": 35951077,
"created_at": "2026-03-27T14:42:47.395548205Z"
}
],
"created_at": "2026-03-27T14:42:47.395555326Z",
"updated_at": "2026-03-27T14:42:47.395555326Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -1,15 +0,0 @@
{
"id": "d2ce2995-9c60-49c9-9b41-197020cebaae",
"name": "placements",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/placements.parquet",
"size_bytes": 1213820,
"created_at": "2026-03-27T14:00:35.885543632Z"
}
],
"created_at": "2026-03-27T14:00:35.885550623Z",
"updated_at": "2026-03-27T14:00:35.885550623Z"
}

View File

@ -0,0 +1,23 @@
{
"id": "d35c7941-37e2-4bde-8226-5cf69c74931a",
"name": "placements",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/placements.parquet",
"size_bytes": 1213820,
"created_at": "2026-03-27T14:42:39.040983450Z"
}
],
"created_at": "2026-03-27T14:42:39.040989351Z",
"updated_at": "2026-03-27T14:42:39.040989351Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -1,15 +0,0 @@
{
"id": "d8170213-d6af-4478-ae23-59f06fda3165",
"name": "job_orders",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/job_orders.parquet",
"size_bytes": 905534,
"created_at": "2026-03-27T14:00:35.780022147Z"
}
],
"created_at": "2026-03-27T14:00:35.780029168Z",
"updated_at": "2026-03-27T14:00:35.780029168Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "e26d3633-a341-4229-9819-f287d98b788a",
"name": "candidates",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/candidates.parquet",
"size_bytes": 10592165,
"created_at": "2026-03-27T14:00:35.662150713Z"
}
],
"created_at": "2026-03-27T14:00:35.662162510Z",
"updated_at": "2026-03-27T14:00:35.662162510Z"
}

View File

@ -1,15 +0,0 @@
{
"id": "e4b8441f-d729-4465-91fb-2ed5f481e65d",
"name": "clients",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/clients.parquet",
"size_bytes": 21971,
"created_at": "2026-03-27T14:00:35.670181596Z"
}
],
"created_at": "2026-03-27T14:00:35.670184688Z",
"updated_at": "2026-03-27T14:00:35.670184688Z"
}

View File

@ -0,0 +1,23 @@
{
"id": "e959ca11-9f6b-4843-864a-cc3f50a8aa60",
"name": "email_log",
"schema_fingerprint": "auto",
"objects": [
{
"bucket": "data",
"key": "datasets/email_log.parquet",
"size_bytes": 16768671,
"created_at": "2026-03-27T14:42:49.271082991Z"
}
],
"created_at": "2026-03-27T14:42:49.271091077Z",
"updated_at": "2026-03-27T14:42:49.271091077Z",
"description": "",
"owner": "",
"sensitivity": null,
"columns": [],
"lineage": null,
"freshness": null,
"tags": [],
"row_count": null
}

View File

@ -0,0 +1 @@
{"job_id":"job-1774622586005","index_name":"resumes_100k_v2","total_chunks":100000,"completed_ranges":[[92500,95000],[95000,97500],[90000,92500],[97500,100000]],"failed_ranges":[],"embedded_count":10000}