From 390ebf0c3652bc832ee49399e9eca3fc95eafdca Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Thu, 16 Apr 2026 22:08:34 -0500
Subject: [PATCH] =?UTF-8?q?IVF=5FPQ=20recall=20tuned=20from=200.80=20?=
 =?UTF-8?q?=E2=86=92=200.97=20via=20parameter=20sweep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Systematic sweep of 8 IVF_PQ configs on 100K × 768d resumes.
num_sub_vectors is the dominant lever: 48 → 192 pushes recall
from 0.795 → 0.970. Winner: partitions=500, bits=8, subs=192.
Build 61s (vs 18s baseline), acceptable for background builds.

Hybrid status: HNSW recall=1.00 at <1ms, Lance IVF_PQ recall=0.97
at 60ms. Both backends production-grade.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/lance_tune.py | 134 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 scripts/lance_tune.py

diff --git a/scripts/lance_tune.py b/scripts/lance_tune.py
new file mode 100644
index 0000000..49c4413
--- /dev/null
+++ b/scripts/lance_tune.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Lance IVF_PQ parameter sweep — find the best recall config.
+
+Mirrors the HNSW autotune pattern: try different configs, measure
+recall against brute-force ground truth, pick the Pareto winner.
+
+Parameters to tune:
+- num_partitions: more partitions = finer-grained search = higher recall, slower build
+- num_sub_vectors: more = better quantization = higher recall, more disk
+- num_bits: 8 is standard; 4 is smaller but lower recall
+
+We keep the harness (resumes_100k_smoke) constant — same 20 queries,
+same ground truth computed from brute-force cosine.
+"""
+
+import json, time, sys
+from urllib.request import Request, urlopen
+from urllib.error import HTTPError
+
+BASE = "http://localhost:3100"
+
+def post(path, body=None, timeout=600):
+    data = json.dumps(body).encode() if body else None
+    req = Request(f"{BASE}{path}", data=data, headers={"Content-Type": "application/json"})
+    try:
+        resp = urlopen(req, timeout=timeout)
+        return json.loads(resp.read())
+    except HTTPError as e:
+        return {"error": e.read().decode()[:300]}
+    except Exception as e:
+        return {"error": str(e)}
+
+# Configs to sweep. Key insight from IVF_PQ theory:
+# - More partitions + more probes = higher recall (sqrt(N) is a starting point)
+# - More sub_vectors = finer quantization = higher recall but bigger index
+# - 768 dims must be evenly divisible by num_sub_vectors
+#   Valid: 48 (768/48=16), 96 (768/96=8), 192 (768/192=4), 384 (768/384=2)
+#   Also: 64 (768/64=12), 128 (768/128=6), 256 (768/256=3)
+
+configs = [
+    # (partitions, bits, sub_vectors, label)
+    (100,  8, 48,  "fewer partitions"),
+    (316,  8, 48,  "baseline (√N)"),
+    (500,  8, 48,  "more partitions"),
+    (316,  8, 96,  "more sub_vectors (96)"),
+    (316,  8, 192, "fine quantization (192)"),
+    (500,  8, 96,  "more parts + more subs"),
+    (500,  8, 192, "max config"),
+    (200,  8, 128, "balanced mid"),
+]
+
+print("=" * 70)
+print("LANCE IVF_PQ PARAMETER SWEEP — find the recall sweet spot")
+print("=" * 70)
+print(f"Index: resumes_100k_v2 (100K × 768d)")
+print(f"Harness: resumes_100k_smoke (20 queries, brute-force ground truth)")
+print(f"Configs to test: {len(configs)}")
+print()
+
+results = []
+
+for i, (parts, bits, subs, label) in enumerate(configs):
+    print(f"[{i+1}/{len(configs)}] {label}: partitions={parts} bits={bits} sub_vectors={subs}")
+
+    # Rebuild index with these params
+    t0 = time.time()
+    r = post(f"/vectors/lance/index/resumes_100k_v2", {
+        "num_partitions": parts,
+        "num_bits": bits,
+        "num_sub_vectors": subs,
+    })
+    build_ms = (time.time() - t0) * 1000
+
+    if "error" in r:
+        print(f"  ✗ build failed: {r['error'][:80]}")
+        results.append({"label": label, "parts": parts, "bits": bits, "subs": subs,
+                        "recall": 0, "p50": 0, "build_ms": build_ms, "error": r["error"][:80]})
+        continue
+
+    build_secs = r.get("build_time_secs", 0)
+    print(f"  built in {build_secs:.1f}s")
+
+    # Measure recall
+    r = post(f"/vectors/lance/recall/resumes_100k_v2", {
+        "harness": "resumes_100k_smoke",
+        "top_k": 10,
+    })
+
+    if "error" in r:
+        print(f"  ✗ recall failed: {r['error'][:80]}")
+        results.append({"label": label, "parts": parts, "bits": bits, "subs": subs,
+                        "recall": 0, "p50": 0, "build_ms": build_ms, "error": r["error"][:80]})
+        continue
+
+    recall = r.get("mean_recall", 0)
+    p50 = r.get("latency_p50_us", 0)
+    p95 = r.get("latency_p95_us", 0)
+    print(f"  recall@10={recall:.4f}  p50={p50:.0f}us  p95={p95:.0f}us  build={build_secs:.1f}s")
+
+    results.append({
+        "label": label, "parts": parts, "bits": bits, "subs": subs,
+        "recall": recall, "p50": p50, "p95": p95,
+        "build_secs": build_secs, "build_ms": build_ms,
+    })
+
+# Pick the winner: highest recall, then lowest p50 on ties
+print("\n" + "=" * 70)
+print("RESULTS")
+print("=" * 70)
+print(f"\n{'Config':<30} {'Parts':>6} {'Subs':>6} {'Recall':>8} {'p50 us':>8} {'Build':>7}")
+print("-" * 70)
+
+results.sort(key=lambda r: (-r["recall"], r.get("p50", 99999)))
+for r in results:
+    if "error" in r:
+        print(f"{r['label']:<30} {r['parts']:>6} {r['subs']:>6} {'FAIL':>8} {'—':>8} {'—':>7}")
+    else:
+        print(f"{r['label']:<30} {r['parts']:>6} {r['subs']:>6} {r['recall']:>8.4f} {r['p50']:>8.0f} {r['build_secs']:>6.1f}s")
+
+winner = results[0] if results else None
+if winner and "error" not in winner:
+    print(f"\n★ WINNER: {winner['label']}")
+    print(f"  recall@10={winner['recall']:.4f}  p50={winner['p50']:.0f}us  build={winner['build_secs']:.1f}s")
+    print(f"  Config: num_partitions={winner['parts']} num_bits={winner['bits']} num_sub_vectors={winner['subs']}")
+
+    # Rebuild with the winner config so it stays active
+    if winner != results[-1]:  # only if the last build wasn't already the winner
+        print(f"\n  Rebuilding index with winner config...")
+        post(f"/vectors/lance/index/resumes_100k_v2", {
+            "num_partitions": winner["parts"],
+            "num_bits": winner["bits"],
+            "num_sub_vectors": winner["subs"],
+        })
+        print("  Done — winner config is now active.")