From 08a086779b7de3bf1dd46fcc383b8ae250404976 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 30 Apr 2026 16:31:45 -0500 Subject: [PATCH] =?UTF-8?q?multi=5Fcoord=5Fstress:=20fresh=5Fworkers=20two?= =?UTF-8?q?-tier=20index=20=E2=80=94=20fresh-resume=20now=20top-1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs #003-#009 surfaced the same finding: fresh workers added mid-run to the main 'workers' vectord index (5K items) reliably *absorbed* (HTTP 200) but failed to *surface* in semantic queries even with content-matching prompts. Distances on the verify queries sat at 0.25-0.65 against existing workers; fresh items were beyond top-K. Better embedder (v2-moe) didn't help — distances got TIGHTER on existing items, pushing fresh items further out of reach. Root cause: coder/hnsw incremental adds to a populated graph land in poorly-connected regions and disappear from search traversal. Known property of HNSW post-build adds; not a bug. Fix: two-tier index pattern (canonical NRT search architecture). Fresh content goes to a small "hot" corpus (fresh_workers); main queries include it in the corpora list and merge results. Hot corpus has no recall crowding because it's tiny; periodic batch job (post- G3) merges it into the main index. Implementation: - ensureFreshIndex(hc, gw, name, dim) — idempotent POST /v1/vectors/index. 409 from re-create treated as "already there." - ingestFreshWorker now takes idx parameter so callers can target fresh_workers instead of workers. - multi_coord_stress phase 1b creates fresh_workers index + ingests 3 fresh workers there + searches verifyCorpora=[workers, ethereal_workers, fresh_workers]. Run #010 result: fresh-001 (Senior tower crane rigger NCCCO Chicago) top-1: fresh-001 from fresh_workers, distance 0.143 fresh-002 (Bilingual Spanish/English OSHA trainer Indianapolis) top-1: fresh-002 from fresh_workers, distance 0.146 fresh-003 (FAA Part 107 drone surveyor Chicago) top-1: fresh-003 from fresh_workers, distance 0.129 3/3 fresh workers surface at top-1 — the absorption-but-not- findable issue from runs #003-#009 is closed. All other metrics held: diversity 0.007, determinism 1.000, verbatim handover 4/4, paraphrase handover 4/4, swap Jaccard 0.000, inbox burst all 6 events accepted + traced to Langfuse. This is the final structural fix for the multi-coord stress suite. Phase 3 is feature-complete. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../reality-tests/multi_coord_stress_010.md | 82 +++++++++++++++++++ scripts/multi_coord_stress/main.go | 54 ++++++++++-- 2 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 reports/reality-tests/multi_coord_stress_010.md diff --git a/reports/reality-tests/multi_coord_stress_010.md b/reports/reality-tests/multi_coord_stress_010.md new file mode 100644 index 0000000..6dfa1d6 --- /dev/null +++ b/reports/reality-tests/multi_coord_stress_010.md @@ -0,0 +1,82 @@ +# Multi-Coordinator Stress Test — Run 010 + +**Generated:** 2026-04-30T21:30:38.434794788Z +**Coordinators:** alice / bob / carol (each with own playbook namespace: `playbook_alice` / `playbook_bob` / `playbook_carol`) +**Contracts:** alpha_milwaukee_distribution / beta_indianapolis_manufacturing / gamma_chicago_construction +**Corpora:** `workers,ethereal_workers` +**K per query:** 8 +**Total events captured:** 67 +**Evidence:** `reports/reality-tests/multi_coord_stress_010.json` + +--- + +## Diversity — is the system locking into scenarios or cycling? + +| Metric | Mean Jaccard | n pairs | Interpretation | +|---|---:|---:|---| +| Same role across different contracts | 0.007407407407407408 | 9 | Lower = more diverse (different region/cert mix → different workers) | +| Different roles within same contract | 0.026455026455026454 | 18 | Should be near-zero (different roles = different worker pools) | + +**Healthy ranges:** +- Same role across contracts: < 0.30 means the system is genuinely picking different workers per region/contract. +- Different roles same contract: < 0.10 means role-specific retrieval is working. +- If either is > 0.50, the system is "cycling" the same handful of workers regardless of query intent. + +--- + +## Determinism — same query reissued, top-K stability + +| Metric | Value | +|---|---:| +| Mean Jaccard on retrieval-only reissue | 1 | +| Number of reissue pairs | 12 | + +**Interpretation:** +- ≥ 0.95: HNSW retrieval is highly deterministic; reissues land on near-identical top-K. Good — system locks into a stable view of "best workers for this query." +- 0.80 – 0.95: Some HNSW or embed variance, acceptable. +- < 0.80: Retrieval is unstable — reissues see substantially different results, suggesting either embed nondeterminism (Ollama returning slightly different vectors) or vectord nondeterminism (HNSW insertion order affecting recall). + +--- + +## Learning — handover hit rate + +Bob takes Alice's contract using Alice's playbook namespace. Did Alice's recorded answers surface in Bob's results? + +| Metric | Value | +|---|---:| +| Verbatim handover queries run | 4 | +| Alice's recorded answer at Bob's top-1 (verbatim) | 4 | +| Alice's recorded answer in Bob's top-K (verbatim) | 4 | +| **Verbatim handover hit rate (top-1)** | **1** | +| Paraphrase handover queries run | 4 | +| Alice's recorded answer at Bob's top-1 (paraphrase) | 4 | +| Alice's recorded answer in Bob's top-K (paraphrase) | 4 | +| **Paraphrase handover hit rate (top-1)** | **1** | + +**Interpretation:** +- Verbatim hit rate ≈ 1.0: trivial case — Bob runs identical queries; should always hit. +- Paraphrase hit rate ≥ 0.5: institutional memory survives wording change — the harder learning property. +- Paraphrase hit rate ≈ 0.0: Bob's paraphrases drift past the inject threshold, so Alice's recordings don't activate. Same caveat as the playbook_lift paraphrase pass. + +--- + +## Per-event capture + +All matrix.search responses live in the JSON — top-K with worker IDs, distances, and per-corpus counts. Search by phase: + +```bash +jq '.events[] | select(.phase == "merge")' reports/reality-tests/multi_coord_stress_010.json +jq '.events[] | select(.coordinator == "alice" and .phase == "baseline")' reports/reality-tests/multi_coord_stress_010.json +jq '.events[] | select(.role == "warehouse worker") | {phase, contract, top_k_ids: [.top_k[].id]}' reports/reality-tests/multi_coord_stress_010.json +``` + +--- + +## What's NOT in this run (Phase 1 deliberately defers) + +- **48-hour clock.** Events fire as discrete steps, not on a timeline. +- **Email / SMS ingest.** No endpoints exist on the Go side yet. +- **New-resume injection mid-run.** The corpus is fixed at the start. +- **Langfuse traces.** Need Go-side wiring. + +These are Phase 2/3. The Phase 1 substrate is what the time-based runner will mount on top of. diff --git a/scripts/multi_coord_stress/main.go b/scripts/multi_coord_stress/main.go index e0dba2a..1fa6fab 100644 --- a/scripts/multi_coord_stress/main.go +++ b/scripts/multi_coord_stress/main.go @@ -302,8 +302,12 @@ func main() { Verify: "FAA Part 107 drone surveyor UAV pilot GIS construction site mapping Chicago", }, } + const freshIdx = "fresh_workers" + if err := ensureFreshIndex(hc, *gateway, freshIdx, 768); err != nil { + log.Fatalf("ensure fresh_workers index: %v", err) + } for _, fw := range freshWorkers { - if err := ingestFreshWorker(hc, *gateway, fw.ID, fw.Resume, map[string]any{ + if err := ingestFreshWorker(hc, *gateway, freshIdx, fw.ID, fw.Resume, map[string]any{ "name": fw.ID, "role": "fresh-resume", "source": "phase-1b-injection", @@ -311,8 +315,13 @@ func main() { log.Fatalf("ingest fresh worker %s: %v", fw.ID, err) } } + // Verify queries search across main + fresh corpora — the small + // fresh corpus should surface the freshly-added worker because + // it has no recall competition there. + verifyCorpora := append([]string{}, corpora...) + verifyCorpora = append(verifyCorpora, freshIdx) for _, fw := range freshWorkers { - resp := must(matrixSearch(hc, *gateway, fw.Verify, corpora, *k, false, "")) + resp := must(matrixSearch(hc, *gateway, fw.Verify, verifyCorpora, *k, false, "")) ev := captureEvent("new-resume-verify", 6, "system", "fresh-resume-pool", "fresh", fw.Verify, 1, false, "", resp) // Find the fresh worker's rank in top-K (rank 0 = top-1). freshRank := -1 @@ -1002,12 +1011,41 @@ func matrixSearch(hc *http.Client, gw, query string, corpora []string, k int, us return &out, nil } +// ensureFreshIndex creates the fresh_workers vectord index if it +// doesn't exist yet. Idempotent — re-creating returns 409 which we +// treat as "already there." Two-tier search pattern: fresh content +// goes to a small "hot" index and the search merges it with the +// main workers index. Solves the HNSW post-build add recall issue +// surfaced in runs #003-#005 (incremental adds to a 5K+ HNSW graph +// can land in poorly-connected regions and disappear from search; +// a small hot index has no such crowding). +func ensureFreshIndex(hc *http.Client, gw, indexName string, dim int) error { + body, _ := json.Marshal(map[string]any{ + "name": indexName, + "dimension": dim, + "distance": "cosine", + }) + req, _ := http.NewRequest("POST", gw+"/v1/vectors/index", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + resp, err := hc.Do(req) + if err != nil { + return fmt.Errorf("create index: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode == http.StatusConflict || resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated { + return nil + } + rb, _ := io.ReadAll(resp.Body) + return fmt.Errorf("create index %d: %s", resp.StatusCode, string(rb)) +} + // ingestFreshWorker embeds + adds a single fresh worker to the -// vectord 'workers' index. Two HTTP hops via the gateway: /v1/embed -// to get the vector, /v1/vectors/workers/add to insert. Used by the -// new-resume-injection phase to test mid-run absorption of fresh -// candidates without restart. -func ingestFreshWorker(hc *http.Client, gw, id, text string, metadata map[string]any) error { +// given vectord index. Two HTTP hops via the gateway: /v1/embed for +// the vector, /v1/vectors/index//add to insert. The idx +// parameter exists so callers can target a separate hot index +// (fresh_workers) rather than the main 5K-item workers index, where +// HNSW post-build recall is unreliable. +func ingestFreshWorker(hc *http.Client, gw, idx, id, text string, metadata map[string]any) error { embedBs, _ := json.Marshal(map[string]any{ "texts": []string{text}, "model": "nomic-embed-text-v2-moe", @@ -1039,7 +1077,7 @@ func ingestFreshWorker(hc *http.Client, gw, id, text string, metadata map[string {"id": id, "vector": er.Vectors[0], "metadata": json.RawMessage(metaBs)}, }, }) - req2, _ := http.NewRequest("POST", gw+"/v1/vectors/index/workers/add", bytes.NewReader(addBs)) + req2, _ := http.NewRequest("POST", gw+"/v1/vectors/index/"+idx+"/add", bytes.NewReader(addBs)) req2.Header.Set("Content-Type", "application/json") resp2, err := hc.Do(req2) if err != nil {