From 137aed64fbd291edad3527238d99f67400f01821 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 23:29:13 -0500 Subject: [PATCH] =?UTF-8?q?Coherence=20pass=20=E2=80=94=20PRD/PHASES=20upd?= =?UTF-8?q?ates,=20config=20snapshot=20wired,=20unit=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit J flagged the audit: "make sure everything flows coherently, no pseudocode or unnecessary patches or ignoring any particular part of what we built." This is that pass. PRD.md updates: - Phase 19 refinement block — geo-filter + role-prefilter WIRED with citation density numbers (0.32 → 1.38, and 2 → 28 on same scenario). - Phase 20 rewrite — mistral dropped, qwen3.5 + qwen3 local hot path, think:false as the key mechanical finding, kimi-k2.6 upgrade path. - Phase 21 status block — think plumbing + cloud executor routing added after original commit. - Phase 22 item B (cloud rescue) — pivot sanitizer, rescue verified 1/3 on stress_01. - Phase 23 NEW — staffer identity + tool_level + competence-weighted retrieval + kb_staffer_report. Auto-discovered worker labels called out with real numbers (Rachel Lewis 12× across 4 staffers). - Phase 24 NEW — Observer/Autotune integration gap DOCUMENTED, not fixed. Observer has been idle at 0 ops for 3600+ cycles because scenarios hit gateway:3100 directly, bypassing MCP:3700 which the observer wraps. This is the honest "we're not using it in these tests" signal J surfaced. Fix deferred; gap visible now. PHASES.md: - Appended Phases 20-23 as checked, Phase 24 as unchecked gap. - Updated footer count: 102 unit tests across all layers. - Latest line updated with 14× citation lift + 46.4pt tool-asymmetry finding. scenario.ts: - snapshotConfig() was defined but never called. Now fires at every scenario start with a stable sha256 hash over the active model set + tool_level + cloud flags. config_snapshots.jsonl finally populates, which the error_corrections diff path needs to work correctly. kb.test.ts (new): 4 signature invariant tests — stability across unrelated fields (date, contract, staffer), sensitivity to role/city/ count changes, digest shape. All pass under `bun test`. service.rs: 6 Rust extractor tests for extract_target_geo + extract_target_role — basic, missing-state-returns-none, word boundary (civilian != city), multi-word role, absent role, quoted value parse. All pass under `cargo test -p vectord --lib extractor_tests`. Dangling items now honestly documented rather than silently pending: - Chunking cache (config/models.json SPEC, not wired) — flagged - Playbook versioning (SPEC, not wired) — flagged - Observer integration (WIRED but disconnected) — new Phase 24 --- crates/vectord/src/service.rs | 43 +++++++++++++++++ docs/PHASES.md | 68 ++++++++++++++++++++++++-- docs/PRD.md | 90 ++++++++++++++++++++++++++++++++--- tests/multi-agent/kb.test.ts | 51 ++++++++++++++++++++ tests/multi-agent/scenario.ts | 32 ++++++++++++- 5 files changed, 271 insertions(+), 13 deletions(-) create mode 100644 tests/multi-agent/kb.test.ts diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 977575f..48310fc 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -2486,3 +2486,46 @@ async fn lance_build_scalar_index( Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } } + +#[cfg(test)] +mod extractor_tests { + use super::*; + + #[test] + fn extract_target_geo_basic() { + let f = "role = 'Welder' AND city = 'Toledo' AND state = 'OH' AND CAST(availability AS DOUBLE) > 0.5"; + assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into()))); + } + + #[test] + fn extract_target_geo_missing_state_returns_none() { + let f = "role = 'Welder' AND city = 'Toledo'"; + assert_eq!(extract_target_geo(f), None); + } + + #[test] + fn extract_target_geo_word_boundary() { + // "civilian" contains "city" as a substring — must not match. + let f = "civilian_rank = 1 AND city = 'Toledo' AND state = 'OH'"; + assert_eq!(extract_target_geo(f), Some(("Toledo".into(), "OH".into()))); + } + + #[test] + fn extract_target_role_basic() { + let f = "role = 'Welder' AND city = 'Toledo'"; + assert_eq!(extract_target_role(f), Some("Welder".into())); + } + + #[test] + fn extract_target_role_none_when_absent() { + let f = "city = 'Toledo' AND state = 'OH'"; + assert_eq!(extract_target_role(f), None); + } + + #[test] + fn extract_target_role_multi_word() { + let f = "role = 'Warehouse Associate' AND city = 'Chicago'"; + assert_eq!(extract_target_role(f), Some("Warehouse Associate".into())); + } +} + diff --git a/docs/PHASES.md b/docs/PHASES.md index b562c5c..502d971 100644 --- a/docs/PHASES.md +++ b/docs/PHASES.md @@ -230,7 +230,7 @@ - Profile-driven routing: `POST /vectors/profile/{id}/search` auto-routes to Lance when profile.vector_backend=lance - Auto-migrate + auto-index on activation - Measured on real 100K × 768d: migrate 0.57s, IVF_PQ build 16.2s (14× faster than HNSW 230s), search 23ms, append 100 rows 3.3ms, doc_id fetch 3.5ms (with scalar btree) - - IVF_PQ recall@10 = 0.805 (HNSW = 1.000) — measured via `/vectors/lance/recall/{idx}` harness + - IVF_PQ recall@10 = 0.805 with Lance's default `nprobes=1` (the hidden cap — see 2026-04-20 tuning work below, which lifts it to 1.000). Measured via `/vectors/lance/recall/{idx}` harness. - [x] Phase E.3: Scheduled ingest — 2026-04-17 - `ingestd::schedule` module: ScheduleDef, ScheduleStore (JSON at `_schedules/{id}.json`), Scheduler tokio task - Supports MySQL + Postgres sources on interval triggers (Cron variant defined, parsing stubbed) @@ -241,10 +241,68 @@ - Two-tier: lopdf text extraction → Tesseract 5.5 fallback for scanned/image PDFs - Extracts embedded XObject /Image streams, shells to tesseract --oem 3 --psm 6 - Same schema (source_file, page_number, text_content) — downstream unchanged -- [ ] Fine-tuned domain models -- [ ] Multi-node query distribution +- [x] Catalog hygiene — idempotent `register()` + dedupe + DELETE (2026-04-19, ADR-020) + - `catalogd::Registry::register` now gates on `(name, schema_fingerprint)`: same fp → reuse `DatasetId` and update objects in place; different fp → return error (409 Conflict on HTTP, `FAILED_PRECONDITION` on gRPC). First-time registration is unchanged. + - `POST /catalog/dedupe` one-shot operator endpoint collapses pre-existing duplicates; winner = non-null `row_count` first, newest `updated_at` second. + - `DELETE /catalog/datasets/by-name/{name}` removes the manifest from both in-memory registry and object storage (metadata-only — parquet files, vector indexes, tombstones are NOT cascade-deleted). Added to support test-harness cleanup; also plugs a real catalog hole where zombie entries from prior deletes would break DataFusion schema inference. + - Cleanup run on live catalog: 374 → 31 datasets, 343 orphan manifests removed, 0 errors. 308× `successful_playbooks` was the worst offender. + - Concurrency: write lock held across storage I/O in `register()` to close the check→insert TOCTOU window (32-worker multi-threaded stress test verifies single-manifest invariant). + - End-to-end verification: `scripts/e2e_pipeline_check.sh` runs 31 assertions across 12 pipeline stages (ingest → catalog → SQL+JOIN → dedup → idempotency → metadata → PII → vector embed → semantic search → cleanup) against the live gateway. Idempotent across repeat runs. + - Tests: 11 new in `catalogd` (was 0, includes 3 concurrency tests + 3 delete_dataset tests); 11 new in `storaged` for `AppendLog` + `ErrorJournal` (was 0). Fixed a broken doctest in `append_log.rs`. +- [x] Autotune agent: portfolio rotation + auto-bootstrap (2026-04-20) + - `pick_periodic_target` now sources candidates from `IndexRegistry` (not just promoted indexes) and picks least-recently-tuned, so trial budget spreads across every index with ≥1000 vectors instead of fixating on one converged champion. + - `run_one_cycle` bootstraps on first visit: `ensure_auto_harness` auto-generates `{index}_auto` (20 synthetic self-queries, k=10, brute-force ground truth) if missing, then seeds with `HnswConfig::default()` (ec=80/es=30). + - Regression fix: `harness::recall_at_k` now uses set-intersection semantics. The prior impl counted duplicates in `predicted` — on corpora with repeated chunks (`kb_response_cache_agent`) this inflated recall above 1.0 and poisoned promotion decisions. +7 unit tests. +- [x] Scheduled ingest: real cron parsing (2026-04-20) + - Vixie-compatible 5/6-field cron via `croner` crate. Day-of-week follows Unix convention (`1-5` = Mon-Fri). 6-field adds seconds granularity. + - `validate_trigger` in `ingestd::schedule` — create/patch handlers reject malformed expressions with `400 BAD_REQUEST` at creation time, not silently at fire time. + - Swapped away from the `cron` crate (0.16) which uses a non-Unix DOW convention (`1=Sun`) that would silently bite anyone writing `1-5` expecting weekdays. +9 unit tests. +- [x] EvalSets federation (2026-04-20) + - `harness::HarnessStore` mirrors the TrialJournal / PromotionRegistry federation pattern: eval artifacts colocate with each index's recorded bucket; legacy evals in primary remain discoverable via a fallback path; cross-bucket listing dedupes. + - Every eval callsite (service.rs × 5, agent.rs × 3, autotune.rs × 1) now routes through `HarnessStore`. `VectorState` and `AgentDeps` each hold a shared instance. +- [x] Index bucket-migrate PATCH (2026-04-20) + - `PATCH /vectors/indexes/{name}/bucket` copies an index's vector parquet + trial-journal batches + promotion file + auto-harness to `dest_bucket`, flips `IndexMeta.bucket` as the commit point, and evicts the `EmbeddingCache` so next load reads from the new bucket. Optional `delete_source: true` sweeps source artifacts. + - Lance-backed indexes refused with 400 — Lance URIs are bucket-specific and require rewriting the dataset, separate story. Round-trip verified: 390 artifacts, 0.04s. +- [x] IVF_PQ recall tuning (2026-04-20) + - `LanceVectorStore::search` now accepts optional `nprobes` + `refine_factor`. Lance's built-in `nprobes=1` default was the hidden cap on recall — on 316-partition `resumes_100k_v2` it searched only 0.3% of partitions per query. + - Server defaults (`LANCE_DEFAULT_NPROBES=20`, `LANCE_DEFAULT_REFINE_FACTOR=5`) flow through the scoped-search path and the autotune harness. Measured on `resumes_100k_v2`: recall `0.805 → 1.000` at p50 ≈ 7.4ms. Even `nprobes=5, refine=5` saturates recall at p50 ≈ 4.7ms. + - `/vectors/lance/recall/{idx}` accepts per-request `nprobes` / `refine_factor` so operators can sweep the curve. +- [x] **Phase 19: Playbook memory (meta-index)** — the feedback loop originally implied by the PRD but never built. Playbooks stop being write-only; they start shaping future rankings. (2026-04-20) + - [x] 19.1 — `POST /vectors/playbook_memory/rebuild` scans `successful_playbooks` via DataFusion, builds one `PlaybookEntry` per row (operation + approach + context embedded as one vector via nomic-embed-text) + - [x] 19.2 — Brute-force cosine search over in-memory embeddings (chosen over HNSW: successful_playbooks maxes around thousands of rows, overhead of a second indexed surface isn't worth it until that ceiling bites) + - [x] 19.3 — Endorsed names parsed out of `result` column, keyed by `(city, state, name)` tuple so shared names across cities don't cross-pollinate. Parsing via `parse_names` + `parse_city_state` helpers (7 unit tests) + - [x] 19.4 — `/vectors/hybrid?use_playbook_memory=true`: fetches `top_k * 5` candidates so endorsed workers outside the vanilla top-K can still climb. Boost is additive on vector score, each hit carries `playbook_boost` + `playbook_citations` in the response for explainability + - [x] 19.5 — Multi-agent orchestrator (`tests/multi-agent/orchestrator.ts`) auto-seeds `POST /vectors/playbook_memory/seed` on consensus_done, so the next query sees the new endorsement without a full `/rebuild`. Closes the feedback loop: two agents reach consensus → playbook sealed → next query re-ranks + - [x] 19.6 — `MAX_BOOST_PER_WORKER = 0.25` enforced in `compute_boost_for`; verified with unit test (100 identical playbooks → boost capped at 0.25) and live test (5 identical seeds → exactly 0.25). Time decay deferred as optional + - Real finding surfaced during build: the 32 bootstrap rows in `successful_playbooks` reference phantom worker names — 80 of 82 don't correspond to actual rows in `workers_500k`. `/seed` endpoint bypasses `successful_playbooks` so operators can prime memory with real fixtures; production path is the orchestrator write-through +- [x] **Phase 19 refinement — geo + role prefilter on boost** (2026-04-21) + - Added `compute_boost_for_filtered` and `compute_boost_for_filtered_with_role` to `playbook_memory.rs`. SQL filter's `(city, state, role)` parsed in `service.rs`; exact role-matches in target geo skip cosine and earn similarity=1.0. Restored the feedback loop: matched=0 → matched=11 per query on the same Nashville test. Citation density on Riverfront Steel: 2 → 28 per run (14×). + - Rust unit tests: `extractor_tests::extract_target_geo_basic/_missing_state/_word_boundary`, `extract_target_role_basic/_none/_multi_word`. 6/6 pass. + - Diagnostic log: `playbook_boost: boosts=N sources=N parsed=N matched=N target_geo=? target_role=?` on every call. +- [x] **Phase 20: Model Matrix + Overseer Tiers** (2026-04-21) + - `config/models.json` — 5 tiers (t1_hot / t2_review / t3_overview / t4_strategic / t5_gatekeeper), each with context_window + context_budget + overflow_policy. Ollama Cloud bearer key from `/root/llm_team_config.json`. + - Hot path: qwen3.5:latest + qwen3:latest local with `think:false`. Mistral dropped after 0/14 fill on complex scenarios. + - T3 cloud: gpt-oss:120b via Ollama Cloud — verified 4-8s latency, strict JSON-shape output for remediation. +- [x] **Phase 21: Scratchpad + Tree-Split Continuation** (2026-04-21) + - `tests/multi-agent/agent.ts`: `estimateTokens()`, `assertContextBudget()`, `generateContinuable()`, `generateTreeSplit()`. `think` flag plumbed through sidecar's `/generate`. Empty-response backoff + truncation-continuation, no max_tokens tourniquet. + - Rust port queued: `crates/aibridge/src/continuation.rs`, `tree_split.rs`. +- [x] **Phase 22: Internal Knowledge Library** (2026-04-21) + - `data/_kb/` — signatures.jsonl, outcomes.jsonl, pathway_recommendations.jsonl, error_corrections.jsonl, config_snapshots.jsonl. Event-driven cycle: indexRun → recommendFor → loadRecommendation. + - Item B cloud rescue: failed event → cloud remediation JSON → retry with pivot. Verified 1/3 rescues succeeded on stress_01 (Gary IN → South Bend IN pivot). + - `scripts/kb_measure.py` aggregator. Unit tests: `kb.test.ts` — 4/4 pass (signature stability, role/city/count invariants, digest shape). +- [x] **Phase 23: Staffer identity + competence-weighted retrieval** (2026-04-21) + - ScenarioSpec gained `contract: ContractTerms` and `staffer: Staffer { id, name, tenure_months, role, tool_level }`. + - tool_level runtime overrides: full / local / basic / minimal. Basic + minimal route executor to Ollama Cloud `kimi-k2.5` (kimi-k2.6 pending pro-tier upgrade). + - `data/_kb/staffers.jsonl` — competence_score = 0.45·fill + 0.20·turn_eff + 0.20·cite + 0.15·rescue. Recomputed per run. + - `findNeighbors` now returns `weighted_score = cosine × max_staffer_competence`. `scripts/kb_staffer_report.py` — leaderboard + cross-staffer worker overlap (Rachel D. Lewis 12× across 4 staffers → auto-discovered high-value label). + - `gen_staffer_demo.ts` + `run_staffer_demo.sh` — 4 personas × 3 contracts = 12 runs. +- [ ] **Phase 24: Observer / Autotune integration** (GAP, not wired) + - `lakehouse-observer.service` watches MCP :3700; scenario.ts hits gateway :3100 directly. Observer idle at 0 ops across 3600+ cycles. Autotune runs on its own schedule, never sees scenario outcomes. + - Next-sprint: scenario emits per-event outcome summaries to observer's ingest path; observer ERROR_ANALYZER + PLAYBOOK_BUILDER loops consume them; autotune subscribes to the metric stream. +- [ ] Fine-tuned domain models (Phase 25+) +- [ ] Multi-node query distribution (only if ceilings bite) --- -**52+ unit tests | 13 crates | 19 ADRs | 2.47M rows | 100K vectors | Hybrid Parquet+HNSW ⊕ Lance** -**Latest: 2026-04-17 — 8 commits shipping Phase 16.2 through Phase 18** +**102 unit tests | 13 crates | 20 ADRs | 2.47M rows | 100K vectors | Hybrid Parquet+HNSW ⊕ Lance | Phase 19 refined + 20-23 shipped** +**Latest: 2026-04-21 — Phases 20-23 shipped. Geo+role prefilter lifted playbook citation density 14×. Cloud rescue converts zero-supply failures into successful pivots. Staffer competence weighting differentiates full-tool senior from minimal-tool trainee by 46.4pt fill rate on same contracts. Phase 24 observer integration flagged as honest gap.** diff --git a/docs/PRD.md b/docs/PRD.md index ab6e3aa..5b69914 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -380,17 +380,37 @@ Make successful playbooks actually improve future searches. Today `successful_pl - Hard guarantees about recall lift magnitude. "Measurably better on the demo query" is the gate, not a universal quality claim. - Real-time recomputation on every playbook. Batched refresh via the existing stale-marking path is sufficient. +### Phase 19 refinement (WIRED 2026-04-21): geo-filter + role prefilter on boost + +Item-3 diagnostic pass surfaced that `compute_boost_for` was ranking playbooks globally by cosine similarity, while candidates came from an SQL-filtered city. Result: boost map had 170 endorsed workers, 0 intersected the 50 Nashville-filtered candidates. Zero citations where there should have been dozens. + +Fix — in `crates/vectord/src/playbook_memory.rs`: +- `compute_boost_for_filtered(target_geo)` — skip playbooks from other cities before cosine sort. +- `compute_boost_for_filtered_with_role(target_geo, target_role)` — multi-strategy: exact (role, city, state) match earns similarity=1.0 and fills up to half the top_k; cosine fallback fills the rest. Mirrors Mem0/Zep 2026 guidance on parallel-strategy rerank. + +In `crates/vectord/src/service.rs`: +- `extract_target_geo` and `extract_target_role` pull both from the executor's SQL filter. +- `tracing::info!` emits `playbook_boost: boosts=N sources=N parsed=N matched=N target_geo=? target_role=?` on every hybrid_search. Silent-truncation class of bug now visible. + +Citation lift measured: avg citations per run 0.32 → 1.38 after geo filter; then 2 → 28 in the single-scenario Riverfront Steel re-run after role prefilter landed. 14× delta on same scenario. + +Unit tests: `extract_target_geo_basic`, `_missing_state_returns_none`, `_word_boundary` (rejects "civilian" substring), `extract_target_role_basic`, `_none_when_absent`, `_multi_word` — all pass (`cargo test -p vectord --lib extractor_tests`). + ### Phase 20: Model Matrix + Overseer Tiers (WIRED 2026-04-21) -Five-tier routing declared in `config/models.json`. Hot path (T1/T2) stays on local mistral + qwen2.5. Cloud is consulted sparingly for overview (T3 gpt-oss:120b), strategic (T4 qwen3.5:397b), and gatekeeper decisions (T5 kimi-k2-thinking). Every tier declares `context_window` + `context_budget` + `overflow_policy`. See ADR-021 (to add). +Five-tier routing declared in `config/models.json`. Hot path (T1/T2) stays local (qwen3.5 + qwen3 after mistral was dropped for 0/14 fill rate on complex scenarios). Cloud for overview (T3 gpt-oss:120b), strategic (T4 qwen3.5:397b), and gatekeeper (T5 kimi-k2-thinking). Every tier declares `context_window` + `context_budget` + `overflow_policy`. -- T1 hot: 50-200 calls/scenario, local only -- T2 review: 5-14 calls/event, local only -- T3 overview: 1-3 calls/scenario, cloud primary +- T1 hot: 50-200 calls/scenario, local only — `qwen3.5:latest` executor, `think:false` +- T2 review: 5-14 calls/event, local only — `qwen3:latest` reviewer, `think:false` +- T3 overview: 1-3 calls/scenario, cloud primary — `gpt-oss:120b` on Ollama Cloud, thinking on - T4 strategic: 1-10 calls/day, cloud primary - T5 gatekeeper: 1-5 calls/day, audit-logged -T3 checkpoints + cross-day lessons are wired. Lessons archive to `data/_playbook_lessons/` and load back at next scenario start as `prior_lessons` in executor context. +T3 checkpoints + cross-day lessons wired. Lessons archive to `data/_playbook_lessons/` and load back at next scenario start as `prior_lessons` in executor context. Cloud passthrough verified on stress_01 scenario with `LH_OVERVIEW_CLOUD=1` — `gpt-oss:120b` response latency consistently 4-8s, diagnosing city-pivot ("Gary IN → Chicago IL, 40mi") when target city has zero supply. + +`think:false` is the key mechanical finding — qwen3.5 burns ~650 tokens of hidden reasoning before emitting response; hot-path JSON emitters MUST disable thinking or continuation has to paper over empty returns. T3/T4 overseers KEEP thinking (that's the point). + +**Kimi-k2.6 upgrade path:** Current Ollama Cloud key returns 403 on kimi-k2.6 (`ollama run kimi-k2.6:cloud` requires `ollama signin` with pro-tier account). kimi-k2.5 substitutes on the current tier — same family, strong at tool calling. Swap to k2.6 is a one-line change in `applyToolLevel` once the subscription lands. ### Phase 21: Scratchpad + Tree-Split Continuation @@ -421,6 +441,16 @@ T3 checkpoints + cross-day lessons are wired. Lessons archive to `data/_playbook **Status:** TS primitives WIRED. Rust port pending. The escalation path (tree split → bigger-context cloud model → kimi-k2:1t's 1M window → split decision into sub-decisions) is declared in `config/models.json` under `context_management.overflow_policies`. +### Phase 21 status update (WIRED 2026-04-21 evening) + +Additional primitives landed after the initial commit: + +- **`think: boolean`** flag plumbed through `generate()`, `generateCloud()`, `generateContinuable()`, and into sidecar's `/generate` endpoint. Enables per-call opt-out of hidden reasoning for hot-path JSON emitters. Verified: qwen3.5 with `think:false` + `num_predict:400` returns clean `{"worker_id":...}` on first call; without `think:false`, 650 tokens eaten by reasoning, response empty. + +- **Cloud executor routing** — `ACTIVE_EXECUTOR_CLOUD` / `ACTIVE_REVIEWER_CLOUD` flags let per-staffer tool_level route executor to Ollama Cloud when weak local model (qwen2.5) would collapse. Verified on kimi-k2.5 via Ollama Cloud: clean JSON emission, think:false honored. + +Rust port of continuation + tree-split primitives remains queued for next sprint (`crates/aibridge/src/continuation.rs`, `tree_split.rs`). + ### Phase 22: Internal Knowledge Library (KB) Meta-layer over Phase 19 playbook_memory. Playbook memory answers "which WORKERS worked for this event." The KB answers "which CONFIG worked for this playbook signature." Subject changes from workers to the system itself — model choice, budget hints, overflow policies, pathway notes. @@ -451,9 +481,55 @@ Meta-layer over Phase 19 playbook_memory. Playbook memory answers "which WORKERS - budget_hints {executor_max_tokens, reviewer_max_tokens, executor_think} - pathway_notes (concrete pre-run advice) -**Status (WIRED 2026-04-21):** `tests/multi-agent/kb.ts` holds all primitives. scenario.ts reads rec at start, indexes + recommends at end. Cold start gracefully writes a "low confidence, no history" rec so the second run has a floor to build on. +**Status (WIRED 2026-04-21):** `tests/multi-agent/kb.ts` holds all primitives. scenario.ts reads rec at start, indexes + recommends at end. Cold start gracefully writes a "low confidence, no history" rec so the second run has a floor to build on. `snapshotConfig()` wired to fire at every scenario start — active model set + tool_level + cloud flags hashed and appended to `config_snapshots.jsonl`. -### Phase 23+: Further horizon +**Phase 22 item B — cloud rescue (WIRED):** When an event fails and cloud T3 is enabled, `requestCloudRemediation()` feeds the failure trace (SQL filters attempted, row counts, reviewer drift reasons, gap signals, contract terms) to cloud and parses a JSON remediation with new_city / new_state / new_role / new_count / rationale. Event retries once with the pivot. Verified 1/3 rescues succeeded on stress_01 (Gary IN → South Bend IN pivot filled a Welder that local drift-aborted). Sanitizer splits "City, ST" comma-packed outputs so downstream SQL doesn't get `Hammond, IN, IN`. + +### Phase 23: Staffer identity + competence-weighted retrieval (WIRED 2026-04-21) + +Answers "who handled this" as a first-class dimension of the matrix index. Senior staffers' playbooks rank higher than juniors' on similar scenarios via competence × similarity score. Auto-discovers "reliable performer" worker labels via cross-staffer endorsement overlap. + +**Schema (`scenario.ts` ScenarioSpec):** +- `contract?: ContractTerms` — deadline, budget_per_hour_max, local_bonus_per_hour, local_bonus_radius_mi, fill_requirement. Propagates into T3 checkpoint + cloud rescue prompts so cloud reasons about trade-offs (pivot-within-radius before budget-pivot-further). +- `staffer?: Staffer` — {id, name, tenure_months, role, tool_level}. tool_level controls subsystems available to this run: + - `full` — qwen3.5 + qwen3 local + cloud T3 + cloud rescue + - `local` — qwen3.5 + qwen3 local + local gpt-oss:20b T3 + rescue + - `basic` — **kimi-k2.5 cloud** exec + qwen3 local reviewer + local T3, no rescue + - `minimal` — kimi-k2.5 cloud exec + qwen3 local reviewer, NO T3, NO rescue — tests whether playbook inheritance carries knowledge alone + +**KB staffer indexing (`data/_kb/staffers.jsonl`):** +- Recomputed per-staffer on every run: total_runs, fill_rate, avg_turns_per_event, avg_citations_per_run, rescue_rate, competence_score. +- `competence_score = 0.45·fill_rate + 0.20·turn_efficiency + 0.20·citation_density + 0.15·rescue_rate`. Bounded 0..1. + +**Weighted neighbor retrieval:** +- `findNeighbors` in `kb.ts` returns `weighted_score = cosine × max_staffer_competence` (floor 0.3). Senior playbooks rank above junior playbooks on similar scenarios. +- `pathway_recommendations` include `best_staffer_id` / `best_staffer_competence` so cloud knows WHOSE playbook it's synthesizing from. + +**Cross-staffer auto-discovery:** +- `scripts/kb_staffer_report.py` emits leaderboard + workers endorsed across ≥2 staffers on same signature. +- Validated output: Rachel D. Lewis (Welder Nashville) endorsed 12× across 4 staffers; Christina Watson (Machine Op Indianapolis) 11×. These are the highest-confidence "reliable performer" labels the system produced without human tagging. + +**Demo infrastructure:** +- `tests/multi-agent/gen_staffer_demo.ts` — 4 personas × 3 contracts = 12 scenario specs. +- `scripts/run_staffer_demo.sh` — sequential batch with cloud T3. +- `scripts/kb_staffer_report.py` — leaderboard + top/bottom differential + cross-staffer overlap. + +### Phase 24: Observer / Autotune integration (NOT YET WIRED — honest gap) + +J flagged this 2026-04-21 evening: the `lakehouse-observer.service` systemd unit has been running for 3600+ cycles but shows `total_ops=0 successes=0 failures=0` because `tests/multi-agent/scenario.ts` hits the Rust gateway directly on port 3100, bypassing the Bun MCP layer on 3700 that observer wraps. + +Result: our test scenarios are INVISIBLE to the observer and the autotune pipeline. Autotune's HNSW parameter learning runs on its own schedule, but no signal from scenario outcomes flows into it. + +**Target architecture:** +- Scenarios emit per-event outcome summaries to a path the observer polls (or POST to observer's ingest endpoint directly). +- Observer's ERROR ANALYZER + PLAYBOOK BUILDER loops consume those summaries alongside the MCP-layer ops. +- Autotune agent subscribes to a metric stream the observer writes. + +**Why deferred:** this is a real architecture change (coherent data path from scenario → observer → autotune → vectord index) and needs care. The observer's current `observed_operations` ingest uses REPLACE semantics (flagged in `feedback_ingest_replace_semantics.md`) — naive appending will wipe prior ops. + +**Status:** GAP DOCUMENTED, not fixed. Scenarios continue to populate KB directly. The parallel pipelines are coherent but separate; Phase 24 connects them. + +### Phase 25+: Further horizon - Specialized fine-tuned models per domain (staffing matcher, resume parser) - Video/audio transcript ingest + multimodal embeddings diff --git a/tests/multi-agent/kb.test.ts b/tests/multi-agent/kb.test.ts new file mode 100644 index 0000000..58d5c24 --- /dev/null +++ b/tests/multi-agent/kb.test.ts @@ -0,0 +1,51 @@ +import { test, expect } from "bun:test"; +import { computeSignature, specDigest } from "./kb.ts"; + +// kb signature invariants — required so the KB's retrieval layer +// doesn't silently drift when we add fields to ScenarioSpec. + +test("computeSignature is stable across reorderings of unrelated fields", () => { + const a = { + client: "Acme Corp", + events: [ + { kind: "baseline_fill", role: "Welder", count: 3, city: "Toledo", state: "OH" }, + ], + }; + const b = { ...a, date: "2026-05-01", contract: { deadline: "2026-05-15" } } as any; + const c = { ...a, staffer: { id: "S-1", name: "X", tenure_months: 10, role: "senior" } } as any; + const sigA = computeSignature(a); + const sigB = computeSignature(b); + const sigC = computeSignature(c); + expect(sigA).toBe(sigB); + expect(sigA).toBe(sigC); +}); + +test("computeSignature changes when role changes", () => { + const base = { client: "Acme", events: [{ kind: "baseline_fill", role: "Welder", count: 3, city: "Toledo", state: "OH" }] }; + const swapped = { client: "Acme", events: [{ kind: "baseline_fill", role: "Electrician", count: 3, city: "Toledo", state: "OH" }] }; + expect(computeSignature(base)).not.toBe(computeSignature(swapped)); +}); + +test("computeSignature changes when city or count changes", () => { + const base = { client: "A", events: [{ kind: "baseline_fill", role: "Welder", count: 3, city: "Toledo", state: "OH" }] }; + const cityChange = { ...base, events: [{ ...base.events[0], city: "Detroit", state: "MI" }] }; + const countChange = { ...base, events: [{ ...base.events[0], count: 5 }] }; + expect(computeSignature(base)).not.toBe(computeSignature(cityChange)); + expect(computeSignature(base)).not.toBe(computeSignature(countChange)); +}); + +test("specDigest includes each event's role + city", () => { + const spec = { + client: "Acme", + events: [ + { kind: "baseline_fill", role: "Welder", count: 3, city: "Toledo", state: "OH" }, + { kind: "emergency", role: "Loader", count: 2, city: "Chicago", state: "IL" }, + ], + }; + const digest = specDigest(spec); + expect(digest).toContain("Acme"); + expect(digest).toContain("Welder"); + expect(digest).toContain("Toledo,OH"); + expect(digest).toContain("Loader"); + expect(digest).toContain("Chicago,IL"); +}); diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index 3fd5dc6..06f14da 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -35,7 +35,8 @@ import { reviewerPrompt, GATEWAY, } from "./agent.ts"; -import { indexRun, recommendFor, loadRecommendation, type PathwayRecommendation } from "./kb.ts"; +import { indexRun, recommendFor, loadRecommendation, snapshotConfig, type PathwayRecommendation } from "./kb.ts"; +import { createHash } from "node:crypto"; import { mkdir, writeFile, appendFile } from "node:fs/promises"; import { join } from "node:path"; @@ -1450,6 +1451,35 @@ async function main() { // per run. If no staffer or no tool_level, defaults hold. applyToolLevel(spec.staffer?.tool_level); + // Phase 22 — record the config snapshot each run gets. Lets the + // error_corrections detector diff configs between fail→succeed pairs + // and gives the KB a receipt of what was active for any given + // outcome. Hash computed over the active model set + tool_level so + // the same staffer running back-to-back with no config change + // doesn't clutter the file. + try { + const activeModels = { + executor: ACTIVE_EXECUTOR, + reviewer: ACTIVE_REVIEWER, + overview: OVERVIEW_MODEL, + executor_cloud: String(ACTIVE_EXECUTOR_CLOUD), + overview_cloud: String(ACTIVE_OVERVIEW_CLOUD), + t3_disabled: String(ACTIVE_T3_DISABLED), + tool_level: spec.staffer?.tool_level ?? "default", + }; + const configHash = createHash("sha256") + .update(JSON.stringify(activeModels)) + .digest("hex") + .slice(0, 16); + await snapshotConfig( + configHash, + activeModels, + `scenario_start ${spec.client} ${spec.date} staffer=${spec.staffer?.id ?? "none"}`, + ); + } catch (e) { + console.log(` (config snapshot skipped: ${(e as Error).message})`); + } + console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`); if (spec.staffer) { const level = spec.staffer.tool_level ?? "(default)";