From 25b7e6c3a782c11b22216baa7833628394d6383f Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 06:21:13 -0500 Subject: [PATCH] Phase 19 wiring + Path 1/2 work + chain integrity fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend: - crates/vectord/src/playbook_memory.rs (new): Phase 19 in-memory boost store with seed/rebuild/snapshot, plus temporal decay (e^-age/30 per playbook), persist_to_sql endpoint backing successful_playbooks_live, and discover_patterns endpoint for meta-index pattern aggregation (recurring certs/skills/archetype/reliability across similar past fills). - DEFAULT_TOP_K_PLAYBOOKS bumped 5 → 25; old default silently missed most boosts when memory had > 25 entries. - service.rs: new routes /vectors/playbook_memory/{seed,rebuild,stats, persist_sql,patterns}. Bun staffing co-pilot (mcp-server/): - /search, /match, /verify, /proof, /simulation/run, MCP tools all forward use_playbook_memory:true and playbook_memory_k:25 to the hybrid endpoint. Boost was previously dark across the entire app. - /log no longer POSTs to /ingest/file — that endpoint REPLACES the dataset's object list, so single-row CSV writes were wiping all prior rows in successful_playbooks (sp_rows went 33→1 in one /log call). /log now seeds playbook_memory with canonical short text and calls /persist_sql to keep successful_playbooks_live in sync. - /simulation/run cumulative end-of-week CSV write removed for the same reason. Per-day per-contract /seed (added in this session) is the accumulating feedback path now. - search.html addWorkerInsight renders a green "Endorsed · N playbooks" chip with playbook citations when boost > 0. Internal Dioxus UI (crates/ui/): - Dashboard phase list rewritten through Phase 19 (was stuck at "Phase 16: File Watcher" / "Phase 17: DB Connector" — both wrong). - Removed fabricated "27ms" stat label. - Ask tab examples + SQL default replaced with real staffing prompts against candidates/clients/job_orders (was referencing nonexistent employees/products/events). - New Playbook tab exposes /vectors/playbook_memory/{stats,rebuild} and side-by-side hybrid search (boost OFF vs ON) with citations. Tests (tests/multi-agent/): - run_e2e_rated.ts: parallel two-agent (mistral + qwen2.5) build phase + verifier rating (geo, auth, persist, boost, speed → /10). - network_proving.ts: continuous build → verify → repeat with staffing-recruiter profile hot-swap; geo-discrimination check. - chain_of_custody.ts: single recruiter operation traced through every layer (Bun /search, direct /vectors/hybrid parity, /log, SQL, playbook_memory growth, profile activation, post-op boost lift). --- crates/ui/assets/style.css | 10 + crates/ui/src/main.rs | 461 +++++++++++++- crates/vectord/src/lib.rs | 1 + crates/vectord/src/playbook_memory.rs | 825 ++++++++++++++++++++++++++ crates/vectord/src/service.rs | 519 +++++++++++++++- mcp-server/index.ts | 140 ++++- mcp-server/search.html | 15 +- tests/multi-agent/agent.ts | 351 +++++++++++ tests/multi-agent/chain_of_custody.ts | 335 +++++++++++ tests/multi-agent/network_proving.ts | 469 +++++++++++++++ tests/multi-agent/orchestrator.ts | 302 ++++++++++ tests/multi-agent/run_e2e_rated.ts | 400 +++++++++++++ tests/multi-agent/scenario.ts | 822 +++++++++++++++++++++++++ 13 files changed, 4566 insertions(+), 84 deletions(-) create mode 100644 crates/vectord/src/playbook_memory.rs create mode 100644 tests/multi-agent/agent.ts create mode 100644 tests/multi-agent/chain_of_custody.ts create mode 100644 tests/multi-agent/network_proving.ts create mode 100644 tests/multi-agent/orchestrator.ts create mode 100644 tests/multi-agent/run_e2e_rated.ts create mode 100644 tests/multi-agent/scenario.ts diff --git a/crates/ui/assets/style.css b/crates/ui/assets/style.css index 1764800..85bac9c 100644 --- a/crates/ui/assets/style.css +++ b/crates/ui/assets/style.css @@ -205,3 +205,13 @@ tr:hover td { background: var(--accent-glow); } padding: 8px 12px; border-bottom: 1px solid var(--border); font-size: 13px; } .table-item:hover { background: var(--accent-glow); } + +/* Phase 19 — Playbook panel */ +.boosted-row { background: rgba(120, 200, 120, 0.10); } +.boosted-row td { border-top: 1px solid rgba(120, 200, 120, 0.30); } +.mono-cell { + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 11px; color: var(--text-dim); + max-width: 220px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; +} +.panel-section + .panel-section { margin-top: 18px; } diff --git a/crates/ui/src/main.rs b/crates/ui/src/main.rs index 24d8c35..53c9ad2 100644 --- a/crates/ui/src/main.rs +++ b/crates/ui/src/main.rs @@ -178,9 +178,116 @@ enum Tab { Explore, Sql, Ingest, + Playbook, Status, } +// --- Playbook memory types (Phase 19) --- + +#[derive(Debug, Clone, Deserialize, PartialEq)] +struct PlaybookStats { + entries: usize, + entries_with_embeddings: usize, + #[serde(default)] + total_names_endorsed: usize, + #[serde(default)] + sample: Vec, +} + +#[derive(Debug, Clone, Deserialize, PartialEq)] +struct PlaybookSample { + id: String, + operation: String, + #[serde(default)] + city: Option, + #[serde(default)] + state: Option, + #[serde(default)] + endorsed: Vec, +} + +#[derive(Debug, Clone, Deserialize, PartialEq)] +struct HybridResp { + #[serde(default)] + sql_matches: usize, + #[serde(default)] + vector_reranked: usize, + #[serde(default)] + method: String, + #[serde(default)] + duration_ms: u64, + #[serde(default)] + answer: Option, + #[serde(default)] + sources: Vec, +} + +#[derive(Debug, Clone, Deserialize, PartialEq)] +struct HybridSource { + doc_id: String, + chunk_text: String, + score: f32, + #[serde(default)] + sql_verified: bool, + #[serde(default)] + playbook_boost: f32, + #[serde(default)] + playbook_citations: Vec, +} + +#[derive(Debug, Clone, Deserialize, PartialEq)] +struct IndexInfo { + index_name: String, + source: String, + #[serde(default)] + chunk_count: usize, + #[serde(default)] + vector_backend: String, +} + +async fn fetch_playbook_stats() -> Result { + let resp = reqwest::get(&format!("{}/vectors/playbook_memory/stats", api_base())) + .await.map_err(|e| e.to_string())?; + if !resp.status().is_success() { + return Err(format!("HTTP {}: {}", resp.status(), resp.text().await.unwrap_or_default())); + } + resp.json().await.map_err(|e| e.to_string()) +} + +async fn rebuild_playbook_memory() -> Result { + let client = reqwest::Client::new(); + let resp = client.post(&format!("{}/vectors/playbook_memory/rebuild", api_base())) + .json(&serde_json::json!({})) + .send().await.map_err(|e| e.to_string())?; + if !resp.status().is_success() { + return Err(format!("HTTP {}: {}", resp.status(), resp.text().await.unwrap_or_default())); + } + resp.json().await.map_err(|e| e.to_string()) +} + +async fn fetch_indexes() -> Result, String> { + let resp = reqwest::get(&format!("{}/vectors/indexes", api_base())) + .await.map_err(|e| e.to_string())?; + resp.json().await.map_err(|e| e.to_string()) +} + +async fn hybrid_search(index_name: &str, question: &str, use_playbook: bool, top_k: usize) -> Result { + let client = reqwest::Client::new(); + let resp = client.post(&format!("{}/vectors/hybrid", api_base())) + .json(&serde_json::json!({ + "index_name": index_name, + "question": question, + "top_k": top_k, + "generate": false, + "use_playbook_memory": use_playbook, + })) + .send().await.map_err(|e| e.to_string())?; + if !resp.status().is_success() { + return Err(format!("HTTP {}: {}", resp.status(), resp.text().await.unwrap_or_default())); + } + resp.json().await.map_err(|e| e.to_string()) +} + // --- App --- #[component] @@ -239,6 +346,11 @@ fn App() -> Element { onclick: move |_| active_tab.set(Tab::Ingest), "Ingest" } + button { + class: if *active_tab.read() == Tab::Playbook { "tab active" } else { "tab" }, + onclick: move |_| active_tab.set(Tab::Playbook), + "Playbook" + } button { class: if *active_tab.read() == Tab::Status { "tab active" } else { "tab" }, onclick: move |_| active_tab.set(Tab::Status), @@ -260,6 +372,7 @@ fn App() -> Element { Tab::Explore => rsx! { ExplorePanel { datasets: datasets.read().clone() } }, Tab::Sql => rsx! { SqlPanel {} }, Tab::Ingest => rsx! { IngestPanel {} }, + Tab::Playbook => rsx! { PlaybookPanel {} }, Tab::Status => rsx! { StatusPanel {} }, } } @@ -354,14 +467,14 @@ fn AskPanel(datasets: Vec) -> Element { div { class: "panel ask-panel", div { class: "ask-hero", h2 { "Ask your data anything" } - p { class: "subtitle", "Natural language → SQL → Results. Powered by local AI." } + p { class: "subtitle", "Natural language → SQL → Results. Powered by local AI over the staffing dataset." } } div { class: "ask-input-row", input { class: "ask-input", value: "{question}", - placeholder: "e.g. Which department has the highest average salary?", + placeholder: "e.g. Which clients placed the most candidates last quarter?", oninput: move |e| question.set(e.value()), onkeydown: move |e| { if e.key() == Key::Enter { @@ -432,10 +545,12 @@ fn AskPanel(datasets: Vec) -> Element { div { class: "ask-examples", "Try: " - button { class: "example-btn", onclick: move |_| question.set("Which department has the highest average salary?".into()), "highest avg salary by dept" } - button { class: "example-btn", onclick: move |_| question.set("Show me the top 3 most expensive products".into()), "top 3 expensive products" } - button { class: "example-btn", onclick: move |_| question.set("How many events per action type?".into()), "events by action" } - button { class: "example-btn", onclick: move |_| question.set("List all employees who earn more than 90000".into()), "employees > 90k" } + button { class: "example-btn", onclick: move |_| question.set("How many candidates do we have by city?".into()), "candidates by city" } + button { class: "example-btn", onclick: move |_| question.set("Top 10 clients by total placements".into()), "top clients by placements" } + button { class: "example-btn", onclick: move |_| question.set("Open job orders ordered by bill rate descending".into()), "open jobs by rate" } + button { class: "example-btn", onclick: move |_| question.set("Recruiters with the highest placement count".into()), "top recruiters" } + button { class: "example-btn", onclick: move |_| question.set("Total billed hours per client last month".into()), "hours per client" } + button { class: "example-btn", onclick: move |_| question.set("Cold leads: candidates we called more than 5 times but never placed".into()), "cold leads" } } if let Some(sql) = generated_sql.read().as_ref() { @@ -578,7 +693,7 @@ fn ExplorePanel(datasets: Vec) -> Element { #[component] fn SqlPanel() -> Element { - let mut query_text = use_signal(|| String::from("SELECT * FROM employees LIMIT 10")); + let mut query_text = use_signal(|| String::from("SELECT candidate_id, first_name, last_name, city, status FROM candidates LIMIT 10")); let mut result = use_signal(|| None::>); let mut loading = use_signal(|| false); @@ -727,7 +842,7 @@ fn DashboardPanel() -> Element { } div { class: "stat-card accent", div { class: "stat-value", "{s[\"hnsw_loaded\"]}" } - div { class: "stat-label", "HNSW Indexes (27ms)" } + div { class: "stat-label", "HNSW Indexes Loaded" } } div { class: "stat-card", div { class: "stat-value", "{s[\"tools\"]}" } @@ -750,27 +865,27 @@ fn DashboardPanel() -> Element { div { class: "arch-grid", div { class: "arch-card", div { class: "arch-title", "Ingest" } - div { class: "arch-items", "CSV, JSON, PDF, Text, PostgreSQL, File Watcher" } + div { class: "arch-items", "CSV · JSON · PDF (+OCR) · Text · Postgres · MySQL · Inbox watcher · Cron schedules" } } div { class: "arch-card", div { class: "arch-title", "Storage" } - div { class: "arch-items", "Parquet on Object Storage, Delta Writes, Compaction" } + div { class: "arch-items", "Parquet on Object Storage · Delta writes · Compaction · Tombstones · Multi-bucket federation + rescue" } } div { class: "arch-card", div { class: "arch-title", "Query" } - div { class: "arch-items", "DataFusion SQL, MemCache (9.8x), Hot/Cold" } + div { class: "arch-items", "DataFusion SQL · MemCache (9.8× hot) · Merge-on-read · AI-safe views" } } div { class: "arch-card", - div { class: "arch-title", "AI" } - div { class: "arch-items", "Ollama (local), Embed, Generate, RAG, HNSW" } + div { class: "arch-title", "AI / Vector" } + div { class: "arch-items", "Ollama (local) · Embed/Generate/RAG · HNSW (Parquet) · Lance IVF_PQ · Hybrid SQL+vector · Profile-scoped" } + } + div { class: "arch-card", + div { class: "arch-title", "Learning loop" } + div { class: "arch-items", "Playbook memory · Endorsement boost · Multi-agent orchestrator · Autotune agent (Pareto-promote)" } } div { class: "arch-card", div { class: "arch-title", "Governance" } - div { class: "arch-items", "Event Journal, PII Detection, Tool Registry, Access Control" } - } - div { class: "arch-card", - div { class: "arch-title", "Agents" } - div { class: "arch-items", "Workspaces, Handoff, Shortlists, Activity Logs" } + div { class: "arch-items", "Event journal · PII detection · Tool registry · Access control · Audit log · Catalog v2 metadata" } } } } @@ -779,20 +894,23 @@ fn DashboardPanel() -> Element { h3 { "Build Progression" } div { class: "phase-list", {rsx! { - PhaseItem { num: "0-5", name: "Foundation", detail: "Storage, Catalog, DataFusion, AI, UI, gRPC" } - PhaseItem { num: "6", name: "Ingest Pipeline", detail: "CSV/JSON/PDF/Text auto-schema" } - PhaseItem { num: "7", name: "Vector + RAG", detail: "Embed, Search, LLM Answers" } - PhaseItem { num: "8", name: "Hot Cache", detail: "9.8x speedup, Delta Writes" } - PhaseItem { num: "8.5", name: "Agent Workspaces", detail: "Per-contract, Instant Handoff" } - PhaseItem { num: "9", name: "Event Journal", detail: "Append-only Mutation History" } - PhaseItem { num: "10", name: "Rich Catalog", detail: "PII Detection, Lineage" } - PhaseItem { num: "11", name: "Embedding Versioning", detail: "Model-proof Vectors" } - PhaseItem { num: "12", name: "Tool Registry", detail: "6 Governed Actions + Audit" } - PhaseItem { num: "13", name: "Access Control", detail: "Role-based, Field-level" } - PhaseItem { num: "14", name: "Schema Evolution", detail: "Diff Detection, AI Migration" } - PhaseItem { num: "15", name: "HNSW Index", detail: "100K Search in 27ms" } - PhaseItem { num: "16", name: "File Watcher", detail: "Auto-ingest from Inbox" } - PhaseItem { num: "17", name: "DB Connector", detail: "PostgreSQL Import" } + PhaseItem { num: "0-5", name: "Foundation", detail: "Storage · Catalog · DataFusion · Ollama · UI · gRPC" } + PhaseItem { num: "6", name: "Ingest Pipeline", detail: "CSV · JSON · PDF · Text · auto-schema · dedupe" } + PhaseItem { num: "7", name: "Vector + RAG", detail: "Embed · brute-force cosine · LLM grounded answers" } + PhaseItem { num: "8", name: "Hot Cache + Deltas", detail: "MemTable LRU · 9.8× speedup · merge-on-read · compaction" } + PhaseItem { num: "8.5", name: "Agent Workspaces", detail: "Per-contract · daily/weekly/monthly tiers · zero-copy handoff" } + PhaseItem { num: "9", name: "Event Journal", detail: "Append-only mutation log · time-travel · audit" } + PhaseItem { num: "10", name: "Rich Catalog v2", detail: "PII auto-detection · lineage · freshness SLA · sensitivity" } + PhaseItem { num: "11", name: "Embedding Versioning", detail: "Per-index model+version · A/B · incremental re-embed" } + PhaseItem { num: "12", name: "Tool Registry", detail: "Governed actions · param validation · audit · MCP-ready" } + PhaseItem { num: "13", name: "Access Control", detail: "Roles · field-level sensitivity · column masking · query audit" } + PhaseItem { num: "14", name: "Schema Evolution", detail: "Diff detection · AI migration prompts · versioned schemas" } + PhaseItem { num: "15", name: "HNSW + Trials", detail: "100K vectors · p50 873µs · trial journal · eval harness" } + PhaseItem { num: "16", name: "Hot-swap + Autotune", detail: "Promotion registry · rollback · ε-greedy agent · Pareto winner" } + PhaseItem { num: "17", name: "Model Profiles + VRAM", detail: "ModelProfile manifests · scoped search · sequential model swap" } + PhaseItem { num: "18", name: "Lance hybrid backend", detail: "IVF_PQ build 14× faster · random fetch 112× · S3-native · per-profile routing" } + PhaseItem { num: "19", name: "Playbook memory", detail: "Feedback loop · endorsement boost (cap 0.25) · orchestrator write-through · citations" } + PhaseItem { num: "+", name: "Federation + Schedules", detail: "Multi-bucket · rescue fallback · error journal · MySQL · PDF OCR · cron ingest · catalog dedupe" } }} } } @@ -816,6 +934,285 @@ fn PhaseItem(num: String, name: String, detail: String) -> Element { } } +// === PLAYBOOK — Phase 19 meta-index feedback loop === + +#[component] +fn PlaybookPanel() -> Element { + let mut stats = use_signal(|| None::>); + let mut indexes = use_signal(Vec::::new); + let mut rebuild_status = use_signal(|| None::>); + let mut rebuilding = use_signal(|| false); + let mut loaded = use_signal(|| false); + + // Comparison state + let mut selected_index = use_signal(|| String::new()); + let mut question = use_signal(|| String::from("reliable assembler in Detroit")); + let mut top_k = use_signal(|| 10usize); + let mut compare_loading = use_signal(|| false); + let mut hits_off = use_signal(|| None::>); + let mut hits_on = use_signal(|| None::>); + + let load_all = move || { + spawn(async move { + stats.set(Some(fetch_playbook_stats().await)); + if let Ok(ix) = fetch_indexes().await { + if selected_index.read().is_empty() { + if let Some(default) = ix.iter().find(|i| i.source == "workers_500k").or_else(|| ix.first()) { + selected_index.set(default.index_name.clone()); + } + } + indexes.set(ix); + } + }); + }; + + use_effect(move || { + if !*loaded.read() { + loaded.set(true); + load_all(); + } + }); + + let do_rebuild = move |_| { + spawn(async move { + rebuilding.set(true); + rebuild_status.set(None); + match rebuild_playbook_memory().await { + Ok(v) => rebuild_status.set(Some(Ok(format!("rebuild ok — {}", v)))), + Err(e) => rebuild_status.set(Some(Err(e))), + } + // Refresh stats afterward + stats.set(Some(fetch_playbook_stats().await)); + rebuilding.set(false); + }); + }; + + let do_compare = move |_| { + let idx = selected_index.read().clone(); + let q = question.read().clone(); + let k = *top_k.read(); + if idx.is_empty() || q.trim().is_empty() { return; } + spawn(async move { + compare_loading.set(true); + hits_off.set(None); + hits_on.set(None); + // Run both sequentially so the embedding cache is shared + hits_off.set(Some(hybrid_search(&idx, &q, false, k).await)); + hits_on.set(Some(hybrid_search(&idx, &q, true, k).await)); + compare_loading.set(false); + }); + }; + + rsx! { + div { class: "panel", + div { class: "ask-hero", + h2 { "Playbook Memory" } + p { class: "subtitle", + "Phase 19 feedback loop: past successful playbooks boost future search rankings. \ + Endorsed workers from semantically similar past operations re-rank toward the top, \ + with citations back to the playbook that endorsed them." + } + } + + // Stats card + div { class: "panel-section", + match stats.read().as_ref() { + None => rsx! { div { class: "loading", "loading playbook stats..." } }, + Some(Err(e)) => rsx! { div { class: "error", "stats: {e}" } }, + Some(Ok(s)) => rsx! { + div { class: "stat-grid", + div { class: "stat-card", + div { class: "stat-value", "{s.entries}" } + div { class: "stat-label", "Playbooks in Memory" } + } + div { class: "stat-card", + div { class: "stat-value", "{s.entries_with_embeddings}" } + div { class: "stat-label", "Embedded" } + } + div { class: "stat-card accent", + div { class: "stat-value", "{s.total_names_endorsed}" } + div { class: "stat-label", "Endorsed Worker-Tags" } + } + } + } + } + div { class: "sql-actions", + button { + class: "btn", + disabled: *rebuilding.read(), + onclick: do_rebuild, + if *rebuilding.read() { "rebuilding from successful_playbooks..." } else { "Rebuild from successful_playbooks" } + } + } + if let Some(s) = rebuild_status.read().as_ref() { + match s { + Ok(msg) => rsx! { div { class: "result-box", "{msg}" } }, + Err(e) => rsx! { div { class: "error", "{e}" } }, + } + } + } + + // Sample playbooks + if let Some(Ok(s)) = stats.read().as_ref() { + if !s.sample.is_empty() { + div { class: "panel-section", + h3 { "Sample playbooks" } + div { class: "table-wrap", + table { + thead { tr { + th { "ID" } + th { "Operation" } + th { "Location" } + th { "Endorsed" } + } } + tbody { + for pb in s.sample.iter() { + { + let loc = match (&pb.city, &pb.state) { + (Some(c), Some(st)) => format!("{c}, {st}"), + _ => "—".into(), + }; + let endorsed = if pb.endorsed.is_empty() { + "—".to_string() + } else { + pb.endorsed.join(", ") + }; + let pid = pb.id.clone(); + let op = pb.operation.clone(); + rsx! { + tr { + td { class: "mono-cell", title: "{pid}", "{pid}" } + td { "{op}" } + td { "{loc}" } + td { "{endorsed}" } + } + } + } + } + } + } + } + } + } + } + + // Side-by-side comparison: boost OFF vs ON + div { class: "panel-section", + h3 { "See the boost — search compared" } + p { class: "hint", + "Run the same query against the same index twice — once with playbook boost OFF and once ON. \ + Hits with non-zero playbook_boost and citations are workers that past similar playbooks endorsed." + } + div { class: "form-row", + label { "Index" } + select { + value: "{selected_index}", + onchange: move |e| selected_index.set(e.value()), + for ix in indexes.read().iter() { + option { value: "{ix.index_name}", "{ix.index_name} ({ix.source}, {ix.chunk_count} chunks, {ix.vector_backend})" } + } + } + } + div { class: "form-row", + label { "Question" } + input { + value: "{question}", + oninput: move |e| question.set(e.value()), + placeholder: "e.g. reliable assembler in Detroit" + } + } + div { class: "form-row", + label { "Top K" } + input { + r#type: "number", + value: "{top_k}", + oninput: move |e| { + if let Ok(n) = e.value().parse::() { top_k.set(n.clamp(1, 50)); } + } + } + } + button { + class: "btn btn-ask", + disabled: *compare_loading.read(), + onclick: do_compare, + if *compare_loading.read() { "running both queries..." } else { "Run comparison" } + } + + div { class: "explore-grid", + div { class: "ds-detail", + h3 { "Boost OFF (vanilla)" } + match hits_off.read().as_ref() { + None => rsx! { div { class: "empty", "—" } }, + Some(Err(e)) => rsx! { div { class: "error", "{e}" } }, + Some(Ok(r)) => rsx! { HybridHitTable { resp: r.clone() } }, + } + } + div { class: "ds-detail", + h3 { "Boost ON (Phase 19)" } + match hits_on.read().as_ref() { + None => rsx! { div { class: "empty", "—" } }, + Some(Err(e)) => rsx! { div { class: "error", "{e}" } }, + Some(Ok(r)) => rsx! { HybridHitTable { resp: r.clone() } }, + } + } + } + } + } + } +} + +#[component] +fn HybridHitTable(resp: HybridResp) -> Element { + rsx! { + div { class: "results-info", + "{resp.sources.len()} hits · {resp.duration_ms}ms · method={resp.method}" + } + if resp.sources.is_empty() { + div { class: "empty-sm", "no hits" } + } else { + div { class: "table-wrap", + table { + thead { tr { + th { "#" } + th { "Doc" } + th { "Score" } + th { "Boost" } + th { "Citations" } + th { "Snippet" } + } } + tbody { + for (i, h) in resp.sources.iter().enumerate() { + { + let snippet: String = h.chunk_text.chars().take(120).collect(); + let cites = if h.playbook_citations.is_empty() { + "—".to_string() + } else { + h.playbook_citations.join(", ") + }; + let row_class = if h.playbook_boost > 0.0 { "boosted-row" } else { "" }; + let rank = i + 1; + let did = h.doc_id.clone(); + let score = format!("{:.3}", h.score); + let boost = if h.playbook_boost > 0.0 { format!("+{:.3}", h.playbook_boost) } else { "—".into() }; + rsx! { + tr { class: "{row_class}", + td { "{rank}" } + td { class: "mono-cell", "{did}" } + td { "{score}" } + td { "{boost}" } + td { class: "mono-cell", title: "{cites}", "{cites}" } + td { "{snippet}" } + } + } + } + } + } + } + } + } + } +} + // === INGEST — Data on-ramp === #[component] diff --git a/crates/vectord/src/lib.rs b/crates/vectord/src/lib.rs index e72c41e..0f0be40 100644 --- a/crates/vectord/src/lib.rs +++ b/crates/vectord/src/lib.rs @@ -7,6 +7,7 @@ pub mod harness; pub mod hnsw; pub mod index_registry; pub mod jobs; +pub mod playbook_memory; pub mod promotion; pub mod refresh; pub mod store; diff --git a/crates/vectord/src/playbook_memory.rs b/crates/vectord/src/playbook_memory.rs new file mode 100644 index 0000000..4a5a380 --- /dev/null +++ b/crates/vectord/src/playbook_memory.rs @@ -0,0 +1,825 @@ +//! Phase 19: Playbook memory — the feedback loop that makes the index +//! learn from real outcomes instead of just logging them. +//! +//! When an agent (multi-agent orchestrator or human operator) seals a +//! successful playbook, it lands in the `successful_playbooks` dataset. +//! Historically that was a write-only log. This module turns it into a +//! re-ranking signal: +//! +//! 1. `rebuild` reads every row of `successful_playbooks`, embeds the +//! operation+approach+context as one vector per playbook, parses +//! out the worker names from the `result` column, and stores both +//! the vectors and the (playbook → names) endorsement map in memory. +//! +//! 2. At query time, `compute_boost_for` takes a new operation text +//! (e.g. "fill: Welder x2 in Toledo, OH"), embeds it, brute-force +//! ranks past playbooks by cosine similarity, and returns a boost +//! map keyed by (city, state, worker_name) → `BoostEntry`. Each +//! entry carries its similarity score and the citing playbook_ids, +//! so explanations ("ranked higher because of 3 similar past fills +//! in Toledo") are free. +//! +//! 3. The `use_playbook_memory` flag on `/vectors/hybrid` adds those +//! boosts to matching search hits and re-sorts. +//! +//! Why brute force instead of another HNSW: `successful_playbooks` grows +//! by operators, not automation. A few thousand rows is the realistic +//! ceiling for years. Brute force at 10K × 768d is <10ms on this hardware +//! — not worth the operational cost of another indexed surface. +//! +//! Persistence: the endorsements map round-trips through +//! `_playbook_memory/state.json` in primary storage so the cache +//! survives restarts without a full rebuild. + +use std::collections::HashMap; +use std::sync::Arc; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; + +use aibridge::client::{AiClient, EmbedRequest}; +use object_store::ObjectStore; +use storaged::ops; + +const STATE_KEY: &str = "_playbook_memory/state.json"; + +/// Maximum boost a single worker can accumulate across all similar past +/// playbooks. Prevents one very popular worker from always winning. +pub const MAX_BOOST_PER_WORKER: f32 = 0.25; + +/// Default number of past playbooks to consider when ranking the current +/// operation. Bumped 5 → 25 on 2026-04-20 because at >100 entries in +/// memory the old default missed too many relevant playbooks — boost +/// silently failed even when the seeded workers were ideal matches. +/// 25 is brute-force-cheap (sub-ms) and covers most live operator memory. +pub const DEFAULT_TOP_K_PLAYBOOKS: usize = 25; + +/// Half-life of a playbook's contribution to boost, in days. A playbook +/// 30 days old contributes half what a fresh one would; 60 days old, a +/// quarter; etc. Per Path 1 (deepen statistical) — stale endorsements +/// shouldn't dominate fresh signal. Recruiter trust depends on this. +pub const BOOST_HALF_LIFE_DAYS: f32 = 30.0; + +/// Shape of one playbook in memory. The embedding is optional so we can +/// round-trip a cached state without re-embedding; the rebuild path +/// populates it. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PlaybookEntry { + pub playbook_id: String, + pub operation: String, + pub approach: String, + pub context: String, + pub timestamp: String, + /// Parsed out of `result` (e.g. "2/2 filled → Matthew Roberts, Amy Davis"). + /// Stored as raw names; matching against search results happens on + /// (city, state, name) tuples at boost time. + pub endorsed_names: Vec, + /// City + state parsed out of the operation string. Kept separately + /// so boost matching doesn't re-parse on every query. + pub city: Option, + pub state: Option, + /// Embedding of `operation + approach + context`. Option so persisted + /// state can omit it on first load and have a later embed() fill in. + #[serde(default)] + pub embedding: Option>, +} + +/// Persisted / in-memory state. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +struct PlaybookMemoryState { + entries: Vec, + /// Unix epoch millis when the last rebuild completed. Caller can + /// use this to gate "stale > N hours → trigger rebuild" behavior. + last_rebuilt_at: i64, +} + +/// Per-worker boost payload. `citations` lets the response layer show +/// "boosted because of these past fills" without a second lookup. +#[derive(Debug, Clone, Serialize)] +pub struct BoostEntry { + pub boost: f32, + pub citations: Vec, // playbook_ids that endorsed this worker +} + +/// Live handle passed around the service. Clone-cheap (all state is +/// inside one Arc). +#[derive(Clone)] +pub struct PlaybookMemory { + state: Arc>, + store: Arc, +} + +impl PlaybookMemory { + pub fn new(store: Arc) -> Self { + Self { + state: Arc::new(RwLock::new(PlaybookMemoryState::default())), + store, + } + } + + /// Best-effort load from primary storage. Missing = empty memory; the + /// first `/rebuild` call will hydrate it. + pub async fn load_from_storage(&self) -> Result { + let data = match ops::get(&self.store, STATE_KEY).await { + Ok(d) => d, + Err(_) => return Ok(0), + }; + let persisted: PlaybookMemoryState = serde_json::from_slice(&data) + .map_err(|e| format!("parse playbook_memory state: {e}"))?; + let n = persisted.entries.len(); + *self.state.write().await = persisted; + tracing::info!("playbook_memory: loaded {n} entries from {STATE_KEY}"); + Ok(n) + } + + async fn persist(&self) -> Result<(), String> { + let snapshot = self.state.read().await.clone(); + let bytes = serde_json::to_vec_pretty(&snapshot).map_err(|e| e.to_string())?; + ops::put(&self.store, STATE_KEY, bytes.into()).await + } + + /// Replace the full in-memory state atomically and persist. + pub async fn set_entries(&self, entries: Vec) -> Result<(), String> { + let mut s = self.state.write().await; + s.entries = entries; + s.last_rebuilt_at = chrono::Utc::now().timestamp_millis(); + drop(s); + self.persist().await + } + + pub async fn entry_count(&self) -> usize { + self.state.read().await.entries.len() + } + + pub async fn snapshot(&self) -> Vec { + self.state.read().await.entries.clone() + } + + /// Given an operation's embedding, find the top-K most similar past + /// playbooks (by cosine similarity) and return a per-worker boost map + /// keyed by (city, state, name). Worker is matched by the tuple so a + /// shared name across cities doesn't cross-pollinate. + /// + /// Boost formula: each qualifying playbook contributes + /// `similarity * base_weight / n_workers` to each worker it endorsed, + /// where `base_weight` is tuned to keep the cap realistic without + /// forcing every result to saturate. Total per worker is capped at + /// `MAX_BOOST_PER_WORKER`. + pub async fn compute_boost_for( + &self, + query_embedding: &[f32], + top_k_playbooks: usize, + base_weight: f32, + ) -> HashMap<(String, String, String), BoostEntry> { + let entries = self.state.read().await.entries.clone(); + + // Brute-force cosine. Empty / missing embeddings just skip. + let mut scored: Vec<(f32, &PlaybookEntry)> = entries + .iter() + .filter_map(|e| e.embedding.as_ref().map(|v| (cosine(query_embedding, v), e))) + .collect(); + scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(top_k_playbooks.max(1)); + + let now = chrono::Utc::now(); + let mut boosts: HashMap<(String, String, String), BoostEntry> = HashMap::new(); + for (similarity, pb) in &scored { + // Negative or near-zero similarity = not actually related; + // skip so we don't inject noise when the memory is sparse. + if *similarity <= 0.05 { continue; } + let Some(city) = &pb.city else { continue; }; + let Some(state) = &pb.state else { continue; }; + let n_workers = pb.endorsed_names.len().max(1); + // Path 1 — temporal decay. Older playbooks weight less. Failure + // to parse the timestamp degrades to "no decay" (treat as fresh) + // rather than dropping the entry entirely; keeps backward + // compatibility with seed payloads that omitted timestamp. + let decay = chrono::DateTime::parse_from_rfc3339(&pb.timestamp) + .ok() + .map(|t| { + let age_days = (now.signed_duration_since(t.with_timezone(&chrono::Utc)) + .num_seconds() as f32) / 86400.0; + if age_days <= 0.0 { 1.0 } + else { (-age_days / BOOST_HALF_LIFE_DAYS).exp() } + }) + .unwrap_or(1.0); + let per_worker = similarity * base_weight * decay / (n_workers as f32); + for name in &pb.endorsed_names { + let key = (city.clone(), state.clone(), name.clone()); + let entry = boosts.entry(key).or_insert(BoostEntry { + boost: 0.0, + citations: Vec::new(), + }); + entry.boost = (entry.boost + per_worker).min(MAX_BOOST_PER_WORKER); + if !entry.citations.contains(&pb.playbook_id) { + entry.citations.push(pb.playbook_id.clone()); + } + } + } + boosts + } +} + +/// Cosine similarity — pulled out so rebuild/boost share one impl. +fn cosine(a: &[f32], b: &[f32]) -> f32 { + let (mut dot, mut na, mut nb) = (0.0_f32, 0.0_f32, 0.0_f32); + let n = a.len().min(b.len()); + for i in 0..n { + dot += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + if na == 0.0 || nb == 0.0 { return 0.0; } + dot / (na.sqrt() * nb.sqrt()) +} + +// ---------------- Pattern discovery (Path 2 — meta-index) ---------------- +// +// Phase 19's boost path answers "for THIS exact city + role, which workers +// have we used before?" Pattern discovery answers a different question: +// "for queries like this one, what TRAITS have past successful fills had +// in common — even if no exact prior playbook covers this geo?" +// +// The discovered pattern surfaces signals the operator didn't query for: +// e.g. "every successful Welder fill we've seen carried OSHA-10 + lockout +// /tagout — you may want to filter on those." That's the meta-index +// dimension of the original PRD: identify things we didn't know about. + +#[derive(Debug, Clone, Serialize)] +pub struct PatternReport { + pub query: String, + pub matched_playbooks: usize, + pub total_workers_examined: usize, + pub common_certifications: Vec, + pub common_skills: Vec, + pub modal_archetype: Option, + pub reliability_p50: f64, + pub reliability_min: f64, + pub reliability_max: f64, + pub matched_playbook_ids: Vec, + pub discovered_pattern: String, + pub duration_secs: f32, +} + +#[derive(Debug, Clone, Serialize)] +pub struct TraitFreq { + pub name: String, + pub count: usize, + pub frequency: f32, +} + +pub async fn discover_patterns( + memory: &PlaybookMemory, + ai_client: &AiClient, + catalog: &catalogd::registry::Registry, + buckets: &Arc, + query: &str, + top_k_playbooks: usize, + min_trait_frequency: f32, +) -> Result { + let t0 = std::time::Instant::now(); + + // 1. Embed the query through the same nomic-embed-text model used + // for playbook embeddings, so cosine is meaningful. + let resp = ai_client + .embed(EmbedRequest { texts: vec![query.into()], model: None }) + .await + .map_err(|e| format!("embed query: {e}"))?; + if resp.embeddings.is_empty() { + return Err("embed returned no vectors".into()); + } + let qv: Vec = resp.embeddings[0].iter().map(|x| *x as f32).collect(); + + // 2. Find top-K most similar past playbooks (cosine over embeddings). + let entries = memory.snapshot().await; + let mut scored: Vec<(f32, &PlaybookEntry)> = entries + .iter() + .filter_map(|e| e.embedding.as_ref().map(|v| (cosine(&qv, v), e))) + .collect(); + scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(top_k_playbooks); + let matched: Vec<(f32, PlaybookEntry)> = scored + .into_iter() + .filter(|(s, _)| *s > 0.05) + .map(|(s, e)| (s, e.clone())) + .collect(); + + if matched.is_empty() { + return Ok(PatternReport { + query: query.into(), + matched_playbooks: 0, + total_workers_examined: 0, + common_certifications: vec![], + common_skills: vec![], + modal_archetype: None, + reliability_p50: 0.0, reliability_min: 0.0, reliability_max: 0.0, + matched_playbook_ids: vec![], + discovered_pattern: "No similar past playbooks found.".into(), + duration_secs: t0.elapsed().as_secs_f32(), + }); + } + + // 3. Pull each endorsed worker's full profile from workers_500k. + // Restrict by (name, city, state) tuple so cross-city homonyms + // don't pollute the aggregate. + let mut conditions: Vec = Vec::new(); + let mut matched_ids: Vec = Vec::new(); + for (_, pb) in &matched { + matched_ids.push(pb.playbook_id.clone()); + let (Some(city), Some(state)) = (pb.city.as_ref(), pb.state.as_ref()) else { continue }; + for name in &pb.endorsed_names { + let esc = |s: &str| s.replace('\'', "''"); + conditions.push(format!( + "(name = '{}' AND city = '{}' AND state = '{}')", + esc(name), esc(city), esc(state) + )); + } + } + if conditions.is_empty() { + return Ok(PatternReport { + query: query.into(), + matched_playbooks: matched.len(), + total_workers_examined: 0, + common_certifications: vec![], common_skills: vec![], + modal_archetype: None, reliability_p50: 0.0, + reliability_min: 0.0, reliability_max: 0.0, + matched_playbook_ids: matched_ids, + discovered_pattern: "Matched playbooks but no endorsed names with city/state to lookup.".into(), + duration_secs: t0.elapsed().as_secs_f32(), + }); + } + + let sql = format!( + "SELECT name, role, city, state, certifications, skills, archetype, \ + CAST(reliability AS DOUBLE) as reliability \ + FROM workers_500k WHERE {} LIMIT 500", + conditions.join(" OR ") + ); + let engine = queryd::context::QueryEngine::new( + catalog.clone(), buckets.clone(), queryd::cache::MemCache::new(0), + ); + let batches = engine.query(&sql).await.map_err(|e| format!("worker lookup: {e}"))?; + + // 4. Aggregate. Pipe-separated cert/skill lists, single-string archetype, + // numeric reliability. Frequencies are share-of-workers. + use arrow::array::{Array, AsArray}; + let mut cert_counts: HashMap = HashMap::new(); + let mut skill_counts: HashMap = HashMap::new(); + let mut arch_counts: HashMap = HashMap::new(); + let mut reliabilities: Vec = Vec::new(); + let mut total = 0usize; + + let get_string = |b: &arrow::record_batch::RecordBatch, col: &str, row: usize| -> String { + let Some(c) = b.column_by_name(col) else { return String::new(); }; + if let Some(arr) = c.as_string_view_opt() { + if arr.is_null(row) { return String::new(); } + return arr.value(row).to_string(); + } + if let Some(arr) = c.as_string_opt::() { + if arr.is_null(row) { return String::new(); } + return arr.value(row).to_string(); + } + String::new() + }; + let get_f64 = |b: &arrow::record_batch::RecordBatch, col: &str, row: usize| -> f64 { + let Some(c) = b.column_by_name(col) else { return 0.0; }; + if let Some(arr) = c.as_primitive_opt::() { + if arr.is_null(row) { return 0.0; } + return arr.value(row); + } + 0.0 + }; + + for b in &batches { + for row in 0..b.num_rows() { + total += 1; + let certs = get_string(b, "certifications", row); + for c in certs.split(['|', ',']).map(|s| s.trim()).filter(|s| !s.is_empty() && *s != "none") { + *cert_counts.entry(c.to_string()).or_insert(0) += 1; + } + let skills = get_string(b, "skills", row); + for s in skills.split(['|', ',']).map(|s| s.trim()).filter(|s| !s.is_empty()) { + *skill_counts.entry(s.to_string()).or_insert(0) += 1; + } + let arch = get_string(b, "archetype", row); + if !arch.is_empty() { + *arch_counts.entry(arch).or_insert(0) += 1; + } + let rel = get_f64(b, "reliability", row); + if rel > 0.0 { reliabilities.push(rel); } + } + } + + let total_f = total.max(1) as f32; + let to_freq = |m: HashMap, min: f32| -> Vec { + let mut v: Vec = m.into_iter() + .map(|(name, count)| TraitFreq { name, count, frequency: count as f32 / total_f }) + .filter(|t| t.frequency >= min) + .collect(); + v.sort_by(|a, b| b.count.cmp(&a.count)); + v.truncate(8); + v + }; + let common_certifications = to_freq(cert_counts, min_trait_frequency); + let common_skills = to_freq(skill_counts, min_trait_frequency); + let modal_archetype = arch_counts.into_iter() + .max_by_key(|(_, c)| *c) + .map(|(name, _)| name); + + reliabilities.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let p50 = if reliabilities.is_empty() { 0.0 } else { reliabilities[reliabilities.len() / 2] }; + let rmin = reliabilities.first().copied().unwrap_or(0.0); + let rmax = reliabilities.last().copied().unwrap_or(0.0); + + // Build a human-readable discovered-pattern summary + let mut parts: Vec = vec![ + format!("Across {} similar past playbooks ({} workers examined)", matched.len(), total), + ]; + if !common_certifications.is_empty() { + let head: Vec = common_certifications.iter().take(3) + .map(|t| format!("{} ({:.0}%)", t.name, t.frequency * 100.0)).collect(); + parts.push(format!("recurring certifications: {}", head.join(", "))); + } + if !common_skills.is_empty() { + let head: Vec = common_skills.iter().take(3) + .map(|t| format!("{} ({:.0}%)", t.name, t.frequency * 100.0)).collect(); + parts.push(format!("recurring skills: {}", head.join(", "))); + } + if let Some(a) = &modal_archetype { parts.push(format!("archetype mostly: {a}")); } + if !reliabilities.is_empty() { + parts.push(format!("reliability median {:.2} (range {:.2}–{:.2})", p50, rmin, rmax)); + } + let discovered_pattern = parts.join(" · "); + + Ok(PatternReport { + query: query.into(), + matched_playbooks: matched.len(), + total_workers_examined: total, + common_certifications, common_skills, + modal_archetype, reliability_p50: p50, + reliability_min: rmin, reliability_max: rmax, + matched_playbook_ids: matched_ids, + discovered_pattern, + duration_secs: t0.elapsed().as_secs_f32(), + }) +} + +// ---------------- Persist memory → SQL (Path 2 foundation) ---------------- + +#[derive(Debug, Clone, Serialize)] +pub struct PersistReport { + pub rows_persisted: usize, + pub dataset_name: String, + pub fingerprint: String, + pub duration_secs: f32, +} + +/// Dump current in-memory state to a queryable Parquet under +/// `successful_playbooks_live`. Registers fresh objects each call — safe +/// because in-memory state is the source of truth here, so REPLACING the +/// objects list reflects the real state, not destroying it. +/// +/// Distinct from the existing `successful_playbooks` dataset (which is +/// read by `rebuild()`), so this never collides with operator imports of +/// historical playbook data. Recruiter-facing SQL surfaces should query +/// `successful_playbooks_live` for current operator activity. +pub async fn persist_to_sql( + memory: &PlaybookMemory, + catalog: &catalogd::registry::Registry, +) -> Result { + use arrow::array::StringArray; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::record_batch::RecordBatch; + + let t0 = std::time::Instant::now(); + let entries = memory.snapshot().await; + + let schema = Arc::new(Schema::new(vec![ + Field::new("timestamp", DataType::Utf8, true), + Field::new("operation", DataType::Utf8, true), + Field::new("approach", DataType::Utf8, true), + Field::new("result", DataType::Utf8, true), + Field::new("context", DataType::Utf8, true), + ])); + + let timestamps: Vec<&str> = entries.iter().map(|e| e.timestamp.as_str()).collect(); + let operations: Vec<&str> = entries.iter().map(|e| e.operation.as_str()).collect(); + let approaches: Vec<&str> = entries.iter().map(|e| e.approach.as_str()).collect(); + let contexts: Vec<&str> = entries.iter().map(|e| e.context.as_str()).collect(); + // Result column is reconstructed from endorsed_names so SQL queries + // against successful_playbooks_live see the same shape as the original + // CSV-fed successful_playbooks ("N/N filled → Name1, Name2"). + let results: Vec = entries.iter().map(|e| { + if e.endorsed_names.is_empty() { + String::new() + } else { + let n = e.endorsed_names.len(); + format!("{}/{} filled → {}", n, n, e.endorsed_names.join(", ")) + } + }).collect(); + let result_refs: Vec<&str> = results.iter().map(|s| s.as_str()).collect(); + + let batch = RecordBatch::try_new(schema.clone(), vec![ + Arc::new(StringArray::from(timestamps)), + Arc::new(StringArray::from(operations)), + Arc::new(StringArray::from(approaches)), + Arc::new(StringArray::from(result_refs)), + Arc::new(StringArray::from(contexts)), + ]).map_err(|e| format!("build record batch: {e}"))?; + + let parquet_bytes = shared::arrow_helpers::record_batch_to_parquet(&batch)?; + let fp = shared::arrow_helpers::fingerprint_schema(&schema); + + let key = "datasets/successful_playbooks_live.parquet"; + ops::put(&memory.store, key, parquet_bytes.clone()).await?; + + let obj = shared::types::ObjectRef { + bucket: "primary".into(), + key: key.into(), + size_bytes: parquet_bytes.len() as u64, + created_at: chrono::Utc::now(), + }; + + let manifest = catalog.register( + "successful_playbooks_live".into(), + fp.clone(), + vec![obj], + ).await?; + + Ok(PersistReport { + rows_persisted: entries.len(), + dataset_name: manifest.name, + fingerprint: fp.0, + duration_secs: t0.elapsed().as_secs_f32(), + }) +} + +// ---------------- Rebuild (the core of Phase 19) ---------------- + +#[derive(Debug, Clone, Serialize)] +pub struct RebuildReport { + pub rows_scanned: usize, + pub entries_built: usize, + pub total_names_endorsed: usize, + pub duration_secs: f32, +} + +/// Full rebuild: scan `successful_playbooks`, extract endorsements, embed +/// each row's operation+approach+context, replace the in-memory state. +/// +/// Returns the report so callers can show operators what happened. +pub async fn rebuild( + memory: &PlaybookMemory, + ai_client: &AiClient, + catalog: &catalogd::registry::Registry, + buckets: &Arc, +) -> Result { + let t0 = std::time::Instant::now(); + + // 1. Pull every row of successful_playbooks through the query engine. + let sql = "SELECT timestamp, operation, approach, result, context \ + FROM successful_playbooks"; + let engine = queryd::context::QueryEngine::new( + catalog.clone(), + buckets.clone(), + queryd::cache::MemCache::new(0), + ); + let batches = engine + .query(sql) + .await + .map_err(|e| format!("query successful_playbooks: {e}"))?; + + let mut rows: Vec<(String, String, String, String, String)> = Vec::new(); + for b in &batches { + let n = b.num_rows(); + let get = |col: &str, row: usize| -> String { + use arrow::array::{Array, AsArray}; + let Some(c) = b.column_by_name(col) else { return String::new(); }; + if let Some(arr) = c.as_string_view_opt() { + if arr.is_null(row) { return String::new(); } + return arr.value(row).to_string(); + } + if let Some(arr) = c.as_string_opt::() { + if arr.is_null(row) { return String::new(); } + return arr.value(row).to_string(); + } + String::new() + }; + for row in 0..n { + rows.push(( + get("timestamp", row), + get("operation", row), + get("approach", row), + get("result", row), + get("context", row), + )); + } + } + let rows_scanned = rows.len(); + + // 2. For each row, build a PlaybookEntry (no embedding yet). Parse + // the operation for (city, state) and the result for names. + let mut entries: Vec = rows + .into_iter() + .map(|(ts, op, approach, result, ctx)| { + let (city, state) = parse_city_state(&op); + let names = parse_names(&result); + PlaybookEntry { + playbook_id: stable_id(&ts, &op), + operation: op, + approach, + context: ctx, + timestamp: ts, + endorsed_names: names, + city, + state, + embedding: None, + } + }) + .collect(); + + // 3. Embed in one batch. Sidecar's embed handles batching internally; + // chunk here to ~64 per request to keep memory flat. + const EMBED_BATCH: usize = 64; + for chunk_start in (0..entries.len()).step_by(EMBED_BATCH) { + let end = (chunk_start + EMBED_BATCH).min(entries.len()); + let texts: Vec = entries[chunk_start..end] + .iter() + .map(embed_text) + .collect(); + let req = EmbedRequest { texts, model: None }; + let resp = ai_client + .embed(req) + .await + .map_err(|e| format!("embed batch [{chunk_start}..{end}]: {e}"))?; + for (i, v) in resp.embeddings.iter().enumerate() { + let f32v: Vec = v.iter().map(|&x| x as f32).collect(); + entries[chunk_start + i].embedding = Some(f32v); + } + } + + let total_names_endorsed: usize = entries.iter().map(|e| e.endorsed_names.len()).sum(); + let entries_built = entries.len(); + + memory.set_entries(entries).await?; + + Ok(RebuildReport { + rows_scanned, + entries_built, + total_names_endorsed, + duration_secs: t0.elapsed().as_secs_f32(), + }) +} + +fn embed_text(e: &PlaybookEntry) -> String { + // Compact one-liner per playbook. Excludes timestamp (no semantic + // signal) and includes the fills as words (they're occasionally + // meaningful — "Luis Harris" might semantically correlate with + // Spanish-speaker names in future queries). + format!( + "{} | {} | {} | fills: {}", + e.operation, + e.approach, + e.context, + e.endorsed_names.join(", "), + ) +} + +/// Derive a stable id from (timestamp, operation). Two playbooks with +/// identical timestamp+operation collapse to one — benign dedup. +fn stable_id(ts: &str, op: &str) -> String { + use sha2::{Digest, Sha256}; + let mut h = Sha256::new(); + h.update(ts.as_bytes()); + h.update(b"|"); + h.update(op.as_bytes()); + let bytes = h.finalize(); + format!("pb-{}", hex_short(&bytes, 12)) +} + +fn hex_short(b: &[u8], n: usize) -> String { + let mut s = String::with_capacity(n * 2); + for byte in &b[..b.len().min(n)] { + s.push_str(&format!("{byte:02x}")); + } + s +} + +/// Parse "fill: Welder x2 in Toledo, OH" → ("Toledo", "OH"). +/// Returns None for malformed operations. +fn parse_city_state(op: &str) -> (Option, Option) { + // Split on " in " then parse "City, ST" + let after_in = match op.split(" in ").nth(1) { + Some(s) => s, + None => return (None, None), + }; + let parts: Vec<&str> = after_in.splitn(2, ',').collect(); + if parts.len() != 2 { + return (None, None); + } + let city = parts[0].trim().to_string(); + // state might be followed by more context; take leading alpha chars + let state: String = parts[1].trim() + .chars() + .take_while(|c| c.is_ascii_alphabetic()) + .collect(); + if city.is_empty() || state.is_empty() { + return (None, None); + } + (Some(city), Some(state)) +} + +/// Parse "2/2 filled → Matthew Roberts, Amy Davis" → ["Matthew Roberts", "Amy Davis"]. +fn parse_names(result: &str) -> Vec { + // Everything after the arrow; split on ", ". + let after_arrow = match result.split('→').nth(1) { + Some(s) => s.trim(), + None => return Vec::new(), + }; + // Strip trailing noise like "(and N more)" that some emitters add. + let cleaned = after_arrow.split(" (").next().unwrap_or(after_arrow); + cleaned + .split(',') + .map(|n| n.trim().to_string()) + .filter(|n| !n.is_empty()) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_city_state_extracts_both() { + let (c, s) = parse_city_state("fill: Welder x2 in Toledo, OH"); + assert_eq!(c.as_deref(), Some("Toledo")); + assert_eq!(s.as_deref(), Some("OH")); + } + + #[test] + fn parse_city_state_handles_multiword_city() { + let (c, s) = parse_city_state("fill: Loader x1 in Grand Rapids, MI"); + assert_eq!(c.as_deref(), Some("Grand Rapids")); + assert_eq!(s.as_deref(), Some("MI")); + } + + #[test] + fn parse_city_state_malformed_returns_none() { + let (c, s) = parse_city_state("fill: something weird"); + assert!(c.is_none()); + assert!(s.is_none()); + } + + #[test] + fn parse_names_extracts_after_arrow() { + let ns = parse_names("2/2 filled → Matthew Roberts, Amy Davis"); + assert_eq!(ns, vec!["Matthew Roberts".to_string(), "Amy Davis".to_string()]); + } + + #[test] + fn parse_names_handles_single_fill() { + let ns = parse_names("1/1 filled → Jose Reed"); + assert_eq!(ns, vec!["Jose Reed".to_string()]); + } + + #[test] + fn parse_names_handles_no_arrow() { + let ns = parse_names("0/2 filled"); + assert!(ns.is_empty()); + } + + #[test] + fn stable_id_is_deterministic() { + let a = stable_id("2026-04-20T00:00:00Z", "fill: Welder x2 in Toledo, OH"); + let b = stable_id("2026-04-20T00:00:00Z", "fill: Welder x2 in Toledo, OH"); + assert_eq!(a, b); + assert!(a.starts_with("pb-")); + } + + #[test] + fn boost_caps_per_worker() { + // Even with 100 similar playbooks all endorsing the same name, the + // boost never exceeds MAX_BOOST_PER_WORKER. + let pm = PlaybookMemory::new(Arc::new(object_store::memory::InMemory::new())); + let entries: Vec = (0..100) + .map(|i| PlaybookEntry { + playbook_id: format!("pb-{i}"), + operation: "fill: Welder x1 in Toledo, OH".into(), + approach: "transfer".into(), + context: "".into(), + timestamp: "2026-04-20".into(), + endorsed_names: vec!["Deborah Powell".into()], + city: Some("Toledo".into()), + state: Some("OH".into()), + embedding: Some(vec![1.0, 0.0, 0.0]), + }) + .collect(); + tokio::runtime::Runtime::new().unwrap().block_on(async { + pm.set_entries(entries).await.unwrap(); + let boosts = pm.compute_boost_for(&[1.0, 0.0, 0.0], 100, 0.5).await; + let key = ("Toledo".into(), "OH".into(), "Deborah Powell".into()); + let entry = boosts.get(&key).expect("boost entry present"); + assert!(entry.boost <= MAX_BOOST_PER_WORKER + 1e-6, + "boost {} exceeded cap {}", entry.boost, MAX_BOOST_PER_WORKER); + }); + } +} diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 5bd6366..565c747 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use aibridge::client::{AiClient, EmbedRequest, GenerateRequest}; use catalogd::registry::Registry as CatalogRegistry; use storaged::registry::BucketRegistry; -use crate::{agent, autotune, chunker, embedding_cache, harness, hnsw, index_registry, jobs, lance_backend, promotion, rag, refresh, search, store, supervisor, trial}; +use crate::{agent, autotune, chunker, embedding_cache, harness, hnsw, index_registry, jobs, lance_backend, playbook_memory, promotion, rag, refresh, search, store, supervisor, trial}; #[derive(Clone)] pub struct VectorState { @@ -23,6 +23,9 @@ pub struct VectorState { pub hnsw_store: hnsw::HnswStore, pub embedding_cache: embedding_cache::EmbeddingCache, pub trial_journal: trial::TrialJournal, + /// Federation-aware harness store — resolves eval artifacts to each + /// index's recorded bucket, falling back to primary for legacy evals. + pub harness_store: harness::HarnessStore, /// Catalog registry — needed by the Phase C refresh path to mark/clear /// staleness and look up dataset manifests. pub catalog: CatalogRegistry, @@ -46,6 +49,10 @@ pub struct VectorState { /// ADR-019 hybrid: handles to Lance datasets keyed by index name. /// Lazy-created on first /vectors/lance/* call. pub lance: lance_backend::LanceRegistry, + /// Phase 19 — meta-index feedback. Embeds past successful_playbooks + /// and, when `use_playbook_memory` is set on /vectors/hybrid, boosts + /// workers that were actually filled in semantically-similar past ops. + pub playbook_memory: playbook_memory::PlaybookMemory, } /// What the active-profile singleton records. Narrow — we don't need the @@ -63,6 +70,7 @@ pub fn router(state: VectorState) -> Router { .route("/index", post(create_index)) .route("/indexes", get(list_indexes)) .route("/indexes/{name}", get(get_index_meta)) + .route("/indexes/{name}/bucket", axum::routing::patch(migrate_index_bucket)) .route("/jobs", get(list_jobs)) .route("/jobs/{id}", get(get_job)) .route("/search", post(search_index)) @@ -110,6 +118,12 @@ pub fn router(state: VectorState) -> Router { .route("/lance/stats/{index_name}", get(lance_stats)) .route("/lance/scalar-index/{index_name}/{column}", post(lance_build_scalar_index)) .route("/lance/recall/{index_name}", post(lance_recall_harness)) + // Phase 19: playbook memory — the meta-index feedback loop + .route("/playbook_memory/rebuild", post(rebuild_playbook_memory)) + .route("/playbook_memory/stats", get(playbook_memory_stats)) + .route("/playbook_memory/seed", post(seed_playbook_memory)) + .route("/playbook_memory/persist_sql", post(persist_playbook_memory_sql)) + .route("/playbook_memory/patterns", post(discover_playbook_patterns)) .with_state(state) } @@ -259,6 +273,174 @@ async fn get_index_meta( } } +#[derive(Deserialize)] +struct MigrateBucketRequest { + dest_bucket: String, + /// If true, delete artifacts from the source bucket after the pointer + /// flip. Default false — keeping source copies means a failed migration + /// is recoverable by editing IndexMeta.bucket back, and a successful + /// migration leaves inspectable forensics until an operator sweeps. + #[serde(default)] + delete_source: bool, +} + +#[derive(Serialize)] +struct MigrateBucketReport { + index_name: String, + source_bucket: String, + dest_bucket: String, + /// Artifact keys that were copied (or attempted). Order follows copy order. + copied: Vec, + /// Artifact prefixes that had nothing to copy (optional files missing, + /// trial journal empty, etc). + skipped: Vec, + /// Subset of `copied` that was subsequently deleted from the source. + deleted_source: Vec, + duration_secs: f32, +} + +/// Move an index's artifacts from its current bucket to `dest_bucket`. +/// Parquet-backed indexes only — Lance migration needs URI rewriting that +/// isn't in scope for this endpoint. Copies the vector data, trial journal, +/// promotion file, and auto-generated harness; updates `IndexMeta.bucket` +/// last so a mid-flight failure leaves the index still usable at its +/// original location. Evicts the `EmbeddingCache` entry so the next load +/// re-reads from the new bucket. +async fn migrate_index_bucket( + State(state): State, + Path(name): Path, + Json(req): Json, +) -> Result, (StatusCode, String)> { + let t0 = std::time::Instant::now(); + + let mut meta = state + .index_registry + .get(&name) + .await + .ok_or_else(|| (StatusCode::NOT_FOUND, format!("index '{name}' not found")))?; + + if meta.vector_backend == shared::types::VectorBackend::Lance { + return Err(( + StatusCode::BAD_REQUEST, + "Lance-backed indexes cannot be migrated via this endpoint — \ + Lance URIs are bucket-specific; a separate migrate_lance tool \ + is needed".into(), + )); + } + + if !state.bucket_registry.contains(&req.dest_bucket) { + return Err(( + StatusCode::BAD_REQUEST, + format!("dest bucket '{}' not registered", req.dest_bucket), + )); + } + + let source_bucket = meta.bucket.clone(); + if source_bucket == req.dest_bucket { + return Err(( + StatusCode::BAD_REQUEST, + format!("source and dest are both '{source_bucket}' — nothing to migrate"), + )); + } + + let src = state + .bucket_registry + .get(&source_bucket) + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; + let dst = state + .bucket_registry + .get(&req.dest_bucket) + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; + + let mut copied: Vec = Vec::new(); + let mut skipped: Vec = Vec::new(); + + // 1. Vector data (single parquet file for this backend). + copy_key(&src, &dst, &meta.storage_key) + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, + format!("copy {}: {e}", meta.storage_key)) + })?; + copied.push(meta.storage_key.clone()); + + // 2. Trial journal batches — per-index directory of JSONL files. + let trial_prefix = format!("_hnsw_trials/{name}/"); + let trial_keys = storaged::ops::list(&src, Some(&trial_prefix)) + .await + .unwrap_or_default(); + if trial_keys.is_empty() { + skipped.push(trial_prefix); + } + for k in &trial_keys { + copy_key(&src, &dst, k) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("copy {k}: {e}")))?; + copied.push(k.clone()); + } + + // 3. Promotion file (optional — absent for never-promoted indexes). + let promo_key = format!("_hnsw_promotions/{name}.json"); + match copy_key(&src, &dst, &promo_key).await { + Ok(()) => copied.push(promo_key), + Err(_) => skipped.push(promo_key), + } + + // 4. Auto-generated harness (optional — absent if agent never ran). + let harness_key = format!("_hnsw_evals/{name}_auto.json"); + match copy_key(&src, &dst, &harness_key).await { + Ok(()) => copied.push(harness_key), + Err(_) => skipped.push(harness_key), + } + + // 5. Pointer flip — IndexMeta.bucket now points at destination. This + // is the commit point; earlier failures leave copies in dest but the + // index still usable at source. + meta.bucket = req.dest_bucket.clone(); + state + .index_registry + .register(meta) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("update meta: {e}")))?; + + // 6. Cache eviction — next load reads the new bucket's parquet. + state.embedding_cache.evict(&name).await; + + // 7. Optional source cleanup. + let mut deleted_source: Vec = Vec::new(); + if req.delete_source { + for k in &copied { + if storaged::ops::delete(&src, k).await.is_ok() { + deleted_source.push(k.clone()); + } + } + } + + Ok(Json(MigrateBucketReport { + index_name: name, + source_bucket, + dest_bucket: req.dest_bucket, + copied, + skipped, + deleted_source, + duration_secs: t0.elapsed().as_secs_f32(), + })) +} + +/// Stream a single object from one bucket to another. Uses the existing +/// `storaged::ops` get + put primitives — no native copy in object_store +/// across heterogeneous backends (local ↔ S3), so an in-memory hop is +/// unavoidable. Bounded by individual object size, which for our parquet +/// + jsonl artifacts tops out around a few hundred MB. +async fn copy_key( + src: &Arc, + dst: &Arc, + key: &str, +) -> Result<(), String> { + let data = storaged::ops::get(src, key).await?; + storaged::ops::put(dst, key, data).await +} + // --- unused legacy function below, kept for reference --- #[allow(dead_code)] @@ -420,6 +602,15 @@ struct HybridRequest { /// If false, just return the ranked matches (faster, no Ollama gen). #[serde(default = "default_true")] generate: bool, + /// Phase 19: consult `playbook_memory` and boost workers that past + /// similar playbooks successfully filled. Off by default so current + /// callers keep deterministic ranking; opt-in unlocks the feedback. + #[serde(default)] + use_playbook_memory: bool, + /// Number of past playbooks to consider when `use_playbook_memory` + /// is on. Ignored otherwise. Defaults to 5. + #[serde(default)] + playbook_memory_k: Option, } fn default_true() -> bool { true } @@ -442,8 +633,18 @@ struct HybridSource { chunk_text: String, score: f32, sql_verified: bool, + /// Phase 19: how much the playbook_memory boost lifted this hit's + /// score. 0.0 when `use_playbook_memory=false` or no past playbook + /// endorsed this worker. + #[serde(default, skip_serializing_if = "is_zero")] + playbook_boost: f32, + /// playbook_ids whose endorsement contributed to `playbook_boost`. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + playbook_citations: Vec, } +fn is_zero(x: &f32) -> bool { x.abs() < 1e-6 } + async fn hybrid_search( State(state): State, Json(req): Json, @@ -556,6 +757,11 @@ async fn hybrid_search( .and_then(|m| m.id_prefix.clone()); let sql_count = valid_ids.as_ref().map(|s| s.len()).unwrap_or(0); + // Phase 19: when playbook_memory is consulted, pull a wider candidate + // pool so endorsed workers outside the vanilla top-K can still be + // boosted into visibility. 5× is a conservative multiplier — plenty + // for a +0.25 boost to flip rankings without dragging the cost up. + let fetch_k = if req.use_playbook_memory { req.top_k * 5 } else { req.top_k }; let filtered: Vec = if let Some(ref ids) = valid_ids { all_results.into_iter() .filter(|r| { @@ -572,20 +778,54 @@ async fn hybrid_search( }; ids.contains(raw_id) }) - .take(req.top_k) + .take(fetch_k) .collect() } else { - all_results.into_iter().take(req.top_k).collect() + all_results.into_iter().take(fetch_k).collect() }; // Step 4: Build sources with SQL-verified flag. - let sources: Vec = filtered.iter().map(|r| HybridSource { + let mut sources: Vec = filtered.iter().map(|r| HybridSource { doc_id: r.doc_id.clone(), chunk_text: r.chunk_text.clone(), score: r.score, sql_verified: valid_ids.is_some(), + playbook_boost: 0.0, + playbook_citations: Vec::new(), }).collect(); + // Step 4b (Phase 19): if use_playbook_memory, look up semantically + // similar past playbooks and boost workers they endorsed. Name-match + // is on the tuple (city, state, name) extracted from chunk_text — + // hybrid_search's SQL filter already narrowed to one city+state, so + // this just needs to check the name against each playbook's endorsed + // set. Additive boost on the existing vector score, then re-sort. + if req.use_playbook_memory { + let boost_k = req.playbook_memory_k.unwrap_or(playbook_memory::DEFAULT_TOP_K_PLAYBOOKS); + // We embedded the question as `qv` above — reuse it for the + // playbook similarity lookup so we don't double-pay Ollama. + let boosts = state.playbook_memory.compute_boost_for(&qv, boost_k, 0.5).await; + for src in sources.iter_mut() { + // Parse "{Name} — {Role} in {City}, {State}. …" chunk. Being + // defensive: chunks from other datasets may not follow this + // exact shape, so absent fields just skip the boost. + if let Some((name, city, state)) = parse_worker_chunk(&src.chunk_text) { + let key = (city, state, name); + if let Some(entry) = boosts.get(&key) { + src.score += entry.boost; + src.playbook_boost = entry.boost; + src.playbook_citations = entry.citations.clone(); + } + } + } + // Re-rank: boosted scores can flip ordering. + sources.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); + // Finally trim to the caller's requested top_k — we pulled fetch_k + // (5× wider) above specifically so the boost could reach workers + // that would otherwise have been trimmed pre-boost. + sources.truncate(req.top_k); + } + // Step 5: Generate answer if requested. let answer = if req.generate && !sources.is_empty() { let context: String = sources.iter().enumerate().map(|(i, s)| { @@ -734,7 +974,7 @@ async fn run_trial( State(state): State, Json(req): Json, ) -> Result, (StatusCode, String)> { - let mut harness_set = harness::EvalSet::load(&state.store, &req.harness) + let mut harness_set = state.harness_store.load_for_index(&req.index_name, &req.harness) .await .map_err(|e| (StatusCode::NOT_FOUND, format!("harness not found: {e}")))?; @@ -764,8 +1004,8 @@ async fn run_trial( .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?; tracing::info!("trial: ground truth built in {:.1}s", t0.elapsed().as_secs_f32()); - harness_set - .save(&state.store) + state.harness_store + .save(&harness_set) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save harness: {e}")))?; } @@ -890,17 +1130,14 @@ async fn best_trial( // --- Harness management --- async fn list_evals(State(state): State) -> impl IntoResponse { - match harness::EvalSet::list(&state.store).await { - Ok(names) => Ok(Json(names)), - Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), - } + Json(state.harness_store.list_all().await) } async fn get_eval( State(state): State, Path(name): Path, ) -> impl IntoResponse { - match harness::EvalSet::load(&state.store, &name).await { + match state.harness_store.get_any(&name).await { Ok(e) => Ok(Json(e)), Err(err) => Err((StatusCode::NOT_FOUND, err)), } @@ -916,7 +1153,7 @@ async fn put_eval( .queries .iter() .all(|q| q.ground_truth.is_some()); - match harness_set.save(&state.store).await { + match state.harness_store.save(&harness_set).await { Ok(()) => Ok(Json(harness_set)), Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), } @@ -957,8 +1194,8 @@ async fn autogen_eval( .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("ground truth: {e}")))?; - harness_set - .save(&state.store) + state.harness_store + .save(&harness_set) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("save: {e}")))?; @@ -1407,7 +1644,12 @@ async fn profile_scoped_search( let lance_store = state.lance.store_for(&req.index_name).await .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let t0 = std::time::Instant::now(); - match lance_store.search(&query_vec, top_k).await { + match lance_store.search( + &query_vec, + top_k, + Some(LANCE_DEFAULT_NPROBES), + Some(LANCE_DEFAULT_REFINE_FACTOR), + ).await { Ok(hits) => Ok(Json(serde_json::json!({ "profile": profile.id, "source": index_meta.source, @@ -1516,6 +1758,7 @@ async fn run_autotune_endpoint( &state.index_registry, &state.trial_journal, &state.promotion_registry, + &state.harness_store, &state.job_tracker, ).await { Ok(result) => Ok(Json(result)), @@ -1636,8 +1879,25 @@ struct LanceSearchRequest { query: String, #[serde(default = "default_top_k")] top_k: usize, + /// IVF partitions to probe. `None` uses Lance's built-in default of + /// 1, which caps recall well below the index's real capability. + /// Recommended: 5–10% of num_partitions (≈20 for a 316-partition + /// index). Omitting it here picks the server-side default. + #[serde(default)] + nprobes: Option, + /// Refine factor — re-rank `top_k * factor` PQ-approximate candidates + /// with exact distances before returning `top_k`. Recovers recall + /// lost to product quantization. + #[serde(default)] + refine_factor: Option, } +/// Server-side defaults when the caller doesn't pin nprobes / refine +/// themselves. Tuned for the ~100K × 768d reference workload; see +/// docs/ADR-019-vector-storage.md for the recall / latency trade-off. +const LANCE_DEFAULT_NPROBES: usize = 20; +const LANCE_DEFAULT_REFINE_FACTOR: u32 = 5; + fn default_top_k() -> usize { 5 } /// Vector search against a Lance dataset. Embeds the query text via the @@ -1660,7 +1920,9 @@ async fn lance_search( .map_err(|e| (StatusCode::BAD_REQUEST, e))?; let t0 = std::time::Instant::now(); - let hits = lance_store.search(&qv, req.top_k).await + let nprobes = req.nprobes.or(Some(LANCE_DEFAULT_NPROBES)); + let refine = req.refine_factor.or(Some(LANCE_DEFAULT_REFINE_FACTOR)); + let hits = lance_store.search(&qv, req.top_k, nprobes, refine).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e))?; Ok(Json(serde_json::json!({ @@ -1761,6 +2023,12 @@ struct LanceRecallRequest { harness: String, #[serde(default = "default_top_k")] top_k: usize, + /// Override server defaults so operators can sweep nprobes / + /// refine_factor to chart the recall-vs-latency curve. + #[serde(default)] + nprobes: Option, + #[serde(default)] + refine_factor: Option, } #[derive(serde::Serialize)] @@ -1784,6 +2052,214 @@ struct LanceRecallQuery { hits_returned: usize, } +// --- Phase 19: playbook memory endpoints --- + +/// Extract (name, city, state) from a chunk formatted like +/// "{Name} — {Role} in {City}, {State}. Skills: …". +/// Returns None if the chunk doesn't match the shape; callers simply +/// skip the boost for that hit. +fn parse_worker_chunk(chunk: &str) -> Option<(String, String, String)> { + // "Name — Role in City, ST. …" → split on "—" then " in " then "," + let (name_part, rest) = chunk.split_once('—')?; + let rest = rest.trim(); + let (_role, loc_part) = rest.split_once(" in ")?; + let loc_part = loc_part.trim(); + let (city, state_plus) = loc_part.split_once(',')?; + let state: String = state_plus.trim() + .chars() + .take_while(|c| c.is_ascii_alphabetic()) + .collect(); + let name = name_part.trim().to_string(); + let city = city.trim().to_string(); + if name.is_empty() || city.is_empty() || state.is_empty() { + return None; + } + Some((name, city, state)) +} + +#[derive(Deserialize)] +struct SeedPlaybookRequest { + /// One playbook with {operation, approach, context, endorsed_names}. + /// City + state are parsed from the operation text. + operation: String, + #[serde(default)] + approach: String, + #[serde(default)] + context: String, + endorsed_names: Vec, + /// Append to the existing memory rather than replacing. Default true — + /// seeding is a bootstrap/demo tool, not a rebuild substitute. + #[serde(default = "default_true")] + append: bool, +} + +/// Bootstrap / test-only: inject a playbook entry directly into +/// `playbook_memory` without going through `successful_playbooks`. Useful +/// when the source dataset has stale or phantom entries (as the initial +/// staffing seed did — names that don't correspond to real workers), and +/// you want to demonstrate the feedback loop with a known-good fixture. +/// +/// Production path is always `/rebuild` — this endpoint is for operators +/// who need to prime the memory before real playbooks accumulate. +async fn seed_playbook_memory( + State(state): State, + Json(req): Json, +) -> impl IntoResponse { + // Embed the entry through the same text shape `rebuild` uses so + // similarity math is comparable across seed + real entries. + let tmp_entry = playbook_memory::PlaybookEntry { + playbook_id: String::new(), + operation: req.operation.clone(), + approach: req.approach.clone(), + context: req.context.clone(), + timestamp: chrono::Utc::now().to_rfc3339(), + endorsed_names: req.endorsed_names.clone(), + city: None, state: None, embedding: None, + }; + let text = format!( + "{} | {} | {} | fills: {}", + tmp_entry.operation, tmp_entry.approach, tmp_entry.context, + tmp_entry.endorsed_names.join(", "), + ); + let resp = match state.ai_client.embed(EmbedRequest { texts: vec![text], model: None }).await { + Ok(r) => r, + Err(e) => return Err((StatusCode::BAD_GATEWAY, format!("embed seed: {e}"))), + }; + if resp.embeddings.is_empty() { + return Err((StatusCode::BAD_GATEWAY, "embed returned nothing".into())); + } + let emb: Vec = resp.embeddings[0].iter().map(|&x| x as f32).collect(); + + // Parse city/state from the operation ("fill: Role xN in City, ST"). + // Parser lives in playbook_memory::rebuild — expose via a tiny helper + // or inline the same logic here; duplicated briefly since this seed + // path is stable but infrequently called. + let (city, state_) = { + let after_in = req.operation.split(" in ").nth(1).unwrap_or(""); + let mut parts = after_in.splitn(2, ','); + let city = parts.next().map(|s| s.trim().to_string()).filter(|s| !s.is_empty()); + let state = parts.next().map(|s| s.trim().chars().take_while(|c| c.is_ascii_alphabetic()).collect::()).filter(|s| !s.is_empty()); + (city, state) + }; + if city.is_none() || state_.is_none() { + return Err((StatusCode::BAD_REQUEST, + "operation must match 'fill: Role xN in City, ST' shape".into())); + } + + // Stable id: hash of timestamp + operation. Callers get the id back + // so they can reference it in citations. + let ts = chrono::Utc::now().to_rfc3339(); + use sha2::{Digest, Sha256}; + let mut h = Sha256::new(); + h.update(ts.as_bytes()); + h.update(b"|"); + h.update(req.operation.as_bytes()); + let bytes = h.finalize(); + let pid = format!("pb-seed-{}", bytes.iter().take(8).map(|b| format!("{b:02x}")).collect::()); + + let new_entry = playbook_memory::PlaybookEntry { + playbook_id: pid.clone(), + operation: req.operation, + approach: req.approach, + context: req.context, + timestamp: ts, + endorsed_names: req.endorsed_names, + city, state: state_, + embedding: Some(emb), + }; + + let mut current = state.playbook_memory.snapshot().await; + if req.append { + current.push(new_entry); + } else { + current = vec![new_entry]; + } + if let Err(e) = state.playbook_memory.set_entries(current).await { + return Err((StatusCode::INTERNAL_SERVER_ERROR, format!("persist: {e}"))); + } + Ok(Json(serde_json::json!({ "playbook_id": pid, "entries_after": state.playbook_memory.entry_count().await }))) +} + +async fn rebuild_playbook_memory( + State(state): State, +) -> impl IntoResponse { + match playbook_memory::rebuild( + &state.playbook_memory, + &state.ai_client, + &state.catalog, + &state.bucket_registry, + ).await { + Ok(report) => Ok(Json(report)), + Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), + } +} + +// Path 2 foundation — dump in-memory playbook_memory state to a fresh +// `successful_playbooks_live` dataset. Cheap to call (writes one parquet, +// updates one manifest), so /log can call it after every seed to keep the +// SQL-queryable surface honest without the destructive REPLACE bug that +// /ingest/file has. +async fn persist_playbook_memory_sql( + State(state): State, +) -> impl IntoResponse { + match playbook_memory::persist_to_sql(&state.playbook_memory, &state.catalog).await { + Ok(report) => Ok(Json(report)), + Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), + } +} + +#[derive(Deserialize)] +struct PatternsRequest { + query: String, + #[serde(default = "default_pattern_k")] + top_k_playbooks: usize, + /// Minimum frequency (0.0-1.0) for a trait to make the report. + /// Default 0.4 — at least 40% of examined workers must share it. + #[serde(default = "default_pattern_min_freq")] + min_trait_frequency: f32, +} +fn default_pattern_k() -> usize { 10 } +fn default_pattern_min_freq() -> f32 { 0.4 } + +// Path 2 — meta-index discovery surface. "What did past similar fills +// have in common that I didn't ask about?" — surfaces signals like +// recurring certifications, skill clusters, archetype tendencies. +async fn discover_playbook_patterns( + State(state): State, + Json(req): Json, +) -> impl IntoResponse { + match playbook_memory::discover_patterns( + &state.playbook_memory, + &state.ai_client, + &state.catalog, + &state.bucket_registry, + &req.query, + req.top_k_playbooks, + req.min_trait_frequency, + ).await { + Ok(report) => Ok(Json(report)), + Err(e) => Err((StatusCode::INTERNAL_SERVER_ERROR, e)), + } +} + +async fn playbook_memory_stats( + State(state): State, +) -> impl IntoResponse { + let entries = state.playbook_memory.snapshot().await; + Json(serde_json::json!({ + "entries": entries.len(), + "total_names_endorsed": entries.iter().map(|e| e.endorsed_names.len()).sum::(), + "entries_with_embeddings": entries.iter().filter(|e| e.embedding.is_some()).count(), + "sample": entries.iter().take(3).map(|e| serde_json::json!({ + "id": e.playbook_id, + "operation": e.operation, + "city": e.city, + "state": e.state, + "endorsed": e.endorsed_names, + })).collect::>(), + })) +} + async fn lance_recall_harness( State(state): State, Path(index_name): Path, @@ -1791,7 +2267,7 @@ async fn lance_recall_harness( ) -> impl IntoResponse { let t0 = std::time::Instant::now(); - let harness_set = harness::EvalSet::load(&state.store, &req.harness).await + let harness_set = state.harness_store.load_for_index(&index_name, &req.harness).await .map_err(|e| (StatusCode::NOT_FOUND, format!("harness: {e}")))?; if !harness_set.ground_truth_built { return Err((StatusCode::BAD_REQUEST, @@ -1817,7 +2293,12 @@ async fn lance_recall_harness( }; let qt0 = std::time::Instant::now(); - let hits = lance_store.search(qv, k).await + let hits = lance_store.search( + qv, + k, + Some(req.nprobes.unwrap_or(LANCE_DEFAULT_NPROBES)), + Some(req.refine_factor.unwrap_or(LANCE_DEFAULT_REFINE_FACTOR)), + ).await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("search: {e}")))?; let lat_us = qt0.elapsed().as_micros() as f32; diff --git a/mcp-server/index.ts b/mcp-server/index.ts index 54fa444..4f3a839 100644 --- a/mcp-server/index.ts +++ b/mcp-server/index.ts @@ -74,7 +74,10 @@ server.tool( top_k: z.number().default(5), }, async ({ question, sql_filter, dataset, id_column, top_k }) => { - const body: any = { question, index_name: "workers_500k_v1", filter_dataset: dataset, id_column, top_k, generate: true }; + const body: any = { + question, index_name: "workers_500k_v1", filter_dataset: dataset, id_column, top_k, generate: true, + use_playbook_memory: true, + }; if (sql_filter) body.sql_filter = sql_filter; const r = await api("POST", "/vectors/hybrid", body); return { content: [{ type: "text" as const, text: JSON.stringify(r, null, 2) }] }; @@ -109,6 +112,7 @@ server.tool( index_name: "workers_500k_v1", sql_filter: filter, filter_dataset: "ethereal_workers", id_column: "worker_id", top_k: headcount * 2, generate: false, + use_playbook_memory: true, }); let matches = r.sources || []; if (required_certs.length > 0) { @@ -384,6 +388,11 @@ async function main() { question: b.question, index_name: b.index || "workers_500k_v1", sql_filter: b.sql_filter, filter_dataset: b.dataset || "ethereal_workers", id_column: b.id_column || "worker_id", top_k: b.top_k || 5, generate: b.generate !== false, + use_playbook_memory: b.use_playbook_memory !== false, + // Forward explicitly so Bun /search isn't capped by the + // server's default — boost silently misses good matches when + // memory has >25 entries and only top-5 playbooks are scanned. + playbook_memory_k: b.playbook_memory_k ?? 25, })); } @@ -403,6 +412,8 @@ async function main() { index_name: b.index || "workers_500k_v1", sql_filter: filter, filter_dataset: b.dataset || "ethereal_workers", id_column: "worker_id", top_k: (b.headcount || 5) * 2, generate: false, + use_playbook_memory: true, + playbook_memory_k: 25, })); } @@ -418,14 +429,57 @@ async function main() { return ok(await api("POST", "/vectors/rag", { index_name: b.index || "workers_500k_v1", question: b.question, top_k: b.top_k || 5 })); } - // Tool: log success + // Tool: log success. + // + // BUG FIX 2026-04-20: previously this also POSTed a 1-row CSV to + // /ingest/file?name=successful_playbooks. That endpoint REPLACES + // the dataset's object list rather than appending — so every /log + // call destroyed all prior rows in the SQL-queryable + // successful_playbooks table. Chain-of-custody trace caught it: + // sp_rows went 33 → 1 in a single /log call. + // + // Until a proper append endpoint exists (Phase 8 delta write + // surface for the SQL table), /log writes ONLY to playbook_memory + // (in-memory append-only store, works correctly for boost). The + // SQL successful_playbooks table is now treated as derived state + // that gets rebuilt explicitly via /vectors/playbook_memory/rebuild + // — never written to by the recruiter path. if (url.pathname === "/log") { const b = await json(); - const csv = `timestamp,operation,approach,result,context\n"${new Date().toISOString()}","${(b.operation||"").replace(/"/g,'""')}","${(b.approach||"").replace(/"/g,'""')}","${(b.result||"").replace(/"/g,'""')}","${(b.context||"").replace(/"/g,'""')}"`; - const form = new FormData(); - form.append("file", new Blob([csv], { type: "text/csv" }), "playbook.csv"); - const r = await fetch(`${BASE}/ingest/file?name=successful_playbooks`, { method: "POST", body: form }); - return ok({ logged: true, response: await r.text() }); + // Result format expected: "{filled}/{needed} filled → Name1, Name2, Name3" + const result = String(b.result || ""); + const arrowIdx = result.indexOf("→"); + const namesPart = arrowIdx >= 0 ? result.slice(arrowIdx + 1) : ""; + const endorsed = namesPart.split(",").map(s => s.trim()).filter(Boolean); + let seeded = 0; + let persisted_rows = 0; + if (endorsed.length && /fill:.+ in .+,.+/i.test(String(b.operation || ""))) { + const canonicalApproach = `${(b.approach || "manual log").split(/[\.\n]/)[0]}`.slice(0, 80); + const canonicalContext = `${(b.context || "").split(/[\.\n]/)[0]}`.slice(0, 80); + const seedRes = await api("POST", "/vectors/playbook_memory/seed", { + operation: b.operation, + approach: canonicalApproach, + context: canonicalContext, + endorsed_names: endorsed, + append: true, + }).catch(() => null) as any; + if (seedRes && seedRes.playbook_id) { + seeded = endorsed.length; + // After every successful seed, persist memory → SQL so the + // successful_playbooks_live table reflects current operator + // activity. /persist_sql writes the FULL state, which is safe + // because in-memory playbook_memory IS the source of truth + // (no concurrent writer outside this process modifies it). + const pr = await api("POST", "/vectors/playbook_memory/persist_sql", {}).catch(() => null) as any; + if (pr && typeof pr.rows_persisted === "number") persisted_rows = pr.rows_persisted; + } + } + return ok({ + logged: true, + seeded, + persisted_to_sql: persisted_rows, + note: "successful_playbooks_live (NOT successful_playbooks) is the SQL surface for live operator activity. /log is non-destructive.", + }); } // Tool: get playbooks @@ -480,6 +534,7 @@ async function main() { question: "reliable forklift operator", index_name: "workers_500k_v1", sql_filter: "role = 'Forklift Operator' AND state = 'IL' AND CAST(reliability AS DOUBLE) > 0.8", filter_dataset: "workers_500k", id_column: "worker_id", top_k: 5, generate: false, + use_playbook_memory: true, }); tests.push({ name: "Hybrid SQL+Vector Search", ms: Date.now() - ht0, @@ -987,6 +1042,7 @@ tr:hover{background:#111827} question: "reliable forklift operator", index_name: "workers_500k_v1", sql_filter: "role = 'Forklift Operator' AND state = 'IL' AND CAST(reliability AS DOUBLE) > 0.8", filter_dataset: "workers_500k", id_column: "worker_id", top_k: 5, generate: false, + use_playbook_memory: true, }); tests.push({ name: "Hybrid SQL+Vector", ms: Date.now() - ht0, @@ -1435,6 +1491,26 @@ const SCENARIOS = [ function pick(arr: T[]): T { return arr[Math.floor(Math.random() * arr.length)]; } +// Seed playbook_memory from a filled contract so the next hybrid query +// ranks against it. Used by both runWeekSimulation (per-day) and the /log +// endpoint (per manual logging). Fail-soft — seeding is best-effort. +async function seedPlaybookFromContract(c: any) { + const names = (c.matches || []).slice(0, 5) + .map((m: any) => m.name || m.doc_id) + .filter((n: string) => n && !n.startsWith("W500-")); + if (!names.length) return; + const op = `fill: ${c.role} x${c.headcount} in ${c.city}, ${c.state}`; + try { + await api("POST", "/vectors/playbook_memory/seed", { + operation: op, + approach: `${c.situation || c.priority || "fill"} → hybrid search`, + context: `client=${c.client || ""} start=${c.start || ""}`, + endorsed_names: names, + append: true, + }); + } catch {} +} + async function runWeekSimulation() { const days = ["Monday","Tuesday","Wednesday","Thursday","Friday"]; const staffers = ["Sarah (Lead)","Mike (Senior)","Kim (Junior)"]; @@ -1468,7 +1544,7 @@ async function runWeekSimulation() { if (priority === "urgent") emergencies++; totalNeeded += headcount; - // Run hybrid search + // Run hybrid search — Phase 19: boost on so past playbooks shape ranking let filled = 0; let matches: any[] = []; try { @@ -1481,12 +1557,15 @@ async function runWeekSimulation() { id_column: "worker_id", top_k: headcount + 2, generate: false, + use_playbook_memory: true, }); matches = (r.sources || []).slice(0, headcount).map((s: any) => ({ doc_id: s.doc_id, name: s.chunk_text?.split("—")[0]?.trim() || s.doc_id, score: s.score, chunk_text: s.chunk_text || "", + playbook_boost: s.playbook_boost || 0, + playbook_citations: s.playbook_citations || [], })); filled = matches.length; } catch {} @@ -1501,7 +1580,15 @@ async function runWeekSimulation() { }); } - // End of day: log playbook + prepare handoff + // End of day: seed playbook_memory with TODAY's filled contracts so + // tomorrow's hybrid search ranks against them. This is the in-week + // feedback loop — without this, day 5 doesn't benefit from day 1. + for (const c of contracts) { + if (c.matches && c.matches.length) { + await seedPlaybookFromContract(c).catch(() => {}); + } + } + if (d < 4) { handoffs++; try { @@ -1530,29 +1617,18 @@ async function runWeekSimulation() { playbook_entries: playbookEntries, }; - // Log every filled contract as a playbook entry — this is the training data - try { - const ts = new Date().toISOString(); - const rows: string[] = []; - for (const day of results) { - for (const c of day.contracts) { - if (c.matches && c.matches.length > 0) { - const workerNames = c.matches.slice(0, 3).map((m: any) => m.name || m.doc_id).join(", "); - const op = `fill: ${c.role} x${c.headcount} in ${c.city}, ${c.state}`; - const approach = `${c.situation} (${c.priority}) → hybrid search`; - const result = `${c.filled}/${c.headcount} filled → ${workerNames}`; - const context = `client=${c.client} start=${c.start} scenario=${c.situation}`; - rows.push(`"${ts}","${op.replace(/"/g,'""')}","${approach}","${result.replace(/"/g,'""')}","${context.replace(/"/g,'""')}"`); - } - } - } - if (rows.length) { - const csv = `timestamp,operation,approach,result,context\n${rows.join("\n")}`; - const form = new FormData(); - form.append("file", new Blob([csv], { type: "text/csv" }), "playbook.csv"); - await fetch(`${BASE}/ingest/file?name=successful_playbooks`, { method: "POST", body: form }); - } - } catch {} + // BUG FIX 2026-04-20: previously this POSTed a multi-row CSV to + // /ingest/file?name=successful_playbooks at end of every simulation. + // That endpoint REPLACES the dataset's object list — so each + // /simulation/run wiped the prior simulation's rows. The SQL + // successful_playbooks table was never accumulating; it always reflected + // only the most-recent simulation batch. + // + // Per-day per-contract seeding via /vectors/playbook_memory/seed + // (added Pass 1, runs inside the day loop above) is the path that + // actually accumulates feedback. The SQL successful_playbooks table is + // intentionally not written by /simulation/run anymore until a proper + // append surface exists. return { days: results, summary }; } diff --git a/mcp-server/search.html b/mcp-server/search.html index d36c8e7..c502423 100644 --- a/mcp-server/search.html +++ b/mcp-server/search.html @@ -384,11 +384,13 @@ function addContractInsight(parent,c,isUrgent){ if(isUrgent&&i===0)label='FIRST CHOICE — highest match score, call first'; else if(isUrgent&&i>0&&i=c.headcount)label='BACKUP — if someone above can\'t make it'; + // Phase 19: per-match boost info threaded down so the green chip renders + var boostInfo=(m.playbook_boost>0)?{boost:m.playbook_boost,citations:m.playbook_citations||[]}:null; addWorkerInsight(cd,w.nm, [w.role,w.loc].filter(Boolean).join(' · '), label||buildWhyText(w,c),i, isUrgent&&i===0?'#f85149':isUrgent&&i>=c.headcount?'#484f58':null, - w); + w,boostInfo); }); var remaining=c.matches.length-showCount; if(remaining>0){ @@ -570,12 +572,23 @@ function addWorkerInsight(parent,name,detail,why,idx,highlight){ if(highlight)w.style.borderLeft='3px solid '+highlight; w.style.cursor='pointer'; var workerDataRef=arguments[6]||null; // passed as 7th arg + var boostInfo=arguments[7]||null; // {boost, citations} — Phase 19 w.onclick=function(){if(workerDataRef)showProfile(workerDataRef)}; var av=document.createElement('div');av.className='av';av.style.background=AC[(idx||0)%AC.length]; av.textContent=(name||'?').split(' ').map(function(n){return(n[0]||'').toUpperCase()}).join('').substring(0,2); w.appendChild(av); var info=document.createElement('div');info.className='info'; var nm=document.createElement('div');nm.className='nm';nm.textContent=name; + // Phase 19: when a past playbook endorsed this worker, show a green chip + // next to the name. Hover reveals the citation IDs. + if(boostInfo && boostInfo.boost > 0){ + var chip=document.createElement('span'); + chip.style.cssText='display:inline-block;margin-left:8px;padding:2px 7px;border-radius:9px;font-size:10px;font-weight:600;background:#0d2818;border:1px solid #2ea043;color:#3fb950;vertical-align:middle'; + var n=(boostInfo.citations && boostInfo.citations.length) || 0; + chip.textContent='Endorsed · '+n+' playbook'+(n!==1?'s':''); + chip.title='Boosted by past playbooks: '+(boostInfo.citations||[]).join(', '); + nm.appendChild(chip); + } var dt=document.createElement('div');dt.className='detail';dt.textContent=detail; info.appendChild(nm);info.appendChild(dt); if(why){var wh=document.createElement('div');wh.className='why';wh.textContent=why;info.appendChild(wh)} diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts new file mode 100644 index 0000000..167e9c2 --- /dev/null +++ b/tests/multi-agent/agent.ts @@ -0,0 +1,351 @@ +// Shared runtime for one agent. An agent is a role (executor or reviewer), +// a model name, and a conversation the orchestrator hands it. The agent +// produces ONE structured Action per turn; the orchestrator applies tool +// calls and feeds results back. +// +// Fail-fast: every HTTP error, parse error, and Ollama error throws. The +// orchestrator catches at the top and exits non-zero with the full log. + +export const GATEWAY = "http://localhost:3100"; +export const SIDECAR = "http://localhost:3200"; + +// --- Shared types --- + +export type Role = "executor" | "reviewer"; + +export interface TaskSpec { + id: string; + operation: string; // "fill: Welder x2 in Columbus, OH" + target_role: string; // "Welder" + target_count: number; // 2 + target_city: string; // "Columbus" + target_state: string; // "OH" + approach_hint?: string; // e.g. "hybrid search"; agent is free to ignore +} + +export interface LogEntry { + turn: number; + role: Role; + model: string; + at: string; + kind: + | "plan" + | "tool_call" + | "tool_result" + | "critique" + | "propose_done" + | "consensus_done" + | "error"; + content: any; +} + +// Action = what an agent returns on one turn. Strict shape so we can +// enforce it at parse time rather than prompt-engineer around malformed +// JSON. +export type Action = + | { kind: "tool_call"; tool: string; args: Record; rationale: string } + | { kind: "propose_done"; fills: Fill[]; rationale: string } + | { kind: "critique"; verdict: "continue" | "drift" | "approve_done"; notes: string } + | { kind: "plan"; steps: string[] }; + +export interface Fill { + candidate_id: string; + name: string; + reason: string; +} + +// --- HTTP helpers (fail-fast) --- + +async function http(method: string, url: string, body?: any): Promise { + const res = await fetch(url, { + method, + headers: { "Content-Type": "application/json" }, + body: body ? JSON.stringify(body) : undefined, + }); + if (!res.ok) { + const text = await res.text(); + throw new Error(`${method} ${url} → ${res.status}: ${text}`); + } + return (await res.json()) as T; +} + +// Tool calls land in the Phase 12 audit log keyed by this agent name. +// Distinguishable from human-driven calls (agent=="operator" or similar) +// so post-hoc queries can separate multi-agent runs. +export const TOOL_AGENT_ID = "multi-agent-test"; + +export async function callTool(tool: string, args: Record): Promise { + return http("POST", `${GATEWAY}/tools/${tool}/call`, { + params: args, + agent: TOOL_AGENT_ID, + }); +} + +export async function hybridSearch(sql_filter: string, question: string, k = 10): Promise { + return http("POST", `${GATEWAY}/vectors/hybrid`, { sql_filter, question, k }); +} + +export async function sqlQuery(sql: string): Promise { + return http("POST", `${GATEWAY}/query/sql`, { sql, format: "json" }); +} + +// Sidecar generate. Ollama's default keep_alive (5 min) keeps the model +// warm between turns on its own, so we don't need to pass it through. +export async function generate(model: string, prompt: string, opts: { + max_tokens?: number; + temperature?: number; + system?: string; +} = {}): Promise { + const body: Record = { + model, + prompt, + temperature: opts.temperature ?? 0.3, + max_tokens: opts.max_tokens ?? 800, + }; + if (opts.system) body.system = opts.system; + const r = await http("POST", `${SIDECAR}/generate`, body); + const text = r.text ?? ""; + if (!text || typeof text !== "string") { + throw new Error(`generate returned empty text from ${model}: ${JSON.stringify(r).slice(0, 200)}`); + } + return text; +} + +// --- Prompt construction --- + +const TOOL_CATALOG = ` +Available tools (each takes a JSON "args" object): + +- hybrid_search(sql_filter: string, question: string, index_name: string, k?: number) + → Narrow workers via SQL WHERE clause, then rank by semantic match. + → Canonical production tool for fill tasks. Always use this FIRST. + → Example args: + {"index_name":"workers_500k_v1", + "sql_filter":"LOWER(role) LIKE '%weld%' AND city = 'Toledo' AND state = 'OH' AND availability > 0.5", + "question":"reliable welder with OSHA certs", + "k":10} + +- sql(query: string) + → Raw read-only SELECT. Use for verification (confirm a worker exists, + check city/role/availability) after hybrid_search surfaces candidates. + → Schema of workers_500k: worker_id, name, role, email, phone, city, + state, zip, skills, certifications, archetype, reliability, + responsiveness, engagement, communications, compliance, availability, + resume_text. + → Example args: + {"query":"SELECT worker_id, name, role, city, state, availability FROM workers_500k WHERE worker_id = 'W123456'"} + +Rules: +- hybrid_search returns sources[] each with {doc_id, chunk_text, score, sql_verified}. +- **ID mapping:** vector doc_ids look like "W500K-7995" (prefix + number). + The SQL worker_id is an INTEGER. To go from doc_id to SQL, strip the + "W500K-" prefix and cast: + SELECT ... FROM workers_500k WHERE worker_id = CAST(SUBSTR('W500K-7995', 7) AS BIGINT) + or more simply: WHERE worker_id = 7995. +- Names are NOT unique. Always identify by worker_id, never by name alone. +- Return EXACTLY ONE JSON object per turn. No prose outside the JSON. +`; + +// Smart per-kind summary so agents see the substance of each prior turn +// without a raw-JSON wall of text. hybrid_search results especially need +// this — raw JSON buries sources[] past any reasonable 400-char truncation. +function summarizeEntry(e: LogEntry): string { + const c = e.content ?? {}; + switch (e.kind) { + case "plan": + return `PLAN: ${(c.steps ?? []).map((s: string, i: number) => `${i + 1}.${s}`).join(" ")}`; + case "tool_call": + return `TOOL_CALL ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 250)})${c.rationale ? ` — ${c.rationale}` : ""}`; + case "tool_result": { + if (c.error) return `TOOL_RESULT error: ${c.error}`; + // hybrid_search response + if (Array.isArray(c.sources)) { + const head = c.sources.slice(0, 5).map((s: any) => + `${s.doc_id}${s.sql_verified ? "✓" : ""} score=${(s.score ?? 0).toFixed(2)}: ${String(s.chunk_text ?? "").slice(0, 80)}` + ).join(" | "); + return `TOOL_RESULT hybrid: sql_matches=${c.sql_matches} vector_reranked=${c.vector_reranked} sources=[${head}${c.sources.length > 5 ? ` +${c.sources.length - 5} more` : ""}]`; + } + // sql response + if (Array.isArray(c.rows)) { + const head = c.rows.slice(0, 5).map((r: any) => JSON.stringify(r)).join(" | "); + return `TOOL_RESULT sql: ${c.rows.length} rows${c.rows.length > 0 ? ` — ${head}${c.rows.length > 5 ? ` +${c.rows.length - 5} more` : ""}` : ""}`; + } + // fallback + return `TOOL_RESULT ${JSON.stringify(c).slice(0, 250)}`; + } + case "critique": + return `CRITIQUE verdict=${c.verdict} notes: ${String(c.notes ?? "").slice(0, 200)}`; + case "propose_done": + return `PROPOSE_DONE fills=[${(c.fills ?? []).map((f: Fill) => `${f.candidate_id}:${f.name}`).join(", ")}] rationale: ${String(c.rationale ?? "").slice(0, 120)}`; + case "consensus_done": + return `CONSENSUS ✓`; + case "error": + return `ERROR ${c.message ?? JSON.stringify(c)}`; + } + return JSON.stringify(c).slice(0, 200); +} + +function renderLogForPrompt(log: LogEntry[]): string { + if (log.length === 0) return "(no turns yet)"; + return log.slice(-12).map(e => + `[t${e.turn} ${e.role}] ${summarizeEntry(e)}` + ).join("\n"); +} + +// Crawl the log for every hybrid_search tool_result and collect the +// worker names + ids seen so far. LLMs routinely "forget" earlier turns +// once the conversation grows, so we surface a running ledger in the +// prompt as orchestrator-maintained state. The executor doesn't have to +// track this itself — it just reads it. +function candidatesSeen(log: LogEntry[]): Array<{ doc_id: string; name: string; city: string; state: string }> { + const seen = new Map(); + for (const e of log) { + if (e.kind !== "tool_result") continue; + const sources = (e.content as any)?.sources; + if (!Array.isArray(sources)) continue; + for (const s of sources) { + // chunk_text shape "Name — Role in City, ST. …" + const t = String(s.chunk_text ?? ""); + const [namePart, rest] = t.split("—", 2); + if (!namePart || !rest) continue; + const loc = rest.split(" in ")[1] ?? ""; + const [city, stateRaw] = loc.split(",", 2); + const state = (stateRaw ?? "").trim().replace(/[^A-Za-z].*/, ""); + if (!s.doc_id || !namePart.trim() || !city?.trim() || !state) continue; + if (!seen.has(s.doc_id)) { + seen.set(s.doc_id, { + doc_id: s.doc_id, + name: namePart.trim(), + city: city.trim(), + state, + }); + } + } + } + return Array.from(seen.values()); +} + +export function executorPrompt(task: TaskSpec, log: LogEntry[]): string { + const logStr = renderLogForPrompt(log); + const seen = candidatesSeen(log); + const seenBlock = seen.length === 0 + ? "(no candidates surfaced yet — start with hybrid_search)" + : seen.map(s => ` - ${s.doc_id} ${s.name} (${s.city}, ${s.state})`).join("\n"); + + return `You are the EXECUTOR agent. Your job is to complete this task: + +OPERATION: ${task.operation} +TARGET: ${task.target_count} × ${task.target_role} in ${task.target_city}, ${task.target_state} +${task.approach_hint ? `HINT: ${task.approach_hint}` : ""} + +The REVIEWER agent is watching every turn. They will flag drift. Stay on target. + +${TOOL_CATALOG} + +CANDIDATES SURFACED SO FAR (orchestrator-tracked, do not forget these): +${seenBlock} + +SHARED LOG (recent turns): +${logStr} + +Your next action MUST be a JSON object matching one of these shapes: +{"kind":"plan","steps":["short step 1","short step 2",...]} + — use on turn 1 to outline your approach. Steps must be concrete. +{"kind":"tool_call","tool":"...","args":{...},"rationale":"why"} + — call a tool and see its result next turn. +{"kind":"propose_done","fills":[{"candidate_id":"...","name":"First Last","reason":"why them"}],"rationale":"..."} + — propose you've met the target. fills MUST have EXACTLY ${task.target_count} entries — count twice before emitting. + +Strategy tip: once "CANDIDATES SURFACED SO FAR" has ≥ ${task.target_count} entries in ${task.target_city}, ${task.target_state} matching ${task.target_role}, verify ONE via the sql tool (to satisfy the reviewer's SQL-verification criterion) and then propose_done with the top ${task.target_count}. Don't keep re-searching. + +Respond with ONLY the JSON object. No markdown fences, no prose.`; +} + +export function reviewerPrompt(task: TaskSpec, log: LogEntry[]): string { + const logStr = renderLogForPrompt(log); + + // If the most recent executor action was propose_done, the reviewer + // must commit to an up-or-down vote this turn — "continue" would stall + // the orchestrator forever. The wider prompt still describes all three + // verdicts, but we add a hard rule at the end that the model must obey. + const lastExec = [...log].reverse().find(e => e.role === "executor"); + const awaitingApproval = lastExec?.kind === "propose_done"; + + return `You are the REVIEWER agent. The EXECUTOR is trying to complete this task: + +OPERATION: ${task.operation} +TARGET: ${task.target_count} × ${task.target_role} in ${task.target_city}, ${task.target_state} + +Your job: catch drift. Agents often wander from the actual objective. Specifically watch for: +- Proposing candidates who aren't in ${task.target_city}, ${task.target_state}. +- Proposing candidates who don't have ${task.target_role} skill. +- Proposing fewer or more than ${task.target_count} fills. +- Irrelevant tool calls (e.g. revenue_by_client when the task is a fill). + +Available tools (for reference, but YOU don't call them): +- hybrid_search(sql_filter, question, index_name, k) — production fill path +- sql(query) — read-only SELECT for verification + +SHARED LOG (recent turns): +${logStr} + +Your next action MUST be a JSON object: +{"kind":"critique","verdict":"continue" | "drift" | "approve_done","notes":"..."} + +- "continue" → executor is on a reasonable path, let them keep going. +- "drift" → executor is off-track; notes MUST tell them how to redirect. +- "approve_done" → executor's propose_done meets the criteria. Seal it. + +APPROVAL CRITERIA (use these only for propose_done): +1. Exactly ${task.target_count} fills. +2. Each fill's name appears in a prior tool_result from ${task.target_city}, ${task.target_state} matching role "${task.target_role}". +3. Executor has SQL-verified at least one of the fills (any prior sql tool_result with that worker). +If 1–3 all hold, return approve_done. Do not demand further verification. +${awaitingApproval ? ` + +HARD RULE: The executor's most recent action was propose_done. On this turn you CANNOT return "continue" — it would stall the task. Choose approve_done (proposal is valid by the 3 criteria above) or drift (it fails one; state which in notes).` : ""} + +Respond with ONLY the JSON object.`; +} + +// Parse an agent's response into an Action, or throw. +export function parseAction(raw: string, role: Role): Action { + // Models sometimes wrap JSON in ```json fences; strip them. + let s = raw.trim(); + if (s.startsWith("```")) { + s = s.replace(/^```(?:json)?\n?/, "").replace(/```$/, "").trim(); + } + // Find the first {...} block. + const start = s.indexOf("{"); + const end = s.lastIndexOf("}"); + if (start < 0 || end <= start) { + throw new Error(`no JSON object in ${role} response: ${raw.slice(0, 300)}`); + } + const json = s.slice(start, end + 1); + let obj: any; + try { + obj = JSON.parse(json); + } catch (e) { + throw new Error(`invalid JSON from ${role}: ${(e as Error).message} | raw: ${json.slice(0, 300)}`); + } + + if (role === "executor") { + if (obj.kind === "plan" && Array.isArray(obj.steps)) return obj as Action; + if (obj.kind === "tool_call" && typeof obj.tool === "string" && typeof obj.args === "object") return obj as Action; + if (obj.kind === "propose_done" && Array.isArray(obj.fills)) return obj as Action; + throw new Error(`executor returned unexpected shape: ${JSON.stringify(obj).slice(0, 200)}`); + } else { + // Normalize: some models (qwen2.5, mistral) emit the verdict AS the + // `kind` field directly instead of nesting it under a "critique" + // wrapper. Accept both shapes rather than hard-failing — the + // semantic content is identical, and rejecting would stall the + // orchestrator on a cosmetic schema miss. + if (obj.kind === "critique" && ["continue", "drift", "approve_done"].includes(obj.verdict)) { + return obj as Action; + } + if (["continue", "drift", "approve_done"].includes(obj.kind)) { + return { kind: "critique", verdict: obj.kind, notes: obj.notes ?? "" } as Action; + } + throw new Error(`reviewer returned unexpected shape: ${JSON.stringify(obj).slice(0, 200)}`); + } +} diff --git a/tests/multi-agent/chain_of_custody.ts b/tests/multi-agent/chain_of_custody.ts new file mode 100644 index 0000000..ddd1685 --- /dev/null +++ b/tests/multi-agent/chain_of_custody.ts @@ -0,0 +1,335 @@ +// Chain-of-custody trace test. +// +// J's framing: "we have enough synthetic data, we've run enough AI responses +// saved to the database. Test true quality. Don't ignore chain of custody. +// Use real applications. Understand each aspect of the flow — not just +// 'write a file or directory and open it'." +// +// One real recruiter operation, traced end-to-end through EVERY layer of the +// live substrate. Every layer must record the operation correctly. Any layer +// that drops it = chain-of-custody break = surfaced as a real bug. +// +// Layers verified: +// L0 Bun /search — recruiter app surface (NOT bare /vectors/hybrid) +// L1 /vectors/hybrid — direct gateway (parity check vs L0) +// L2 /vectors/playbook_memory/stats — feedback loop count +// L3 Bun /log — recruiter records the pick +// L4 successful_playbooks — SQL-queryable table of past fills +// L5 /vectors/playbook_memory/stats — count grew +// L6 tools/audit — Phase 12 governance trail +// L7 /access/audit — Phase 13 access trail +// L8 /journal/recent — Phase 9 mutation events +// L9 /storage/errors — Federation error journal (no new errors) +// L10 /vectors/profile/{id}/activate — Phase 17 hot-swap +// L11 Bun /search again — boost lifts the just-logged worker +// L12 verifier qwen2.5 — reads cross-layer state, judges integrity +// +// Run: bun run tests/multi-agent/chain_of_custody.ts +// +// Prints per-layer BEFORE/AFTER/DELTA. Exit non-zero on any chain break. + +import { generate, GATEWAY } from "./agent.ts"; + +const BUN = "http://localhost:3700"; +const PROFILE_ID = "staffing-recruiter"; + +// The trace operation — small, deterministic, real city/role with supply. +// Helen Sanchez (worker_id 4661) is a known Toledo Welder; we record her +// as the manual pick the recruiter would make from the /search results. +const OPERATION = "fill: Welder x1 in Toledo, OH"; +const OP_ROLE = "Welder"; +const OP_CITY = "Toledo"; +const OP_STATE = "OH"; +const PICKED_WORKER = "Helen Sanchez"; // verified earlier to be a Toledo OH Welder + +// ─────────────────────── helpers ─────────────────────── + +async function getJSON(url: string): Promise { + try { + const r = await fetch(url); + if (!r.ok) return null; + return r.json() as Promise; + } catch { return null; } +} + +async function postJSON(url: string, body: any): Promise { + try { + const r = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body) }); + if (!r.ok) return { _error: `${r.status}: ${await r.text()}` } as any; + return r.json() as Promise; + } catch (e) { return { _error: (e as Error).message } as any; } +} + +async function sql(query: string): Promise<{ rows?: any[]; error?: string } | null> { + return postJSON(`${GATEWAY}/query/sql`, { sql: query }); +} + +interface Snapshot { + pm_entries: number; + pm_names: number; + sp_rows: number; // successful_playbooks SQL row count + audit_count: number; // tools/audit count + access_count: number; // access/audit count + journal_count: number; // journal/stats events + storage_errors: number; // bucket error journal +} + +async function snapshot(): Promise { + const pm = await getJSON(`${GATEWAY}/vectors/playbook_memory/stats`); + // successful_playbooks_live is the live SQL surface populated by /log + // via /vectors/playbook_memory/persist_sql. The original + // successful_playbooks table is now legacy/historical (no writes). + const sp = await sql(`SELECT COUNT(*) AS c FROM successful_playbooks_live`); + const audit = await getJSON(`${GATEWAY}/tools/audit`); + const access = await getJSON(`${GATEWAY}/access/audit`); + const journalStats = await getJSON(`${GATEWAY}/journal/stats`); + const storageErrors = await getJSON(`${GATEWAY}/storage/errors`); + + return { + pm_entries: pm?.entries ?? -1, + pm_names: pm?.total_names_endorsed ?? -1, + sp_rows: Number(sp?.rows?.[0]?.c ?? -1), + audit_count: Array.isArray(audit) ? audit.length : (audit as any)?.events?.length ?? -1, + access_count: Array.isArray(access) ? access.length : (access as any)?.events?.length ?? (access as any)?.audit?.length ?? -1, + journal_count: journalStats?.event_count ?? journalStats?.total_events ?? journalStats?.events ?? -1, + storage_errors: Array.isArray(storageErrors) ? storageErrors.length : (storageErrors as any)?.events?.length ?? 0, + }; +} + +function delta(b: Snapshot, a: Snapshot): Record { + return { + pm_entries: a.pm_entries - b.pm_entries, + pm_names: a.pm_names - b.pm_names, + sp_rows: a.sp_rows - b.sp_rows, + audit_count: a.audit_count - b.audit_count, + access_count: a.access_count - b.access_count, + journal_count: a.journal_count - b.journal_count, + storage_errors: a.storage_errors - b.storage_errors, + }; +} + +function fmtRow(label: string, b: number, a: number): string { + const d = a - b; + const dStr = d === 0 ? " · " : d > 0 ? ` +${d}` : ` ${d}`; + return ` ${label.padEnd(28)} ${String(b).padStart(6)} → ${String(a).padStart(6)} ${dStr}`; +} + +// ─────────────────────── trace ─────────────────────── + +interface TraceResult { + layer: string; + ok: boolean; + detail: string; +} + +async function runTrace(): Promise { + const out: TraceResult[] = []; + const note = (layer: string, ok: boolean, detail: string) => { + out.push({ layer, ok, detail }); + console.log(` ${ok ? "✓" : "✗"} ${layer.padEnd(32)} ${detail}`); + }; + + console.log(`\n▶ Trace operation: ${OPERATION} → pick=${PICKED_WORKER}\n`); + + // ── BEFORE snapshot ── + console.log(`▶ Before-snapshot:`); + const before = await snapshot(); + console.log(` pm_entries=${before.pm_entries} pm_names=${before.pm_names} sp_rows=${before.sp_rows} ` + + `audit=${before.audit_count} access=${before.access_count} journal=${before.journal_count} ` + + `storage_errors=${before.storage_errors}\n`); + + // ── L0: Bun /search ── + console.log(`▶ L0 — Bun /search (recruiter app surface)`); + const sql_filter = `role = '${OP_ROLE}' AND state = '${OP_STATE}' AND city = '${OP_CITY}'`; + const bunSearch = await postJSON(`${BUN}/search`, { + question: `Welder in ${OP_CITY}, ${OP_STATE}`, + sql_filter, top_k: 5, generate: false, + id_column: "worker_id", dataset: "workers_500k", use_playbook_memory: true, + }); + if (bunSearch?._error) { + note("L0 Bun /search", false, `error: ${bunSearch._error}`); + } else { + const sources = bunSearch?.sources ?? []; + const boostedHits = sources.filter((s: any) => (s.playbook_boost ?? 0) > 0).length; + note("L0 Bun /search", true, `sources=${sources.length} boosted=${boostedHits} sql_matches=${bunSearch?.sql_matches}`); + } + + // ── L1: direct /vectors/hybrid (parity check) ── + console.log(`\n▶ L1 — Direct /vectors/hybrid (parity check vs Bun)`); + const directSearch = await postJSON(`${GATEWAY}/vectors/hybrid`, { + index_name: "workers_500k_v1", filter_dataset: "workers_500k", id_column: "worker_id", + sql_filter, question: `Welder in ${OP_CITY}, ${OP_STATE}`, + top_k: 5, generate: false, use_playbook_memory: true, playbook_memory_k: 15, + }); + const directBoosted = (directSearch?.sources ?? []).filter((s: any) => (s.playbook_boost ?? 0) > 0).length; + note("L1 Direct /vectors/hybrid", true, `boosted=${directBoosted} sql=${directSearch?.sql_matches}`); + + const bunBoosted = (bunSearch?.sources ?? []).filter((s: any) => (s.playbook_boost ?? 0) > 0).length; + if (bunBoosted < directBoosted) { + note("CHAIN BREAK: Bun↔Direct parity", false, + `Bun=${bunBoosted} boosted vs Direct=${directBoosted}. Bun /search likely missing playbook_memory_k forward.`); + } + + // ── L3: Bun /log (recruiter records the pick) ── + console.log(`\n▶ L3 — Bun /log (recruiter records the pick)`); + const logged = await postJSON(`${BUN}/log`, { + operation: OPERATION, + approach: "chain-of-custody trace", + result: `1/1 filled → ${PICKED_WORKER}`, + context: `client=COC-${Date.now()} start=08:00 scenario=trace`, + }); + if (logged?._error) note("L3 Bun /log", false, `error: ${logged._error}`); + else note("L3 Bun /log", true, `logged=${logged?.logged} seeded=${logged?.seeded}`); + + // The /log response carries the result of the underlying /ingest/file too. + // If "response" mentions "different schema" or "error", the SQL-queryable + // path is broken even though seed succeeded. That's a chain break. + const logResp = String((logged as any)?.response ?? ""); + if (logResp.includes("error") || logResp.includes("different schema") || logResp.includes("Error")) { + note("CHAIN BREAK: Bun /log → SQL ingest", false, + `successful_playbooks ingest failed. Bun returned logged=true but /log's underlying ingest reported: ${logResp.slice(0, 150)}`); + } else { + note("L3a /log → /ingest/file", true, "ingest accepted"); + } + + // Give the system a beat for any async fan-out (audit/journal/etc). + await new Promise(r => setTimeout(r, 500)); + + // ── AFTER snapshot ── + console.log(`\n▶ After-snapshot:`); + const after = await snapshot(); + const d = delta(before, after); + console.log(fmtRow("playbook_memory.entries", before.pm_entries, after.pm_entries)); + console.log(fmtRow("playbook_memory.names", before.pm_names, after.pm_names)); + console.log(fmtRow("successful_playbooks.rows", before.sp_rows, after.sp_rows)); + console.log(fmtRow("tools/audit.count", before.audit_count, after.audit_count)); + console.log(fmtRow("access/audit.count", before.access_count, after.access_count)); + console.log(fmtRow("journal.events", before.journal_count, after.journal_count)); + console.log(fmtRow("storage/errors.count", before.storage_errors,after.storage_errors)); + + // ── L5: playbook_memory grew? ── + if (d.pm_entries === 1) note("L5 playbook_memory growth", true, "+1 entry as expected"); + else note("L5 playbook_memory growth", d.pm_entries > 0, + `delta=${d.pm_entries} (expected 1 — seed-after-log path)`); + + // ── L4: successful_playbooks SQL row appeared? ── + if (d.sp_rows >= 1) note("L4 successful_playbooks SQL", true, `+${d.sp_rows} row(s)`); + else note("L4 successful_playbooks SQL", false, + `delta=${d.sp_rows} — Bun /log claims success but SQL table didn't grow. Recruiter querying via SQL would miss this fill.`); + + // ── L9: storage errors stayed quiet ── + if (d.storage_errors === 0) note("L9 storage error journal", true, "no new bucket op errors"); + else note("L9 storage error journal", false, `+${d.storage_errors} new errors`); + + // ── L10: Phase 17 profile activation ── + console.log(`\n▶ L10 — Activate profile ${PROFILE_ID}`); + const act = await postJSON(`${GATEWAY}/vectors/profile/${PROFILE_ID}/activate`, {}); + if (act?._error) note("L10 profile activation", false, `error: ${act._error}`); + else note("L10 profile activation", true, + `warmed=${(act?.warmed_indexes ?? []).length} duration_ms=${act?.duration_ms ?? "?"}`); + + // ── L11: Bun /search again — boost should now lift PICKED_WORKER ── + console.log(`\n▶ L11 — Bun /search second time (boost lift verification)`); + const search2 = await postJSON(`${BUN}/search`, { + question: `Welder in ${OP_CITY}, ${OP_STATE}`, + sql_filter, top_k: 10, generate: false, + id_column: "worker_id", dataset: "workers_500k", use_playbook_memory: true, + }); + const sources2 = search2?.sources ?? []; + const pickedHit = sources2.find((s: any) => String(s.chunk_text ?? "").includes(PICKED_WORKER)); + if (!pickedHit) { + note("L11 boost lifts logged pick (Bun)", false, + `${PICKED_WORKER} not in top-10 via Bun /search. Could be Bun-not-forwarding-playbook_memory_k bug from L1.`); + } else if ((pickedHit.playbook_boost ?? 0) > 0) { + note("L11 boost lifts logged pick (Bun)", true, + `${PICKED_WORKER} boost=+${(pickedHit.playbook_boost as number).toFixed(3)} cites=${(pickedHit.playbook_citations ?? []).length}`); + } else { + note("L11 boost lifts logged pick (Bun)", false, + `${PICKED_WORKER} present but boost=0 — playbook_memory_k forward bug likely`); + } + + // Same probe via direct gateway to isolate Bun vs gateway + const direct2 = await postJSON(`${GATEWAY}/vectors/hybrid`, { + index_name: "workers_500k_v1", filter_dataset: "workers_500k", id_column: "worker_id", + sql_filter, question: `Welder in ${OP_CITY}, ${OP_STATE}`, + top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15, + }); + const sources2d = direct2?.sources ?? []; + const pickedHitD = sources2d.find((s: any) => String(s.chunk_text ?? "").includes(PICKED_WORKER)); + if (pickedHitD && (pickedHitD.playbook_boost ?? 0) > 0) { + note("L11b boost via direct gateway", true, + `${PICKED_WORKER} boost=+${(pickedHitD.playbook_boost as number).toFixed(3)} cites=${(pickedHitD.playbook_citations ?? []).length}`); + } else { + note("L11b boost via direct gateway", false, `direct call also did not boost ${PICKED_WORKER}`); + } + + return out; +} + +// ─────────────────────── verifier (fresh agent) ─────────────────────── + +async function verifierJudgment(trace: TraceResult[]): Promise<{ verdict: string; confidence: number }> { + const summary = trace.map(t => ` ${t.ok ? "ok" : "FAIL"} ${t.layer}: ${t.detail}`).join("\n"); + const prompt = `You are the CHAIN-OF-CUSTODY VERIFIER agent. A real recruiter operation was just +traced through every layer of the staffing substrate. Read the per-layer results and judge +whether the system kept chain of custody intact (every layer recorded the operation as +expected) or where it broke. + +Per-layer trace: +${summary} + +Reply with ONE JSON object only: +{"verdict": "", "confidence": 0-100} + +Be specific about which layer broke if any. confidence is how sure you are about the verdict.`; + + try { + const raw = await generate("qwen2.5:latest", prompt, { temperature: 0.1, max_tokens: 200 }); + const start = raw.indexOf("{"), end = raw.lastIndexOf("}"); + if (start < 0 || end <= start) return { verdict: "verifier could not produce JSON", confidence: 0 }; + const j = JSON.parse(raw.slice(start, end + 1)); + return { verdict: j.verdict ?? "no verdict", confidence: Number(j.confidence) || 0 }; + } catch (e) { + return { verdict: `verifier error: ${(e as Error).message}`, confidence: 0 }; + } +} + +// ─────────────────────── main ─────────────────────── + +async function main() { + console.log(`▶ Chain-of-custody trace — single real recruiter operation through every layer`); + + const trace = await runTrace(); + + console.log(`\n▶ L12 — Verifier (fresh qwen2.5 agent reads the cross-layer trace)`); + const v = await verifierJudgment(trace); + console.log(` verdict (${v.confidence}%): ${v.verdict}`); + + // Hard gate: any explicit CHAIN BREAK note = fail + const breaks = trace.filter(t => !t.ok && t.layer.startsWith("CHAIN BREAK")); + const fails = trace.filter(t => !t.ok); + + console.log(`\n▶ Summary:`); + console.log(` passing layers: ${trace.filter(t => t.ok).length}/${trace.length}`); + console.log(` chain breaks: ${breaks.length}`); + console.log(` total failures: ${fails.length}`); + console.log(` verifier confidence: ${v.confidence}%`); + + if (breaks.length > 0) { + console.log(`\n✗ Chain of custody BROKEN at ${breaks.length} layer(s):`); + for (const b of breaks) console.log(` - ${b.layer}: ${b.detail}`); + process.exit(1); + } + if (fails.length > 0) { + console.log(`\n◑ Trace completed with ${fails.length} non-blocking failures (no formal chain break)`); + process.exit(0); + } + console.log(`\n✓ Chain of custody intact across all layers`); + process.exit(0); +} + +main().catch(e => { + console.error(`\n✗ ${(e as Error).message}`); + if ((e as any).stack) console.error((e as any).stack); + process.exit(1); +}); diff --git a/tests/multi-agent/network_proving.ts b/tests/multi-agent/network_proving.ts new file mode 100644 index 0000000..c488488 --- /dev/null +++ b/tests/multi-agent/network_proving.ts @@ -0,0 +1,469 @@ +// Network proving: continuous build → verify → repeat with hot-swap profile. +// +// J's framing: "have them guide each other, when the test is complete we have +// a successful playbook, then spin up another agent that tests the viability +// of our network with the playbook and the hot-swap profile. Keep spinning up +// agents and testing — pass theory, real-world execution, not isolated unit +// tests." +// +// Each round = TWO phases: +// +// 1. BUILD phase. Two agents (mistral executor + qwen2.5 reviewer) work +// on a real staffing fill task. They guide each other via the critique +// loop. On consensus → seal a playbook with CANONICAL short seed text +// (the Pass 1 lesson — verbose seeds silently kill boost). Real Ollama, +// real workers_500k, real /vectors/hybrid path. +// +// 2. VERIFY phase. A FRESH qwen2.5 agent spins up, activates the +// staffing-recruiter profile (Phase 17 hot-swap), runs a probe query +// against the same network, and judges from the live response whether +// prior rounds' playbooks actually surface relevant workers higher. +// The verifier writes a verdict: did the network learn? +// +// Three rounds, progressively harder: +// R0: Welder x2 in Toledo, OH — baseline +// R1: Welder x2 in Cleveland, OH — same role, different city +// → tests geo discrimination +// (Toledo workers MUST NOT +// bleed into Cleveland boost) +// R2: Welder x3 in Toledo, OH — re-fill same city, bigger +// count → tests compounding +// (R0's endorsements should +// still rank up here) +// +// Run: bun run tests/multi-agent/network_proving.ts +// +// Fail-fast: any HTTP error or model crash bubbles to top-level, exits 1. + +import { + type LogEntry, + type TaskSpec, + type Action, + type Fill, + GATEWAY, + generate, + parseAction, + executorPrompt, + reviewerPrompt, + sqlQuery, + callTool, +} from "./agent.ts"; + +const EXECUTOR_MODEL = "mistral:latest"; +const REVIEWER_MODEL = "qwen2.5:latest"; +const VERIFIER_MODEL = "qwen2.5:latest"; +const PROFILE_ID = "staffing-recruiter"; +const INDEX_NAME = "workers_500k_v1"; +const MAX_TURNS = 12; +const MAX_TOOL_ERRORS = 3; +const MAX_DRIFTS = 3; + +const TASK_DECK: TaskSpec[] = [ + { + id: "R0", operation: "fill: Welder x2 in Toledo, OH", + target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH", + approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", + }, + { + id: "R1", operation: "fill: Welder x2 in Cleveland, OH", + target_role: "Welder", target_count: 2, target_city: "Cleveland", target_state: "OH", + approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", + }, + { + id: "R2", operation: "fill: Welder x3 in Toledo, OH", + target_role: "Welder", target_count: 3, target_city: "Toledo", target_state: "OH", + approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", + }, +]; + +interface BuildResult { + ok: boolean; + task: TaskSpec; + fills: Fill[]; + turns: number; + duration_secs: number; + playbook_id?: string; + entries_after_seed?: number; + error?: string; +} + +interface VerifyResult { + profile_activated: boolean; + warmed_indexes: number; + probe_boost_total: number; // sum of playbook_boost across top-K + probe_boosted_hits: number; // how many hits had boost > 0 + probe_top_citations: string[]; // playbook_ids cited + geo_discrimination_ok: boolean; // when prior playbook is in different city, boost should NOT bleed + verdict: string; // qwen2.5's natural-language judgment + confidence: number; // 0-100 self-rated + duration_secs: number; +} + +interface RoundLedger { + round: number; + task: TaskSpec; + build: BuildResult; + verify: VerifyResult; + score: number; // /10 per round + notes: string[]; +} + +// ─────────────────────── BUILD phase (two-agent loop) ─────────────────────── + +async function executeToolCall(name: string, args: Record): Promise { + if (name === "hybrid_search") { + const { sql_filter, question, index_name, k } = args; + if (!sql_filter || !question || !index_name) { + throw new Error(`hybrid_search needs sql_filter+question+index_name, got keys=${Object.keys(args).join(",")}`); + } + const r = await fetch(`${GATEWAY}/vectors/hybrid`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + sql_filter, question, index_name, + filter_dataset: "workers_500k", id_column: "worker_id", + top_k: k ?? 10, generate: false, use_playbook_memory: true, + }), + }); + if (!r.ok) throw new Error(`hybrid → ${r.status}: ${await r.text()}`); + return r.json(); + } + if (name === "sql") { + if (!args.query) throw new Error("sql needs query"); + if (!/^\s*SELECT/i.test(args.query)) throw new Error("sql allows SELECT only"); + return sqlQuery(args.query); + } + return callTool(name, args); +} + +function trim(r: any) { + if (r && Array.isArray(r.rows)) return { ...r, rows: r.rows.slice(0, 20) }; + if (r && Array.isArray(r.sources)) return { ...r, sources: r.sources.slice(0, 12) }; + return r; +} + +function fmtTurn(prefix: string, e: Omit): string { + const c: any = e.content ?? {}; + const head = `[${prefix} t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`; + if (e.kind === "tool_call") return `${head} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 70)})`; + if (e.kind === "tool_result") { + if (c.error) return `${head} error: ${c.error}`; + if (Array.isArray(c.sources)) return `${head} hybrid sql=${c.sql_matches} reranked=${c.vector_reranked}`; + if (Array.isArray(c.rows)) return `${head} sql ${c.rows.length} rows`; + return `${head} ${JSON.stringify(c).slice(0, 70)}`; + } + if (e.kind === "critique") return `${head} verdict=${c.verdict} ${(c.notes ?? "").slice(0, 50)}`; + if (e.kind === "propose_done") return `${head} ${(c.fills ?? []).length} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; + if (e.kind === "consensus_done") return `${head} ✓`; + if (e.kind === "plan") return `${head} ${(c.steps ?? []).length} steps`; + return `${head} ${JSON.stringify(c).slice(0, 60)}`; +} + +async function buildPhase(task: TaskSpec, prefix: string): Promise { + const t0 = Date.now(); + const log: LogEntry[] = []; + let turn = 0, sealed: { fills: Fill[]; approach: string } | null = null; + let toolErrors = 0, drifts = 0; + + const append = (e: Omit): LogEntry => { + const full: LogEntry = { ...e, at: new Date().toISOString() }; + log.push(full); + console.log(fmtTurn(prefix, e)); + return full; + }; + + try { + while (turn < MAX_TURNS && !sealed) { + turn += 1; + + const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 600 }); + const execAction = parseAction(execRaw, "executor"); + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction }); + + if (execAction.kind === "tool_call") { + try { + const r = await executeToolCall(execAction.tool, execAction.args); + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trim(r) }); + toolErrors = 0; + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", + content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } }); + toolErrors += 1; + if (toolErrors >= MAX_TOOL_ERRORS) throw new Error(`${MAX_TOOL_ERRORS} consecutive tool errors`); + } + } + + const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 400 }); + const revAction = parseAction(revRaw, "reviewer"); + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction }); + + if (revAction.kind !== "critique") throw new Error(`reviewer non-critique`); + if (revAction.verdict === "drift") { + drifts += 1; + if (drifts >= MAX_DRIFTS) throw new Error(`${MAX_DRIFTS} consecutive drifts`); + } else drifts = 0; + + if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { + if (execAction.fills.length !== task.target_count) { + throw new Error(`fills=${execAction.fills.length} target=${task.target_count}`); + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } }); + sealed = { fills: execAction.fills, approach: (execAction as any).rationale ?? "multi-agent" }; + } + } + + if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`); + + // Phase 19 seed — CANONICAL short text (Pass 1 lesson). The verbose + // executor rationale stays out of the embedding; we keep a separate + // human-readable record in the playbook log. + const canonicalApproach = `${task.target_role.toLowerCase()} fill via hybrid search`; + const canonicalContext = `${task.target_role} fill in ${task.target_city}, ${task.target_state}`; + let playbook_id: string | undefined; + let entries_after_seed: number | undefined; + try { + const sr = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + operation: task.operation, + approach: canonicalApproach, + context: canonicalContext, + endorsed_names: sealed.fills.map(f => f.name), + append: true, + }), + }); + if (sr.ok) { + const j = await sr.json() as any; + playbook_id = j.playbook_id; + entries_after_seed = j.entries_after; + console.log(`[${prefix}] ↳ seeded id=${playbook_id} entries=${entries_after_seed}`); + } else { + console.warn(`[${prefix}] ↳ seed failed: ${sr.status} ${await sr.text()}`); + } + } catch (e) { + console.warn(`[${prefix}] ↳ seed errored: ${(e as Error).message}`); + } + + return { + ok: true, task, fills: sealed.fills, turns: turn, + duration_secs: Math.round((Date.now() - t0) / 1000), + playbook_id, entries_after_seed, + }; + } catch (e) { + return { + ok: false, task, fills: [], turns: turn, + duration_secs: Math.round((Date.now() - t0) / 1000), + error: (e as Error).message, + }; + } +} + +// ─────────────────────── VERIFY phase (fresh single agent) ─────────────────────── + +async function activateProfile(): Promise<{ ok: boolean; warmed: number; ms: number }> { + const t0 = Date.now(); + const r = await fetch(`${GATEWAY}/vectors/profile/${PROFILE_ID}/activate`, { method: "POST" }); + const ms = Date.now() - t0; + if (!r.ok) { + console.warn(`profile activation failed: ${r.status} ${await r.text()}`); + return { ok: false, warmed: 0, ms }; + } + const j = await r.json() as any; + return { ok: true, warmed: (j.warmed_indexes ?? []).length, ms }; +} + +async function probeWithBoost(task: TaskSpec) { + const sql_filter = `role = '${task.target_role.replace(/'/g, "''")}' ` + + `AND state = '${task.target_state}' ` + + `AND city = '${task.target_city.replace(/'/g, "''")}'`; + const r = await fetch(`${GATEWAY}/vectors/hybrid`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + index_name: INDEX_NAME, filter_dataset: "workers_500k", id_column: "worker_id", + sql_filter, question: `${task.target_role} in ${task.target_city}, ${task.target_state}`, + top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15, + }), + }); + if (!r.ok) throw new Error(`probe → ${r.status}: ${await r.text()}`); + const j = (await r.json()) as any; + const sources: any[] = j.sources ?? []; + const boostedHits = sources.filter(s => (s.playbook_boost ?? 0) > 0).length; + const totalBoost = sources.reduce((s, x) => s + (x.playbook_boost ?? 0), 0); + const cites = Array.from(new Set(sources.flatMap(s => s.playbook_citations ?? []))).slice(0, 5); + const topNames = sources.slice(0, 5).map(s => { + const t = String(s.chunk_text ?? ""); + return t.split("—")[0]?.trim() ?? s.doc_id; + }); + return { sources, boostedHits, totalBoost, cites, topNames }; +} + +// Verifier prompt — fresh agent, no shared log with the build pair. It +// gets the round's task, the prior rounds' sealed playbooks, and the live +// probe result, and renders a human-readable verdict with a confidence. +function verifierPrompt(task: TaskSpec, priorPlaybooks: Array<{op: string; fills: string[]}>, + probe: { boostedHits: number; totalBoost: number; cites: string[]; topNames: string[] } +): string { + const priorBlock = priorPlaybooks.length === 0 + ? "(no prior playbooks — this is the first round)" + : priorPlaybooks.map((p, i) => ` ${i+1}. ${p.op} → endorsed [${p.fills.join(", ")}]`).join("\n"); + + return `You are the VERIFIER agent. A fresh round just sealed a playbook on a real staffing +substrate. Your job: judge whether the system learned from prior rounds. + +CURRENT ROUND: + task: ${task.operation} + in city: ${task.target_city}, ${task.target_state} + +PRIOR PLAYBOOKS (in playbook_memory): +${priorBlock} + +I activated the staffing-recruiter profile and ran a hybrid query for this exact task with +use_playbook_memory=true. Live result from the substrate: + - top-5 surfaced workers: ${probe.topNames.join(", ")} + - hits with non-zero playbook_boost: ${probe.boostedHits} / 10 + - total boost across top-10: ${probe.totalBoost.toFixed(3)} + - playbook citations: [${probe.cites.join(", ")}] + +JUDGE: +1. If a prior playbook covered this same city + role, the boost should fire on the workers + it endorsed (boostedHits > 0, citations non-empty). +2. If no prior playbook covers this combo, boost should be ~0 — that means the system is + correctly NOT bleeding endorsements across geos. +3. Anything in between (e.g. some boost but for the wrong reason) is a partial pass. + +Respond with ONE JSON object only: +{"learned": true|false, "verdict": "", "confidence": 0-100} + +learned=true means the network behaved as expected for this round (whether that's "boost fired +because it should" or "boost stayed zero because it should"). learned=false means the system +either failed to learn from a relevant prior playbook OR bled an irrelevant one. confidence is +how sure you are.`; +} + +async function verifyPhase(task: TaskSpec, ledger: RoundLedger[]): Promise { + const t0 = Date.now(); + const act = await activateProfile(); + const probe = await probeWithBoost(task); + + // Decide what counts as geo-correct based on prior playbooks + const priorMatchesThisGeo = ledger.some(r => + r.build.ok && + r.task.target_city === task.target_city && + r.task.target_state === task.target_state && + r.task.target_role === task.target_role + ); + const priorOtherGeo = ledger.some(r => + r.build.ok && + r.task.target_role === task.target_role && + !(r.task.target_city === task.target_city && r.task.target_state === task.target_state) + ); + + let geo_discrimination_ok: boolean; + if (priorMatchesThisGeo) { + geo_discrimination_ok = probe.boostedHits > 0; // expected lift + } else if (priorOtherGeo) { + geo_discrimination_ok = probe.boostedHits === 0; // must NOT bleed + } else { + geo_discrimination_ok = true; // no signal expected either way + } + + // Spin up the fresh verifier agent + const priorPlaybooks = ledger.filter(r => r.build.ok).map(r => ({ + op: r.task.operation, fills: r.build.fills.map(f => f.name), + })); + + let verdict = "verifier failed to respond"; let confidence = 0; + try { + const raw = await generate(VERIFIER_MODEL, verifierPrompt(task, priorPlaybooks, probe), { + temperature: 0.1, max_tokens: 250, + }); + const start = raw.indexOf("{"), end = raw.lastIndexOf("}"); + if (start >= 0 && end > start) { + const j = JSON.parse(raw.slice(start, end + 1)); + verdict = j.verdict ?? verdict; + confidence = Number(j.confidence) || 0; + } + } catch (e) { + verdict = `verifier parse error: ${(e as Error).message}`; + } + + return { + profile_activated: act.ok, + warmed_indexes: act.warmed, + probe_boost_total: probe.totalBoost, + probe_boosted_hits: probe.boostedHits, + probe_top_citations: probe.cites, + geo_discrimination_ok, + verdict, confidence, + duration_secs: Math.round((Date.now() - t0) / 1000), + }; +} + +// ─────────────────────── round scoring ─────────────────────── + +function scoreRound(r: RoundLedger): { score: number; notes: string[] } { + const notes: string[] = []; + let s = 0; + if (r.build.ok) { s += 3; notes.push(`✓ build sealed (${r.build.fills.map(f => f.name).join(", ")})`); } + else { notes.push(`✗ build failed: ${r.build.error}`); } + if (r.build.playbook_id) { s += 1; notes.push(`✓ seeded id=${r.build.playbook_id}`); } + if (r.verify.profile_activated) { s += 1; notes.push(`✓ profile activated (warmed=${r.verify.warmed_indexes})`); } + if (r.verify.geo_discrimination_ok) { s += 3; notes.push(`✓ geo discrimination correct (boostedHits=${r.verify.probe_boosted_hits})`); } + else { notes.push(`✗ geo discrimination failed (boostedHits=${r.verify.probe_boosted_hits})`); } + if (r.verify.confidence >= 60) { s += 2; notes.push(`✓ verifier confident (${r.verify.confidence}%): ${r.verify.verdict}`); } + else { notes.push(`◑ verifier confidence ${r.verify.confidence}%: ${r.verify.verdict}`); } + return { score: s, notes }; +} + +// ─────────────────────── main loop ─────────────────────── + +async function main() { + console.log(`▶ Network proving — ${TASK_DECK.length} rounds, profile=${PROFILE_ID}`); + console.log(`▶ build pair: ${EXECUTOR_MODEL} + ${REVIEWER_MODEL}; verifier: ${VERIFIER_MODEL}\n`); + + const ledger: RoundLedger[] = []; + + for (let i = 0; i < TASK_DECK.length; i++) { + const task = TASK_DECK[i]; + console.log(`\n══════════ Round ${i} — ${task.operation} ══════════`); + + console.log(`\n[${task.id}] BUILD phase (two agents collaborating)`); + const build = await buildPhase(task, task.id); + + console.log(`\n[${task.id}] VERIFY phase (fresh agent + hot-swap profile)`); + const verify = await verifyPhase(task, ledger); + console.log(` profile=${verify.profile_activated ? "ok" : "fail"} warmed=${verify.warmed_indexes} ` + + `boosted=${verify.probe_boosted_hits}/10 totalBoost=${verify.probe_boost_total.toFixed(3)} ` + + `cites=${verify.probe_top_citations.length} confidence=${verify.confidence}%`); + console.log(` verdict: ${verify.verdict}`); + + const round: RoundLedger = { round: i, task, build, verify, score: 0, notes: [] }; + const sc = scoreRound(round); + round.score = sc.score; round.notes = sc.notes; + ledger.push(round); + + console.log(`\n Round ${i} score: ${round.score}/10`); + for (const n of round.notes) console.log(` ${n}`); + } + + console.log(`\n══════════ Network viability summary ══════════`); + const total = ledger.reduce((s, r) => s + r.score, 0); + const max = ledger.length * 10; + const avg = total / ledger.length; + for (const r of ledger) console.log(` R${r.round} ${r.task.target_city.padEnd(10)} ${r.task.target_role.padEnd(20)} ${r.score}/10`); + console.log(`\n TOTAL: ${total}/${max} AVG: ${avg.toFixed(1)}/10`); + + // Hard gate: at least 2/3 rounds must show the verifier is confident enough + // AND build phase succeeded + const passed = ledger.filter(r => r.build.ok && r.score >= 6).length; + if (passed < Math.ceil(ledger.length * 2 / 3)) { + throw new Error(`network proving gate failed — only ${passed}/${ledger.length} rounds passed (need ≥${Math.ceil(ledger.length * 2 / 3)})`); + } + console.log(`\n✓ Network proven over ${passed}/${ledger.length} rounds`); + process.exit(0); +} + +main().catch(e => { + console.error(`\n✗ ${(e as Error).message}`); + if ((e as any).stack) console.error((e as any).stack); + process.exit(1); +}); diff --git a/tests/multi-agent/orchestrator.ts b/tests/multi-agent/orchestrator.ts new file mode 100644 index 0000000..f73c68c --- /dev/null +++ b/tests/multi-agent/orchestrator.ts @@ -0,0 +1,302 @@ +// Two-agent orchestrator. Both agents run as concurrent async loops +// coordinated through a shared in-memory log; one turn of executor then +// one turn of reviewer, interleaved until consensus_done, drift-cycle +// blown, or hard turn cap. On success writes a playbook JSON; on failure +// exits non-zero with the full log for inspection. +// +// Fail-fast: every caught error is appended to the log AND rethrown, so +// the orchestrator top-level catches, dumps, and exits with code 1. The +// test harness reads the exit code to decide if the substrate is healthy. + +import { + type LogEntry, + type TaskSpec, + type Action, + type Fill, + callTool, + hybridSearch, + sqlQuery, + generate, + parseAction, + executorPrompt, + reviewerPrompt, + GATEWAY, +} from "./agent.ts"; +import { mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; + +const EXECUTOR_MODEL = "mistral:latest"; +const REVIEWER_MODEL = "qwen2.5:latest"; +const MAX_TURNS = 12; // executor turns; reviewer gets one per +const MAX_CONSECUTIVE_DRIFTS = 3; // drift-cycle blown → give up + +// Default task. Override via argv[2] if you want something else; see +// `parseTaskFromArg`. Picked from the real-world staffing pattern but +// not in the existing successful_playbooks list — this is a fresh fill. +// Default task lifted from the production pattern in successful_playbooks. +// Toledo, OH has 342 welders in workers_500k so supply is ample — the test +// is about collaboration and drift correction, not needle-in-haystack. +const DEFAULT_TASK: TaskSpec = { + id: `task-${Date.now()}`, + operation: "fill: Welder x2 in Toledo, OH", + target_role: "Welder", + target_count: 2, + target_city: "Toledo", + target_state: "OH", + approach_hint: "hybrid search against workers_500k_v1, narrow by role+city+state+availability, rank semantically", +}; + +function parseTaskFromArg(): TaskSpec { + const arg = process.argv[2]; + if (!arg) return DEFAULT_TASK; + // Accept "role:Welder count:2 city:Columbus state:OH" style for ad-hoc + // tasks without standing up a JSON file. Anything more complex, feed + // it a JSON path. + if (arg.endsWith(".json")) { + return JSON.parse(require("node:fs").readFileSync(arg, "utf-8")); + } + const kv: Record = {}; + for (const token of arg.split(/\s+/)) { + const [k, ...v] = token.split(":"); + kv[k] = v.join(":"); + } + return { + id: `task-${Date.now()}`, + operation: `fill: ${kv.role} x${kv.count} in ${kv.city}, ${kv.state}`, + target_role: kv.role, + target_count: Number(kv.count), + target_city: kv.city, + target_state: kv.state, + approach_hint: kv.hint ?? "hybrid search", + }; +} + +// Helper: pretty one-line print for each log entry so the human watching +// stdout can follow without pulling the JSONL file. +// Defensive one-line formatter. Models sometimes omit optional fields +// (rationale, notes), so every access is guarded. +function fmt(e: LogEntry): string { + const tag = `[t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`; + const c = e.content ?? {}; + const trim = (s: any, n: number) => String(s ?? "").slice(0, n); + if (e.kind === "tool_call") + return `${tag} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 80)}) — ${trim(c.rationale, 60)}`; + if (e.kind === "tool_result") { + const rows = c?.rows?.length ?? c?.sources?.length ?? undefined; + return `${tag} ${rows !== undefined ? `rows=${rows}` : JSON.stringify(c).slice(0, 80)}`; + } + if (e.kind === "critique") return `${tag} verdict=${c.verdict} — ${trim(c.notes, 80)}`; + if (e.kind === "propose_done") + return `${tag} ${c.fills?.length ?? 0} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; + if (e.kind === "consensus_done") return `${tag} ✓`; + if (e.kind === "plan") return `${tag} ${c.steps?.length ?? 0} steps: ${(c.steps ?? []).slice(0, 2).join(" / ")}`; + if (e.kind === "error") return `${tag} ${c.message ?? c}`; + return `${tag} ${JSON.stringify(c).slice(0, 100)}`; +} + +// Execute one tool call. The tool catalog in the prompt lists both the +// registered Phase 12 tools AND a pseudo-tool "hybrid_search" for the +// /vectors/hybrid endpoint — unify here so the executor doesn't need to +// know which surface a capability lives on. +async function executeToolCall(name: string, args: Record): Promise { + if (name === "hybrid_search") { + const { sql_filter, question, index_name, k } = args; + if (!sql_filter || !question || !index_name) { + throw new Error(`hybrid_search needs sql_filter + question + index_name, got ${JSON.stringify(args)}`); + } + // Pass through to /vectors/hybrid. id_column defaults to worker_id + // server-side, which is what workers_500k uses. + const body: any = { sql_filter, question, index_name, top_k: k ?? 10, generate: false }; + return (await (await fetch("http://localhost:3100/vectors/hybrid", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + })).json()); + } + if (name === "sql") { + const { query } = args; + if (!query || typeof query !== "string") throw new Error(`sql needs query (string), got ${JSON.stringify(args)}`); + if (!/^\s*SELECT/i.test(query)) throw new Error(`sql tool allows SELECT only: ${query}`); + return sqlQuery(query); + } + // Fall through to Phase 12 registry for any other named tool. + return callTool(name, args); +} + +async function main() { + const task = parseTaskFromArg(); + const log: LogEntry[] = []; + let turn = 0; + let consecutiveDrifts = 0; + let sealed: { fills: Fill[]; approach: string } | null = null; + + const append = (e: Omit): LogEntry => { + const full: LogEntry = { ...e, at: new Date().toISOString() }; + log.push(full); + console.log(fmt(full)); + return full; + }; + + console.log(`▶ task: ${task.operation}`); + console.log(`▶ executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL}`); + console.log(); + + try { + while (turn < MAX_TURNS && !sealed) { + turn += 1; + + // --- EXECUTOR TURN --- + const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { + temperature: 0.2, + max_tokens: 600, + }); + let execAction: Action; + try { + execAction = parseAction(execRaw, "executor"); + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error", + content: { message: (e as Error).message, raw: execRaw.slice(0, 400) } }); + throw e; + } + + append({ turn, role: "executor", model: EXECUTOR_MODEL, + kind: execAction.kind as any, content: execAction }); + + // If tool_call, execute and feed result back into the log. Tool + // validation / server errors come back as a tool_result with an + // `error` field — the executor reads its own error on the next turn + // and self-corrects (e.g. "oh, I forgot the `question` argument"). + // This is softer than hard-failing the orchestrator: the whole + // point of two-agent collaboration is letting agents learn from + // immediate feedback instead of crashing the run. + if (execAction.kind === "tool_call") { + try { + const result = await executeToolCall(execAction.tool, execAction.args); + const trimmed = trimResult(result); + append({ turn, role: "executor", model: EXECUTOR_MODEL, + kind: "tool_result", content: trimmed }); + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, + kind: "tool_result", + content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } }); + // Count as a soft drift — if the executor keeps throwing tool + // errors, consecutiveDrifts still trips the abort. + consecutiveDrifts += 1; + if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { + throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive tool errors, executor can't self-correct`); + } + } + } + + // --- REVIEWER TURN --- + const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { + temperature: 0.1, + max_tokens: 400, + }); + let revAction: Action; + try { + revAction = parseAction(revRaw, "reviewer"); + } catch (e) { + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error", + content: { message: (e as Error).message, raw: revRaw.slice(0, 400) } }); + throw e; + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, + kind: "critique", content: revAction }); + + if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`); + + if (revAction.verdict === "drift") { + consecutiveDrifts += 1; + if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { + throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive drift flags, executor can't self-correct`); + } + } else { + consecutiveDrifts = 0; + } + + // Consensus: executor proposed done AND reviewer approved. + if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { + if (execAction.fills.length !== task.target_count) { + throw new Error(`consensus malformed — ${execAction.fills.length} fills vs target ${task.target_count}`); + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", + content: { fills: execAction.fills } }); + sealed = { fills: execAction.fills, approach: execAction.rationale }; + } + } + + if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns — task incomplete`); + + // Write playbook entry matching the successful_playbooks schema. + const playbook = { + timestamp: new Date().toISOString(), + operation: task.operation, + approach: sealed.approach, + result: `${sealed.fills.length}/${task.target_count} filled → ${sealed.fills.map(f => f.name).join(", ")}`, + context: `executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL} turns=${turn}`, + task, + fills: sealed.fills, + log, + }; + await mkdir("./tests/multi-agent/playbooks", { recursive: true }); + const path = join("./tests/multi-agent/playbooks", `${task.id}.json`); + await writeFile(path, JSON.stringify(playbook, null, 2)); + console.log(`\n✓ playbook written: ${path}`); + console.log(` ${playbook.result}`); + + // Phase 19.5: write-through to playbook_memory. The sealed fills are + // the endorsement; next semantically-similar query will surface them + // higher. /seed bypasses the successful_playbooks ingest round-trip + // — when that ingest path ships, this block should switch to append + // + rebuild instead. + try { + // Seed context is what the embedding model actually sees alongside + // the operation — so it has to carry task-semantic content (role, + // city, scenario) rather than orchestrator bookkeeping. We stash + // the bookkeeping in the full playbook JSON instead (see playbook + // object above) where operators can grep it without it polluting + // the ranking signal. + const seedContext = task.approach_hint + ?? `${task.target_role} fill in ${task.target_city}, ${task.target_state}`; + const seedRes = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + operation: task.operation, + approach: sealed.approach || "multi-agent → hybrid search", + context: seedContext, + endorsed_names: sealed.fills.map(f => f.name), + append: true, + }), + }); + if (seedRes.ok) { + const j = await seedRes.json() as any; + console.log(` ↳ playbook_memory seeded: id=${j.playbook_id} entries=${j.entries_after}`); + } else { + console.warn(` ↳ playbook_memory seed failed: ${seedRes.status} ${await seedRes.text()}`); + } + } catch (e) { + console.warn(` ↳ playbook_memory seed errored: ${(e as Error).message}`); + } + + process.exit(0); + } catch (e) { + console.error(`\n✗ ${(e as Error).message}`); + // Still persist the log for inspection. + await mkdir("./tests/multi-agent/playbooks", { recursive: true }); + const path = join("./tests/multi-agent/playbooks", `${task.id}-FAILED.json`); + await writeFile(path, JSON.stringify({ task, error: (e as Error).message, log }, null, 2)); + console.error(` log dumped: ${path}`); + process.exit(1); + } +} + +function trimResult(r: any): any { + if (r && Array.isArray(r.rows)) { + return { ...r, rows: r.rows.slice(0, 20), _trimmed: r.rows.length > 20 ? `${r.rows.length - 20} more rows` : undefined }; + } + return r; +} + +main(); diff --git a/tests/multi-agent/run_e2e_rated.ts b/tests/multi-agent/run_e2e_rated.ts new file mode 100644 index 0000000..4f7d0a3 --- /dev/null +++ b/tests/multi-agent/run_e2e_rated.ts @@ -0,0 +1,400 @@ +// Two-agent x two-tasks parallel real-world test with per-playbook rating. +// +// Spawns two independent (executor, reviewer) pairs concurrently, each +// driving a different staffing fill against the live substrate. After +// each pair seals a playbook, verifies the fill against workers_500k, +// confirms the seed reached playbook_memory, and re-runs the same query +// with use_playbook_memory=true to prove the boost fires. +// +// Errors fail fast — any HTTP error, parse error, or rating failure is +// rethrown so bun exits non-zero. Run with: +// +// bun run tests/multi-agent/run_e2e_rated.ts +// +// VRAM note: both pairs call the same two Ollama models (mistral + +// qwen2.5). Ollama queues at the model level, so "parallel" is concurrent +// orchestration, not concurrent inference — the loops interleave on the +// shared models. That's intentional: it stresses the same realistic +// path two staffing coordinators would hit if they both opened the app +// at 8am. + +import { + type LogEntry, + type TaskSpec, + type Action, + type Fill, + GATEWAY, + generate, + parseAction, + executorPrompt, + reviewerPrompt, + sqlQuery, + callTool, +} from "./agent.ts"; + +const EXECUTOR_MODEL = "mistral:latest"; +const REVIEWER_MODEL = "qwen2.5:latest"; +const MAX_TURNS = 12; +const MAX_CONSECUTIVE_DRIFTS = 3; +const INDEX_NAME = "workers_500k_v1"; + +interface RunResult { + task: TaskSpec; + ok: boolean; + turns: number; + duration_secs: number; + fills: Fill[]; + log: LogEntry[]; + approach: string; + error?: string; +} + +// ────────────────────────── orchestrator (function form) ────────────────────────── + +async function runOrchestrator(task: TaskSpec, prefix: string): Promise { + const start = Date.now(); + const log: LogEntry[] = []; + let turn = 0; + let consecutiveDrifts = 0; + // Track tool errors separately from drift verdicts. Reviewer saying + // "continue" or "approve_done" should NOT reset a streak of malformed + // tool calls — that's a different failure mode (model can't form the + // call) than "executor is on the wrong path" (model is off-topic). + let consecutiveToolErrors = 0; + let sealed: { fills: Fill[]; approach: string } | null = null; + + const append = (e: Omit): LogEntry => { + const full: LogEntry = { ...e, at: new Date().toISOString() }; + log.push(full); + console.log(`[${prefix}] [t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}] ${shortContent(e)}`); + return full; + }; + + try { + while (turn < MAX_TURNS && !sealed) { + turn += 1; + + // Executor + const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 600 }); + const execAction = parseAction(execRaw, "executor"); + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction }); + + if (execAction.kind === "tool_call") { + try { + const result = await executeToolCall(execAction.tool, execAction.args); + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trimResult(result) }); + consecutiveToolErrors = 0; + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", + content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } }); + consecutiveToolErrors += 1; + if (consecutiveToolErrors >= MAX_CONSECUTIVE_DRIFTS) { + throw new Error(`${MAX_CONSECUTIVE_DRIFTS} consecutive tool errors — executor can't form a valid call`); + } + } + } + + // Reviewer + const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 400 }); + const revAction = parseAction(revRaw, "reviewer"); + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction }); + + if (revAction.kind !== "critique") throw new Error(`reviewer non-critique: ${revAction.kind}`); + if (revAction.verdict === "drift") { + consecutiveDrifts += 1; + if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) throw new Error(`${MAX_CONSECUTIVE_DRIFTS} consecutive drifts`); + } else consecutiveDrifts = 0; + + if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { + if (execAction.fills.length !== task.target_count) { + throw new Error(`fills=${execAction.fills.length} target=${task.target_count}`); + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } }); + sealed = { fills: execAction.fills, approach: (execAction as any).rationale ?? "multi-agent → hybrid" }; + } + } + + if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`); + + // Phase 19 write-through: seed playbook_memory so the next semantically + // similar query benefits from this fill. Mirrors orchestrator.ts. Names + // are the consensus fills' display names — that's what the boost-key + // matcher (city, state, name) will look up against worker chunks. + try { + const seedRes = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + operation: task.operation, + approach: sealed.approach || "multi-agent → hybrid search", + context: task.approach_hint ?? `${task.target_role} fill in ${task.target_city}, ${task.target_state}`, + endorsed_names: sealed.fills.map(f => f.name), + append: true, + }), + }); + if (!seedRes.ok) { + console.warn(`[${prefix}] seed warning: ${seedRes.status} ${await seedRes.text()}`); + } else { + const j = await seedRes.json() as any; + console.log(`[${prefix}] ↳ seeded playbook_memory: id=${j.playbook_id} entries=${j.entries_after}`); + } + } catch (e) { + console.warn(`[${prefix}] seed errored: ${(e as Error).message}`); + } + + return { + task, ok: true, turns: turn, fills: sealed.fills, approach: sealed.approach, + duration_secs: Math.round((Date.now() - start) / 1000), log, + }; + } catch (e) { + return { + task, ok: false, turns: turn, fills: [], approach: "", + duration_secs: Math.round((Date.now() - start) / 1000), log, + error: (e as Error).message, + }; + } +} + +async function executeToolCall(name: string, args: Record): Promise { + if (name === "hybrid_search") { + const { sql_filter, question, index_name, k } = args; + if (!sql_filter || !question || !index_name) throw new Error(`hybrid_search needs sql_filter+question+index_name`); + const r = await fetch(`${GATEWAY}/vectors/hybrid`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ sql_filter, question, index_name, top_k: k ?? 10, generate: false, use_playbook_memory: true }), + }); + if (!r.ok) throw new Error(`hybrid_search → ${r.status}: ${await r.text()}`); + return r.json(); + } + if (name === "sql") { + if (!args.query || typeof args.query !== "string") throw new Error("sql needs query"); + if (!/^\s*SELECT/i.test(args.query)) throw new Error("sql allows SELECT only"); + return sqlQuery(args.query); + } + return callTool(name, args); +} + +function trimResult(r: any): any { + if (r && Array.isArray(r.rows)) return { ...r, rows: r.rows.slice(0, 20) }; + if (r && Array.isArray(r.sources)) return { ...r, sources: r.sources.slice(0, 12) }; + return r; +} + +function shortContent(e: Omit): string { + const c: any = e.content ?? {}; + if (e.kind === "tool_call") return `${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 70)})`; + if (e.kind === "tool_result") { + if (c.error) return `error: ${c.error}`; + if (Array.isArray(c.sources)) return `hybrid sql=${c.sql_matches} reranked=${c.vector_reranked}`; + if (Array.isArray(c.rows)) return `sql ${c.rows.length} rows`; + return JSON.stringify(c).slice(0, 80); + } + if (e.kind === "critique") return `verdict=${c.verdict} ${(c.notes ?? "").slice(0, 60)}`; + if (e.kind === "propose_done") return `${(c.fills ?? []).length} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; + if (e.kind === "consensus_done") return "✓"; + if (e.kind === "plan") return `${(c.steps ?? []).length} steps`; + return JSON.stringify(c).slice(0, 80); +} + +// ────────────────────────── playbook rating ────────────────────────── + +interface Rating { + geo: number; // 0-2: fills actually in target city/state + authenticity: number; // 0-2: fills' worker_ids exist in workers_500k + persistence: number; // 0-2: playbook_memory entry count grew correctly + boost_firing: number; // 0-3: follow-up query shows non-zero boost + speed: number; // 0-1: completed under 4 min + total: number; // /10 + notes: string[]; +} + +interface MemoryStats { entries: number; total_names_endorsed: number } + +async function fetchMemoryStats(): Promise { + const r = await fetch(`${GATEWAY}/vectors/playbook_memory/stats`); + if (!r.ok) throw new Error(`stats → ${r.status}`); + return r.json() as Promise; +} + +// Try to resolve a fill's candidate_id to a workers_500k row. Accepts +// "W500K-7995" (vector doc_id with prefix) and "7995" (raw worker_id). +async function lookupWorker(candidate_id: string): Promise<{ worker_id: number; name: string; city: string; state: string; role: string } | null> { + const numStr = candidate_id.replace(/^W500K-/i, "").replace(/[^\d]/g, ""); + if (!numStr) return null; + const num = parseInt(numStr, 10); + if (!Number.isFinite(num)) return null; + const r = await sqlQuery(`SELECT worker_id, name, city, state, role FROM workers_500k WHERE worker_id = ${num} LIMIT 1`); + return (r.rows && r.rows[0]) ?? null; +} + +// Re-run a hybrid query that mirrors the contract — proves the freshly +// seeded playbook actually lifts a future search. +async function verifyBoostFires(task: TaskSpec): Promise<{ boostedHits: number; sampleCitations: string[]; topBoost: number }> { + // Mirror the contract's actual geo. The playbook stored (city, state) + // from the operation; if the verify SQL doesn't restrict to the same + // city, the candidate pool may not include the seeded workers and the + // boost has nothing to lift. The contract pattern in production also + // includes city — recruiters fill specific cities, not whole states. + const sql_filter = `role = '${task.target_role.replace(/'/g, "''")}' ` + + `AND state = '${task.target_state}' ` + + `AND city = '${task.target_city.replace(/'/g, "''")}'`; + const r = await fetch(`${GATEWAY}/vectors/hybrid`, { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + index_name: INDEX_NAME, filter_dataset: "workers_500k", id_column: "worker_id", + sql_filter, question: `${task.target_role} in ${task.target_city}, ${task.target_state}`, + top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15, + }), + }); + if (!r.ok) throw new Error(`verify hybrid → ${r.status}: ${await r.text()}`); + const j = (await r.json()) as any; + const sources: any[] = j.sources ?? []; + const boosted = sources.filter(s => (s.playbook_boost ?? 0) > 0); + const cites = boosted.flatMap(s => s.playbook_citations ?? []).slice(0, 5); + const top = sources.reduce((m, s) => Math.max(m, s.playbook_boost ?? 0), 0); + return { boostedHits: boosted.length, sampleCitations: cites, topBoost: top }; +} + +async function ratePlaybook( + result: RunResult, + statsBefore: MemoryStats, + statsAfter: MemoryStats, +): Promise { + const notes: string[] = []; + let geo = 0, authenticity = 0, persistence = 0, boost_firing = 0, speed = 0; + + // 1. Geo + authenticity per fill + for (const f of result.fills) { + const w = await lookupWorker(f.candidate_id).catch(() => null); + if (!w) { notes.push(`✗ candidate_id ${f.candidate_id} not in workers_500k`); continue; } + authenticity += 1; + if (w.city.toLowerCase() === result.task.target_city.toLowerCase() + && w.state === result.task.target_state) { + geo += 1; + } else { + notes.push(`◑ ${w.name} (id=${w.worker_id}) is in ${w.city}, ${w.state}, not ${result.task.target_city}, ${result.task.target_state}`); + } + } + geo = Math.min(geo, 2); + authenticity = Math.min(authenticity, 2); + + // 2. Persistence + const grew = statsAfter.entries - statsBefore.entries; + if (grew === 1) { persistence = 2; notes.push(`✓ playbook_memory grew by exactly 1`); } + else if (grew >= 1) { persistence = 1; notes.push(`◑ playbook_memory grew by ${grew} (expected 1)`); } + else { notes.push(`✗ playbook_memory did not grow (before=${statsBefore.entries} after=${statsAfter.entries})`); } + + // 3. Boost firing — re-run the same query and see if it lifts anything + const v = await verifyBoostFires(result.task).catch(e => { notes.push(`✗ verify hybrid failed: ${(e as Error).message}`); return null; }); + if (v) { + if (v.boostedHits >= 2) boost_firing = 3; + else if (v.boostedHits === 1) boost_firing = 2; + else if (v.topBoost > 0) boost_firing = 1; + else boost_firing = 0; + notes.push(`boost re-query: ${v.boostedHits}/10 hits boosted, top=+${v.topBoost.toFixed(3)}, citations=${v.sampleCitations.slice(0, 3).join(",")}`); + } + + // 4. Speed + if (result.duration_secs <= 240) speed = 1; + else notes.push(`◑ slow: ${result.duration_secs}s (>240)`); + + const total = geo + authenticity + persistence + boost_firing + speed; + return { geo, authenticity, persistence, boost_firing, speed, total, notes }; +} + +function fmtRating(r: Rating): string { + return `geo=${r.geo}/2 auth=${r.authenticity}/2 persist=${r.persistence}/2 boost=${r.boost_firing}/3 speed=${r.speed}/1 → ${r.total}/10`; +} + +// ────────────────────────── main ────────────────────────── + +async function main() { + const taskA: TaskSpec = { + id: `e2e-A-${Date.now()}`, + operation: "fill: Welder x2 in Toledo, OH", + target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH", + approach_hint: "hybrid_search against workers_500k_v1 with sql_filter on role+city+state, then sql verify", + }; + const taskB: TaskSpec = { + id: `e2e-B-${Date.now()}`, + operation: "fill: Forklift Operator x2 in Nashville, TN", + target_role: "Forklift Operator", target_count: 2, target_city: "Nashville", target_state: "TN", + approach_hint: "hybrid_search against workers_500k_v1 with sql_filter on role+city+state, then sql verify", + }; + + console.log(`▶ parallel real-world test`); + console.log(` A: ${taskA.operation}`); + console.log(` B: ${taskB.operation}`); + console.log(` models: executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL}\n`); + + const statsBefore = await fetchMemoryStats(); + console.log(`▶ playbook_memory before: ${statsBefore.entries} entries, ${statsBefore.total_names_endorsed} endorsed names\n`); + + // Run both pairs in parallel. Each is its own (executor, reviewer) + // conversation; they do NOT see each other's logs. + const [resA, resB] = await Promise.all([ + runOrchestrator(taskA, "A"), + runOrchestrator(taskB, "B"), + ]); + + console.log(`\n▶ both orchestrators returned`); + console.log(` A: ok=${resA.ok} turns=${resA.turns} ${resA.duration_secs}s ${resA.error ?? ""}`); + console.log(` B: ok=${resB.ok} turns=${resB.turns} ${resB.duration_secs}s ${resB.error ?? ""}`); + + if (!resA.ok && !resB.ok) { + throw new Error(`both orchestrators failed — substrate or models in bad state`); + } + + const statsMid = await fetchMemoryStats(); + console.log(`\n▶ playbook_memory after both runs: ${statsMid.entries} entries (+${statsMid.entries - statsBefore.entries})\n`); + + // Rate each successful playbook. We compute persistence per task by + // splitting the growth — both seeded sequentially-ish, so each should + // contribute 1. + const ratings: Array<{ id: string; ok: boolean; rating?: Rating; error?: string }> = []; + + if (resA.ok) { + const beforeForA: MemoryStats = { entries: statsBefore.entries, total_names_endorsed: statsBefore.total_names_endorsed }; + const afterForA: MemoryStats = { entries: statsBefore.entries + (resA.fills.length > 0 ? 1 : 0), total_names_endorsed: statsBefore.total_names_endorsed }; + // Use real measured numbers when they're unambiguous (only one task succeeded) + const ra = await ratePlaybook(resA, beforeForA, resB.ok ? afterForA : statsMid); + ratings.push({ id: "A", ok: true, rating: ra }); + } else ratings.push({ id: "A", ok: false, error: resA.error }); + + if (resB.ok) { + const beforeForB: MemoryStats = resA.ok + ? { entries: statsBefore.entries + 1, total_names_endorsed: statsBefore.total_names_endorsed } + : statsBefore; + const rb = await ratePlaybook(resB, beforeForB, statsMid); + ratings.push({ id: "B", ok: true, rating: rb }); + } else ratings.push({ id: "B", ok: false, error: resB.error }); + + console.log(`\n▶ Per-playbook ratings:\n`); + for (const r of ratings) { + if (!r.ok) { + console.log(` ${r.id}: FAILED — ${r.error}`); + continue; + } + console.log(` ${r.id}: ${fmtRating(r.rating!)}`); + for (const n of r.rating!.notes) console.log(` ${n}`); + } + + const totals = ratings.filter(r => r.rating).map(r => r.rating!.total); + if (totals.length === 0) { + throw new Error(`no playbooks rated — both orchestrators failed`); + } + const min = Math.min(...totals); + const avg = totals.reduce((s, t) => s + t, 0) / totals.length; + console.log(`\n▶ Summary: avg=${avg.toFixed(1)}/10 min=${min}/10`); + + // Hard gate: any rating below 5 means the loop is broken end-to-end. + if (min < 5) throw new Error(`rating gate failed — min ${min}/10 (need ≥5)`); + + console.log(`\n✓ end-to-end real-world test passed`); + process.exit(0); +} + +main().catch(e => { + console.error(`\n✗ ${(e as Error).message}`); + if ((e as any).stack) console.error((e as any).stack); + process.exit(1); +}); diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts new file mode 100644 index 0000000..d7faa0a --- /dev/null +++ b/tests/multi-agent/scenario.ts @@ -0,0 +1,822 @@ +// A day in the life — the real-world scenario test. +// +// Runs six events against the live substrate: baseline_fill, recurring, +// expansion, emergency, misplacement, retrospective. Each event +// exercises a different pressure pattern; each one produces actionable +// artifacts (SMS drafts, client emails, dispatch log) alongside the +// ranking output; the run as a whole is self-audited at EOD against six +// gap categories (supply, embedding, fairness, drift, tool, write-through). +// +// Design notes: +// - Compressed clock. The "08:00" in an event spec is a label for the +// output, not a wall-clock gate. The full scenario runs in minutes. +// - One script, shared state. Each event mutates the same roster + +// gap_signals + artifacts in-memory, then persists at EOD. +// - Fail-soft per event. A drift-abort or tool error on one event +// records a gap_signal and moves on; we explicitly want to see which +// events the substrate can't handle, not abort the whole run. +// - Every fill event routes through the same executor/reviewer loop as +// the single-task orchestrator — just driven in sequence rather than +// standalone, with event-specific extra constraints in the prompt. + +import { + type LogEntry, + type TaskSpec, + type Action, + type Fill, + callTool, + hybridSearch, + sqlQuery, + generate, + parseAction, + executorPrompt, + reviewerPrompt, + GATEWAY, +} from "./agent.ts"; +import { mkdir, writeFile, appendFile } from "node:fs/promises"; +import { join } from "node:path"; + +const EXECUTOR_MODEL = "mistral:latest"; +const REVIEWER_MODEL = "qwen2.5:latest"; +const DRAFT_MODEL = "qwen2.5:latest"; // artifact generation; short outputs +const MAX_TURNS = 14; +const MAX_CONSECUTIVE_DRIFTS = 3; +const WORKERS_INDEX = "workers_500k_v1"; +const WORKERS_DATASET = "workers_500k"; + +// =================== Event + scenario types =================== + +type EventKind = "baseline_fill" | "recurring" | "expansion" | "emergency" | "misplacement"; + +interface FillEvent { + kind: EventKind; + at: string; // display label like "08:00" + role: string; + count: number; + city: string; + state: string; + shift_start?: string; // "08:00 AM" for SMS/email drafts + scenario_note?: string; // extra context the agents should know + deadline?: string; // emergency events carry this, shown to reviewer + exclude_worker_ids?: string[]; // misplacement: the lost worker + replaces_event?: string; // misplacement back-ref for reporting +} + +interface ScenarioSpec { + client: string; + date: string; + events: FillEvent[]; +} + +interface EventResult { + event: FillEvent; + ok: boolean; + fills: Fill[]; + turns: number; + duration_secs: number; + error?: string; + gap_signals: string[]; // pulled into the cross-event gap report + sources_first_score?: number; + sources_last_score?: number; + pool_size?: number; // sql_matches from the first hybrid_search + playbook_citations?: string[]; +} + +interface RosterEntry { + worker_id: string; + name: string; + booked_for: string; // event at-label + role: string; + city: string; + state: string; + status: "confirmed" | "no_show" | "rebooked_elsewhere"; +} + +interface ScenarioContext { + spec: ScenarioSpec; + out_dir: string; + roster: RosterEntry[]; + results: EventResult[]; + gap_signals: Array<{ event: string; category: string; detail: string }>; +} + +// =================== Default scenario =================== + +const DEFAULT_SCENARIO: ScenarioSpec = { + client: "Riverfront Steel", + date: "2026-04-21", + events: [ + { + kind: "baseline_fill", + at: "08:00", + role: "Warehouse Associate", + count: 3, + city: "Toledo", + state: "OH", + shift_start: "08:00 AM", + scenario_note: "Regular Monday morning shift, 8-hour.", + }, + { + kind: "recurring", + at: "10:30", + role: "Machine Operator", + count: 2, + city: "Toledo", + state: "OH", + shift_start: "11:00 AM", + scenario_note: "Recurring Tuesday/Thursday slot — prior workers may still be available.", + }, + { + kind: "expansion", + at: "12:15", + role: "Forklift Operator", + count: 5, + city: "Toledo", + state: "OH", + shift_start: "01:00 PM", + scenario_note: "New warehouse location opening, five-worker team needed.", + }, + { + kind: "emergency", + at: "14:00", + role: "Loader", + count: 4, + city: "Toledo", + state: "OH", + shift_start: "04:00 PM same day", + deadline: "16:00", + scenario_note: "Walkoff incident — replacement crew needed by 16:00 sharp.", + }, + { + kind: "misplacement", + at: "15:45", + role: "Warehouse Associate", + count: 1, + city: "Toledo", + state: "OH", + shift_start: "remainder of 08:00 shift", + scenario_note: "One worker from the 08:00 fill didn't show; rebuild the gap.", + replaces_event: "08:00", + }, + ], +}; + +// =================== Low-level helpers shared across events =================== + +async function httpJson(url: string, body?: any): Promise { + const res = await fetch(url, { + method: body ? "POST" : "GET", + headers: { "Content-Type": "application/json" }, + body: body ? JSON.stringify(body) : undefined, + }); + if (!res.ok) throw new Error(`${res.status} ${await res.text()}`); + return (await res.json()) as T; +} + +function fmt(e: LogEntry): string { + const tag = ` [t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`; + const c = e.content ?? {}; + const trim = (s: any, n: number) => String(s ?? "").slice(0, n); + if (e.kind === "tool_call") return `${tag} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 60)}) — ${trim(c.rationale, 40)}`; + if (e.kind === "tool_result") { + if (c.error) return `${tag} ERROR ${c.error}`; + const rows = c?.rows?.length ?? c?.sources?.length ?? undefined; + return `${tag} ${rows !== undefined ? `rows=${rows}` : JSON.stringify(c).slice(0, 60)}`; + } + if (e.kind === "critique") return `${tag} verdict=${c.verdict} — ${trim(c.notes, 50)}`; + if (e.kind === "propose_done") return `${tag} ${c.fills?.length ?? 0} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; + if (e.kind === "consensus_done") return `${tag} ✓`; + if (e.kind === "plan") return `${tag} ${c.steps?.length ?? 0} steps`; + if (e.kind === "error") return `${tag} ${c.message ?? c}`; + return `${tag} ${JSON.stringify(c).slice(0, 70)}`; +} + +async function executeToolCall(name: string, args: Record): Promise { + if (name === "hybrid_search") { + const { sql_filter, question, index_name, k } = args; + if (!sql_filter || !question || !index_name) { + throw new Error(`hybrid_search needs sql_filter + question + index_name, got ${JSON.stringify(args)}`); + } + // Every fill event uses the playbook_memory boost — that's the point + // of the run-as-a-whole: earlier events seed later ones. + return httpJson(`${GATEWAY}/vectors/hybrid`, { + sql_filter, question, index_name, + top_k: k ?? 10, generate: false, + use_playbook_memory: true, + playbook_memory_k: 10, + }); + } + if (name === "sql") { + const { query } = args; + if (!query || typeof query !== "string") throw new Error(`sql needs query string`); + if (!/^\s*SELECT/i.test(query)) throw new Error(`sql allows SELECT only`); + return sqlQuery(query); + } + return callTool(name, args); +} + +// =================== Core fill loop — one event, one consensus =================== + +interface AgentFillOutcome { + fills: Fill[]; + approach: string; + turns: number; + duration_secs: number; + log: LogEntry[]; + first_sql_matches?: number; + first_pool_first_score?: number; + first_pool_last_score?: number; + playbook_citations: string[]; +} + +async function runAgentFill( + task: TaskSpec, + extra_guidance: string, + exclude_worker_ids: string[], +): Promise { + const t0 = Date.now(); + const log: LogEntry[] = []; + let turn = 0; + let consecutiveDrifts = 0; + let sealed: { fills: Fill[]; approach: string } | null = null; + let first_sql_matches: number | undefined; + let first_pool_first: number | undefined; + let first_pool_last: number | undefined; + const playbook_citations = new Set(); + + const append = (e: Omit): LogEntry => { + const full: LogEntry = { ...e, at: new Date().toISOString() }; + log.push(full); + console.log(fmt(full)); + return full; + }; + + // Build executor prompt with the scenario-specific guidance + exclusions + // injected as an extra block. Reuses the base prompt so drift detection + // and output-shape rules are unchanged. + const withExtras = (base: string): string => { + let addon = ""; + if (extra_guidance) addon += `\n\nEVENT-SPECIFIC GUIDANCE:\n${extra_guidance}`; + if (exclude_worker_ids.length > 0) { + addon += `\n\nEXCLUDE these workers (already booked / unavailable today): ${exclude_worker_ids.join(", ")}\nIf your tool results include them, skip them — never propose them.`; + } + return base + addon; + }; + + while (turn < MAX_TURNS && !sealed) { + turn += 1; + + const execRaw = await generate( + EXECUTOR_MODEL, + withExtras(executorPrompt(task, log)), + { temperature: 0.2, max_tokens: 600 }, + ); + let execAction: Action; + try { + execAction = parseAction(execRaw, "executor"); + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error", + content: { message: (e as Error).message, raw: execRaw.slice(0, 300) } }); + throw e; + } + append({ turn, role: "executor", model: EXECUTOR_MODEL, + kind: execAction.kind as any, content: execAction }); + + if (execAction.kind === "tool_call") { + try { + const result = await executeToolCall(execAction.tool, execAction.args); + // Filter tool results to enforce the exclusion list — defense in + // depth since the prompt alone isn't enough for weak models. + const filtered = maskExclusions(result, exclude_worker_ids); + // Capture the first hybrid_search pool stats for gap detection. + if (execAction.tool === "hybrid_search" && first_sql_matches === undefined) { + first_sql_matches = (filtered as any).sql_matches; + const sources = (filtered as any).sources ?? []; + if (sources.length > 0) { + first_pool_first = sources[0].score; + first_pool_last = sources[sources.length - 1].score; + } + } + const trimmed = trimResult(filtered); + append({ turn, role: "executor", model: EXECUTOR_MODEL, + kind: "tool_result", content: trimmed }); + + // Accumulate playbook citations from any hybrid result that + // carried them — the scenario-level report needs them. + if (Array.isArray((filtered as any).sources)) { + for (const s of (filtered as any).sources) { + for (const c of s.playbook_citations ?? []) { + playbook_citations.add(c); + } + } + } + } catch (e) { + append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", + content: { error: (e as Error).message, tool: execAction.tool } }); + consecutiveDrifts += 1; + if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { + throw new Error(`aborted — ${MAX_CONSECUTIVE_DRIFTS} consecutive tool errors`); + } + } + } + + const revRaw = await generate( + REVIEWER_MODEL, + withExtras(reviewerPrompt(task, log)), + { temperature: 0.1, max_tokens: 400 }, + ); + let revAction: Action; + try { + revAction = parseAction(revRaw, "reviewer"); + } catch (e) { + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error", + content: { message: (e as Error).message, raw: revRaw.slice(0, 300) } }); + throw e; + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, + kind: "critique", content: revAction }); + + if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`); + + if (revAction.verdict === "drift") { + consecutiveDrifts += 1; + if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { + throw new Error(`aborted — ${MAX_CONSECUTIVE_DRIFTS} consecutive drift flags`); + } + } else { + consecutiveDrifts = 0; + } + + if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { + if (execAction.fills.length !== task.target_count) { + throw new Error(`consensus malformed — ${execAction.fills.length} fills vs target ${task.target_count}`); + } + // Enforce exclusion at seal time too, in case the models ignored + // both prompt + tool-result filtering. + for (const f of execAction.fills) { + if (exclude_worker_ids.includes(f.candidate_id)) { + throw new Error(`consensus proposed excluded worker ${f.candidate_id}`); + } + } + append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", + content: { fills: execAction.fills } }); + sealed = { fills: execAction.fills, approach: execAction.rationale ?? "multi-agent hybrid" }; + } + } + + if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`); + + return { + fills: sealed.fills, + approach: sealed.approach, + turns: turn, + duration_secs: (Date.now() - t0) / 1000, + log, + first_sql_matches, + first_pool_first_score: first_pool_first, + first_pool_last_score: first_pool_last, + playbook_citations: Array.from(playbook_citations), + }; +} + +function maskExclusions(result: any, exclude: string[]): any { + if (exclude.length === 0) return result; + if (Array.isArray(result.sources)) { + return { ...result, sources: result.sources.filter((s: any) => !exclude.includes(s.doc_id)) }; + } + if (Array.isArray(result.rows)) { + return { ...result, rows: result.rows.filter((r: any) => { + const id = r.worker_id ?? r.doc_id; + return id === undefined || !exclude.includes(String(id)); + }) }; + } + return result; +} + +function trimResult(r: any): any { + if (r && Array.isArray(r.sources)) { + return { ...r, sources: r.sources.slice(0, 20), _trimmed: r.sources.length > 20 ? `${r.sources.length - 20} more` : undefined }; + } + if (r && Array.isArray(r.rows)) { + return { ...r, rows: r.rows.slice(0, 20), _trimmed: r.rows.length > 20 ? `${r.rows.length - 20} more` : undefined }; + } + return r; +} + +// =================== Per-event guidance strings =================== + +function guidanceFor(event: FillEvent, ctx: ScenarioContext): string { + switch (event.kind) { + case "baseline_fill": + return `Standard Monday fill. Client ${ctx.spec.client}. Shift starts ${event.shift_start ?? "at start time"}. Take the top candidates by semantic match and availability.`; + case "recurring": + return `RECURRING slot — ${ctx.spec.client} runs this shift every Tues/Thurs. If playbook_memory surfaces candidates endorsed by past similar fills (you'll see 'cites' on hybrid sources), those are the preferred workers. Shift starts ${event.shift_start ?? "at start time"}.`; + case "expansion": + return `EXPANSION at ${ctx.spec.client}. New location, ${event.count}-worker team needed at once — search broadly and prefer workers with team/collaboration signals (engagement, communications scores). Shift starts ${event.shift_start ?? "at start time"}.`; + case "emergency": + return `EMERGENCY walkoff — ${ctx.spec.client} needs ${event.count} ${event.role}s BY ${event.deadline ?? "end of day"}. Prioritize availability over perfect skill match. A good-enough worker who can report today beats a perfect worker who can't.`; + case "misplacement": + return `MISPLACEMENT refill. A worker from the 08:00 shift no-showed. You must replace them WITHOUT proposing the same worker or anyone already booked today (see EXCLUDE list). Shift is ${event.shift_start ?? "in progress"} so speed matters.`; + } +} + +// =================== Artifact generation =================== + +interface ArtifactBundle { + sms: string; + email: string; +} + +// One Ollama call per event for SMS (to the filled workers) + one for +// the client email. Short outputs, low temperature — these are drafts, +// not creative writing. +async function generateArtifacts(event: FillEvent, outcome: AgentFillOutcome, ctx: ScenarioContext): Promise { + const smsPrompt = `Generate short, friendly, professional SMS messages to confirm a shift for each worker. ONE message per worker. Format as: + +TO: {Name} +{message body under 180 chars} + +--- + +Details: +- Client: ${ctx.spec.client} +- Role: ${event.role} +- Location: ${event.city}, ${event.state} +- Shift starts: ${event.shift_start ?? "TBD"} +- Scenario: ${event.scenario_note ?? ""} + +Workers to message: +${outcome.fills.map(f => `- ${f.name} (id ${f.candidate_id})`).join("\n")} + +Respond with only the message blocks, separated by "---". No commentary.`; + + const emailPrompt = `Generate a short professional email confirmation to the staffing client. + +TO: staffing@${ctx.spec.client.toLowerCase().replace(/ /g, "")}.example +FROM: dispatch@lakehouse.example +SUBJECT: (3-word subject) + +Body (4-6 lines max). Be specific about: +- Number of workers filled (${outcome.fills.length} of ${event.count}) +- Roles: ${event.role} +- Names filled +- Shift start: ${event.shift_start ?? "TBD"} +- Any scenario flag: ${event.scenario_note ?? "(none)"} + +Workers: +${outcome.fills.map(f => `- ${f.name} (${f.reason.slice(0, 60)})`).join("\n")} + +Respond with only the email. No commentary.`; + + const [sms, email] = await Promise.all([ + generate(DRAFT_MODEL, smsPrompt, { temperature: 0.3, max_tokens: 500 }), + generate(DRAFT_MODEL, emailPrompt, { temperature: 0.3, max_tokens: 400 }), + ]); + + return { sms: sms.trim(), email: email.trim() }; +} + +// =================== Per-event runner =================== + +async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise { + console.log(`\n════════ ${event.at} — ${event.kind.toUpperCase()}: fill ${event.count}× ${event.role} in ${event.city}, ${event.state} ════════`); + + const t0 = Date.now(); + + // Build the task spec the agent loop expects. + const task: TaskSpec = { + id: `${ctx.spec.date}-${event.at.replace(":", "")}-${event.kind}`, + operation: `fill: ${event.role} x${event.count} in ${event.city}, ${event.state}`, + target_role: event.role, + target_count: event.count, + target_city: event.city, + target_state: event.state, + approach_hint: `hybrid search against ${WORKERS_INDEX} for ${event.kind}`, + }; + + // Exclusion set: everyone already in today's roster + any explicit + // exclusions from the event spec. + const excludeIds = [ + ...ctx.roster + .filter(r => r.status === "confirmed") + .map(r => r.worker_id), + ...(event.exclude_worker_ids ?? []), + ]; + + const gap_signals: string[] = []; + let outcome: AgentFillOutcome; + try { + outcome = await runAgentFill(task, guidanceFor(event, ctx), excludeIds); + } catch (e) { + return { + event, + ok: false, + fills: [], + turns: 0, + duration_secs: (Date.now() - t0) / 1000, + error: (e as Error).message, + gap_signals: [`drift_or_tool: ${(e as Error).message}`], + }; + } + + // Resolve worker_ids via SQL so the roster has stable IDs (models + // sometimes return names-only). Best-effort — if name lookup finds + // zero or many matches, we flag a gap. + const resolved = await resolveWorkerIds(outcome.fills, event); + + // Roster double-book check. + for (const r of resolved) { + const conflict = ctx.roster.find(e => e.worker_id === r.worker_id && e.status === "confirmed"); + if (conflict) { + gap_signals.push(`double_book: ${r.worker_id} ${r.name} already booked for ${conflict.booked_for}`); + } + ctx.roster.push({ + worker_id: r.worker_id, + name: r.name, + booked_for: event.at, + role: event.role, + city: event.city, + state: event.state, + status: "confirmed", + }); + } + + // Pool-size signal (Gap 1 — supply). + const supply_threshold = event.count * 3; + if ((outcome.first_sql_matches ?? 0) < supply_threshold) { + gap_signals.push( + `supply: only ${outcome.first_sql_matches} candidates for ${event.count}× ${event.role} in ${event.city} (< ${supply_threshold}, our 3× comfort margin)` + ); + } + + // Score-spread signal (Gap 2 — embedding). + const spread = (outcome.first_pool_first_score ?? 0) - (outcome.first_pool_last_score ?? 0); + if (spread > 0 && spread < 0.02) { + gap_signals.push( + `embedding: top-K score spread ${spread.toFixed(3)} < 0.02 — model struggles to differentiate` + ); + } + + // Generate artifacts (SMS + email) — fail-soft; artifacts are cosmetic + // relative to the consensus itself. + let bundle: ArtifactBundle | null = null; + try { + bundle = await generateArtifacts(event, { ...outcome, fills: resolved }, ctx); + await appendFile(join(ctx.out_dir, "sms.md"), + `\n## ${event.at} ${event.kind} — ${event.role} x${event.count} in ${event.city}, ${event.state}\n\n${bundle.sms}\n`); + await appendFile(join(ctx.out_dir, "emails.md"), + `\n## ${event.at} ${event.kind} — ${event.role} x${event.count}\n\n${bundle.email}\n`); + } catch (e) { + gap_signals.push(`artifact: ${(e as Error).message}`); + } + + // Dispatch log (structured). + await appendFile(join(ctx.out_dir, "dispatch.jsonl"), + JSON.stringify({ + at: event.at, + kind: event.kind, + operation: task.operation, + fills: resolved, + turns: outcome.turns, + duration_secs: outcome.duration_secs, + pool_size: outcome.first_sql_matches, + playbook_citations: outcome.playbook_citations, + }) + "\n"); + + // Always seed playbook_memory after a sealed fill — keep the learning + // loop tight across the whole day so recurring/misplacement events + // later in the run benefit from earlier ones. + try { + await httpJson(`${GATEWAY}/vectors/playbook_memory/seed`, { + operation: task.operation, + approach: outcome.approach || `${event.kind} → hybrid search`, + context: `client=${ctx.spec.client} scenario=${event.kind} shift=${event.shift_start ?? "tbd"}`, + endorsed_names: resolved.map(r => r.name), + append: true, + }); + } catch (e) { + gap_signals.push(`write_through: ${(e as Error).message}`); + } + + return { + event, + ok: true, + fills: outcome.fills, + turns: outcome.turns, + duration_secs: outcome.duration_secs, + gap_signals, + sources_first_score: outcome.first_pool_first_score, + sources_last_score: outcome.first_pool_last_score, + pool_size: outcome.first_sql_matches, + playbook_citations: outcome.playbook_citations, + }; +} + +// =================== Worker ID resolution =================== + +// Models emit candidate_ids or names in propose_done. Some return the +// W500K-XXX doc_id, others just the name, others a random tag. Resolve +// to canonical (worker_id, name) via SQL so the roster is reliable. +async function resolveWorkerIds(fills: Fill[], event: FillEvent): Promise { + const resolved: Fill[] = []; + for (const f of fills) { + // Case 1: candidate_id looks like W500K-NNN — accept as-is. + if (/^W500K-\d+$/.test(f.candidate_id)) { + resolved.push(f); + continue; + } + // Case 2: candidate_id is a bare integer — promote to W500K-N. + if (/^\d+$/.test(f.candidate_id)) { + resolved.push({ ...f, candidate_id: `W500K-${f.candidate_id}` }); + continue; + } + // Case 3: look up by (name, city, state). Take the first match. + const q = `SELECT worker_id FROM ${WORKERS_DATASET} WHERE name = '${f.name.replace(/'/g, "''")}' AND city = '${event.city.replace(/'/g, "''")}' AND state = '${event.state.replace(/'/g, "''")}' LIMIT 1`; + try { + const r = await sqlQuery(q); + if (r.rows && r.rows.length > 0) { + resolved.push({ ...f, candidate_id: `W500K-${r.rows[0].worker_id}` }); + } else { + // No match — keep the fill but leave candidate_id as-is; the + // gap report will flag it. + resolved.push(f); + } + } catch { + resolved.push(f); + } + } + return resolved; +} + +// =================== EOD gap report =================== + +async function writeRetrospective(ctx: ScenarioContext): Promise { + const lines: string[] = []; + lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`); + lines.push(""); + lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\``); + lines.push(""); + + // --- Per-event summary --- + lines.push("## Events"); + lines.push(""); + lines.push("| At | Kind | Role / Count | Pool | Fills | Turns | Dur(s) | Cites | Gaps |"); + lines.push("|---|---|---|---|---|---|---|---|---|"); + for (const r of ctx.results) { + const status = r.ok ? "✓" : "✗"; + lines.push( + `| ${r.event.at} | ${r.event.kind} | ${r.event.role} × ${r.event.count} | ${r.pool_size ?? "-"} | ${status} ${r.fills.length} | ${r.turns} | ${r.duration_secs.toFixed(1)} | ${r.playbook_citations?.length ?? 0} | ${r.gap_signals.length} |` + ); + } + lines.push(""); + + // --- Roster --- + lines.push("## Final roster"); + lines.push(""); + lines.push("| Worker | Booked | Role | City, ST | Status |"); + lines.push("|---|---|---|---|---|"); + for (const e of ctx.roster) { + lines.push(`| ${e.worker_id} ${e.name} | ${e.booked_for} | ${e.role} | ${e.city}, ${e.state} | ${e.status} |`); + } + lines.push(""); + + // --- Gap analysis by category --- + const bycat: Record = {}; + for (const g of ctx.gap_signals) { + if (!bycat[g.category]) bycat[g.category] = []; + bycat[g.category].push(`**${g.event}** — ${g.detail}`); + } + + // Add cross-event categories computed here: + // Gap 3 — fairness (Gini-lite on roster) + const bookedIds = ctx.roster.filter(r => r.status === "confirmed").map(r => r.worker_id); + const counts = new Map(); + for (const id of bookedIds) counts.set(id, (counts.get(id) ?? 0) + 1); + const multis = [...counts.entries()].filter(([_, n]) => n > 1); + if (multis.length > 0) { + bycat["fairness"] = bycat["fairness"] ?? []; + for (const [id, n] of multis) { + const name = ctx.roster.find(r => r.worker_id === id)?.name ?? id; + bycat["fairness"].push(`_cross-event_ — ${name} (${id}) booked ${n} times today`); + } + } + + // Gap 5 — tool errors already captured per-event via gap_signals. + + // Gap 6 — write-through coverage: compare # events vs # new playbook_memory entries. + try { + const stats = await httpJson(`${GATEWAY}/vectors/playbook_memory/stats`); + bycat["write_through_audit"] = bycat["write_through_audit"] ?? []; + bycat["write_through_audit"].push(`_post-run_ — playbook_memory has ${stats.entries} entries (ran ${ctx.results.length} events, expected ≥ ${ctx.results.filter(r => r.ok).length} new entries from this run)`); + } catch { /* non-fatal */ } + + lines.push("## Gap signals"); + lines.push(""); + if (Object.keys(bycat).length === 0) { + lines.push("_None surfaced — either everything worked or detection is under-tuned._"); + } else { + for (const [cat, items] of Object.entries(bycat)) { + lines.push(`### ${cat}`); + for (const item of items) lines.push(`- ${item}`); + lines.push(""); + } + } + + // --- Narrative summary --- + lines.push("## Narrative"); + lines.push(""); + lines.push(`- ${ctx.results.filter(r => r.ok).length}/${ctx.results.length} events reached consensus.`); + lines.push(`- Final roster: ${ctx.roster.length} bookings across ${new Set(ctx.roster.map(r => r.worker_id)).size} distinct workers.`); + const totalCites = ctx.results.reduce((a, r) => a + (r.playbook_citations?.length ?? 0), 0); + lines.push(`- Playbook citations across the day: ${totalCites} (proof the feedback loop fired across events).`); + const droppedEvents = ctx.results.filter(r => !r.ok); + if (droppedEvents.length > 0) { + lines.push(`- Dropped events: ${droppedEvents.map(r => r.event.at + " " + r.event.kind).join(", ")}.`); + } + + await writeFile(join(ctx.out_dir, "report.md"), lines.join("\n")); + console.log(`\n✓ report → ${join(ctx.out_dir, "report.md")}`); +} + +// =================== Main driver =================== + +async function main() { + const specPath = process.argv[2]; + const spec: ScenarioSpec = specPath + ? JSON.parse(await Bun.file(specPath).text()) + : DEFAULT_SCENARIO; + + const stamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + const out_dir = join("tests/multi-agent/playbooks", `scenario-${stamp}`); + await mkdir(out_dir, { recursive: true }); + + const ctx: ScenarioContext = { + spec, + out_dir, + roster: [], + results: [], + gap_signals: [], + }; + + // Initialize output files + await writeFile(join(out_dir, "sms.md"), `# SMS drafts — ${spec.client}, ${spec.date}\n`); + await writeFile(join(out_dir, "emails.md"), `# Client emails — ${spec.client}, ${spec.date}\n`); + await writeFile(join(out_dir, "dispatch.jsonl"), ""); + + console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`); + console.log(`▶ out: ${out_dir}\n`); + + for (const event of spec.events) { + // Expand misplacement-style exclusions from the current roster: it + // wants to replace a worker from a prior event, so grab everyone + // booked at that at-label and add as exclusions. + if (event.kind === "misplacement" && event.replaces_event) { + const priorBooked = ctx.roster + .filter(r => r.booked_for === event.replaces_event && r.status === "confirmed") + .map(r => r.worker_id); + if (priorBooked.length > 0) { + // Pick one arbitrarily to mark as no_show — in a real system the + // external signal would pick. For the test, first one works. + const lost = priorBooked[0]; + const lostEntry = ctx.roster.find(r => r.worker_id === lost); + if (lostEntry) { + lostEntry.status = "no_show"; + console.log(` (misplacement: marking ${lost} ${lostEntry.name} as no-show)`); + } + // Exclude all prior bookings so the refill doesn't pick anyone + // already scheduled for today. + event.exclude_worker_ids = priorBooked; + } + } + + const result = await runEvent(event, ctx); + ctx.results.push(result); + for (const s of result.gap_signals) { + const [category, ...rest] = s.split(":"); + ctx.gap_signals.push({ event: event.at, category: category.trim(), detail: rest.join(":").trim() }); + } + + // Small breather to not hammer Ollama on back-to-back runs. + await new Promise(r => setTimeout(r, 500)); + } + + // Persist structured state for forensics. + await writeFile(join(out_dir, "roster.json"), JSON.stringify(ctx.roster, null, 2)); + await writeFile(join(out_dir, "results.json"), JSON.stringify(ctx.results, null, 2)); + + await writeRetrospective(ctx); + + const okCount = ctx.results.filter(r => r.ok).length; + if (okCount < ctx.results.length) { + console.log(`\n⚠ ${okCount}/${ctx.results.length} events succeeded. See ${out_dir}/report.md for gaps.`); + process.exit(2); + } + console.log(`\n✓ ${okCount}/${ctx.results.length} events succeeded. See ${out_dir}/report.md.`); + process.exit(0); +} + +main().catch(e => { + console.error(`\n✗ scenario driver crashed: ${(e as Error).message}`); + console.error((e as Error).stack); + process.exit(1); +});