From ad0edbe29ce9a26e55de2e7a480afc9a9d8e4ed8 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Mon, 20 Apr 2026 23:20:07 -0500
Subject: [PATCH] Cloud kimi-k2.5 executor for weak tiers + multi-strategy
 playbook retrieval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two coupled changes from the 2026 agent-memory research + tool
asymmetry findings.

SCENARIO (weak-tier cloud substitute):
qwen2.5 collapsed to 0/14 across the basic/minimal tool_levels.
Replace with cloud kimi-k2.5 on Ollama Cloud — same family as k2.6
(pro-tier locked today, on J's upgrade path). Plumb cloud flag
through ACTIVE_EXECUTOR_CLOUD / ACTIVE_REVIEWER_CLOUD into
generateContinuable so executor/reviewer can route to cloud when
tool_level requires. think:false supported by Kimi family.

Tool level mapping (revised):
  full     — qwen3.5 local + qwen3 local + cloud gpt-oss:120b T3 + rescue
  local    — qwen3.5 local + qwen3 local + local gpt-oss:20b T3 + rescue
  basic    — kimi-k2.5 cloud + qwen3 local + local T3, no rescue
  minimal  — kimi-k2.5 cloud + qwen3 local, no T3, no rescue.
             Playbook inheritance alone on the decision path.

This is the honest version of J's "minimal tools still works via
inheritance" hypothesis — with the executor no longer broken at the
tokenizer level, we can actually measure whether playbook retrieval
substitutes for missing overseers.

PLAYBOOK_MEMORY (multi-strategy retrieval):
Zep / Mem0 research shows multi-strategy rerank (semantic + keyword +
graph + temporal) outperforms single-strategy cosine. Lakehouse now
has a two-tier:

  1. Exact (role, city, state) match: skip cosine, assign similarity=1.0,
     take up to top_k/2+1 slots. These are identity-class neighbors —
     the strongest possible signal.
  2. Cosine fallback within the same (city, state) but different role:
     fills remaining slots.

Exposed as compute_boost_for_filtered_with_role(target_geo, target_role).
Backwards-compatible: compute_boost_for_filtered forwards with role=None
so existing callers keep their current behavior.

Service.rs wires both: extract_target_geo and extract_target_role pull
from the executor's SQL filter. grab_eq_value is factored out of
extract_target_geo so both lookups share one parser. Diagnostic log
now prints target_role alongside target_geo for every hybrid_search:

  playbook_boost: boosts=88 sources=39 parsed=39 matched=5
    target_geo=Some(("Nashville", "TN")) target_role=Some("Welder")

Verified: Nashville Welder query returns 5/10 boosted workers in
top_k with clean role+geo provenance.

Research sources: atlan.com Agent Memory Frameworks 2026, Mem0 paper
(arxiv 2504.19413), Zep/Graphiti LongMemEval comparison, ossinsight
Agent Memory Race 2026.

kimi-k2.6 on current key returns 403 — pro-tier upgrade required.
kimi-k2.5 is the substitute today; swap to k2.6 by renaming one line
in applyToolLevel once the subscription lands.
---
 crates/vectord/src/playbook_memory.rs | 71 ++++++++++++++++---
 crates/vectord/src/service.rs         | 98 ++++++++++++++-------------
 tests/multi-agent/scenario.ts         | 22 ++++--
 3 files changed, 133 insertions(+), 58 deletions(-)

diff --git a/crates/vectord/src/playbook_memory.rs b/crates/vectord/src/playbook_memory.rs
index dd5ae5e..d13f71a 100644
--- a/crates/vectord/src/playbook_memory.rs
+++ b/crates/vectord/src/playbook_memory.rs
@@ -235,6 +235,26 @@ impl PlaybookMemory {
         top_k_playbooks: usize,
         base_weight: f32,
         target_geo: Option<(&str, &str)>,
+    ) -> HashMap<(String, String, String), BoostEntry> {
+        self.compute_boost_for_filtered_with_role(query_embedding, top_k_playbooks, base_weight, target_geo, None).await
+    }
+
+    /// Variant that also accepts a target role for pre-filtering.
+    /// Multi-strategy retrieval: exact (role, city, state) matches skip
+    /// cosine entirely and earn the maximum boost, since identity on
+    /// those three fields is the strongest possible similarity signal.
+    /// Remaining entries (within the same city+state but different
+    /// role, or unknown role) go through the normal cosine path as a
+    /// fallback. This addresses the 2026 agent-memory finding that
+    /// multi-strategy parallel retrieval with rerank outperforms
+    /// single-strategy semantic search.
+    pub async fn compute_boost_for_filtered_with_role(
+        &self,
+        query_embedding: &[f32],
+        top_k_playbooks: usize,
+        base_weight: f32,
+        target_geo: Option<(&str, &str)>,
+        target_role: Option<&str>,
     ) -> HashMap<(String, String, String), BoostEntry> {
         let state = self.state.read().await;
         let entries = state.entries.clone();
@@ -246,11 +266,11 @@ impl PlaybookMemory {
         }
         drop(state);
 
-        // Brute-force cosine. Empty / missing embeddings just skip.
-        // When target_geo is set, pre-filter to matching playbooks BEFORE
-        // cosine sort — that way top-k is within the city, not across
-        // all cities.
-        let mut scored: Vec<(f32, &PlaybookEntry)> = entries
+        // Pre-filter by target_geo (city, state) before cosine. When
+        // target_geo is set, only playbooks from that city go into the
+        // ranking pool — prevents globally-popular semantic neighbors
+        // from drowning out the city's local successful playbooks.
+        let geo_filtered: Vec<&PlaybookEntry> = entries
             .iter()
             .filter(|e| match (target_geo, &e.city, &e.state) {
                 (None, _, _) => true,
@@ -259,10 +279,45 @@ impl PlaybookMemory {
                 }
                 _ => false,
             })
-            .filter_map(|e| e.embedding.as_ref().map(|v| (cosine(query_embedding, v), e)))
             .collect();
-        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
-        scored.truncate(top_k_playbooks.max(1));
+
+        // Multi-strategy: split the geo-filtered pool into (exact role
+        // match) vs (other). Exact matches skip cosine — they're already
+        // the strongest signal possible. Operations are shaped
+        // "fill: Welder x3 in Toledo, OH" so we match role by checking
+        // whether `fill: {role} ` appears in the operation string,
+        // case-insensitive.
+        let mut exact_matches: Vec<&PlaybookEntry> = Vec::new();
+        let mut cosine_pool: Vec<(f32, &PlaybookEntry)> = Vec::new();
+        let role_needle = target_role
+            .map(|r| format!("fill: {} ", r).to_ascii_lowercase());
+        for e in geo_filtered {
+            let is_exact = role_needle.as_ref()
+                .map(|needle| e.operation.to_ascii_lowercase().contains(needle))
+                .unwrap_or(false);
+            if is_exact {
+                exact_matches.push(e);
+            } else if let Some(v) = &e.embedding {
+                cosine_pool.push((cosine(query_embedding, v), e));
+            }
+        }
+        cosine_pool.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Allocate top_k across the two pools — exact matches get first
+        // priority (up to min(exact_count, top_k/2) slots), then cosine
+        // fills the rest. This is rerank with hard preference for
+        // identity matches.
+        let exact_take = exact_matches.len().min(top_k_playbooks.max(1) / 2 + 1);
+        let cosine_take = top_k_playbooks.saturating_sub(exact_take);
+
+        // Score exact matches with max similarity (1.0) so downstream
+        // weighting treats them as the strongest possible signal.
+        let mut scored: Vec<(f32, &PlaybookEntry)> = exact_matches
+            .into_iter()
+            .take(exact_take)
+            .map(|e| (1.0_f32, e))
+            .collect();
+        scored.extend(cosine_pool.into_iter().take(cosine_take));
 
         let now = chrono::Utc::now();
         let mut boosts: HashMap<(String, String, String), BoostEntry> = HashMap::new();
diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs
index 73ea114..977575f 100644
--- a/crates/vectord/src/service.rs
+++ b/crates/vectord/src/service.rs
@@ -803,20 +803,23 @@ async fn hybrid_search(
     // set. Additive boost on the existing vector score, then re-sort.
     if req.use_playbook_memory {
         let boost_k = req.playbook_memory_k.unwrap_or(playbook_memory::DEFAULT_TOP_K_PLAYBOOKS);
-        // Extract target (city, state) from the SQL filter so
-        // compute_boost_for can skip playbooks from other cities that
-        // would never intersect the candidate pool. The executor's
-        // filter shape is stable: `... city = 'Toledo' AND state = 'OH' ...`.
-        // Case-insensitive match, tolerant of single quotes and spaces.
+        // Extract target (city, state, role) from the SQL filter so
+        // compute_boost_for can skip playbooks from other cities AND
+        // prioritize exact role matches via the multi-strategy path.
+        // The executor's filter shape is stable:
+        //   `... role = 'Welder' AND city = 'Toledo' AND state = 'OH' ...`.
+        // Case-insensitive match, tolerant of single quotes.
         let target_geo = req.sql_filter.as_deref().and_then(extract_target_geo);
+        let target_role = req.sql_filter.as_deref().and_then(extract_target_role);
         // We embedded the question as `qv` above — reuse it for the
         // playbook similarity lookup so we don't double-pay Ollama.
         let boosts = state.playbook_memory
-            .compute_boost_for_filtered(
+            .compute_boost_for_filtered_with_role(
                 &qv,
                 boost_k,
                 0.5,
                 target_geo.as_ref().map(|(c, s)| (c.as_str(), s.as_str())),
+                target_role.as_deref(),
             )
             .await;
 
@@ -850,12 +853,13 @@ async fn hybrid_search(
             }
         }
         tracing::info!(
-            "playbook_boost: boosts={} sources={} parsed={} matched={} target_geo={:?} (query='{}')",
+            "playbook_boost: boosts={} sources={} parsed={} matched={} target_geo={:?} target_role={:?} (query='{}')",
             boosts.len(),
             sources.len(),
             parsed_count,
             matched_count,
             target_geo,
+            target_role,
             req.question.chars().take(60).collect::<String>(),
         );
         // Re-rank: boosted scores can flip ordering.
@@ -2098,51 +2102,53 @@ struct LanceRecallQuery {
 /// "{Name} — {Role} in {City}, {State}. Skills: …".
 /// Returns None if the chunk doesn't match the shape; callers simply
 /// skip the boost for that hit.
+/// Extract role from an SQL filter matching `role = 'Welder'` style.
+/// Case-insensitive on the column name. Quoted value; quotes not
+/// included in returned string.
+fn extract_target_role(sql_filter: &str) -> Option<String> {
+    grab_eq_value(sql_filter, "role")
+}
+
+/// Shared equality-value extractor for (city, state, role) lookups.
+fn grab_eq_value(src: &str, col: &str) -> Option<String> {
+    let lower = src.to_ascii_lowercase();
+    let col_lower = col.to_ascii_lowercase();
+    let mut search_from = 0usize;
+    while let Some(off) = lower[search_from..].find(&col_lower) {
+        let pos = search_from + off;
+        let prior_ok = pos == 0
+            || !lower.as_bytes()[pos - 1].is_ascii_alphanumeric()
+               && lower.as_bytes()[pos - 1] != b'_';
+        let after = pos + col_lower.len();
+        if !prior_ok || after >= src.len() {
+            search_from = pos + col_lower.len();
+            continue;
+        }
+        let mut i = after;
+        while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
+        if i >= src.len() || src.as_bytes()[i] != b'=' { search_from = pos + col_lower.len(); continue; }
+        i += 1;
+        while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
+        if i >= src.len() || src.as_bytes()[i] != b'\'' { search_from = pos + col_lower.len(); continue; }
+        i += 1;
+        let start = i;
+        while i < src.len() && src.as_bytes()[i] != b'\'' { i += 1; }
+        if i > start {
+            return Some(src[start..i].to_string());
+        }
+        search_from = pos + col_lower.len();
+    }
+    None
+}
+
 /// Pull (city, state) out of a SQL filter that uses
 /// `city = 'Toledo' AND state = 'OH'` style equality. Returns None if
 /// either is missing — the caller keeps the original global boost map
 /// behavior (no geo narrowing). Case-insensitive on the column name
 /// so `CITY=` or `City =` also work.
 fn extract_target_geo(sql_filter: &str) -> Option<(String, String)> {
-    fn grab_eq(src: &str, col: &str) -> Option<String> {
-        // Very small parser, resilient enough for the executor's
-        // filter shapes. Matches `col = 'value'` or `col='value'` with
-        // case-insensitive column name.
-        let lower = src.to_ascii_lowercase();
-        let col_lower = col.to_ascii_lowercase();
-        let mut search_from = 0usize;
-        while let Some(off) = lower[search_from..].find(&col_lower) {
-            let pos = search_from + off;
-            // Require word boundary before the column name so "city"
-            // inside "civilian_rank" doesn't false-match.
-            let prior_ok = pos == 0
-                || !lower.as_bytes()[pos - 1].is_ascii_alphanumeric()
-                   && lower.as_bytes()[pos - 1] != b'_';
-            let after = pos + col_lower.len();
-            if !prior_ok || after >= src.len() {
-                search_from = pos + col_lower.len();
-                continue;
-            }
-            // Walk past whitespace, require '='.
-            let mut i = after;
-            while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
-            if i >= src.len() || src.as_bytes()[i] != b'=' { search_from = pos + col_lower.len(); continue; }
-            i += 1;
-            while i < src.len() && src.as_bytes()[i] == b' ' { i += 1; }
-            // Value is single-quoted literal; extract until the next '.
-            if i >= src.len() || src.as_bytes()[i] != b'\'' { search_from = pos + col_lower.len(); continue; }
-            i += 1;
-            let start = i;
-            while i < src.len() && src.as_bytes()[i] != b'\'' { i += 1; }
-            if i > start {
-                return Some(src[start..i].to_string());
-            }
-            search_from = pos + col_lower.len();
-        }
-        None
-    }
-    let city = grab_eq(sql_filter, "city")?;
-    let state = grab_eq(sql_filter, "state")?;
+    let city = grab_eq_value(sql_filter, "city")?;
+    let state = grab_eq_value(sql_filter, "state")?;
     Some((city, state))
 }
 
diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts
index 597fd86..3fd5dc6 100644
--- a/tests/multi-agent/scenario.ts
+++ b/tests/multi-agent/scenario.ts
@@ -92,6 +92,8 @@ const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";
 // based on staffer.tool_level before calling anything else.
 let ACTIVE_EXECUTOR = EXECUTOR_MODEL;
 let ACTIVE_REVIEWER = REVIEWER_MODEL;
+let ACTIVE_EXECUTOR_CLOUD = false;
+let ACTIVE_REVIEWER_CLOUD = false;
 let ACTIVE_T3_DISABLED = T3_DISABLED;
 let ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
 let ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
@@ -101,6 +103,8 @@ function applyToolLevel(level: Staffer["tool_level"] | undefined): void {
   // don't leak.
   ACTIVE_EXECUTOR = EXECUTOR_MODEL;
   ACTIVE_REVIEWER = REVIEWER_MODEL;
+  ACTIVE_EXECUTOR_CLOUD = false;
+  ACTIVE_REVIEWER_CLOUD = false;
   ACTIVE_T3_DISABLED = T3_DISABLED;
   ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
   ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
@@ -113,14 +117,22 @@ function applyToolLevel(level: Staffer["tool_level"] | undefined): void {
       ACTIVE_OVERVIEW_CLOUD = false;
       break;
     case "basic":
-      ACTIVE_EXECUTOR = "qwen2.5:latest";
-      ACTIVE_REVIEWER = "qwen2.5:latest";
+      // qwen2.5 collapsed on this workload (0/14 fill). Replace with
+      // cloud kimi-k2.5 — same family as k2.6 (which requires a paid
+      // tier), strong at tool calling. kimi-k2.6 is targeted when the
+      // subscription upgrades.
+      ACTIVE_EXECUTOR = "kimi-k2.5";
+      ACTIVE_EXECUTOR_CLOUD = true;
+      ACTIVE_REVIEWER = "qwen3:latest";  // local reviewer stays cheap
       ACTIVE_OVERVIEW_CLOUD = false;
       ACTIVE_RETRY_ON_FAIL = false;
       break;
     case "minimal":
-      ACTIVE_EXECUTOR = "qwen2.5:latest";
-      ACTIVE_REVIEWER = "qwen2.5:latest";
+      // Same executor as basic but strip the overseer + rescue.
+      // Proves whether playbook inheritance alone carries the load.
+      ACTIVE_EXECUTOR = "kimi-k2.5";
+      ACTIVE_EXECUTOR_CLOUD = true;
+      ACTIVE_REVIEWER = "qwen3:latest";
       ACTIVE_T3_DISABLED = true;
       ACTIVE_OVERVIEW_CLOUD = false;
       ACTIVE_RETRY_ON_FAIL = false;
@@ -514,6 +526,7 @@ async function runAgentFill(
         shape: "json",
         max_continuations: 3,
         think: false,
+        cloud: ACTIVE_EXECUTOR_CLOUD,
         on_continuation: (n, len) =>
           append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "note",
             content: { continuation: n, combined_chars: len } }),
@@ -577,6 +590,7 @@ async function runAgentFill(
         shape: "json",
         max_continuations: 3,
         think: false,
+        cloud: ACTIVE_REVIEWER_CLOUD,
         on_continuation: (n, len) =>
           append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "note",
             content: { continuation: n, combined_chars: len } }),