From a7fc8e22566cb0a1c54f472e5dc70fb409fb3973 Mon Sep 17 00:00:00 2001
From: root <root@island37.com>
Date: Mon, 20 Apr 2026 22:01:45 -0500
Subject: [PATCH] =?UTF-8?q?Item=20B=20=E2=80=94=20cloud-rescue=20retry=20o?=
 =?UTF-8?q?n=20event=20failure?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a scenario event fails (drift abort or other error) and
LH_RETRY_ON_FAIL is on (default when cloud T3 is enabled), ask cloud
for a concrete pivot — new city, role, or count — then re-run the
event with the remediation's fields. Capped at 1 retry per event so a
genuinely-impossible scenario can't burn budget.

requestCloudRemediation(event, result):
- Feeds the same diagnostic bundle T3 checkpoints get (SQL filters,
  row counts, SQL errors, reviewer drift reasons, gap signals).
- Prompt demands structured JSON: {retry, new_city, new_role,
  new_count, rationale}.
- Cloud is instructed to pivot to NEAREST alternate city when
  zero-supply detected, broaden role when uniquely scarce, reduce
  count when clearly unachievable, or return retry=false when no
  pivot seems viable.

EventResult additions:
- retry_attempt, retry_remediation (with rationale + cloud_model +
  duration), retry_result (full inner result shape), original_event.
- If retry succeeded, it becomes the primary result and original_event
  preserves what was attempted first. If retry also failed, the
  primary stays the failure and retry is recorded alongside.

Sanitizer on cloud output: model sometimes emits "Hammond, IN" in
new_city with "IN" in a non-existent new_state field, producing
"Hammond, IN, IN" downstream. Split new_city on comma, take first
token as city, extract state if present after the comma. Original
event's state is the fallback.

VERIFIED on stress_01.json with LH_OVERVIEW_CLOUD=1:
  Without rescue (item A baseline):  1/5 events ok
  With rescue (item B):              3/5 events ok
Gary IN misplacement: drift → cloud proposed South Bend IN → retry
filled 1/1. Rationale stored in retry_remediation for forensics.

Known limits surfaced (future work):
- City-field mangling failed one rescue before the sanitizer landed;
  next run will use the fix.
- Cloud picks alternate cities without knowing ground-truth supply.
  Flint → Saginaw pivoted but Saginaw also had sparse Welders.
  Future: expose a /vectors/supply-estimate endpoint cloud can consult
  before proposing a pivot.
---
 tests/multi-agent/scenario.ts | 189 +++++++++++++++++++++++++++++++++-
 1 file changed, 187 insertions(+), 2 deletions(-)

diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts
index ceaf8ec..44220d8 100644
--- a/tests/multi-agent/scenario.ts
+++ b/tests/multi-agent/scenario.ts
@@ -79,6 +79,11 @@ const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL
     : (T3_TIER?.local_fallback?.model ?? "gpt-oss:20b"));
 const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3);
 const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
+// Phase 22 item B — cloud-assisted retry on event failure. Default ON
+// when cloud T3 is enabled (LH_OVERVIEW_CLOUD=1) since that's the only
+// path where the rescue call gets a model worth asking. Disable with
+// LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue.
+const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";
 
 // Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
 // on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
@@ -140,6 +145,31 @@ interface EventResult {
                                // what T3 needs to diagnose supply issues
                                // vs surface symptoms. Present on both
                                // failed and successful events.
+  // Phase 22 item B — cloud-assisted retry fields. When a first attempt
+  // fails and LH_RETRY_ON_FAIL is on, T3 cloud proposes a pivot; if
+  // adopted, the retry re-runs the event with the new (city, role,
+  // count) and records the result in `retry_result`. original_event is
+  // the pre-pivot spec, kept for the retrospective so we can show the
+  // coordinator what changed.
+  retry_attempt?: number;
+  retry_remediation?: {
+    proposed_city?: string;
+    proposed_role?: string;
+    proposed_count?: number;
+    rationale: string;
+    cloud_model: string;
+    cloud_duration_secs: number;
+  };
+  retry_result?: Omit<EventResult, "retry_attempt" | "retry_remediation" | "retry_result" | "diagnostic_log">;
+  original_event?: FillEvent; // what the event was before pivot
+}
+
+interface CloudRemediation {
+  retry: boolean;
+  new_city?: string;
+  new_role?: string;
+  new_count?: number;
+  rationale: string;
 }
 
 interface RosterEntry {
@@ -643,7 +673,7 @@ Body (4-6 lines max). Be specific about:
 - Any scenario flag: ${event.scenario_note ?? "(none)"}
 
 Workers:
-${outcome.fills.map(f => `- ${f.name} (${f.reason.slice(0, 60)})`).join("\n")}
+${outcome.fills.map(f => `- ${f.name}${f.reason ? " (" + f.reason.slice(0, 60) + ")" : ""}`).join("\n")}
 
 Respond with only the email. No commentary.`;
 
@@ -997,6 +1027,71 @@ HINT: <hint>`;
   };
 }
 
+// Phase 22 item B — rescue path. When an event fails, feed the
+// failure trace (SQL, pool, drift reasons, gap signals) to cloud T3
+// and ask for a concrete pivot: new city, new role, or new count.
+// Returns null if cloud can't help or flag says "impossible, don't
+// retry". Same diagnostic enrichment as the checkpoint prompt so
+// cloud reasons from raw signals, not symptoms.
+async function requestCloudRemediation(
+  event: FillEvent,
+  result: EventResult,
+): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> {
+  if (T3_DISABLED) return null;
+  const start = Date.now();
+  const diag = extractDiagnostics(result.diagnostic_log);
+
+  const diagBlock = `SQL filters attempted (first 3):\n${diag.sql_filters.slice(0, 3).map(f => "  - " + f).join("\n") || "  (none)"}\n`
+    + `Row counts per tool call: ${diag.hybrid_row_counts.slice(0, 5).join(", ") || "(none)"}\n`
+    + (diag.sql_errors.length > 0 ? `SQL errors: ${diag.sql_errors.slice(0, 2).join(" | ")}\n` : "")
+    + `Reviewer drift reasons (first 3): ${diag.drift_reasons.slice(0, 3).join(" | ") || "(none recorded)"}\n`
+    + `Gap signals: ${result.gap_signals.join(" | ") || "none"}`;
+
+  const prompt = `You are the rescue agent for a staffing coordinator system. An event just FAILED. Diagnose root cause, then propose ONE concrete retry plan.
+
+FAILED EVENT:
+  at=${event.at} kind=${event.kind} role=${event.role} count=${event.count} city=${event.city} state=${event.state}
+  outcome: ${result.error ?? "unknown failure"}
+  turns used: ${result.turns}
+  pool surfaced: ${result.pool_size ?? "n/a"}
+
+RAW DIAGNOSTICS (what the agent actually saw):
+${diagBlock}
+
+Respond with a JSON object (NOTHING else, no prose before or after, no markdown):
+{
+  "retry": true | false,
+  "new_city": "string (same as original if no pivot)",
+  "new_role": "string (same as original if no pivot)",
+  "new_count": <integer (same as original if no pivot)>,
+  "rationale": "2-3 sentences explaining the fix or why retry is futile"
+}
+
+RULES:
+- Set retry=true only if you believe the pivot will likely succeed.
+- If the city has genuine zero supply, pivot to the NEAREST alternate city with comparable labor pool (name a specific one).
+- If the role is uniquely scarce, either broaden to a synonym role OR reduce count to something achievable.
+- If no pivot seems viable, set retry=false — wasting a retry is worse than declaring impossible.
+- Keep new_count realistic. Don't propose 5× in a city that clearly only has 2 workers.`;
+
+  let raw = "";
+  try {
+    raw = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 900 });
+  } catch (e) {
+    return null;
+  }
+  const duration_secs = (Date.now() - start) / 1000;
+  try {
+    const m = raw.match(/\{[\s\S]*\}/);
+    if (!m) return null;
+    const parsed = JSON.parse(m[0]) as CloudRemediation;
+    if (typeof parsed.retry !== "boolean") return null;
+    return { remediation: parsed, duration_secs };
+  } catch {
+    return null;
+  }
+}
+
 async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
   if (T3_DISABLED) return null;
 
@@ -1258,7 +1353,97 @@ async function main() {
       }
     }
 
-    const result = await runEvent(event, ctx);
+    let result = await runEvent(event, ctx);
+
+    // Phase 22 item B — cloud rescue on failure. When an event fails
+    // and LH_RETRY_ON_FAIL is on (default on when cloud T3 is on), ask
+    // cloud for a concrete pivot and re-run the event with the new
+    // (city, role, count). Capped at 1 retry per event to keep the
+    // budget bounded and avoid infinite loops on genuinely-impossible
+    // scenarios.
+    if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) {
+      console.log(`   ▶ cloud rescue requested for ${event.at} ${event.kind}…`);
+      const rescue = await requestCloudRemediation(event, result);
+      if (rescue && rescue.remediation.retry) {
+        const r = rescue.remediation;
+        // Sanitize cloud's fields — model sometimes emits "Hammond, IN"
+        // as new_city and "IN" as new_state, producing "Hammond, IN, IN"
+        // downstream. Split on comma and take the first token for city.
+        const sanitizeCity = (c: string | undefined) => (c ?? "").split(",")[0].trim();
+        const sanitizeState = (c: string | undefined, stateFromCity: string) => {
+          const explicit = (c ?? "").trim();
+          // If explicit state is empty or matches original, try to
+          // extract from the city string if it had a trailing ", XX".
+          return explicit || stateFromCity || event.state;
+        };
+        const cityRaw = r.new_city ?? event.city;
+        const cityClean = sanitizeCity(cityRaw);
+        const stateFromCity = (cityRaw.match(/,\s*([A-Z]{2})/) ?? [])[1] ?? "";
+        const newEvent: FillEvent = {
+          ...event,
+          city: cityClean || event.city,
+          state: sanitizeState(undefined, stateFromCity) || event.state,
+          role: r.new_role ?? event.role,
+          count: r.new_count ?? event.count,
+          scenario_note: `[cloud-rescue ${rescue.duration_secs.toFixed(1)}s] ${r.rationale}`,
+        };
+        const noChange = newEvent.city === event.city
+          && newEvent.role === event.role
+          && newEvent.count === event.count;
+        if (noChange) {
+          console.log(`   (cloud rescue declined — no actual pivot proposed)`);
+        } else {
+          console.log(`   retry: ${event.role}×${event.count} ${event.city},${event.state} → ${newEvent.role}×${newEvent.count} ${newEvent.city},${newEvent.state}`);
+          const retryResult = await runEvent(newEvent, ctx);
+          // Annotate original result with the retry attempt so the
+          // retrospective can show both. Retry becomes THE result if it
+          // succeeded; otherwise original stays the primary outcome and
+          // retry is recorded alongside.
+          const originalForRecord = event;
+          const origTurnsDur = { turns: result.turns, duration_secs: result.duration_secs };
+          const retryAnnotation = {
+            retry_attempt: 1,
+            retry_remediation: {
+              proposed_city: r.new_city,
+              proposed_role: r.new_role,
+              proposed_count: r.new_count,
+              rationale: r.rationale,
+              cloud_model: OVERVIEW_MODEL,
+              cloud_duration_secs: rescue.duration_secs,
+            },
+            retry_result: {
+              event: retryResult.event,
+              ok: retryResult.ok,
+              fills: retryResult.fills,
+              turns: retryResult.turns,
+              duration_secs: retryResult.duration_secs,
+              error: retryResult.error,
+              gap_signals: retryResult.gap_signals,
+              sources_first_score: retryResult.sources_first_score,
+              sources_last_score: retryResult.sources_last_score,
+              pool_size: retryResult.pool_size,
+              playbook_citations: retryResult.playbook_citations,
+              discovered_pattern: retryResult.discovered_pattern,
+            },
+            original_event: originalForRecord,
+          };
+          if (retryResult.ok) {
+            // Promote retry to primary result — the coordinator cares
+            // about the final outcome. Keep original_event so the
+            // pivot is visible in the retrospective.
+            result = { ...retryResult, ...retryAnnotation, retry_attempt: 1 };
+          } else {
+            // Retry also failed. Keep original as primary, annotate
+            // with what was tried.
+            result = { ...result, ...retryAnnotation };
+          }
+          console.log(`   retry outcome: ${retryResult.ok ? "✓ filled " + retryResult.fills.length + "/" + newEvent.count : "✗ " + (retryResult.error?.slice(0, 80) ?? "unknown")}`);
+        }
+      } else {
+        console.log(`   (cloud rescue: ${rescue ? "retry=false" : "unavailable"})`);
+      }
+    }
+
     ctx.results.push(result);
     for (const s of result.gap_signals) {
       const [category, ...rest] = s.split(":");