From a7fc8e22566cb0a1c54f472e5dc70fb409fb3973 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 22:01:45 -0500 Subject: [PATCH] =?UTF-8?q?Item=20B=20=E2=80=94=20cloud-rescue=20retry=20o?= =?UTF-8?q?n=20event=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a scenario event fails (drift abort or other error) and LH_RETRY_ON_FAIL is on (default when cloud T3 is enabled), ask cloud for a concrete pivot — new city, role, or count — then re-run the event with the remediation's fields. Capped at 1 retry per event so a genuinely-impossible scenario can't burn budget. requestCloudRemediation(event, result): - Feeds the same diagnostic bundle T3 checkpoints get (SQL filters, row counts, SQL errors, reviewer drift reasons, gap signals). - Prompt demands structured JSON: {retry, new_city, new_role, new_count, rationale}. - Cloud is instructed to pivot to NEAREST alternate city when zero-supply detected, broaden role when uniquely scarce, reduce count when clearly unachievable, or return retry=false when no pivot seems viable. EventResult additions: - retry_attempt, retry_remediation (with rationale + cloud_model + duration), retry_result (full inner result shape), original_event. - If retry succeeded, it becomes the primary result and original_event preserves what was attempted first. If retry also failed, the primary stays the failure and retry is recorded alongside. Sanitizer on cloud output: model sometimes emits "Hammond, IN" in new_city with "IN" in a non-existent new_state field, producing "Hammond, IN, IN" downstream. Split new_city on comma, take first token as city, extract state if present after the comma. Original event's state is the fallback. VERIFIED on stress_01.json with LH_OVERVIEW_CLOUD=1: Without rescue (item A baseline): 1/5 events ok With rescue (item B): 3/5 events ok Gary IN misplacement: drift → cloud proposed South Bend IN → retry filled 1/1. Rationale stored in retry_remediation for forensics. Known limits surfaced (future work): - City-field mangling failed one rescue before the sanitizer landed; next run will use the fix. - Cloud picks alternate cities without knowing ground-truth supply. Flint → Saginaw pivoted but Saginaw also had sparse Welders. Future: expose a /vectors/supply-estimate endpoint cloud can consult before proposing a pivot. --- tests/multi-agent/scenario.ts | 189 +++++++++++++++++++++++++++++++++- 1 file changed, 187 insertions(+), 2 deletions(-) diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index ceaf8ec..44220d8 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -79,6 +79,11 @@ const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL : (T3_TIER?.local_fallback?.model ?? "gpt-oss:20b")); const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3); const T3_DISABLED = process.env.LH_T3_DISABLE === "1"; +// Phase 22 item B — cloud-assisted retry on event failure. Default ON +// when cloud T3 is enabled (LH_OVERVIEW_CLOUD=1) since that's the only +// path where the rescue call gets a model worth asking. Disable with +// LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue. +const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0"; // Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending // on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local. @@ -140,6 +145,31 @@ interface EventResult { // what T3 needs to diagnose supply issues // vs surface symptoms. Present on both // failed and successful events. + // Phase 22 item B — cloud-assisted retry fields. When a first attempt + // fails and LH_RETRY_ON_FAIL is on, T3 cloud proposes a pivot; if + // adopted, the retry re-runs the event with the new (city, role, + // count) and records the result in `retry_result`. original_event is + // the pre-pivot spec, kept for the retrospective so we can show the + // coordinator what changed. + retry_attempt?: number; + retry_remediation?: { + proposed_city?: string; + proposed_role?: string; + proposed_count?: number; + rationale: string; + cloud_model: string; + cloud_duration_secs: number; + }; + retry_result?: Omit; + original_event?: FillEvent; // what the event was before pivot +} + +interface CloudRemediation { + retry: boolean; + new_city?: string; + new_role?: string; + new_count?: number; + rationale: string; } interface RosterEntry { @@ -643,7 +673,7 @@ Body (4-6 lines max). Be specific about: - Any scenario flag: ${event.scenario_note ?? "(none)"} Workers: -${outcome.fills.map(f => `- ${f.name} (${f.reason.slice(0, 60)})`).join("\n")} +${outcome.fills.map(f => `- ${f.name}${f.reason ? " (" + f.reason.slice(0, 60) + ")" : ""}`).join("\n")} Respond with only the email. No commentary.`; @@ -997,6 +1027,71 @@ HINT: `; }; } +// Phase 22 item B — rescue path. When an event fails, feed the +// failure trace (SQL, pool, drift reasons, gap signals) to cloud T3 +// and ask for a concrete pivot: new city, new role, or new count. +// Returns null if cloud can't help or flag says "impossible, don't +// retry". Same diagnostic enrichment as the checkpoint prompt so +// cloud reasons from raw signals, not symptoms. +async function requestCloudRemediation( + event: FillEvent, + result: EventResult, +): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> { + if (T3_DISABLED) return null; + const start = Date.now(); + const diag = extractDiagnostics(result.diagnostic_log); + + const diagBlock = `SQL filters attempted (first 3):\n${diag.sql_filters.slice(0, 3).map(f => " - " + f).join("\n") || " (none)"}\n` + + `Row counts per tool call: ${diag.hybrid_row_counts.slice(0, 5).join(", ") || "(none)"}\n` + + (diag.sql_errors.length > 0 ? `SQL errors: ${diag.sql_errors.slice(0, 2).join(" | ")}\n` : "") + + `Reviewer drift reasons (first 3): ${diag.drift_reasons.slice(0, 3).join(" | ") || "(none recorded)"}\n` + + `Gap signals: ${result.gap_signals.join(" | ") || "none"}`; + + const prompt = `You are the rescue agent for a staffing coordinator system. An event just FAILED. Diagnose root cause, then propose ONE concrete retry plan. + +FAILED EVENT: + at=${event.at} kind=${event.kind} role=${event.role} count=${event.count} city=${event.city} state=${event.state} + outcome: ${result.error ?? "unknown failure"} + turns used: ${result.turns} + pool surfaced: ${result.pool_size ?? "n/a"} + +RAW DIAGNOSTICS (what the agent actually saw): +${diagBlock} + +Respond with a JSON object (NOTHING else, no prose before or after, no markdown): +{ + "retry": true | false, + "new_city": "string (same as original if no pivot)", + "new_role": "string (same as original if no pivot)", + "new_count": , + "rationale": "2-3 sentences explaining the fix or why retry is futile" +} + +RULES: +- Set retry=true only if you believe the pivot will likely succeed. +- If the city has genuine zero supply, pivot to the NEAREST alternate city with comparable labor pool (name a specific one). +- If the role is uniquely scarce, either broaden to a synonym role OR reduce count to something achievable. +- If no pivot seems viable, set retry=false — wasting a retry is worse than declaring impossible. +- Keep new_count realistic. Don't propose 5× in a city that clearly only has 2 workers.`; + + let raw = ""; + try { + raw = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 900 }); + } catch (e) { + return null; + } + const duration_secs = (Date.now() - start) / 1000; + try { + const m = raw.match(/\{[\s\S]*\}/); + if (!m) return null; + const parsed = JSON.parse(m[0]) as CloudRemediation; + if (typeof parsed.retry !== "boolean") return null; + return { remediation: parsed, duration_secs }; + } catch { + return null; + } +} + async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise { if (T3_DISABLED) return null; @@ -1258,7 +1353,97 @@ async function main() { } } - const result = await runEvent(event, ctx); + let result = await runEvent(event, ctx); + + // Phase 22 item B — cloud rescue on failure. When an event fails + // and LH_RETRY_ON_FAIL is on (default on when cloud T3 is on), ask + // cloud for a concrete pivot and re-run the event with the new + // (city, role, count). Capped at 1 retry per event to keep the + // budget bounded and avoid infinite loops on genuinely-impossible + // scenarios. + if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) { + console.log(` ▶ cloud rescue requested for ${event.at} ${event.kind}…`); + const rescue = await requestCloudRemediation(event, result); + if (rescue && rescue.remediation.retry) { + const r = rescue.remediation; + // Sanitize cloud's fields — model sometimes emits "Hammond, IN" + // as new_city and "IN" as new_state, producing "Hammond, IN, IN" + // downstream. Split on comma and take the first token for city. + const sanitizeCity = (c: string | undefined) => (c ?? "").split(",")[0].trim(); + const sanitizeState = (c: string | undefined, stateFromCity: string) => { + const explicit = (c ?? "").trim(); + // If explicit state is empty or matches original, try to + // extract from the city string if it had a trailing ", XX". + return explicit || stateFromCity || event.state; + }; + const cityRaw = r.new_city ?? event.city; + const cityClean = sanitizeCity(cityRaw); + const stateFromCity = (cityRaw.match(/,\s*([A-Z]{2})/) ?? [])[1] ?? ""; + const newEvent: FillEvent = { + ...event, + city: cityClean || event.city, + state: sanitizeState(undefined, stateFromCity) || event.state, + role: r.new_role ?? event.role, + count: r.new_count ?? event.count, + scenario_note: `[cloud-rescue ${rescue.duration_secs.toFixed(1)}s] ${r.rationale}`, + }; + const noChange = newEvent.city === event.city + && newEvent.role === event.role + && newEvent.count === event.count; + if (noChange) { + console.log(` (cloud rescue declined — no actual pivot proposed)`); + } else { + console.log(` retry: ${event.role}×${event.count} ${event.city},${event.state} → ${newEvent.role}×${newEvent.count} ${newEvent.city},${newEvent.state}`); + const retryResult = await runEvent(newEvent, ctx); + // Annotate original result with the retry attempt so the + // retrospective can show both. Retry becomes THE result if it + // succeeded; otherwise original stays the primary outcome and + // retry is recorded alongside. + const originalForRecord = event; + const origTurnsDur = { turns: result.turns, duration_secs: result.duration_secs }; + const retryAnnotation = { + retry_attempt: 1, + retry_remediation: { + proposed_city: r.new_city, + proposed_role: r.new_role, + proposed_count: r.new_count, + rationale: r.rationale, + cloud_model: OVERVIEW_MODEL, + cloud_duration_secs: rescue.duration_secs, + }, + retry_result: { + event: retryResult.event, + ok: retryResult.ok, + fills: retryResult.fills, + turns: retryResult.turns, + duration_secs: retryResult.duration_secs, + error: retryResult.error, + gap_signals: retryResult.gap_signals, + sources_first_score: retryResult.sources_first_score, + sources_last_score: retryResult.sources_last_score, + pool_size: retryResult.pool_size, + playbook_citations: retryResult.playbook_citations, + discovered_pattern: retryResult.discovered_pattern, + }, + original_event: originalForRecord, + }; + if (retryResult.ok) { + // Promote retry to primary result — the coordinator cares + // about the final outcome. Keep original_event so the + // pivot is visible in the retrospective. + result = { ...retryResult, ...retryAnnotation, retry_attempt: 1 }; + } else { + // Retry also failed. Keep original as primary, annotate + // with what was tried. + result = { ...result, ...retryAnnotation }; + } + console.log(` retry outcome: ${retryResult.ok ? "✓ filled " + retryResult.fills.length + "/" + newEvent.count : "✗ " + (retryResult.error?.slice(0, 80) ?? "unknown")}`); + } + } else { + console.log(` (cloud rescue: ${rescue ? "retry=false" : "unavailable"})`); + } + } + ctx.results.push(result); for (const s of result.gap_signals) { const [category, ...rest] = s.split(":");