Item B — cloud-rescue retry on event failure
When a scenario event fails (drift abort or other error) and
LH_RETRY_ON_FAIL is on (default when cloud T3 is enabled), ask cloud
for a concrete pivot — new city, role, or count — then re-run the
event with the remediation's fields. Capped at 1 retry per event so a
genuinely-impossible scenario can't burn budget.
requestCloudRemediation(event, result):
- Feeds the same diagnostic bundle T3 checkpoints get (SQL filters,
row counts, SQL errors, reviewer drift reasons, gap signals).
- Prompt demands structured JSON: {retry, new_city, new_role,
new_count, rationale}.
- Cloud is instructed to pivot to NEAREST alternate city when
zero-supply detected, broaden role when uniquely scarce, reduce
count when clearly unachievable, or return retry=false when no
pivot seems viable.
EventResult additions:
- retry_attempt, retry_remediation (with rationale + cloud_model +
duration), retry_result (full inner result shape), original_event.
- If retry succeeded, it becomes the primary result and original_event
preserves what was attempted first. If retry also failed, the
primary stays the failure and retry is recorded alongside.
Sanitizer on cloud output: model sometimes emits "Hammond, IN" in
new_city with "IN" in a non-existent new_state field, producing
"Hammond, IN, IN" downstream. Split new_city on comma, take first
token as city, extract state if present after the comma. Original
event's state is the fallback.
VERIFIED on stress_01.json with LH_OVERVIEW_CLOUD=1:
Without rescue (item A baseline): 1/5 events ok
With rescue (item B): 3/5 events ok
Gary IN misplacement: drift → cloud proposed South Bend IN → retry
filled 1/1. Rationale stored in retry_remediation for forensics.
Known limits surfaced (future work):
- City-field mangling failed one rescue before the sanitizer landed;
next run will use the fix.
- Cloud picks alternate cities without knowing ground-truth supply.
Flint → Saginaw pivoted but Saginaw also had sparse Welders.
Future: expose a /vectors/supply-estimate endpoint cloud can consult
before proposing a pivot.
This commit is contained in:
parent
c21b261877
commit
a7fc8e2256
@ -79,6 +79,11 @@ const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL
|
||||
: (T3_TIER?.local_fallback?.model ?? "gpt-oss:20b"));
|
||||
const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3);
|
||||
const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
|
||||
// Phase 22 item B — cloud-assisted retry on event failure. Default ON
|
||||
// when cloud T3 is enabled (LH_OVERVIEW_CLOUD=1) since that's the only
|
||||
// path where the rescue call gets a model worth asking. Disable with
|
||||
// LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue.
|
||||
const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";
|
||||
|
||||
// Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
|
||||
// on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
|
||||
@ -140,6 +145,31 @@ interface EventResult {
|
||||
// what T3 needs to diagnose supply issues
|
||||
// vs surface symptoms. Present on both
|
||||
// failed and successful events.
|
||||
// Phase 22 item B — cloud-assisted retry fields. When a first attempt
|
||||
// fails and LH_RETRY_ON_FAIL is on, T3 cloud proposes a pivot; if
|
||||
// adopted, the retry re-runs the event with the new (city, role,
|
||||
// count) and records the result in `retry_result`. original_event is
|
||||
// the pre-pivot spec, kept for the retrospective so we can show the
|
||||
// coordinator what changed.
|
||||
retry_attempt?: number;
|
||||
retry_remediation?: {
|
||||
proposed_city?: string;
|
||||
proposed_role?: string;
|
||||
proposed_count?: number;
|
||||
rationale: string;
|
||||
cloud_model: string;
|
||||
cloud_duration_secs: number;
|
||||
};
|
||||
retry_result?: Omit<EventResult, "retry_attempt" | "retry_remediation" | "retry_result" | "diagnostic_log">;
|
||||
original_event?: FillEvent; // what the event was before pivot
|
||||
}
|
||||
|
||||
interface CloudRemediation {
|
||||
retry: boolean;
|
||||
new_city?: string;
|
||||
new_role?: string;
|
||||
new_count?: number;
|
||||
rationale: string;
|
||||
}
|
||||
|
||||
interface RosterEntry {
|
||||
@ -643,7 +673,7 @@ Body (4-6 lines max). Be specific about:
|
||||
- Any scenario flag: ${event.scenario_note ?? "(none)"}
|
||||
|
||||
Workers:
|
||||
${outcome.fills.map(f => `- ${f.name} (${f.reason.slice(0, 60)})`).join("\n")}
|
||||
${outcome.fills.map(f => `- ${f.name}${f.reason ? " (" + f.reason.slice(0, 60) + ")" : ""}`).join("\n")}
|
||||
|
||||
Respond with only the email. No commentary.`;
|
||||
|
||||
@ -997,6 +1027,71 @@ HINT: <hint>`;
|
||||
};
|
||||
}
|
||||
|
||||
// Phase 22 item B — rescue path. When an event fails, feed the
|
||||
// failure trace (SQL, pool, drift reasons, gap signals) to cloud T3
|
||||
// and ask for a concrete pivot: new city, new role, or new count.
|
||||
// Returns null if cloud can't help or flag says "impossible, don't
|
||||
// retry". Same diagnostic enrichment as the checkpoint prompt so
|
||||
// cloud reasons from raw signals, not symptoms.
|
||||
async function requestCloudRemediation(
|
||||
event: FillEvent,
|
||||
result: EventResult,
|
||||
): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> {
|
||||
if (T3_DISABLED) return null;
|
||||
const start = Date.now();
|
||||
const diag = extractDiagnostics(result.diagnostic_log);
|
||||
|
||||
const diagBlock = `SQL filters attempted (first 3):\n${diag.sql_filters.slice(0, 3).map(f => " - " + f).join("\n") || " (none)"}\n`
|
||||
+ `Row counts per tool call: ${diag.hybrid_row_counts.slice(0, 5).join(", ") || "(none)"}\n`
|
||||
+ (diag.sql_errors.length > 0 ? `SQL errors: ${diag.sql_errors.slice(0, 2).join(" | ")}\n` : "")
|
||||
+ `Reviewer drift reasons (first 3): ${diag.drift_reasons.slice(0, 3).join(" | ") || "(none recorded)"}\n`
|
||||
+ `Gap signals: ${result.gap_signals.join(" | ") || "none"}`;
|
||||
|
||||
const prompt = `You are the rescue agent for a staffing coordinator system. An event just FAILED. Diagnose root cause, then propose ONE concrete retry plan.
|
||||
|
||||
FAILED EVENT:
|
||||
at=${event.at} kind=${event.kind} role=${event.role} count=${event.count} city=${event.city} state=${event.state}
|
||||
outcome: ${result.error ?? "unknown failure"}
|
||||
turns used: ${result.turns}
|
||||
pool surfaced: ${result.pool_size ?? "n/a"}
|
||||
|
||||
RAW DIAGNOSTICS (what the agent actually saw):
|
||||
${diagBlock}
|
||||
|
||||
Respond with a JSON object (NOTHING else, no prose before or after, no markdown):
|
||||
{
|
||||
"retry": true | false,
|
||||
"new_city": "string (same as original if no pivot)",
|
||||
"new_role": "string (same as original if no pivot)",
|
||||
"new_count": <integer (same as original if no pivot)>,
|
||||
"rationale": "2-3 sentences explaining the fix or why retry is futile"
|
||||
}
|
||||
|
||||
RULES:
|
||||
- Set retry=true only if you believe the pivot will likely succeed.
|
||||
- If the city has genuine zero supply, pivot to the NEAREST alternate city with comparable labor pool (name a specific one).
|
||||
- If the role is uniquely scarce, either broaden to a synonym role OR reduce count to something achievable.
|
||||
- If no pivot seems viable, set retry=false — wasting a retry is worse than declaring impossible.
|
||||
- Keep new_count realistic. Don't propose 5× in a city that clearly only has 2 workers.`;
|
||||
|
||||
let raw = "";
|
||||
try {
|
||||
raw = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 900 });
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
const duration_secs = (Date.now() - start) / 1000;
|
||||
try {
|
||||
const m = raw.match(/\{[\s\S]*\}/);
|
||||
if (!m) return null;
|
||||
const parsed = JSON.parse(m[0]) as CloudRemediation;
|
||||
if (typeof parsed.retry !== "boolean") return null;
|
||||
return { remediation: parsed, duration_secs };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
|
||||
if (T3_DISABLED) return null;
|
||||
|
||||
@ -1258,7 +1353,97 @@ async function main() {
|
||||
}
|
||||
}
|
||||
|
||||
const result = await runEvent(event, ctx);
|
||||
let result = await runEvent(event, ctx);
|
||||
|
||||
// Phase 22 item B — cloud rescue on failure. When an event fails
|
||||
// and LH_RETRY_ON_FAIL is on (default on when cloud T3 is on), ask
|
||||
// cloud for a concrete pivot and re-run the event with the new
|
||||
// (city, role, count). Capped at 1 retry per event to keep the
|
||||
// budget bounded and avoid infinite loops on genuinely-impossible
|
||||
// scenarios.
|
||||
if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) {
|
||||
console.log(` ▶ cloud rescue requested for ${event.at} ${event.kind}…`);
|
||||
const rescue = await requestCloudRemediation(event, result);
|
||||
if (rescue && rescue.remediation.retry) {
|
||||
const r = rescue.remediation;
|
||||
// Sanitize cloud's fields — model sometimes emits "Hammond, IN"
|
||||
// as new_city and "IN" as new_state, producing "Hammond, IN, IN"
|
||||
// downstream. Split on comma and take the first token for city.
|
||||
const sanitizeCity = (c: string | undefined) => (c ?? "").split(",")[0].trim();
|
||||
const sanitizeState = (c: string | undefined, stateFromCity: string) => {
|
||||
const explicit = (c ?? "").trim();
|
||||
// If explicit state is empty or matches original, try to
|
||||
// extract from the city string if it had a trailing ", XX".
|
||||
return explicit || stateFromCity || event.state;
|
||||
};
|
||||
const cityRaw = r.new_city ?? event.city;
|
||||
const cityClean = sanitizeCity(cityRaw);
|
||||
const stateFromCity = (cityRaw.match(/,\s*([A-Z]{2})/) ?? [])[1] ?? "";
|
||||
const newEvent: FillEvent = {
|
||||
...event,
|
||||
city: cityClean || event.city,
|
||||
state: sanitizeState(undefined, stateFromCity) || event.state,
|
||||
role: r.new_role ?? event.role,
|
||||
count: r.new_count ?? event.count,
|
||||
scenario_note: `[cloud-rescue ${rescue.duration_secs.toFixed(1)}s] ${r.rationale}`,
|
||||
};
|
||||
const noChange = newEvent.city === event.city
|
||||
&& newEvent.role === event.role
|
||||
&& newEvent.count === event.count;
|
||||
if (noChange) {
|
||||
console.log(` (cloud rescue declined — no actual pivot proposed)`);
|
||||
} else {
|
||||
console.log(` retry: ${event.role}×${event.count} ${event.city},${event.state} → ${newEvent.role}×${newEvent.count} ${newEvent.city},${newEvent.state}`);
|
||||
const retryResult = await runEvent(newEvent, ctx);
|
||||
// Annotate original result with the retry attempt so the
|
||||
// retrospective can show both. Retry becomes THE result if it
|
||||
// succeeded; otherwise original stays the primary outcome and
|
||||
// retry is recorded alongside.
|
||||
const originalForRecord = event;
|
||||
const origTurnsDur = { turns: result.turns, duration_secs: result.duration_secs };
|
||||
const retryAnnotation = {
|
||||
retry_attempt: 1,
|
||||
retry_remediation: {
|
||||
proposed_city: r.new_city,
|
||||
proposed_role: r.new_role,
|
||||
proposed_count: r.new_count,
|
||||
rationale: r.rationale,
|
||||
cloud_model: OVERVIEW_MODEL,
|
||||
cloud_duration_secs: rescue.duration_secs,
|
||||
},
|
||||
retry_result: {
|
||||
event: retryResult.event,
|
||||
ok: retryResult.ok,
|
||||
fills: retryResult.fills,
|
||||
turns: retryResult.turns,
|
||||
duration_secs: retryResult.duration_secs,
|
||||
error: retryResult.error,
|
||||
gap_signals: retryResult.gap_signals,
|
||||
sources_first_score: retryResult.sources_first_score,
|
||||
sources_last_score: retryResult.sources_last_score,
|
||||
pool_size: retryResult.pool_size,
|
||||
playbook_citations: retryResult.playbook_citations,
|
||||
discovered_pattern: retryResult.discovered_pattern,
|
||||
},
|
||||
original_event: originalForRecord,
|
||||
};
|
||||
if (retryResult.ok) {
|
||||
// Promote retry to primary result — the coordinator cares
|
||||
// about the final outcome. Keep original_event so the
|
||||
// pivot is visible in the retrospective.
|
||||
result = { ...retryResult, ...retryAnnotation, retry_attempt: 1 };
|
||||
} else {
|
||||
// Retry also failed. Keep original as primary, annotate
|
||||
// with what was tried.
|
||||
result = { ...result, ...retryAnnotation };
|
||||
}
|
||||
console.log(` retry outcome: ${retryResult.ok ? "✓ filled " + retryResult.fills.length + "/" + newEvent.count : "✗ " + (retryResult.error?.slice(0, 80) ?? "unknown")}`);
|
||||
}
|
||||
} else {
|
||||
console.log(` (cloud rescue: ${rescue ? "retry=false" : "unavailable"})`);
|
||||
}
|
||||
}
|
||||
|
||||
ctx.results.push(result);
|
||||
for (const s of result.gap_signals) {
|
||||
const [category, ...rest] = s.split(":");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user