diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index 17ff135..ceaf8ec 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -136,6 +136,10 @@ interface EventResult { pool_size?: number; // sql_matches from the first hybrid_search playbook_citations?: string[]; discovered_pattern?: string; // Path 2 meta-index snapshot per event + diagnostic_log?: LogEntry[]; // SQL filters, pool sizes, drift reasons — + // what T3 needs to diagnose supply issues + // vs surface symptoms. Present on both + // failed and successful events. } interface RosterEntry { @@ -341,9 +345,12 @@ async function runAgentFill( task: TaskSpec, extra_guidance: string, exclude_worker_ids: string[], + sharedLog?: LogEntry[], // If provided, runAgentFill appends here too so + // callers can diagnose AFTER a throw. Empty by + // default — passing this in is opt-in. ): Promise { const t0 = Date.now(); - const log: LogEntry[] = []; + const log: LogEntry[] = sharedLog ?? []; let turn = 0; let consecutiveDrifts = 0; let sealed: { fills: Fill[]; approach: string } | null = null; @@ -677,8 +684,14 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise - `- ${p.event.at} ${p.event.kind} ${p.event.role}×${p.event.count} → ${p.ok ? p.fills.length + "/" + p.event.count + " filled" : "FAIL"}; pool=${p.pool_size ?? "?"}; cites=${p.playbook_citations?.length ?? 0}` + `- ${p.event.at} ${p.event.kind} ${p.event.role}×${p.event.count} in ${p.event.city},${p.event.state} → ${p.ok ? p.fills.length + "/" + p.event.count + " filled" : "FAIL(" + (p.error?.slice(0, 40) ?? "?") + ")"}; pool=${p.pool_size ?? "?"}; cites=${p.playbook_citations?.length ?? 0}` ).join("\n"); + const diag = extractDiagnostics(result.diagnostic_log); + const diagBlock = result.ok + ? "" // only include for failures — successful events don't need the forensic detail + : `\n\nFAILURE FORENSICS (raw signals the agent saw — use to diagnose root cause):\n` + + ` SQL filters attempted (first 3):\n${diag.sql_filters.slice(0, 3).map(f => " - " + f).join("\n") || " (none)"}\n` + + ` Row counts per tool call: ${diag.hybrid_row_counts.slice(0, 5).join(", ") || "(none)"}\n` + + (diag.sql_errors.length > 0 ? ` SQL errors: ${diag.sql_errors.slice(0, 2).join(" | ")}\n` : "") + + ` Reviewer drift reasons (first 2): ${diag.drift_reasons.slice(0, 2).join(" | ") || "(none recorded)"}\n` + + ` KEY CHECK: if row counts were non-zero but the reviewer said "no matches for ${event.city}", the SQL filter yielded 0 rows in the target city and vector search FALLED BACK to workers in OTHER cities. That is a ZERO-SUPPLY condition — pivot city or broaden role.`; + const thisOne = `This event: ${event.at} ${event.kind} ${event.role}×${event.count} in ${event.city}, ${event.state}. ` + `Outcome: ${result.ok ? "filled " + result.fills.length + "/" + event.count : "FAILED: " + (result.error ?? "unknown")}. ` + `Pool size: ${result.pool_size ?? "n/a"}. Turns: ${result.turns}. Playbook citations: ${result.playbook_citations?.length ?? 0}. ` @@ -899,9 +959,9 @@ async function runOverviewCheckpoint( Recent events (most recent last): ${priorSummary || "(no prior events)"} -${thisOne} +${thisOne}${diagBlock} -Your job: emit ONE risk flag (≤6 words) and ONE actionable hint (≤25 words) for the NEXT event. Be concrete: name the role, city, or worker class if relevant. Do not restate what happened. Think step by step, then output strictly as: +Your job: emit ONE risk flag (≤8 words) and ONE actionable hint (≤40 words) for the NEXT event. Be concrete: name the role, city, worker class, OR a geographic pivot (e.g. "pivot Gary IN → Chicago IL, 40min drive"). Do not restate what happened. If the failure was zero-supply, your hint MUST propose a specific alternative city or role. Think step by step, then output strictly as: RISK: HINT: `; @@ -940,9 +1000,12 @@ HINT: `; async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise { if (T3_DISABLED) return null; - const eventDigest = ctx.results.map(r => - `- ${r.event.at} ${r.event.kind} ${r.event.role}×${r.event.count} ${r.event.city},${r.event.state} → ${r.ok ? r.fills.length + " filled" : "FAIL"}; pool=${r.pool_size ?? "?"}; turns=${r.turns}; cites=${r.playbook_citations?.length ?? 0}; gaps=${r.gap_signals.length}` - ).join("\n"); + const eventDigest = ctx.results.map(r => { + const diag = extractDiagnostics(r.diagnostic_log); + const zeroSupply = !r.ok && diag.drift_reasons.some(n => /no match|no candidat|0 rows/i.test(n)); + const supplyTag = zeroSupply ? " [ZERO-SUPPLY: pivot city needed]" : ""; + return `- ${r.event.at} ${r.event.kind} ${r.event.role}×${r.event.count} ${r.event.city},${r.event.state} → ${r.ok ? r.fills.length + " filled" : "FAIL(" + (r.error?.slice(0, 40) ?? "?") + ")"}; pool=${r.pool_size ?? "?"}; turns=${r.turns}; cites=${r.playbook_citations?.length ?? 0}; gaps=${r.gap_signals.length}${supplyTag}`; + }).join("\n"); const checkpointDigest = checkpoints.length > 0 ? checkpoints.map(c => `- after ${c.after_event} (${c.event_kind}): risk="${c.risk}" hint="${c.hint}"`).join("\n") @@ -960,6 +1023,8 @@ ${checkpointDigest} Your job: write ONE actionable lesson for future runs that face similar setups. Target audience: the agent tomorrow. Keep the lesson to 3-5 sentences. No filler, no restating. Think step by step about what pattern repeated, what to pre-fetch, or what to avoid — then write the lesson as plain prose. +PRIORITY: If ANY event carries the [ZERO-SUPPLY: pivot city needed] tag, your lesson MUST name the specific cities that had zero supply and propose the nearest alternate city with roughly the required role count. That is the single most valuable signal for future runs — do not bury it under generic advice. Example: "Gary, IN has no Electricians in the corpus — pivot to Chicago, IL (40mi) or Hammond, IN (10mi) for future fills." + LESSON:`; try { diff --git a/tests/multi-agent/scenarios/stress_01.json b/tests/multi-agent/scenarios/stress_01.json new file mode 100644 index 0000000..fa1057c --- /dev/null +++ b/tests/multi-agent/scenarios/stress_01.json @@ -0,0 +1,58 @@ +{ + "client": "Ironclad Industrial", + "date": "2026-04-22", + "events": [ + { + "kind": "baseline_fill", + "at": "07:00", + "role": "Electrician", + "count": 5, + "city": "Gary", + "state": "IN", + "shift_start": "07:00 AM", + "scenario_note": "Gary IN has ZERO Electricians in the index. Local WILL fail this. Cloud should diagnose no-supply and recommend pivoting to Chicago IL (40min drive) or relaxing to 'Maintenance Tech'." + }, + { + "kind": "expansion", + "at": "09:30", + "role": "Safety Coordinator", + "count": 8, + "city": "Peoria", + "state": "IL", + "shift_start": "09:30 AM", + "scenario_note": "Safety Coordinator is the rarest role overall (~4500 nationally). 8× in a mid-sized city with availability > 0.5 is genuinely tight. Cloud should either confirm or suggest multi-city sourcing." + }, + { + "kind": "emergency", + "at": "11:45", + "role": "Welder", + "count": 3, + "city": "Flint", + "state": "MI", + "shift_start": "12:00 PM", + "deadline": "13:30", + "scenario_note": "Flint MI has ZERO workers indexed — total data desert. Cloud must flag 'impossible supply' and recommend pivot (Detroit 60mi, Saginaw 40mi)." + }, + { + "kind": "expansion", + "at": "14:00", + "role": "Tool & Die Maker", + "count": 4, + "city": "Grand Rapids", + "state": "MI", + "shift_start": "14:00 PM", + "scenario_note": "Tool & Die Maker is scarce (~9000 total). 4× in Grand Rapids, availability > 0.5 AND reliability > 0.75. Tight but solvable if playbook_memory has history; cloud should prioritize proven performers." + }, + { + "kind": "misplacement", + "at": "15:30", + "role": "Electrician", + "count": 1, + "city": "Gary", + "state": "IN", + "shift_start": "15:30 PM", + "replaces_event": "07:00", + "scenario_note": "Refilling 1× Electrician in Gary after a no-show. Same data desert as event 1 — cloud should recognize the repeat and recommend the SAME pivot it gave earlier, proving it learns within-run." + } + ] +}