diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index d7faa0a..6e2d335 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -80,6 +80,7 @@ interface EventResult { sources_last_score?: number; pool_size?: number; // sql_matches from the first hybrid_search playbook_citations?: string[]; + discovered_pattern?: string; // Path 2 meta-index snapshot per event } interface RosterEntry { @@ -203,7 +204,10 @@ async function executeToolCall(name: string, args: Record): Promise sql_filter, question, index_name, top_k: k ?? 10, generate: false, use_playbook_memory: true, - playbook_memory_k: 10, + // 2026-04-20 — bumped 10 → 100 to match server default change. At + // this memory size the semantic similarities cluster narrowly + // (0.55-0.67) and k=10 silently misses geo-matched playbooks. + playbook_memory_k: 100, }); } if (name === "sql") { @@ -406,18 +410,39 @@ function trimResult(r: any): any { // =================== Per-event guidance strings =================== function guidanceFor(event: FillEvent, ctx: ScenarioContext): string { - switch (event.kind) { - case "baseline_fill": - return `Standard Monday fill. Client ${ctx.spec.client}. Shift starts ${event.shift_start ?? "at start time"}. Take the top candidates by semantic match and availability.`; - case "recurring": - return `RECURRING slot — ${ctx.spec.client} runs this shift every Tues/Thurs. If playbook_memory surfaces candidates endorsed by past similar fills (you'll see 'cites' on hybrid sources), those are the preferred workers. Shift starts ${event.shift_start ?? "at start time"}.`; - case "expansion": - return `EXPANSION at ${ctx.spec.client}. New location, ${event.count}-worker team needed at once — search broadly and prefer workers with team/collaboration signals (engagement, communications scores). Shift starts ${event.shift_start ?? "at start time"}.`; - case "emergency": - return `EMERGENCY walkoff — ${ctx.spec.client} needs ${event.count} ${event.role}s BY ${event.deadline ?? "end of day"}. Prioritize availability over perfect skill match. A good-enough worker who can report today beats a perfect worker who can't.`; - case "misplacement": - return `MISPLACEMENT refill. A worker from the 08:00 shift no-showed. You must replace them WITHOUT proposing the same worker or anyone already booked today (see EXCLUDE list). Shift is ${event.shift_start ?? "in progress"} so speed matters.`; - } + // HARD SCHEMA GUARD: prior runs of this scenario had mistral invent + // column names from narrative guidance ("shift", "recurring", + // "expansion") and write SQL filters against them. Lock the schema + // explicitly so the executor has no excuse. Also pin `availability` + // and `reliability` as DOUBLE casts since their text-storage causes + // type_coercion errors otherwise. + const schemaLock = ` +SCHEMA ENFORCEMENT (CRITICAL): +The ONLY columns in workers_500k usable in sql_filter are: + worker_id, name, role, email, phone, city, state, zip, + skills, certifications, archetype, reliability, responsiveness, + engagement, communications, compliance, availability, resume_text. +Narrative words like "shift", "recurring", "expansion", "emergency", +"morning", "priority" are NOT columns. DO NOT invent columns. +Numeric filters need CAST: CAST(availability AS DOUBLE) > 0.5 and +CAST(reliability AS DOUBLE) > 0.7.`; + + const base = (() => { + switch (event.kind) { + case "baseline_fill": + return `Standard fill. Client ${ctx.spec.client}. Rank by semantic match; require CAST(availability AS DOUBLE) > 0.5.`; + case "recurring": + return `Recurring slot — prefer workers with past playbook citations (visible on hybrid sources). Require CAST(availability AS DOUBLE) > 0.5.`; + case "expansion": + return `New-location fill, ${event.count} workers at once. Require CAST(availability AS DOUBLE) > 0.5 AND CAST(reliability AS DOUBLE) > 0.75.`; + case "emergency": + return `Emergency replacement needed ASAP. Require CAST(availability AS DOUBLE) > 0.7. A good-enough available worker beats a perfect unavailable one.`; + case "misplacement": + return `Refill for a no-show. Do NOT propose anyone on the EXCLUDE list. Require CAST(availability AS DOUBLE) > 0.5.`; + } + })(); + + return `${schemaLock}\n\nEVENT FOCUS:\n${base}`; } // =================== Artifact generation =================== @@ -570,6 +595,19 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise(`${GATEWAY}/vectors/playbook_memory/patterns`, { + query: `${event.role} in ${event.city}, ${event.state}`, + top_k_playbooks: 25, + min_trait_frequency: 0.4, + }); + discovered_pattern = patterns?.discovered_pattern; + } catch { /* patterns are observational */ } + // Dispatch log (structured). await appendFile(join(ctx.out_dir, "dispatch.jsonl"), JSON.stringify({ @@ -581,16 +619,21 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise r.name), append: true, }); @@ -598,6 +641,26 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise 0) { + const lost = ctx.roster.find(r => r.status === "no_show"); + if (lost) { + try { + await httpJson(`${GATEWAY}/vectors/playbook_memory/mark_failed`, { + operation: `fill: ${lost.role} x1 in ${lost.city}, ${lost.state}`, + failed_names: [lost.name], + reason: `no-show from ${lost.booked_for} shift`, + }); + } catch (e) { + gap_signals.push(`mark_failed: ${(e as Error).message}`); + } + } + } + + // (discovered_pattern was computed + written above, before dispatch.jsonl) + return { event, ok: true, @@ -609,6 +672,7 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise { } } + // --- Workers-touched audit (don't leave anyone out) --- + // Pull every worker that surfaced as a hit across all 5 events — booked + // or excluded or rejected — so we can show the full population the + // system considered, not just the ones that made the cut. J's ask: + // "iterations and decisions that are made don't leave anyone out." + const touched = new Map(); + for (const r of ctx.results) { + for (const f of r.fills) { + const key = f.candidate_id; + if (!touched.has(key)) touched.set(key, { name: f.name, events: [], outcome: "booked" }); + touched.get(key)!.events.push(`${r.event.at} ${r.event.kind}`); + } + } + for (const r of ctx.roster) { + if (r.status === "no_show") { + const t = touched.get(r.worker_id); + if (t) t.outcome = "booked-then-no_show"; + else touched.set(r.worker_id, { name: r.name, events: [r.booked_for], outcome: "no_show" }); + } + } + + lines.push("## Workers touched across the week"); + lines.push(""); + lines.push(`${touched.size} distinct workers made it through to a decision. Every one is accounted for below — ` + + `no-shows flagged, rebookings noted, everyone visible.`); + lines.push(""); + lines.push("| Worker ID | Name | Events | Outcome |"); + lines.push("|---|---|---|---|"); + for (const [id, t] of touched) { + lines.push(`| ${id} | ${t.name} | ${t.events.join(" + ")} | ${t.outcome} |`); + } + lines.push(""); + + // --- Discovered patterns evolution across events --- + lines.push("## Discovered patterns (meta-index)"); + lines.push(""); + lines.push("What the system identified across semantically-similar past fills as each event ran:"); + lines.push(""); + for (const r of ctx.results) { + const dp = (r as any).discovered_pattern ?? "—"; + lines.push(`- **${r.event.at} ${r.event.kind}** (${r.event.role}): ${dp}`); + } + lines.push(""); + // --- Narrative summary --- lines.push("## Narrative"); lines.push(""); lines.push(`- ${ctx.results.filter(r => r.ok).length}/${ctx.results.length} events reached consensus.`); lines.push(`- Final roster: ${ctx.roster.length} bookings across ${new Set(ctx.roster.map(r => r.worker_id)).size} distinct workers.`); + lines.push(`- Workers touched (booked, failed, or otherwise decided): ${touched.size}.`); const totalCites = ctx.results.reduce((a, r) => a + (r.playbook_citations?.length ?? 0), 0); lines.push(`- Playbook citations across the day: ${totalCites} (proof the feedback loop fired across events).`); const droppedEvents = ctx.results.filter(r => !r.ok);