From 1274ab2cb393f8b9892bbf516bea0a626de7c7bb Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 15:09:14 -0500 Subject: [PATCH] Scenario harness: Path 1+2 integration + schema hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upgrades to tests/multi-agent/scenario.ts to exercise the full Path 1+2 feature set on a real warehouse-client week (5 events on one client): - Hard SCHEMA ENFORCEMENT block in every event's guidance. Prior runs had mistral read narrative words ("shift", "recurring", "expansion") as SQL column names. Schema is now locked explicitly with valid columns listed and CAST guidance for availability + reliability. - playbook_memory_k bumped 10 → 100 to match server default. - Canonical short seed text (operation + "{kind} fill via hybrid search" + "{role} fill in {city}, {state}"). Verbose LLM rationales dilute embeddings and silently kill boost (Pass 1 finding). - /vectors/playbook_memory/mark_failed fires automatically on misplacement events — records the no-shower's failure so future searches for same city+role dampen their boost. - /vectors/playbook_memory/patterns call per event — surfaces what the meta-index discovered (recurring certs/skills/archetype/reliability) for that query into the dispatch log and retrospective. - Retrospective now includes a workers-touched audit table (every worker who reached a decision, with outcome column) and a discovered-patterns-evolution section across events. Honest limitations this surfaced in the real run: - mistral's executor prompt-adherence degrades on high-count events (5+ fills) and scenario-specific language (emergency/misplacement). 3 of 5 events aborted via drift guard. Baseline + recurring sealed cleanly with real fills + SMS + emails + seeded playbooks. - worker_id resolution returns "undefined" for some names when name matching is ambiguous in workers_500k (multiple workers with same name in same city). --- tests/multi-agent/scenario.ts | 139 ++++++++++++++++++++++++++++++---- 1 file changed, 124 insertions(+), 15 deletions(-) diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index d7faa0a..6e2d335 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -80,6 +80,7 @@ interface EventResult { sources_last_score?: number; pool_size?: number; // sql_matches from the first hybrid_search playbook_citations?: string[]; + discovered_pattern?: string; // Path 2 meta-index snapshot per event } interface RosterEntry { @@ -203,7 +204,10 @@ async function executeToolCall(name: string, args: Record): Promise sql_filter, question, index_name, top_k: k ?? 10, generate: false, use_playbook_memory: true, - playbook_memory_k: 10, + // 2026-04-20 — bumped 10 → 100 to match server default change. At + // this memory size the semantic similarities cluster narrowly + // (0.55-0.67) and k=10 silently misses geo-matched playbooks. + playbook_memory_k: 100, }); } if (name === "sql") { @@ -406,18 +410,39 @@ function trimResult(r: any): any { // =================== Per-event guidance strings =================== function guidanceFor(event: FillEvent, ctx: ScenarioContext): string { - switch (event.kind) { - case "baseline_fill": - return `Standard Monday fill. Client ${ctx.spec.client}. Shift starts ${event.shift_start ?? "at start time"}. Take the top candidates by semantic match and availability.`; - case "recurring": - return `RECURRING slot — ${ctx.spec.client} runs this shift every Tues/Thurs. If playbook_memory surfaces candidates endorsed by past similar fills (you'll see 'cites' on hybrid sources), those are the preferred workers. Shift starts ${event.shift_start ?? "at start time"}.`; - case "expansion": - return `EXPANSION at ${ctx.spec.client}. New location, ${event.count}-worker team needed at once — search broadly and prefer workers with team/collaboration signals (engagement, communications scores). Shift starts ${event.shift_start ?? "at start time"}.`; - case "emergency": - return `EMERGENCY walkoff — ${ctx.spec.client} needs ${event.count} ${event.role}s BY ${event.deadline ?? "end of day"}. Prioritize availability over perfect skill match. A good-enough worker who can report today beats a perfect worker who can't.`; - case "misplacement": - return `MISPLACEMENT refill. A worker from the 08:00 shift no-showed. You must replace them WITHOUT proposing the same worker or anyone already booked today (see EXCLUDE list). Shift is ${event.shift_start ?? "in progress"} so speed matters.`; - } + // HARD SCHEMA GUARD: prior runs of this scenario had mistral invent + // column names from narrative guidance ("shift", "recurring", + // "expansion") and write SQL filters against them. Lock the schema + // explicitly so the executor has no excuse. Also pin `availability` + // and `reliability` as DOUBLE casts since their text-storage causes + // type_coercion errors otherwise. + const schemaLock = ` +SCHEMA ENFORCEMENT (CRITICAL): +The ONLY columns in workers_500k usable in sql_filter are: + worker_id, name, role, email, phone, city, state, zip, + skills, certifications, archetype, reliability, responsiveness, + engagement, communications, compliance, availability, resume_text. +Narrative words like "shift", "recurring", "expansion", "emergency", +"morning", "priority" are NOT columns. DO NOT invent columns. +Numeric filters need CAST: CAST(availability AS DOUBLE) > 0.5 and +CAST(reliability AS DOUBLE) > 0.7.`; + + const base = (() => { + switch (event.kind) { + case "baseline_fill": + return `Standard fill. Client ${ctx.spec.client}. Rank by semantic match; require CAST(availability AS DOUBLE) > 0.5.`; + case "recurring": + return `Recurring slot — prefer workers with past playbook citations (visible on hybrid sources). Require CAST(availability AS DOUBLE) > 0.5.`; + case "expansion": + return `New-location fill, ${event.count} workers at once. Require CAST(availability AS DOUBLE) > 0.5 AND CAST(reliability AS DOUBLE) > 0.75.`; + case "emergency": + return `Emergency replacement needed ASAP. Require CAST(availability AS DOUBLE) > 0.7. A good-enough available worker beats a perfect unavailable one.`; + case "misplacement": + return `Refill for a no-show. Do NOT propose anyone on the EXCLUDE list. Require CAST(availability AS DOUBLE) > 0.5.`; + } + })(); + + return `${schemaLock}\n\nEVENT FOCUS:\n${base}`; } // =================== Artifact generation =================== @@ -570,6 +595,19 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise(`${GATEWAY}/vectors/playbook_memory/patterns`, { + query: `${event.role} in ${event.city}, ${event.state}`, + top_k_playbooks: 25, + min_trait_frequency: 0.4, + }); + discovered_pattern = patterns?.discovered_pattern; + } catch { /* patterns are observational */ } + // Dispatch log (structured). await appendFile(join(ctx.out_dir, "dispatch.jsonl"), JSON.stringify({ @@ -581,16 +619,21 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise r.name), append: true, }); @@ -598,6 +641,26 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise 0) { + const lost = ctx.roster.find(r => r.status === "no_show"); + if (lost) { + try { + await httpJson(`${GATEWAY}/vectors/playbook_memory/mark_failed`, { + operation: `fill: ${lost.role} x1 in ${lost.city}, ${lost.state}`, + failed_names: [lost.name], + reason: `no-show from ${lost.booked_for} shift`, + }); + } catch (e) { + gap_signals.push(`mark_failed: ${(e as Error).message}`); + } + } + } + + // (discovered_pattern was computed + written above, before dispatch.jsonl) + return { event, ok: true, @@ -609,6 +672,7 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise { } } + // --- Workers-touched audit (don't leave anyone out) --- + // Pull every worker that surfaced as a hit across all 5 events — booked + // or excluded or rejected — so we can show the full population the + // system considered, not just the ones that made the cut. J's ask: + // "iterations and decisions that are made don't leave anyone out." + const touched = new Map(); + for (const r of ctx.results) { + for (const f of r.fills) { + const key = f.candidate_id; + if (!touched.has(key)) touched.set(key, { name: f.name, events: [], outcome: "booked" }); + touched.get(key)!.events.push(`${r.event.at} ${r.event.kind}`); + } + } + for (const r of ctx.roster) { + if (r.status === "no_show") { + const t = touched.get(r.worker_id); + if (t) t.outcome = "booked-then-no_show"; + else touched.set(r.worker_id, { name: r.name, events: [r.booked_for], outcome: "no_show" }); + } + } + + lines.push("## Workers touched across the week"); + lines.push(""); + lines.push(`${touched.size} distinct workers made it through to a decision. Every one is accounted for below — ` + + `no-shows flagged, rebookings noted, everyone visible.`); + lines.push(""); + lines.push("| Worker ID | Name | Events | Outcome |"); + lines.push("|---|---|---|---|"); + for (const [id, t] of touched) { + lines.push(`| ${id} | ${t.name} | ${t.events.join(" + ")} | ${t.outcome} |`); + } + lines.push(""); + + // --- Discovered patterns evolution across events --- + lines.push("## Discovered patterns (meta-index)"); + lines.push(""); + lines.push("What the system identified across semantically-similar past fills as each event ran:"); + lines.push(""); + for (const r of ctx.results) { + const dp = (r as any).discovered_pattern ?? "—"; + lines.push(`- **${r.event.at} ${r.event.kind}** (${r.event.role}): ${dp}`); + } + lines.push(""); + // --- Narrative summary --- lines.push("## Narrative"); lines.push(""); lines.push(`- ${ctx.results.filter(r => r.ok).length}/${ctx.results.length} events reached consensus.`); lines.push(`- Final roster: ${ctx.roster.length} bookings across ${new Set(ctx.roster.map(r => r.worker_id)).size} distinct workers.`); + lines.push(`- Workers touched (booked, failed, or otherwise decided): ${touched.size}.`); const totalCites = ctx.results.reduce((a, r) => a + (r.playbook_citations?.length ?? 0), 0); lines.push(`- Playbook citations across the day: ${totalCites} (proof the feedback loop fired across events).`); const droppedEvents = ctx.results.filter(r => !r.ok);