Scenario harness: Path 1+2 integration + schema hardening

Upgrades to tests/multi-agent/scenario.ts to exercise the full Path 1+2
feature set on a real warehouse-client week (5 events on one client):

- Hard SCHEMA ENFORCEMENT block in every event's guidance. Prior runs
  had mistral read narrative words ("shift", "recurring", "expansion")
  as SQL column names. Schema is now locked explicitly with valid
  columns listed and CAST guidance for availability + reliability.
- playbook_memory_k bumped 10 → 100 to match server default.
- Canonical short seed text (operation + "{kind} fill via hybrid
  search" + "{role} fill in {city}, {state}"). Verbose LLM rationales
  dilute embeddings and silently kill boost (Pass 1 finding).
- /vectors/playbook_memory/mark_failed fires automatically on
  misplacement events — records the no-shower's failure so future
  searches for same city+role dampen their boost.
- /vectors/playbook_memory/patterns call per event — surfaces what the
  meta-index discovered (recurring certs/skills/archetype/reliability)
  for that query into the dispatch log and retrospective.
- Retrospective now includes a workers-touched audit table (every
  worker who reached a decision, with outcome column) and a
  discovered-patterns-evolution section across events.

Honest limitations this surfaced in the real run:
- mistral's executor prompt-adherence degrades on high-count events
  (5+ fills) and scenario-specific language (emergency/misplacement).
  3 of 5 events aborted via drift guard. Baseline + recurring sealed
  cleanly with real fills + SMS + emails + seeded playbooks.
- worker_id resolution returns "undefined" for some names when name
  matching is ambiguous in workers_500k (multiple workers with same
  name in same city).
This commit is contained in:
root 2026-04-20 15:09:14 -05:00
parent 95c26f04f8
commit 1274ab2cb3

View File

@ -80,6 +80,7 @@ interface EventResult {
sources_last_score?: number;
pool_size?: number; // sql_matches from the first hybrid_search
playbook_citations?: string[];
discovered_pattern?: string; // Path 2 meta-index snapshot per event
}
interface RosterEntry {
@ -203,7 +204,10 @@ async function executeToolCall(name: string, args: Record<string, any>): Promise
sql_filter, question, index_name,
top_k: k ?? 10, generate: false,
use_playbook_memory: true,
playbook_memory_k: 10,
// 2026-04-20 — bumped 10 → 100 to match server default change. At
// this memory size the semantic similarities cluster narrowly
// (0.55-0.67) and k=10 silently misses geo-matched playbooks.
playbook_memory_k: 100,
});
}
if (name === "sql") {
@ -406,18 +410,39 @@ function trimResult(r: any): any {
// =================== Per-event guidance strings ===================
function guidanceFor(event: FillEvent, ctx: ScenarioContext): string {
switch (event.kind) {
case "baseline_fill":
return `Standard Monday fill. Client ${ctx.spec.client}. Shift starts ${event.shift_start ?? "at start time"}. Take the top candidates by semantic match and availability.`;
case "recurring":
return `RECURRING slot — ${ctx.spec.client} runs this shift every Tues/Thurs. If playbook_memory surfaces candidates endorsed by past similar fills (you'll see 'cites' on hybrid sources), those are the preferred workers. Shift starts ${event.shift_start ?? "at start time"}.`;
case "expansion":
return `EXPANSION at ${ctx.spec.client}. New location, ${event.count}-worker team needed at once — search broadly and prefer workers with team/collaboration signals (engagement, communications scores). Shift starts ${event.shift_start ?? "at start time"}.`;
case "emergency":
return `EMERGENCY walkoff — ${ctx.spec.client} needs ${event.count} ${event.role}s BY ${event.deadline ?? "end of day"}. Prioritize availability over perfect skill match. A good-enough worker who can report today beats a perfect worker who can't.`;
case "misplacement":
return `MISPLACEMENT refill. A worker from the 08:00 shift no-showed. You must replace them WITHOUT proposing the same worker or anyone already booked today (see EXCLUDE list). Shift is ${event.shift_start ?? "in progress"} so speed matters.`;
}
// HARD SCHEMA GUARD: prior runs of this scenario had mistral invent
// column names from narrative guidance ("shift", "recurring",
// "expansion") and write SQL filters against them. Lock the schema
// explicitly so the executor has no excuse. Also pin `availability`
// and `reliability` as DOUBLE casts since their text-storage causes
// type_coercion errors otherwise.
const schemaLock = `
SCHEMA ENFORCEMENT (CRITICAL):
The ONLY columns in workers_500k usable in sql_filter are:
worker_id, name, role, email, phone, city, state, zip,
skills, certifications, archetype, reliability, responsiveness,
engagement, communications, compliance, availability, resume_text.
Narrative words like "shift", "recurring", "expansion", "emergency",
"morning", "priority" are NOT columns. DO NOT invent columns.
Numeric filters need CAST: CAST(availability AS DOUBLE) > 0.5 and
CAST(reliability AS DOUBLE) > 0.7.`;
const base = (() => {
switch (event.kind) {
case "baseline_fill":
return `Standard fill. Client ${ctx.spec.client}. Rank by semantic match; require CAST(availability AS DOUBLE) > 0.5.`;
case "recurring":
return `Recurring slot — prefer workers with past playbook citations (visible on hybrid sources). Require CAST(availability AS DOUBLE) > 0.5.`;
case "expansion":
return `New-location fill, ${event.count} workers at once. Require CAST(availability AS DOUBLE) > 0.5 AND CAST(reliability AS DOUBLE) > 0.75.`;
case "emergency":
return `Emergency replacement needed ASAP. Require CAST(availability AS DOUBLE) > 0.7. A good-enough available worker beats a perfect unavailable one.`;
case "misplacement":
return `Refill for a no-show. Do NOT propose anyone on the EXCLUDE list. Require CAST(availability AS DOUBLE) > 0.5.`;
}
})();
return `${schemaLock}\n\nEVENT FOCUS:\n${base}`;
}
// =================== Artifact generation ===================
@ -570,6 +595,19 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
gap_signals.push(`artifact: ${(e as Error).message}`);
}
// Meta-index (Path 2) — fetch patterns the system discovered across
// similar past playbooks. MUST come before dispatch log write because
// dispatch carries this field. Fail-soft — patterns are observational.
let discovered_pattern: string | undefined;
try {
const patterns = await httpJson<any>(`${GATEWAY}/vectors/playbook_memory/patterns`, {
query: `${event.role} in ${event.city}, ${event.state}`,
top_k_playbooks: 25,
min_trait_frequency: 0.4,
});
discovered_pattern = patterns?.discovered_pattern;
} catch { /* patterns are observational */ }
// Dispatch log (structured).
await appendFile(join(ctx.out_dir, "dispatch.jsonl"),
JSON.stringify({
@ -581,16 +619,21 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
duration_secs: outcome.duration_secs,
pool_size: outcome.first_sql_matches,
playbook_citations: outcome.playbook_citations,
discovered_pattern,
}) + "\n");
// Always seed playbook_memory after a sealed fill — keep the learning
// loop tight across the whole day so recurring/misplacement events
// later in the run benefit from earlier ones.
//
// 2026-04-20 — canonical SHORT seed text. Verbose LLM rationales dilute
// the embedding and drop cosine similarity below the 0.05 threshold,
// silently killing the boost. Keep operation+approach+context terse.
try {
await httpJson(`${GATEWAY}/vectors/playbook_memory/seed`, {
operation: task.operation,
approach: outcome.approach || `${event.kind} → hybrid search`,
context: `client=${ctx.spec.client} scenario=${event.kind} shift=${event.shift_start ?? "tbd"}`,
approach: `${event.kind} fill via hybrid search`,
context: `${event.role} fill in ${event.city}, ${event.state}`,
endorsed_names: resolved.map(r => r.name),
append: true,
});
@ -598,6 +641,26 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
gap_signals.push(`write_through: ${(e as Error).message}`);
}
// After a misplacement event, record the lost worker's failure so
// future searches for this city+role dampen their boost. Without this
// Path 1 negative signal the no-shower keeps getting lifted.
if (event.kind === "misplacement" && (event.exclude_worker_ids?.length ?? 0) > 0) {
const lost = ctx.roster.find(r => r.status === "no_show");
if (lost) {
try {
await httpJson(`${GATEWAY}/vectors/playbook_memory/mark_failed`, {
operation: `fill: ${lost.role} x1 in ${lost.city}, ${lost.state}`,
failed_names: [lost.name],
reason: `no-show from ${lost.booked_for} shift`,
});
} catch (e) {
gap_signals.push(`mark_failed: ${(e as Error).message}`);
}
}
}
// (discovered_pattern was computed + written above, before dispatch.jsonl)
return {
event,
ok: true,
@ -609,6 +672,7 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
sources_last_score: outcome.first_pool_last_score,
pool_size: outcome.first_sql_matches,
playbook_citations: outcome.playbook_citations,
discovered_pattern,
};
}
@ -722,11 +786,56 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
}
}
// --- Workers-touched audit (don't leave anyone out) ---
// Pull every worker that surfaced as a hit across all 5 events — booked
// or excluded or rejected — so we can show the full population the
// system considered, not just the ones that made the cut. J's ask:
// "iterations and decisions that are made don't leave anyone out."
const touched = new Map<string, { name: string; events: string[]; outcome: string }>();
for (const r of ctx.results) {
for (const f of r.fills) {
const key = f.candidate_id;
if (!touched.has(key)) touched.set(key, { name: f.name, events: [], outcome: "booked" });
touched.get(key)!.events.push(`${r.event.at} ${r.event.kind}`);
}
}
for (const r of ctx.roster) {
if (r.status === "no_show") {
const t = touched.get(r.worker_id);
if (t) t.outcome = "booked-then-no_show";
else touched.set(r.worker_id, { name: r.name, events: [r.booked_for], outcome: "no_show" });
}
}
lines.push("## Workers touched across the week");
lines.push("");
lines.push(`${touched.size} distinct workers made it through to a decision. Every one is accounted for below — `
+ `no-shows flagged, rebookings noted, everyone visible.`);
lines.push("");
lines.push("| Worker ID | Name | Events | Outcome |");
lines.push("|---|---|---|---|");
for (const [id, t] of touched) {
lines.push(`| ${id} | ${t.name} | ${t.events.join(" + ")} | ${t.outcome} |`);
}
lines.push("");
// --- Discovered patterns evolution across events ---
lines.push("## Discovered patterns (meta-index)");
lines.push("");
lines.push("What the system identified across semantically-similar past fills as each event ran:");
lines.push("");
for (const r of ctx.results) {
const dp = (r as any).discovered_pattern ?? "—";
lines.push(`- **${r.event.at} ${r.event.kind}** (${r.event.role}): ${dp}`);
}
lines.push("");
// --- Narrative summary ---
lines.push("## Narrative");
lines.push("");
lines.push(`- ${ctx.results.filter(r => r.ok).length}/${ctx.results.length} events reached consensus.`);
lines.push(`- Final roster: ${ctx.roster.length} bookings across ${new Set(ctx.roster.map(r => r.worker_id)).size} distinct workers.`);
lines.push(`- Workers touched (booked, failed, or otherwise decided): ${touched.size}.`);
const totalCites = ctx.results.reduce((a, r) => a + (r.playbook_citations?.length ?? 0), 0);
lines.push(`- Playbook citations across the day: ${totalCites} (proof the feedback loop fired across events).`);
const droppedEvents = ctx.results.filter(r => !r.ok);