Scenario harness: Path 1+2 integration + schema hardening
Upgrades to tests/multi-agent/scenario.ts to exercise the full Path 1+2
feature set on a real warehouse-client week (5 events on one client):
- Hard SCHEMA ENFORCEMENT block in every event's guidance. Prior runs
had mistral read narrative words ("shift", "recurring", "expansion")
as SQL column names. Schema is now locked explicitly with valid
columns listed and CAST guidance for availability + reliability.
- playbook_memory_k bumped 10 → 100 to match server default.
- Canonical short seed text (operation + "{kind} fill via hybrid
search" + "{role} fill in {city}, {state}"). Verbose LLM rationales
dilute embeddings and silently kill boost (Pass 1 finding).
- /vectors/playbook_memory/mark_failed fires automatically on
misplacement events — records the no-shower's failure so future
searches for same city+role dampen their boost.
- /vectors/playbook_memory/patterns call per event — surfaces what the
meta-index discovered (recurring certs/skills/archetype/reliability)
for that query into the dispatch log and retrospective.
- Retrospective now includes a workers-touched audit table (every
worker who reached a decision, with outcome column) and a
discovered-patterns-evolution section across events.
Honest limitations this surfaced in the real run:
- mistral's executor prompt-adherence degrades on high-count events
(5+ fills) and scenario-specific language (emergency/misplacement).
3 of 5 events aborted via drift guard. Baseline + recurring sealed
cleanly with real fills + SMS + emails + seeded playbooks.
- worker_id resolution returns "undefined" for some names when name
matching is ambiguous in workers_500k (multiple workers with same
name in same city).
This commit is contained in:
parent
95c26f04f8
commit
1274ab2cb3
@ -80,6 +80,7 @@ interface EventResult {
|
||||
sources_last_score?: number;
|
||||
pool_size?: number; // sql_matches from the first hybrid_search
|
||||
playbook_citations?: string[];
|
||||
discovered_pattern?: string; // Path 2 meta-index snapshot per event
|
||||
}
|
||||
|
||||
interface RosterEntry {
|
||||
@ -203,7 +204,10 @@ async function executeToolCall(name: string, args: Record<string, any>): Promise
|
||||
sql_filter, question, index_name,
|
||||
top_k: k ?? 10, generate: false,
|
||||
use_playbook_memory: true,
|
||||
playbook_memory_k: 10,
|
||||
// 2026-04-20 — bumped 10 → 100 to match server default change. At
|
||||
// this memory size the semantic similarities cluster narrowly
|
||||
// (0.55-0.67) and k=10 silently misses geo-matched playbooks.
|
||||
playbook_memory_k: 100,
|
||||
});
|
||||
}
|
||||
if (name === "sql") {
|
||||
@ -406,18 +410,39 @@ function trimResult(r: any): any {
|
||||
// =================== Per-event guidance strings ===================
|
||||
|
||||
function guidanceFor(event: FillEvent, ctx: ScenarioContext): string {
|
||||
switch (event.kind) {
|
||||
case "baseline_fill":
|
||||
return `Standard Monday fill. Client ${ctx.spec.client}. Shift starts ${event.shift_start ?? "at start time"}. Take the top candidates by semantic match and availability.`;
|
||||
case "recurring":
|
||||
return `RECURRING slot — ${ctx.spec.client} runs this shift every Tues/Thurs. If playbook_memory surfaces candidates endorsed by past similar fills (you'll see 'cites' on hybrid sources), those are the preferred workers. Shift starts ${event.shift_start ?? "at start time"}.`;
|
||||
case "expansion":
|
||||
return `EXPANSION at ${ctx.spec.client}. New location, ${event.count}-worker team needed at once — search broadly and prefer workers with team/collaboration signals (engagement, communications scores). Shift starts ${event.shift_start ?? "at start time"}.`;
|
||||
case "emergency":
|
||||
return `EMERGENCY walkoff — ${ctx.spec.client} needs ${event.count} ${event.role}s BY ${event.deadline ?? "end of day"}. Prioritize availability over perfect skill match. A good-enough worker who can report today beats a perfect worker who can't.`;
|
||||
case "misplacement":
|
||||
return `MISPLACEMENT refill. A worker from the 08:00 shift no-showed. You must replace them WITHOUT proposing the same worker or anyone already booked today (see EXCLUDE list). Shift is ${event.shift_start ?? "in progress"} so speed matters.`;
|
||||
}
|
||||
// HARD SCHEMA GUARD: prior runs of this scenario had mistral invent
|
||||
// column names from narrative guidance ("shift", "recurring",
|
||||
// "expansion") and write SQL filters against them. Lock the schema
|
||||
// explicitly so the executor has no excuse. Also pin `availability`
|
||||
// and `reliability` as DOUBLE casts since their text-storage causes
|
||||
// type_coercion errors otherwise.
|
||||
const schemaLock = `
|
||||
SCHEMA ENFORCEMENT (CRITICAL):
|
||||
The ONLY columns in workers_500k usable in sql_filter are:
|
||||
worker_id, name, role, email, phone, city, state, zip,
|
||||
skills, certifications, archetype, reliability, responsiveness,
|
||||
engagement, communications, compliance, availability, resume_text.
|
||||
Narrative words like "shift", "recurring", "expansion", "emergency",
|
||||
"morning", "priority" are NOT columns. DO NOT invent columns.
|
||||
Numeric filters need CAST: CAST(availability AS DOUBLE) > 0.5 and
|
||||
CAST(reliability AS DOUBLE) > 0.7.`;
|
||||
|
||||
const base = (() => {
|
||||
switch (event.kind) {
|
||||
case "baseline_fill":
|
||||
return `Standard fill. Client ${ctx.spec.client}. Rank by semantic match; require CAST(availability AS DOUBLE) > 0.5.`;
|
||||
case "recurring":
|
||||
return `Recurring slot — prefer workers with past playbook citations (visible on hybrid sources). Require CAST(availability AS DOUBLE) > 0.5.`;
|
||||
case "expansion":
|
||||
return `New-location fill, ${event.count} workers at once. Require CAST(availability AS DOUBLE) > 0.5 AND CAST(reliability AS DOUBLE) > 0.75.`;
|
||||
case "emergency":
|
||||
return `Emergency replacement needed ASAP. Require CAST(availability AS DOUBLE) > 0.7. A good-enough available worker beats a perfect unavailable one.`;
|
||||
case "misplacement":
|
||||
return `Refill for a no-show. Do NOT propose anyone on the EXCLUDE list. Require CAST(availability AS DOUBLE) > 0.5.`;
|
||||
}
|
||||
})();
|
||||
|
||||
return `${schemaLock}\n\nEVENT FOCUS:\n${base}`;
|
||||
}
|
||||
|
||||
// =================== Artifact generation ===================
|
||||
@ -570,6 +595,19 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
|
||||
gap_signals.push(`artifact: ${(e as Error).message}`);
|
||||
}
|
||||
|
||||
// Meta-index (Path 2) — fetch patterns the system discovered across
|
||||
// similar past playbooks. MUST come before dispatch log write because
|
||||
// dispatch carries this field. Fail-soft — patterns are observational.
|
||||
let discovered_pattern: string | undefined;
|
||||
try {
|
||||
const patterns = await httpJson<any>(`${GATEWAY}/vectors/playbook_memory/patterns`, {
|
||||
query: `${event.role} in ${event.city}, ${event.state}`,
|
||||
top_k_playbooks: 25,
|
||||
min_trait_frequency: 0.4,
|
||||
});
|
||||
discovered_pattern = patterns?.discovered_pattern;
|
||||
} catch { /* patterns are observational */ }
|
||||
|
||||
// Dispatch log (structured).
|
||||
await appendFile(join(ctx.out_dir, "dispatch.jsonl"),
|
||||
JSON.stringify({
|
||||
@ -581,16 +619,21 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
|
||||
duration_secs: outcome.duration_secs,
|
||||
pool_size: outcome.first_sql_matches,
|
||||
playbook_citations: outcome.playbook_citations,
|
||||
discovered_pattern,
|
||||
}) + "\n");
|
||||
|
||||
// Always seed playbook_memory after a sealed fill — keep the learning
|
||||
// loop tight across the whole day so recurring/misplacement events
|
||||
// later in the run benefit from earlier ones.
|
||||
//
|
||||
// 2026-04-20 — canonical SHORT seed text. Verbose LLM rationales dilute
|
||||
// the embedding and drop cosine similarity below the 0.05 threshold,
|
||||
// silently killing the boost. Keep operation+approach+context terse.
|
||||
try {
|
||||
await httpJson(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||||
operation: task.operation,
|
||||
approach: outcome.approach || `${event.kind} → hybrid search`,
|
||||
context: `client=${ctx.spec.client} scenario=${event.kind} shift=${event.shift_start ?? "tbd"}`,
|
||||
approach: `${event.kind} fill via hybrid search`,
|
||||
context: `${event.role} fill in ${event.city}, ${event.state}`,
|
||||
endorsed_names: resolved.map(r => r.name),
|
||||
append: true,
|
||||
});
|
||||
@ -598,6 +641,26 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
|
||||
gap_signals.push(`write_through: ${(e as Error).message}`);
|
||||
}
|
||||
|
||||
// After a misplacement event, record the lost worker's failure so
|
||||
// future searches for this city+role dampen their boost. Without this
|
||||
// Path 1 negative signal the no-shower keeps getting lifted.
|
||||
if (event.kind === "misplacement" && (event.exclude_worker_ids?.length ?? 0) > 0) {
|
||||
const lost = ctx.roster.find(r => r.status === "no_show");
|
||||
if (lost) {
|
||||
try {
|
||||
await httpJson(`${GATEWAY}/vectors/playbook_memory/mark_failed`, {
|
||||
operation: `fill: ${lost.role} x1 in ${lost.city}, ${lost.state}`,
|
||||
failed_names: [lost.name],
|
||||
reason: `no-show from ${lost.booked_for} shift`,
|
||||
});
|
||||
} catch (e) {
|
||||
gap_signals.push(`mark_failed: ${(e as Error).message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// (discovered_pattern was computed + written above, before dispatch.jsonl)
|
||||
|
||||
return {
|
||||
event,
|
||||
ok: true,
|
||||
@ -609,6 +672,7 @@ async function runEvent(event: FillEvent, ctx: ScenarioContext): Promise<EventRe
|
||||
sources_last_score: outcome.first_pool_last_score,
|
||||
pool_size: outcome.first_sql_matches,
|
||||
playbook_citations: outcome.playbook_citations,
|
||||
discovered_pattern,
|
||||
};
|
||||
}
|
||||
|
||||
@ -722,11 +786,56 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Workers-touched audit (don't leave anyone out) ---
|
||||
// Pull every worker that surfaced as a hit across all 5 events — booked
|
||||
// or excluded or rejected — so we can show the full population the
|
||||
// system considered, not just the ones that made the cut. J's ask:
|
||||
// "iterations and decisions that are made don't leave anyone out."
|
||||
const touched = new Map<string, { name: string; events: string[]; outcome: string }>();
|
||||
for (const r of ctx.results) {
|
||||
for (const f of r.fills) {
|
||||
const key = f.candidate_id;
|
||||
if (!touched.has(key)) touched.set(key, { name: f.name, events: [], outcome: "booked" });
|
||||
touched.get(key)!.events.push(`${r.event.at} ${r.event.kind}`);
|
||||
}
|
||||
}
|
||||
for (const r of ctx.roster) {
|
||||
if (r.status === "no_show") {
|
||||
const t = touched.get(r.worker_id);
|
||||
if (t) t.outcome = "booked-then-no_show";
|
||||
else touched.set(r.worker_id, { name: r.name, events: [r.booked_for], outcome: "no_show" });
|
||||
}
|
||||
}
|
||||
|
||||
lines.push("## Workers touched across the week");
|
||||
lines.push("");
|
||||
lines.push(`${touched.size} distinct workers made it through to a decision. Every one is accounted for below — `
|
||||
+ `no-shows flagged, rebookings noted, everyone visible.`);
|
||||
lines.push("");
|
||||
lines.push("| Worker ID | Name | Events | Outcome |");
|
||||
lines.push("|---|---|---|---|");
|
||||
for (const [id, t] of touched) {
|
||||
lines.push(`| ${id} | ${t.name} | ${t.events.join(" + ")} | ${t.outcome} |`);
|
||||
}
|
||||
lines.push("");
|
||||
|
||||
// --- Discovered patterns evolution across events ---
|
||||
lines.push("## Discovered patterns (meta-index)");
|
||||
lines.push("");
|
||||
lines.push("What the system identified across semantically-similar past fills as each event ran:");
|
||||
lines.push("");
|
||||
for (const r of ctx.results) {
|
||||
const dp = (r as any).discovered_pattern ?? "—";
|
||||
lines.push(`- **${r.event.at} ${r.event.kind}** (${r.event.role}): ${dp}`);
|
||||
}
|
||||
lines.push("");
|
||||
|
||||
// --- Narrative summary ---
|
||||
lines.push("## Narrative");
|
||||
lines.push("");
|
||||
lines.push(`- ${ctx.results.filter(r => r.ok).length}/${ctx.results.length} events reached consensus.`);
|
||||
lines.push(`- Final roster: ${ctx.roster.length} bookings across ${new Set(ctx.roster.map(r => r.worker_id)).size} distinct workers.`);
|
||||
lines.push(`- Workers touched (booked, failed, or otherwise decided): ${touched.size}.`);
|
||||
const totalCites = ctx.results.reduce((a, r) => a + (r.playbook_citations?.length ?? 0), 0);
|
||||
lines.push(`- Playbook citations across the day: ${totalCites} (proof the feedback loop fired across events).`);
|
||||
const droppedEvents = ctx.results.filter(r => !r.ok);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user