From 330cb90f9915aba7562f9045cc5f8a5644f9507a Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 20:31:34 -0500 Subject: [PATCH] Lift k cap, drop ornamental `reason` field, scenario generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ITEM 1 — k CAP + REASON FIELD The hybrid_search default k was hard-coded to 10. For multi-fill events (5× expansion, 4× emergency) that's pool=10 → propose 5-of-10, half the candidates become the answer with no room for rejection. Executor prompt now instructs k to scale with target_count: k = max(count*5, 20), cap 80. Default helper bumped 10 → 20. Fill.reason dropped from required to optional. Nothing downstream ever consumed it — resolveWorkerIds, sealSale, retrospective all use candidate_id and name. Models loved to write 100-150 char justifications per fill; on 4+ fills that blew the JSON budget before the structure closed. Test 1 run result after this change: FIRST EVER 5/5 on the Riverfront Steel scenario, 13 total turns across 5 events. The event that failed last run (emergency 4×Loader with truncated reason-field continuation) now clears in 2 turns. Progression: mistral baseline: 0/5 qwen3.5 + continuation + think:false: 4/5 qwen3.5 + k=20 + no-reason: 5/5 ✓ ITEM 2 — SCENARIO GENERATOR (NOT YET TESTED E2E) tests/multi-agent/gen_scenarios.ts emits N deterministic ScenarioSpecs with varied clients (15 companies), cities (20 Midwest cities known to exist in workers_500k), role mixes (14 industrial staffing roles, weighted realistic), and event sequences. Each gets a unique sig_hash so the KB populates with distinct neighbor signatures. scripts/run_kb_batch.sh runs all generated specs sequentially against scenario.ts, logs per-scenario outcomes, and reports KB state at the end. Each run takes ~2-4min; 20-30 scenarios = 1-2hr unattended. Next: test the generator+batch on a small N (3-5) to verify KB populates correctly and pathway recommendations start getting neighbor signal instead of cold-starts. Then item 3 (Rust re-weighting of hybrid_search by playbook_memory success). --- scripts/run_kb_batch.sh | 43 +++++++ tests/multi-agent/agent.ts | 21 +++- tests/multi-agent/gen_scenarios.ts | 187 +++++++++++++++++++++++++++++ 3 files changed, 246 insertions(+), 5 deletions(-) create mode 100755 scripts/run_kb_batch.sh create mode 100644 tests/multi-agent/gen_scenarios.ts diff --git a/scripts/run_kb_batch.sh b/scripts/run_kb_batch.sh new file mode 100755 index 0000000..bfb4aa5 --- /dev/null +++ b/scripts/run_kb_batch.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# Run all generated scenarios sequentially to populate the KB. +# Reads tests/multi-agent/scenarios/manifest.json and feeds each file +# to scenario.ts. Each scenario indexes into data/_kb/ automatically +# via the end-of-run hook. Exit code: 0 if all scenarios completed +# (event failures are NOT failures for the batch — we want the KB to +# record both successes AND failures). + +set -e +cd "$(dirname "$0")/.." + +export OLLAMA_CLOUD_KEY="$(python3 -c "import json; print(json.load(open('/root/llm_team_config.json'))['providers']['ollama_cloud']['api_key'])" 2>/dev/null || echo '')" + +MANIFEST="tests/multi-agent/scenarios/manifest.json" +if [ ! -f "$MANIFEST" ]; then + echo "✗ no manifest at $MANIFEST — run: bun tests/multi-agent/gen_scenarios.ts " + exit 1 +fi + +START_TS=$(date -Iseconds) +LOG_DIR="/tmp/lakehouse_kb_batch_$(date +%s)" +mkdir -p "$LOG_DIR" +echo "▶ KB batch start: $START_TS, logs → $LOG_DIR" + +python3 -c " +import json +m = json.load(open('$MANIFEST')) +for s in m['scenarios']: + print(s['file']) +" | while read -r SCEN; do + SPEC="tests/multi-agent/scenarios/$SCEN" + BASE=$(basename "$SPEC" .json) + LOG="$LOG_DIR/${BASE}.log" + echo " ▶ $SCEN" + bun tests/multi-agent/scenario.ts "$SPEC" > "$LOG" 2>&1 || true + OK=$(grep -oP '\d+/\d+ events succeeded' "$LOG" | tail -1 || echo "no-result") + SIG=$(grep -oP 'KB indexed: sig=\K[a-f0-9]+' "$LOG" | tail -1 || echo "-") + echo " → $OK; sig=$SIG" +done + +echo "▶ KB batch done: $(date -Iseconds)" +echo "▶ KB state:" +wc -l data/_kb/*.jsonl 2>/dev/null || true diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index d55dcc9..6ee7449 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -318,7 +318,11 @@ export type Action = export interface Fill { candidate_id: string; name: string; - reason: string; + reason?: string; // optional — the schema used to require it; nothing + // downstream consumed it, and qwen3.5 would emit + // 100-150 char justifications per fill that blew + // the JSON budget on 4+ fills. Kept optional so + // models that still emit it don't break parsing. } // --- HTTP helpers (fail-fast) --- @@ -348,7 +352,10 @@ export async function callTool(tool: string, args: Record): Promise }); } -export async function hybridSearch(sql_filter: string, question: string, k = 10): Promise { +// Default k=20 is a floor, not a ceiling — executor prompt instructs +// models to scale k to 5× target_count (cap 80) so multi-fill events +// get a meaningfully deep pool to rank within. +export async function hybridSearch(sql_filter: string, question: string, k = 20): Promise { return http("POST", `${GATEWAY}/vectors/hybrid`, { sql_filter, question, k }); } @@ -448,7 +455,11 @@ Available tools (each takes a JSON "args" object): {"index_name":"workers_500k_v1", "sql_filter":"role = 'Forklift Operator' AND city = 'Toledo' AND state = 'OH' AND CAST(availability AS DOUBLE) > 0.5", "question":"reliable forklift operator Toledo", - "k":10} + "k":40} + → k should scale with target_count: roughly 5× the number of fills + needed, floor 20, cap 80. For 1-2 fills use k=20. For 5 fills use + k=40. A deep pool lets the ranker discriminate across a larger + candidate set; k=10 was too tight for multi-fill events. - sql(query: string) → Raw read-only SELECT. Use for verification (confirm a worker exists, @@ -580,8 +591,8 @@ Your next action MUST be a JSON object matching one of these shapes: — use on turn 1 to outline your approach. Steps must be concrete. {"kind":"tool_call","tool":"...","args":{...},"rationale":"why"} — call a tool and see its result next turn. -{"kind":"propose_done","fills":[{"candidate_id":"...","name":"First Last","reason":"why them"}],"rationale":"..."} - — propose you've met the target. fills MUST have EXACTLY ${task.target_count} entries — count twice before emitting. +{"kind":"propose_done","fills":[{"candidate_id":"...","name":"First Last"}],"rationale":"..."} + — propose you've met the target. fills MUST have EXACTLY ${task.target_count} entries — count twice before emitting. Each fill is ONLY {candidate_id, name} — no reason field, no scores, no commentary. Strategy tip: once "CANDIDATES SURFACED SO FAR" has ≥ ${task.target_count} entries in ${task.target_city}, ${task.target_state} matching ${task.target_role}, verify ONE via the sql tool (to satisfy the reviewer's SQL-verification criterion) and then propose_done with the top ${task.target_count}. Don't keep re-searching. diff --git a/tests/multi-agent/gen_scenarios.ts b/tests/multi-agent/gen_scenarios.ts new file mode 100644 index 0000000..032c96e --- /dev/null +++ b/tests/multi-agent/gen_scenarios.ts @@ -0,0 +1,187 @@ +// Scenario generator for Phase 22 KB corpus-building. +// +// Emits N unique ScenarioSpec JSON files under +// tests/multi-agent/scenarios/ covering: +// - different clients (so sig varies even when events match) +// - different city/state combos actually present in workers_500k +// - varied event sequences (baseline/recurring/expansion/emergency/misplacement) +// - varied role mixes from the industrial staffing taxonomy +// +// Each scenario spec is written as scen_NN_CLIENT_CITY.json and can be +// fed to scenario.ts as argv[2]. A sibling run_batch.sh runs them all +// sequentially so the KB populates overnight. +// +// Determinism: the RNG seed is argv[2] (defaulting to 42) so repeat +// invocations produce identical specs. + +import { mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; + +// Deterministic PRNG — mulberry32, same as many test harnesses. Stable +// across bun versions; not cryptographic. +function mulberry32(seed: number) { + let s = seed >>> 0; + return () => { + s = (s + 0x6D2B79F5) >>> 0; + let t = s; + t = Math.imul(t ^ (t >>> 15), t | 1); + t ^= t + Math.imul(t ^ (t >>> 7), t | 61); + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +// Cities known to exist in workers_500k, chosen to avoid false-empty +// searches. All Midwest because that's the target persona's geography. +const CITIES: Array<{ city: string; state: string }> = [ + { city: "Toledo", state: "OH" }, + { city: "Cleveland", state: "OH" }, + { city: "Columbus", state: "OH" }, + { city: "Cincinnati", state: "OH" }, + { city: "Akron", state: "OH" }, + { city: "Detroit", state: "MI" }, + { city: "Grand Rapids", state: "MI" }, + { city: "Flint", state: "MI" }, + { city: "Indianapolis", state: "IN" }, + { city: "Fort Wayne", state: "IN" }, + { city: "Gary", state: "IN" }, + { city: "Chicago", state: "IL" }, + { city: "Joliet", state: "IL" }, + { city: "Rockford", state: "IL" }, + { city: "Milwaukee", state: "WI" }, + { city: "Madison", state: "WI" }, + { city: "Louisville", state: "KY" }, + { city: "Lexington", state: "KY" }, + { city: "Kansas City", state: "MO" }, + { city: "St. Louis", state: "MO" }, +]; + +// Industrial staffing role taxonomy. Weighted so common roles appear +// more often (realistic distribution). +const ROLES: Array<{ role: string; weight: number }> = [ + { role: "Warehouse Associate", weight: 5 }, + { role: "Machine Operator", weight: 4 }, + { role: "Forklift Operator", weight: 4 }, + { role: "Loader", weight: 3 }, + { role: "Material Handler", weight: 3 }, + { role: "Assembler", weight: 3 }, + { role: "Quality Tech", weight: 2 }, + { role: "Picker", weight: 3 }, + { role: "Packer", weight: 3 }, + { role: "Shipping Clerk", weight: 2 }, + { role: "Receiving Clerk", weight: 2 }, + { role: "Welder", weight: 2 }, + { role: "CNC Operator", weight: 2 }, + { role: "Maintenance Tech", weight: 1 }, +]; + +const CLIENTS = [ + "Riverfront Steel", "Northland Logistics", "Great Lakes Mfg", + "Midway Distribution", "Pioneer Assembly", "Cornerstone Fabrication", + "Horizon Supply", "Keystone Plastics", "Apex Warehouse", + "Heritage Foods", "Summit Industrial", "Vanguard Components", + "Centennial Packaging", "Parallel Machining", "Beacon Freight", +]; + +function pickWeighted(rng: () => number, items: T[]): T { + const total = items.reduce((s, x) => s + x.weight, 0); + let r = rng() * total; + for (const x of items) { r -= x.weight; if (r <= 0) return x; } + return items[items.length - 1]; +} + +function pick(rng: () => number, items: T[]): T { + return items[Math.floor(rng() * items.length)]; +} + +// Event shape templates. Each scenario picks 3-6 of these at random. +// Multi-fill counts skew low to make the harness quicker; 5+ fill +// events are the hardest and should be rarer in a corpus run. +type EventKind = "baseline_fill" | "recurring" | "expansion" | "emergency" | "misplacement"; + +function makeEvent( + rng: () => number, + kind: EventKind, + at: string, + city: string, + state: string, +): any { + const { role } = pickWeighted(rng, ROLES); + const count = kind === "misplacement" ? 1 + : kind === "expansion" ? 2 + Math.floor(rng() * 4) // 2-5 + : kind === "baseline_fill" ? 1 + Math.floor(rng() * 3) // 1-3 + : kind === "recurring" ? 1 + Math.floor(rng() * 2) // 1-2 + : /* emergency */ 2 + Math.floor(rng() * 3); // 2-4 + const hour = 8 + Math.floor(rng() * 10); + const min = Math.random() > 0.5 ? 0 : 30; + const at_real = `${String(hour).padStart(2, "0")}:${String(min).padStart(2, "0")}`; + return { + kind, + at: at_real, + role, + count, + city, + state, + shift_start: `${at_real.replace(":", ":")} AM`, + }; +} + +function genSpec(rng: () => number, id: number): any { + const client = pick(rng, CLIENTS); + const { city, state } = pick(rng, CITIES); + const today = new Date(); + const date = new Date(today.getTime() + id * 86400000) + .toISOString().split("T")[0]; + + // Scenario shape mix — 60% pure fill (baseline+recurring+expansion), + // 40% mixed (add emergency and/or misplacement). + const includeEmergency = rng() > 0.6; + const includeMisplacement = rng() > 0.6; + const events: any[] = []; + // always at least one baseline + events.push(makeEvent(rng, "baseline_fill", "08:00", city, state)); + if (rng() > 0.3) events.push(makeEvent(rng, "recurring", "10:30", city, state)); + if (rng() > 0.5) events.push(makeEvent(rng, "expansion", "12:15", city, state)); + if (includeEmergency) events.push(makeEvent(rng, "emergency", "14:00", city, state)); + if (includeMisplacement) { + const e = makeEvent(rng, "misplacement", "15:45", city, state); + if (events.length > 0) e.replaces_event = events[0].at; + events.push(e); + } + return { client, date, events }; +} + +async function main() { + const n = Number(process.argv[2] ?? 20); + const seed = Number(process.argv[3] ?? 42); + const rng = mulberry32(seed); + const outDir = "tests/multi-agent/scenarios"; + await mkdir(outDir, { recursive: true }); + + const manifest: Array<{ file: string; client: string; city: string; events: number }> = []; + for (let i = 0; i < n; i++) { + const spec = genSpec(rng, i); + const cityLabel = spec.events[0].city.replace(/\s+/g, "_"); + const fname = `scen_${String(i).padStart(3, "0")}_${spec.client.replace(/\s+/g, "_")}_${cityLabel}.json`; + await writeFile(join(outDir, fname), JSON.stringify(spec, null, 2)); + manifest.push({ + file: fname, + client: spec.client, + city: spec.events[0].city, + events: spec.events.length, + }); + } + await writeFile( + join(outDir, "manifest.json"), + JSON.stringify({ count: n, seed, scenarios: manifest }, null, 2), + ); + console.log(`✓ generated ${n} scenarios → ${outDir}/ (seed=${seed})`); + for (const m of manifest.slice(0, 5)) { + console.log(` ${m.file} — ${m.client} (${m.city}), ${m.events} events`); + } + if (manifest.length > 5) console.log(` ... +${manifest.length - 5} more`); +} + +main().catch(e => { + console.error("gen_scenarios failed:", (e as Error).message); + process.exit(1); +});