// Two-agent orchestrator. Both agents run as concurrent async loops // coordinated through a shared in-memory log; one turn of executor then // one turn of reviewer, interleaved until consensus_done, drift-cycle // blown, or hard turn cap. On success writes a playbook JSON; on failure // exits non-zero with the full log for inspection. // // Fail-fast: every caught error is appended to the log AND rethrown, so // the orchestrator top-level catches, dumps, and exits with code 1. The // test harness reads the exit code to decide if the substrate is healthy. import { type LogEntry, type TaskSpec, type Action, type Fill, callTool, hybridSearch, sqlQuery, generate, parseAction, executorPrompt, reviewerPrompt, GATEWAY, } from "./agent.ts"; import { mkdir, writeFile } from "node:fs/promises"; import { join } from "node:path"; const EXECUTOR_MODEL = "qwen3.5:latest"; const REVIEWER_MODEL = "qwen3:latest"; const MAX_TURNS = 12; // executor turns; reviewer gets one per const MAX_CONSECUTIVE_DRIFTS = 3; // drift-cycle blown → give up // Default task. Override via argv[2] if you want something else; see // `parseTaskFromArg`. Picked from the real-world staffing pattern but // not in the existing successful_playbooks list — this is a fresh fill. // Default task lifted from the production pattern in successful_playbooks. // Toledo, OH has 342 welders in workers_500k so supply is ample — the test // is about collaboration and drift correction, not needle-in-haystack. const DEFAULT_TASK: TaskSpec = { id: `task-${Date.now()}`, operation: "fill: Welder x2 in Toledo, OH", target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH", approach_hint: "hybrid search against workers_500k_v1, narrow by role+city+state+availability, rank semantically", }; function parseTaskFromArg(): TaskSpec { const arg = process.argv[2]; if (!arg) return DEFAULT_TASK; // Accept "role:Welder count:2 city:Columbus state:OH" style for ad-hoc // tasks without standing up a JSON file. Anything more complex, feed // it a JSON path. if (arg.endsWith(".json")) { return JSON.parse(require("node:fs").readFileSync(arg, "utf-8")); } const kv: Record = {}; for (const token of arg.split(/\s+/)) { const [k, ...v] = token.split(":"); kv[k] = v.join(":"); } return { id: `task-${Date.now()}`, operation: `fill: ${kv.role} x${kv.count} in ${kv.city}, ${kv.state}`, target_role: kv.role, target_count: Number(kv.count), target_city: kv.city, target_state: kv.state, approach_hint: kv.hint ?? "hybrid search", }; } // Helper: pretty one-line print for each log entry so the human watching // stdout can follow without pulling the JSONL file. // Defensive one-line formatter. Models sometimes omit optional fields // (rationale, notes), so every access is guarded. function fmt(e: LogEntry): string { const tag = `[t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`; const c = e.content ?? {}; const trim = (s: any, n: number) => String(s ?? "").slice(0, n); if (e.kind === "tool_call") return `${tag} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 80)}) — ${trim(c.rationale, 60)}`; if (e.kind === "tool_result") { const rows = c?.rows?.length ?? c?.sources?.length ?? undefined; return `${tag} ${rows !== undefined ? `rows=${rows}` : JSON.stringify(c).slice(0, 80)}`; } if (e.kind === "critique") return `${tag} verdict=${c.verdict} — ${trim(c.notes, 80)}`; if (e.kind === "propose_done") return `${tag} ${c.fills?.length ?? 0} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; if (e.kind === "consensus_done") return `${tag} ✓`; if (e.kind === "plan") return `${tag} ${c.steps?.length ?? 0} steps: ${(c.steps ?? []).slice(0, 2).join(" / ")}`; if (e.kind === "error") return `${tag} ${c.message ?? c}`; return `${tag} ${JSON.stringify(c).slice(0, 100)}`; } // Execute one tool call. The tool catalog in the prompt lists both the // registered Phase 12 tools AND a pseudo-tool "hybrid_search" for the // /vectors/hybrid endpoint — unify here so the executor doesn't need to // know which surface a capability lives on. async function executeToolCall(name: string, args: Record): Promise { if (name === "hybrid_search") { const { sql_filter, question, index_name, k } = args; if (!sql_filter || !question || !index_name) { throw new Error(`hybrid_search needs sql_filter + question + index_name, got ${JSON.stringify(args)}`); } // Pass through to /vectors/hybrid. id_column defaults to worker_id // server-side, which is what workers_500k uses. const body: any = { sql_filter, question, index_name, top_k: k ?? 10, generate: false }; return (await (await fetch("http://localhost:3100/vectors/hybrid", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), })).json()); } if (name === "sql") { const { query } = args; if (!query || typeof query !== "string") throw new Error(`sql needs query (string), got ${JSON.stringify(args)}`); if (!/^\s*SELECT/i.test(query)) throw new Error(`sql tool allows SELECT only: ${query}`); return sqlQuery(query); } // Fall through to Phase 12 registry for any other named tool. return callTool(name, args); } async function main() { const task = parseTaskFromArg(); const log: LogEntry[] = []; let turn = 0; let consecutiveDrifts = 0; let sealed: { fills: Fill[]; approach: string } | null = null; const append = (e: Omit): LogEntry => { const full: LogEntry = { ...e, at: new Date().toISOString() }; log.push(full); console.log(fmt(full)); return full; }; console.log(`▶ task: ${task.operation}`); console.log(`▶ executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL}`); console.log(); try { while (turn < MAX_TURNS && !sealed) { turn += 1; // --- EXECUTOR TURN --- const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 1200, think: false, }); let execAction: Action; try { execAction = parseAction(execRaw, "executor"); } catch (e) { append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error", content: { message: (e as Error).message, raw: execRaw.slice(0, 400) } }); throw e; } append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction }); // If tool_call, execute and feed result back into the log. Tool // validation / server errors come back as a tool_result with an // `error` field — the executor reads its own error on the next turn // and self-corrects (e.g. "oh, I forgot the `question` argument"). // This is softer than hard-failing the orchestrator: the whole // point of two-agent collaboration is letting agents learn from // immediate feedback instead of crashing the run. if (execAction.kind === "tool_call") { try { const result = await executeToolCall(execAction.tool, execAction.args); const trimmed = trimResult(result); append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trimmed }); } catch (e) { append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } }); // Count as a soft drift — if the executor keeps throwing tool // errors, consecutiveDrifts still trips the abort. consecutiveDrifts += 1; if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive tool errors, executor can't self-correct`); } } } // --- REVIEWER TURN --- const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 1000, think: false, }); let revAction: Action; try { revAction = parseAction(revRaw, "reviewer"); } catch (e) { append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error", content: { message: (e as Error).message, raw: revRaw.slice(0, 400) } }); throw e; } append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction }); if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`); if (revAction.verdict === "drift") { consecutiveDrifts += 1; if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) { throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive drift flags, executor can't self-correct`); } } else { consecutiveDrifts = 0; } // Consensus: executor proposed done AND reviewer approved. if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { if (execAction.fills.length !== task.target_count) { throw new Error(`consensus malformed — ${execAction.fills.length} fills vs target ${task.target_count}`); } append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } }); sealed = { fills: execAction.fills, approach: execAction.rationale }; } } if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns — task incomplete`); // Write playbook entry matching the successful_playbooks schema. const playbook = { timestamp: new Date().toISOString(), operation: task.operation, approach: sealed.approach, result: `${sealed.fills.length}/${task.target_count} filled → ${sealed.fills.map(f => f.name).join(", ")}`, context: `executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL} turns=${turn}`, task, fills: sealed.fills, log, }; await mkdir("./tests/multi-agent/playbooks", { recursive: true }); const path = join("./tests/multi-agent/playbooks", `${task.id}.json`); await writeFile(path, JSON.stringify(playbook, null, 2)); console.log(`\n✓ playbook written: ${path}`); console.log(` ${playbook.result}`); // Phase 19.5: write-through to playbook_memory. The sealed fills are // the endorsement; next semantically-similar query will surface them // higher. /seed bypasses the successful_playbooks ingest round-trip // — when that ingest path ships, this block should switch to append // + rebuild instead. try { // Seed context is what the embedding model actually sees alongside // the operation — so it has to carry task-semantic content (role, // city, scenario) rather than orchestrator bookkeeping. We stash // the bookkeeping in the full playbook JSON instead (see playbook // object above) where operators can grep it without it polluting // the ranking signal. const seedContext = task.approach_hint ?? `${task.target_role} fill in ${task.target_city}, ${task.target_state}`; let seedRes: Response | null = null; for (let attempt = 0; attempt < 3; attempt++) { try { seedRes = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ operation: task.operation, approach: sealed.approach || "multi-agent → hybrid search", context: seedContext, endorsed_names: sealed.fills.map(f => f.name), append: true, }), }); break; } catch (e) { if (attempt === 2) throw e; await Bun.sleep(1000 * (attempt + 1)); } } if (seedRes.ok) { const j = await seedRes.json() as any; console.log(` ↳ playbook_memory seeded: id=${j.outcome?.playbook_id ?? j.playbook_id} entries=${j.entries_after}`); } else { console.warn(` ↳ playbook_memory seed failed: ${seedRes.status} ${await seedRes.text()}`); } } catch (e) { console.warn(` ↳ playbook_memory seed errored: ${(e as Error).message}`); } process.exit(0); } catch (e) { console.error(`\n✗ ${(e as Error).message}`); // Still persist the log for inspection. await mkdir("./tests/multi-agent/playbooks", { recursive: true }); const path = join("./tests/multi-agent/playbooks", `${task.id}-FAILED.json`); await writeFile(path, JSON.stringify({ task, error: (e as Error).message, log }, null, 2)); console.error(` log dumped: ${path}`); process.exit(1); } } function trimResult(r: any): any { if (r && Array.isArray(r.rows)) { return { ...r, rows: r.rows.slice(0, 20), _trimmed: r.rows.length > 20 ? `${r.rows.length - 20} more rows` : undefined }; } return r; } main();