Three coupled fixes that together turned the Riverfront Steel scenario
from 0/5 (mistral) to 4/5 (qwen3.5) with T3 flagging real staffing
concerns rather than linter advice.
MODEL SWAP
- Executor: mistral → qwen3.5:latest (9.7B, 262K ctx, thinking).
mistral's decoder emitted malformed JSON on complex SQL filters
regardless of prompt; J called it — stop using mistral.
- Reviewer: qwen2.5 → qwen3:latest (40K ctx)
- Applied to scenario.ts, orchestrator.ts, network_proving.ts,
run_e2e_rated.ts
CONTINUATION PRIMITIVE (agent.ts)
- generateContinuable(): empty-response → geometric backoff retry;
truncated-JSON → continue from partial as scratchpad; bounded by
budget cap + max_continuations. No more "bump max_tokens until it
stops truncating" tourniquet.
- generateTreeSplit(): map-reduce for oversized input corpora with
running scratchpad digest, reduce pass for final synthesis.
- Empty text no longer throws — it's a signal to continuable that
thinking ate the budget.
think:false FOR HOT PATH
- qwen3.5 burned ~650 tokens of hidden thinking for trivial JSON
emission. For executor/reviewer/draft: think:false. For T3/T4/T5
overseers: thinking stays on (that's the point).
- Sidecar generate endpoint accepts `think` bool, passes through to
Ollama's /api/generate.
VERIFIED OUTCOMES
Riverfront Steel 2026-04-21, qwen3.5+continuable+think:false:
08:00 baseline_fill 3/3 4 turns
10:30 recurring 2/2 3 turns (1 playbook citation)
12:15 expansion 0/5 drift-aborted (5-fill orchestration
problem, separate work)
14:00 emergency 4/4 3 turns (1 citation)
15:45 misplacement 1/1 3 turns
→ T3 caught Patrick Ross double-booking across events
→ T3 flagged forklift cert drift on the event that failed
→ Cross-day lesson proposed "maintain buffer of ≥3 emergency
candidates, pre-fetch certs for expansion, booking system
cross-check" — real staffing advice, not generic linter output
PRD PHASE 21 rewritten to reflect the actual primitive shape (two-
call map-reduce with scratchpad glue) instead of the tourniquet
approach originally documented. Rust port queued for next sprint.
scripts/ab_t3_test.sh: A/B harness that chains B→C→D runs and emits
tests/multi-agent/playbooks/ab_scorecard.json.
303 lines
13 KiB
TypeScript
303 lines
13 KiB
TypeScript
// Two-agent orchestrator. Both agents run as concurrent async loops
|
|
// coordinated through a shared in-memory log; one turn of executor then
|
|
// one turn of reviewer, interleaved until consensus_done, drift-cycle
|
|
// blown, or hard turn cap. On success writes a playbook JSON; on failure
|
|
// exits non-zero with the full log for inspection.
|
|
//
|
|
// Fail-fast: every caught error is appended to the log AND rethrown, so
|
|
// the orchestrator top-level catches, dumps, and exits with code 1. The
|
|
// test harness reads the exit code to decide if the substrate is healthy.
|
|
|
|
import {
|
|
type LogEntry,
|
|
type TaskSpec,
|
|
type Action,
|
|
type Fill,
|
|
callTool,
|
|
hybridSearch,
|
|
sqlQuery,
|
|
generate,
|
|
parseAction,
|
|
executorPrompt,
|
|
reviewerPrompt,
|
|
GATEWAY,
|
|
} from "./agent.ts";
|
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
import { join } from "node:path";
|
|
|
|
const EXECUTOR_MODEL = "qwen3.5:latest";
|
|
const REVIEWER_MODEL = "qwen3:latest";
|
|
const MAX_TURNS = 12; // executor turns; reviewer gets one per
|
|
const MAX_CONSECUTIVE_DRIFTS = 3; // drift-cycle blown → give up
|
|
|
|
// Default task. Override via argv[2] if you want something else; see
|
|
// `parseTaskFromArg`. Picked from the real-world staffing pattern but
|
|
// not in the existing successful_playbooks list — this is a fresh fill.
|
|
// Default task lifted from the production pattern in successful_playbooks.
|
|
// Toledo, OH has 342 welders in workers_500k so supply is ample — the test
|
|
// is about collaboration and drift correction, not needle-in-haystack.
|
|
const DEFAULT_TASK: TaskSpec = {
|
|
id: `task-${Date.now()}`,
|
|
operation: "fill: Welder x2 in Toledo, OH",
|
|
target_role: "Welder",
|
|
target_count: 2,
|
|
target_city: "Toledo",
|
|
target_state: "OH",
|
|
approach_hint: "hybrid search against workers_500k_v1, narrow by role+city+state+availability, rank semantically",
|
|
};
|
|
|
|
function parseTaskFromArg(): TaskSpec {
|
|
const arg = process.argv[2];
|
|
if (!arg) return DEFAULT_TASK;
|
|
// Accept "role:Welder count:2 city:Columbus state:OH" style for ad-hoc
|
|
// tasks without standing up a JSON file. Anything more complex, feed
|
|
// it a JSON path.
|
|
if (arg.endsWith(".json")) {
|
|
return JSON.parse(require("node:fs").readFileSync(arg, "utf-8"));
|
|
}
|
|
const kv: Record<string, string> = {};
|
|
for (const token of arg.split(/\s+/)) {
|
|
const [k, ...v] = token.split(":");
|
|
kv[k] = v.join(":");
|
|
}
|
|
return {
|
|
id: `task-${Date.now()}`,
|
|
operation: `fill: ${kv.role} x${kv.count} in ${kv.city}, ${kv.state}`,
|
|
target_role: kv.role,
|
|
target_count: Number(kv.count),
|
|
target_city: kv.city,
|
|
target_state: kv.state,
|
|
approach_hint: kv.hint ?? "hybrid search",
|
|
};
|
|
}
|
|
|
|
// Helper: pretty one-line print for each log entry so the human watching
|
|
// stdout can follow without pulling the JSONL file.
|
|
// Defensive one-line formatter. Models sometimes omit optional fields
|
|
// (rationale, notes), so every access is guarded.
|
|
function fmt(e: LogEntry): string {
|
|
const tag = `[t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`;
|
|
const c = e.content ?? {};
|
|
const trim = (s: any, n: number) => String(s ?? "").slice(0, n);
|
|
if (e.kind === "tool_call")
|
|
return `${tag} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 80)}) — ${trim(c.rationale, 60)}`;
|
|
if (e.kind === "tool_result") {
|
|
const rows = c?.rows?.length ?? c?.sources?.length ?? undefined;
|
|
return `${tag} ${rows !== undefined ? `rows=${rows}` : JSON.stringify(c).slice(0, 80)}`;
|
|
}
|
|
if (e.kind === "critique") return `${tag} verdict=${c.verdict} — ${trim(c.notes, 80)}`;
|
|
if (e.kind === "propose_done")
|
|
return `${tag} ${c.fills?.length ?? 0} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`;
|
|
if (e.kind === "consensus_done") return `${tag} ✓`;
|
|
if (e.kind === "plan") return `${tag} ${c.steps?.length ?? 0} steps: ${(c.steps ?? []).slice(0, 2).join(" / ")}`;
|
|
if (e.kind === "error") return `${tag} ${c.message ?? c}`;
|
|
return `${tag} ${JSON.stringify(c).slice(0, 100)}`;
|
|
}
|
|
|
|
// Execute one tool call. The tool catalog in the prompt lists both the
|
|
// registered Phase 12 tools AND a pseudo-tool "hybrid_search" for the
|
|
// /vectors/hybrid endpoint — unify here so the executor doesn't need to
|
|
// know which surface a capability lives on.
|
|
async function executeToolCall(name: string, args: Record<string, any>): Promise<any> {
|
|
if (name === "hybrid_search") {
|
|
const { sql_filter, question, index_name, k } = args;
|
|
if (!sql_filter || !question || !index_name) {
|
|
throw new Error(`hybrid_search needs sql_filter + question + index_name, got ${JSON.stringify(args)}`);
|
|
}
|
|
// Pass through to /vectors/hybrid. id_column defaults to worker_id
|
|
// server-side, which is what workers_500k uses.
|
|
const body: any = { sql_filter, question, index_name, top_k: k ?? 10, generate: false };
|
|
return (await (await fetch("http://localhost:3100/vectors/hybrid", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify(body),
|
|
})).json());
|
|
}
|
|
if (name === "sql") {
|
|
const { query } = args;
|
|
if (!query || typeof query !== "string") throw new Error(`sql needs query (string), got ${JSON.stringify(args)}`);
|
|
if (!/^\s*SELECT/i.test(query)) throw new Error(`sql tool allows SELECT only: ${query}`);
|
|
return sqlQuery(query);
|
|
}
|
|
// Fall through to Phase 12 registry for any other named tool.
|
|
return callTool(name, args);
|
|
}
|
|
|
|
async function main() {
|
|
const task = parseTaskFromArg();
|
|
const log: LogEntry[] = [];
|
|
let turn = 0;
|
|
let consecutiveDrifts = 0;
|
|
let sealed: { fills: Fill[]; approach: string } | null = null;
|
|
|
|
const append = (e: Omit<LogEntry, "at">): LogEntry => {
|
|
const full: LogEntry = { ...e, at: new Date().toISOString() };
|
|
log.push(full);
|
|
console.log(fmt(full));
|
|
return full;
|
|
};
|
|
|
|
console.log(`▶ task: ${task.operation}`);
|
|
console.log(`▶ executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL}`);
|
|
console.log();
|
|
|
|
try {
|
|
while (turn < MAX_TURNS && !sealed) {
|
|
turn += 1;
|
|
|
|
// --- EXECUTOR TURN ---
|
|
const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), {
|
|
temperature: 0.2,
|
|
max_tokens: 1200,
|
|
});
|
|
let execAction: Action;
|
|
try {
|
|
execAction = parseAction(execRaw, "executor");
|
|
} catch (e) {
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error",
|
|
content: { message: (e as Error).message, raw: execRaw.slice(0, 400) } });
|
|
throw e;
|
|
}
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL,
|
|
kind: execAction.kind as any, content: execAction });
|
|
|
|
// If tool_call, execute and feed result back into the log. Tool
|
|
// validation / server errors come back as a tool_result with an
|
|
// `error` field — the executor reads its own error on the next turn
|
|
// and self-corrects (e.g. "oh, I forgot the `question` argument").
|
|
// This is softer than hard-failing the orchestrator: the whole
|
|
// point of two-agent collaboration is letting agents learn from
|
|
// immediate feedback instead of crashing the run.
|
|
if (execAction.kind === "tool_call") {
|
|
try {
|
|
const result = await executeToolCall(execAction.tool, execAction.args);
|
|
const trimmed = trimResult(result);
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL,
|
|
kind: "tool_result", content: trimmed });
|
|
} catch (e) {
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL,
|
|
kind: "tool_result",
|
|
content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } });
|
|
// Count as a soft drift — if the executor keeps throwing tool
|
|
// errors, consecutiveDrifts still trips the abort.
|
|
consecutiveDrifts += 1;
|
|
if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) {
|
|
throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive tool errors, executor can't self-correct`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- REVIEWER TURN ---
|
|
const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), {
|
|
temperature: 0.1,
|
|
max_tokens: 1000,
|
|
});
|
|
let revAction: Action;
|
|
try {
|
|
revAction = parseAction(revRaw, "reviewer");
|
|
} catch (e) {
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error",
|
|
content: { message: (e as Error).message, raw: revRaw.slice(0, 400) } });
|
|
throw e;
|
|
}
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL,
|
|
kind: "critique", content: revAction });
|
|
|
|
if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`);
|
|
|
|
if (revAction.verdict === "drift") {
|
|
consecutiveDrifts += 1;
|
|
if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) {
|
|
throw new Error(`aborting — ${MAX_CONSECUTIVE_DRIFTS} consecutive drift flags, executor can't self-correct`);
|
|
}
|
|
} else {
|
|
consecutiveDrifts = 0;
|
|
}
|
|
|
|
// Consensus: executor proposed done AND reviewer approved.
|
|
if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") {
|
|
if (execAction.fills.length !== task.target_count) {
|
|
throw new Error(`consensus malformed — ${execAction.fills.length} fills vs target ${task.target_count}`);
|
|
}
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done",
|
|
content: { fills: execAction.fills } });
|
|
sealed = { fills: execAction.fills, approach: execAction.rationale };
|
|
}
|
|
}
|
|
|
|
if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns — task incomplete`);
|
|
|
|
// Write playbook entry matching the successful_playbooks schema.
|
|
const playbook = {
|
|
timestamp: new Date().toISOString(),
|
|
operation: task.operation,
|
|
approach: sealed.approach,
|
|
result: `${sealed.fills.length}/${task.target_count} filled → ${sealed.fills.map(f => f.name).join(", ")}`,
|
|
context: `executor=${EXECUTOR_MODEL} reviewer=${REVIEWER_MODEL} turns=${turn}`,
|
|
task,
|
|
fills: sealed.fills,
|
|
log,
|
|
};
|
|
await mkdir("./tests/multi-agent/playbooks", { recursive: true });
|
|
const path = join("./tests/multi-agent/playbooks", `${task.id}.json`);
|
|
await writeFile(path, JSON.stringify(playbook, null, 2));
|
|
console.log(`\n✓ playbook written: ${path}`);
|
|
console.log(` ${playbook.result}`);
|
|
|
|
// Phase 19.5: write-through to playbook_memory. The sealed fills are
|
|
// the endorsement; next semantically-similar query will surface them
|
|
// higher. /seed bypasses the successful_playbooks ingest round-trip
|
|
// — when that ingest path ships, this block should switch to append
|
|
// + rebuild instead.
|
|
try {
|
|
// Seed context is what the embedding model actually sees alongside
|
|
// the operation — so it has to carry task-semantic content (role,
|
|
// city, scenario) rather than orchestrator bookkeeping. We stash
|
|
// the bookkeeping in the full playbook JSON instead (see playbook
|
|
// object above) where operators can grep it without it polluting
|
|
// the ranking signal.
|
|
const seedContext = task.approach_hint
|
|
?? `${task.target_role} fill in ${task.target_city}, ${task.target_state}`;
|
|
const seedRes = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
operation: task.operation,
|
|
approach: sealed.approach || "multi-agent → hybrid search",
|
|
context: seedContext,
|
|
endorsed_names: sealed.fills.map(f => f.name),
|
|
append: true,
|
|
}),
|
|
});
|
|
if (seedRes.ok) {
|
|
const j = await seedRes.json() as any;
|
|
console.log(` ↳ playbook_memory seeded: id=${j.playbook_id} entries=${j.entries_after}`);
|
|
} else {
|
|
console.warn(` ↳ playbook_memory seed failed: ${seedRes.status} ${await seedRes.text()}`);
|
|
}
|
|
} catch (e) {
|
|
console.warn(` ↳ playbook_memory seed errored: ${(e as Error).message}`);
|
|
}
|
|
|
|
process.exit(0);
|
|
} catch (e) {
|
|
console.error(`\n✗ ${(e as Error).message}`);
|
|
// Still persist the log for inspection.
|
|
await mkdir("./tests/multi-agent/playbooks", { recursive: true });
|
|
const path = join("./tests/multi-agent/playbooks", `${task.id}-FAILED.json`);
|
|
await writeFile(path, JSON.stringify({ task, error: (e as Error).message, log }, null, 2));
|
|
console.error(` log dumped: ${path}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
function trimResult(r: any): any {
|
|
if (r && Array.isArray(r.rows)) {
|
|
return { ...r, rows: r.rows.slice(0, 20), _trimmed: r.rows.length > 20 ? `${r.rows.length - 20} more rows` : undefined };
|
|
}
|
|
return r;
|
|
}
|
|
|
|
main();
|