// Network proving: continuous build → verify → repeat with hot-swap profile. // // J's framing: "have them guide each other, when the test is complete we have // a successful playbook, then spin up another agent that tests the viability // of our network with the playbook and the hot-swap profile. Keep spinning up // agents and testing — pass theory, real-world execution, not isolated unit // tests." // // Each round = TWO phases: // // 1. BUILD phase. Two agents (mistral executor + qwen2.5 reviewer) work // on a real staffing fill task. They guide each other via the critique // loop. On consensus → seal a playbook with CANONICAL short seed text // (the Pass 1 lesson — verbose seeds silently kill boost). Real Ollama, // real workers_500k, real /vectors/hybrid path. // // 2. VERIFY phase. A FRESH qwen2.5 agent spins up, activates the // staffing-recruiter profile (Phase 17 hot-swap), runs a probe query // against the same network, and judges from the live response whether // prior rounds' playbooks actually surface relevant workers higher. // The verifier writes a verdict: did the network learn? // // Three rounds, progressively harder: // R0: Welder x2 in Toledo, OH — baseline // R1: Welder x2 in Cleveland, OH — same role, different city // → tests geo discrimination // (Toledo workers MUST NOT // bleed into Cleveland boost) // R2: Welder x3 in Toledo, OH — re-fill same city, bigger // count → tests compounding // (R0's endorsements should // still rank up here) // // Run: bun run tests/multi-agent/network_proving.ts // // Fail-fast: any HTTP error or model crash bubbles to top-level, exits 1. import { type LogEntry, type TaskSpec, type Action, type Fill, GATEWAY, generate, parseAction, executorPrompt, reviewerPrompt, sqlQuery, callTool, } from "./agent.ts"; const EXECUTOR_MODEL = "qwen3.5:latest"; const REVIEWER_MODEL = "qwen3:latest"; const VERIFIER_MODEL = "qwen2.5:latest"; const PROFILE_ID = "staffing-recruiter"; const INDEX_NAME = "workers_500k_v1"; const MAX_TURNS = 12; const MAX_TOOL_ERRORS = 3; const MAX_DRIFTS = 3; const TASK_DECK: TaskSpec[] = [ { id: "R0", operation: "fill: Welder x2 in Toledo, OH", target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH", approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", }, { id: "R1", operation: "fill: Welder x2 in Cleveland, OH", target_role: "Welder", target_count: 2, target_city: "Cleveland", target_state: "OH", approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", }, { id: "R2", operation: "fill: Welder x3 in Toledo, OH", target_role: "Welder", target_count: 3, target_city: "Toledo", target_state: "OH", approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify", }, ]; interface BuildResult { ok: boolean; task: TaskSpec; fills: Fill[]; turns: number; duration_secs: number; playbook_id?: string; entries_after_seed?: number; error?: string; } interface VerifyResult { profile_activated: boolean; warmed_indexes: number; probe_boost_total: number; // sum of playbook_boost across top-K probe_boosted_hits: number; // how many hits had boost > 0 probe_top_citations: string[]; // playbook_ids cited geo_discrimination_ok: boolean; // when prior playbook is in different city, boost should NOT bleed verdict: string; // qwen2.5's natural-language judgment confidence: number; // 0-100 self-rated duration_secs: number; } interface RoundLedger { round: number; task: TaskSpec; build: BuildResult; verify: VerifyResult; score: number; // /10 per round notes: string[]; } // ─────────────────────── BUILD phase (two-agent loop) ─────────────────────── async function executeToolCall(name: string, args: Record): Promise { if (name === "hybrid_search") { const { sql_filter, question, index_name, k } = args; if (!sql_filter || !question || !index_name) { throw new Error(`hybrid_search needs sql_filter+question+index_name, got keys=${Object.keys(args).join(",")}`); } const r = await fetch(`${GATEWAY}/vectors/hybrid`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ sql_filter, question, index_name, filter_dataset: "workers_500k", id_column: "worker_id", top_k: k ?? 10, generate: false, use_playbook_memory: true, }), }); if (!r.ok) throw new Error(`hybrid → ${r.status}: ${await r.text()}`); return r.json(); } if (name === "sql") { if (!args.query) throw new Error("sql needs query"); if (!/^\s*SELECT/i.test(args.query)) throw new Error("sql allows SELECT only"); return sqlQuery(args.query); } return callTool(name, args); } function trim(r: any) { if (r && Array.isArray(r.rows)) return { ...r, rows: r.rows.slice(0, 20) }; if (r && Array.isArray(r.sources)) return { ...r, sources: r.sources.slice(0, 12) }; return r; } function fmtTurn(prefix: string, e: Omit): string { const c: any = e.content ?? {}; const head = `[${prefix} t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`; if (e.kind === "tool_call") return `${head} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 70)})`; if (e.kind === "tool_result") { if (c.error) return `${head} error: ${c.error}`; if (Array.isArray(c.sources)) return `${head} hybrid sql=${c.sql_matches} reranked=${c.vector_reranked}`; if (Array.isArray(c.rows)) return `${head} sql ${c.rows.length} rows`; return `${head} ${JSON.stringify(c).slice(0, 70)}`; } if (e.kind === "critique") return `${head} verdict=${c.verdict} ${(c.notes ?? "").slice(0, 50)}`; if (e.kind === "propose_done") return `${head} ${(c.fills ?? []).length} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`; if (e.kind === "consensus_done") return `${head} ✓`; if (e.kind === "plan") return `${head} ${(c.steps ?? []).length} steps`; return `${head} ${JSON.stringify(c).slice(0, 60)}`; } async function buildPhase(task: TaskSpec, prefix: string): Promise { const t0 = Date.now(); const log: LogEntry[] = []; let turn = 0, sealed: { fills: Fill[]; approach: string } | null = null; let toolErrors = 0, drifts = 0; const append = (e: Omit): LogEntry => { const full: LogEntry = { ...e, at: new Date().toISOString() }; log.push(full); console.log(fmtTurn(prefix, e)); return full; }; try { while (turn < MAX_TURNS && !sealed) { turn += 1; const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 1200 }); const execAction = parseAction(execRaw, "executor"); append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction }); if (execAction.kind === "tool_call") { try { const r = await executeToolCall(execAction.tool, execAction.args); append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trim(r) }); toolErrors = 0; } catch (e) { append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } }); toolErrors += 1; if (toolErrors >= MAX_TOOL_ERRORS) throw new Error(`${MAX_TOOL_ERRORS} consecutive tool errors`); } } const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 1000 }); const revAction = parseAction(revRaw, "reviewer"); append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction }); if (revAction.kind !== "critique") throw new Error(`reviewer non-critique`); if (revAction.verdict === "drift") { drifts += 1; if (drifts >= MAX_DRIFTS) throw new Error(`${MAX_DRIFTS} consecutive drifts`); } else drifts = 0; if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") { if (execAction.fills.length !== task.target_count) { throw new Error(`fills=${execAction.fills.length} target=${task.target_count}`); } append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } }); sealed = { fills: execAction.fills, approach: (execAction as any).rationale ?? "multi-agent" }; } } if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`); // Phase 19 seed — CANONICAL short text (Pass 1 lesson). The verbose // executor rationale stays out of the embedding; we keep a separate // human-readable record in the playbook log. const canonicalApproach = `${task.target_role.toLowerCase()} fill via hybrid search`; const canonicalContext = `${task.target_role} fill in ${task.target_city}, ${task.target_state}`; let playbook_id: string | undefined; let entries_after_seed: number | undefined; try { const sr = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ operation: task.operation, approach: canonicalApproach, context: canonicalContext, endorsed_names: sealed.fills.map(f => f.name), append: true, }), }); if (sr.ok) { const j = await sr.json() as any; playbook_id = j.playbook_id; entries_after_seed = j.entries_after; console.log(`[${prefix}] ↳ seeded id=${playbook_id} entries=${entries_after_seed}`); } else { console.warn(`[${prefix}] ↳ seed failed: ${sr.status} ${await sr.text()}`); } } catch (e) { console.warn(`[${prefix}] ↳ seed errored: ${(e as Error).message}`); } return { ok: true, task, fills: sealed.fills, turns: turn, duration_secs: Math.round((Date.now() - t0) / 1000), playbook_id, entries_after_seed, }; } catch (e) { return { ok: false, task, fills: [], turns: turn, duration_secs: Math.round((Date.now() - t0) / 1000), error: (e as Error).message, }; } } // ─────────────────────── VERIFY phase (fresh single agent) ─────────────────────── async function activateProfile(): Promise<{ ok: boolean; warmed: number; ms: number }> { const t0 = Date.now(); const r = await fetch(`${GATEWAY}/vectors/profile/${PROFILE_ID}/activate`, { method: "POST" }); const ms = Date.now() - t0; if (!r.ok) { console.warn(`profile activation failed: ${r.status} ${await r.text()}`); return { ok: false, warmed: 0, ms }; } const j = await r.json() as any; return { ok: true, warmed: (j.warmed_indexes ?? []).length, ms }; } async function probeWithBoost(task: TaskSpec) { const sql_filter = `role = '${task.target_role.replace(/'/g, "''")}' ` + `AND state = '${task.target_state}' ` + `AND city = '${task.target_city.replace(/'/g, "''")}'`; const r = await fetch(`${GATEWAY}/vectors/hybrid`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ index_name: INDEX_NAME, filter_dataset: "workers_500k", id_column: "worker_id", sql_filter, question: `${task.target_role} in ${task.target_city}, ${task.target_state}`, top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15, }), }); if (!r.ok) throw new Error(`probe → ${r.status}: ${await r.text()}`); const j = (await r.json()) as any; const sources: any[] = j.sources ?? []; const boostedHits = sources.filter(s => (s.playbook_boost ?? 0) > 0).length; const totalBoost = sources.reduce((s, x) => s + (x.playbook_boost ?? 0), 0); const cites = Array.from(new Set(sources.flatMap(s => s.playbook_citations ?? []))).slice(0, 5); const topNames = sources.slice(0, 5).map(s => { const t = String(s.chunk_text ?? ""); return t.split("—")[0]?.trim() ?? s.doc_id; }); return { sources, boostedHits, totalBoost, cites, topNames }; } // Verifier prompt — fresh agent, no shared log with the build pair. It // gets the round's task, the prior rounds' sealed playbooks, and the live // probe result, and renders a human-readable verdict with a confidence. function verifierPrompt(task: TaskSpec, priorPlaybooks: Array<{op: string; fills: string[]}>, probe: { boostedHits: number; totalBoost: number; cites: string[]; topNames: string[] } ): string { const priorBlock = priorPlaybooks.length === 0 ? "(no prior playbooks — this is the first round)" : priorPlaybooks.map((p, i) => ` ${i+1}. ${p.op} → endorsed [${p.fills.join(", ")}]`).join("\n"); return `You are the VERIFIER agent. A fresh round just sealed a playbook on a real staffing substrate. Your job: judge whether the system learned from prior rounds. CURRENT ROUND: task: ${task.operation} in city: ${task.target_city}, ${task.target_state} PRIOR PLAYBOOKS (in playbook_memory): ${priorBlock} I activated the staffing-recruiter profile and ran a hybrid query for this exact task with use_playbook_memory=true. Live result from the substrate: - top-5 surfaced workers: ${probe.topNames.join(", ")} - hits with non-zero playbook_boost: ${probe.boostedHits} / 10 - total boost across top-10: ${probe.totalBoost.toFixed(3)} - playbook citations: [${probe.cites.join(", ")}] JUDGE: 1. If a prior playbook covered this same city + role, the boost should fire on the workers it endorsed (boostedHits > 0, citations non-empty). 2. If no prior playbook covers this combo, boost should be ~0 — that means the system is correctly NOT bleeding endorsements across geos. 3. Anything in between (e.g. some boost but for the wrong reason) is a partial pass. Respond with ONE JSON object only: {"learned": true|false, "verdict": "", "confidence": 0-100} learned=true means the network behaved as expected for this round (whether that's "boost fired because it should" or "boost stayed zero because it should"). learned=false means the system either failed to learn from a relevant prior playbook OR bled an irrelevant one. confidence is how sure you are.`; } async function verifyPhase(task: TaskSpec, ledger: RoundLedger[]): Promise { const t0 = Date.now(); const act = await activateProfile(); const probe = await probeWithBoost(task); // Decide what counts as geo-correct based on prior playbooks const priorMatchesThisGeo = ledger.some(r => r.build.ok && r.task.target_city === task.target_city && r.task.target_state === task.target_state && r.task.target_role === task.target_role ); const priorOtherGeo = ledger.some(r => r.build.ok && r.task.target_role === task.target_role && !(r.task.target_city === task.target_city && r.task.target_state === task.target_state) ); let geo_discrimination_ok: boolean; if (priorMatchesThisGeo) { geo_discrimination_ok = probe.boostedHits > 0; // expected lift } else if (priorOtherGeo) { geo_discrimination_ok = probe.boostedHits === 0; // must NOT bleed } else { geo_discrimination_ok = true; // no signal expected either way } // Spin up the fresh verifier agent const priorPlaybooks = ledger.filter(r => r.build.ok).map(r => ({ op: r.task.operation, fills: r.build.fills.map(f => f.name), })); let verdict = "verifier failed to respond"; let confidence = 0; try { const raw = await generate(VERIFIER_MODEL, verifierPrompt(task, priorPlaybooks, probe), { temperature: 0.1, max_tokens: 250, }); const start = raw.indexOf("{"), end = raw.lastIndexOf("}"); if (start >= 0 && end > start) { const j = JSON.parse(raw.slice(start, end + 1)); verdict = j.verdict ?? verdict; confidence = Number(j.confidence) || 0; } } catch (e) { verdict = `verifier parse error: ${(e as Error).message}`; } return { profile_activated: act.ok, warmed_indexes: act.warmed, probe_boost_total: probe.totalBoost, probe_boosted_hits: probe.boostedHits, probe_top_citations: probe.cites, geo_discrimination_ok, verdict, confidence, duration_secs: Math.round((Date.now() - t0) / 1000), }; } // ─────────────────────── round scoring ─────────────────────── function scoreRound(r: RoundLedger): { score: number; notes: string[] } { const notes: string[] = []; let s = 0; if (r.build.ok) { s += 3; notes.push(`✓ build sealed (${r.build.fills.map(f => f.name).join(", ")})`); } else { notes.push(`✗ build failed: ${r.build.error}`); } if (r.build.playbook_id) { s += 1; notes.push(`✓ seeded id=${r.build.playbook_id}`); } if (r.verify.profile_activated) { s += 1; notes.push(`✓ profile activated (warmed=${r.verify.warmed_indexes})`); } if (r.verify.geo_discrimination_ok) { s += 3; notes.push(`✓ geo discrimination correct (boostedHits=${r.verify.probe_boosted_hits})`); } else { notes.push(`✗ geo discrimination failed (boostedHits=${r.verify.probe_boosted_hits})`); } if (r.verify.confidence >= 60) { s += 2; notes.push(`✓ verifier confident (${r.verify.confidence}%): ${r.verify.verdict}`); } else { notes.push(`◑ verifier confidence ${r.verify.confidence}%: ${r.verify.verdict}`); } return { score: s, notes }; } // ─────────────────────── main loop ─────────────────────── async function main() { console.log(`▶ Network proving — ${TASK_DECK.length} rounds, profile=${PROFILE_ID}`); console.log(`▶ build pair: ${EXECUTOR_MODEL} + ${REVIEWER_MODEL}; verifier: ${VERIFIER_MODEL}\n`); const ledger: RoundLedger[] = []; for (let i = 0; i < TASK_DECK.length; i++) { const task = TASK_DECK[i]; console.log(`\n══════════ Round ${i} — ${task.operation} ══════════`); console.log(`\n[${task.id}] BUILD phase (two agents collaborating)`); const build = await buildPhase(task, task.id); console.log(`\n[${task.id}] VERIFY phase (fresh agent + hot-swap profile)`); const verify = await verifyPhase(task, ledger); console.log(` profile=${verify.profile_activated ? "ok" : "fail"} warmed=${verify.warmed_indexes} ` + `boosted=${verify.probe_boosted_hits}/10 totalBoost=${verify.probe_boost_total.toFixed(3)} ` + `cites=${verify.probe_top_citations.length} confidence=${verify.confidence}%`); console.log(` verdict: ${verify.verdict}`); const round: RoundLedger = { round: i, task, build, verify, score: 0, notes: [] }; const sc = scoreRound(round); round.score = sc.score; round.notes = sc.notes; ledger.push(round); console.log(`\n Round ${i} score: ${round.score}/10`); for (const n of round.notes) console.log(` ${n}`); } console.log(`\n══════════ Network viability summary ══════════`); const total = ledger.reduce((s, r) => s + r.score, 0); const max = ledger.length * 10; const avg = total / ledger.length; for (const r of ledger) console.log(` R${r.round} ${r.task.target_city.padEnd(10)} ${r.task.target_role.padEnd(20)} ${r.score}/10`); console.log(`\n TOTAL: ${total}/${max} AVG: ${avg.toFixed(1)}/10`); // Hard gate: at least 2/3 rounds must show the verifier is confident enough // AND build phase succeeded const passed = ledger.filter(r => r.build.ok && r.score >= 6).length; if (passed < Math.ceil(ledger.length * 2 / 3)) { throw new Error(`network proving gate failed — only ${passed}/${ledger.length} rounds passed (need ≥${Math.ceil(ledger.length * 2 / 3)})`); } console.log(`\n✓ Network proven over ${passed}/${ledger.length} rounds`); process.exit(0); } main().catch(e => { console.error(`\n✗ ${(e as Error).message}`); if ((e as any).stack) console.error((e as any).stack); process.exit(1); });