lakehouse/tests/multi-agent/network_proving.ts
root 25b7e6c3a7 Phase 19 wiring + Path 1/2 work + chain integrity fixes
Backend:
- crates/vectord/src/playbook_memory.rs (new): Phase 19 in-memory boost
  store with seed/rebuild/snapshot, plus temporal decay (e^-age/30 per
  playbook), persist_to_sql endpoint backing successful_playbooks_live,
  and discover_patterns endpoint for meta-index pattern aggregation
  (recurring certs/skills/archetype/reliability across similar past fills).
- DEFAULT_TOP_K_PLAYBOOKS bumped 5 → 25; old default silently missed
  most boosts when memory had > 25 entries.
- service.rs: new routes /vectors/playbook_memory/{seed,rebuild,stats,
  persist_sql,patterns}.

Bun staffing co-pilot (mcp-server/):
- /search, /match, /verify, /proof, /simulation/run, MCP tools all
  forward use_playbook_memory:true and playbook_memory_k:25 to the
  hybrid endpoint. Boost was previously dark across the entire app.
- /log no longer POSTs to /ingest/file — that endpoint REPLACES the
  dataset's object list, so single-row CSV writes were wiping all prior
  rows in successful_playbooks (sp_rows went 33→1 in one /log call).
  /log now seeds playbook_memory with canonical short text and calls
  /persist_sql to keep successful_playbooks_live in sync.
- /simulation/run cumulative end-of-week CSV write removed for the same
  reason. Per-day per-contract /seed (added in this session) is the
  accumulating feedback path now.
- search.html addWorkerInsight renders a green "Endorsed · N playbooks"
  chip with playbook citations when boost > 0.

Internal Dioxus UI (crates/ui/):
- Dashboard phase list rewritten through Phase 19 (was stuck at "Phase
  16: File Watcher" / "Phase 17: DB Connector" — both wrong).
- Removed fabricated "27ms" stat label.
- Ask tab examples + SQL default replaced with real staffing prompts
  against candidates/clients/job_orders (was referencing nonexistent
  employees/products/events).
- New Playbook tab exposes /vectors/playbook_memory/{stats,rebuild} and
  side-by-side hybrid search (boost OFF vs ON) with citations.

Tests (tests/multi-agent/):
- run_e2e_rated.ts: parallel two-agent (mistral + qwen2.5) build phase
  + verifier rating (geo, auth, persist, boost, speed → /10).
- network_proving.ts: continuous build → verify → repeat with
  staffing-recruiter profile hot-swap; geo-discrimination check.
- chain_of_custody.ts: single recruiter operation traced through every
  layer (Bun /search, direct /vectors/hybrid parity, /log, SQL,
  playbook_memory growth, profile activation, post-op boost lift).
2026-04-20 06:21:13 -05:00

470 lines
20 KiB
TypeScript

// Network proving: continuous build → verify → repeat with hot-swap profile.
//
// J's framing: "have them guide each other, when the test is complete we have
// a successful playbook, then spin up another agent that tests the viability
// of our network with the playbook and the hot-swap profile. Keep spinning up
// agents and testing — pass theory, real-world execution, not isolated unit
// tests."
//
// Each round = TWO phases:
//
// 1. BUILD phase. Two agents (mistral executor + qwen2.5 reviewer) work
// on a real staffing fill task. They guide each other via the critique
// loop. On consensus → seal a playbook with CANONICAL short seed text
// (the Pass 1 lesson — verbose seeds silently kill boost). Real Ollama,
// real workers_500k, real /vectors/hybrid path.
//
// 2. VERIFY phase. A FRESH qwen2.5 agent spins up, activates the
// staffing-recruiter profile (Phase 17 hot-swap), runs a probe query
// against the same network, and judges from the live response whether
// prior rounds' playbooks actually surface relevant workers higher.
// The verifier writes a verdict: did the network learn?
//
// Three rounds, progressively harder:
// R0: Welder x2 in Toledo, OH — baseline
// R1: Welder x2 in Cleveland, OH — same role, different city
// → tests geo discrimination
// (Toledo workers MUST NOT
// bleed into Cleveland boost)
// R2: Welder x3 in Toledo, OH — re-fill same city, bigger
// count → tests compounding
// (R0's endorsements should
// still rank up here)
//
// Run: bun run tests/multi-agent/network_proving.ts
//
// Fail-fast: any HTTP error or model crash bubbles to top-level, exits 1.
import {
type LogEntry,
type TaskSpec,
type Action,
type Fill,
GATEWAY,
generate,
parseAction,
executorPrompt,
reviewerPrompt,
sqlQuery,
callTool,
} from "./agent.ts";
const EXECUTOR_MODEL = "mistral:latest";
const REVIEWER_MODEL = "qwen2.5:latest";
const VERIFIER_MODEL = "qwen2.5:latest";
const PROFILE_ID = "staffing-recruiter";
const INDEX_NAME = "workers_500k_v1";
const MAX_TURNS = 12;
const MAX_TOOL_ERRORS = 3;
const MAX_DRIFTS = 3;
const TASK_DECK: TaskSpec[] = [
{
id: "R0", operation: "fill: Welder x2 in Toledo, OH",
target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH",
approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
},
{
id: "R1", operation: "fill: Welder x2 in Cleveland, OH",
target_role: "Welder", target_count: 2, target_city: "Cleveland", target_state: "OH",
approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
},
{
id: "R2", operation: "fill: Welder x3 in Toledo, OH",
target_role: "Welder", target_count: 3, target_city: "Toledo", target_state: "OH",
approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
},
];
interface BuildResult {
ok: boolean;
task: TaskSpec;
fills: Fill[];
turns: number;
duration_secs: number;
playbook_id?: string;
entries_after_seed?: number;
error?: string;
}
interface VerifyResult {
profile_activated: boolean;
warmed_indexes: number;
probe_boost_total: number; // sum of playbook_boost across top-K
probe_boosted_hits: number; // how many hits had boost > 0
probe_top_citations: string[]; // playbook_ids cited
geo_discrimination_ok: boolean; // when prior playbook is in different city, boost should NOT bleed
verdict: string; // qwen2.5's natural-language judgment
confidence: number; // 0-100 self-rated
duration_secs: number;
}
interface RoundLedger {
round: number;
task: TaskSpec;
build: BuildResult;
verify: VerifyResult;
score: number; // /10 per round
notes: string[];
}
// ─────────────────────── BUILD phase (two-agent loop) ───────────────────────
async function executeToolCall(name: string, args: Record<string, any>): Promise<any> {
if (name === "hybrid_search") {
const { sql_filter, question, index_name, k } = args;
if (!sql_filter || !question || !index_name) {
throw new Error(`hybrid_search needs sql_filter+question+index_name, got keys=${Object.keys(args).join(",")}`);
}
const r = await fetch(`${GATEWAY}/vectors/hybrid`, {
method: "POST", headers: { "Content-Type": "application/json" },
body: JSON.stringify({
sql_filter, question, index_name,
filter_dataset: "workers_500k", id_column: "worker_id",
top_k: k ?? 10, generate: false, use_playbook_memory: true,
}),
});
if (!r.ok) throw new Error(`hybrid → ${r.status}: ${await r.text()}`);
return r.json();
}
if (name === "sql") {
if (!args.query) throw new Error("sql needs query");
if (!/^\s*SELECT/i.test(args.query)) throw new Error("sql allows SELECT only");
return sqlQuery(args.query);
}
return callTool(name, args);
}
function trim(r: any) {
if (r && Array.isArray(r.rows)) return { ...r, rows: r.rows.slice(0, 20) };
if (r && Array.isArray(r.sources)) return { ...r, sources: r.sources.slice(0, 12) };
return r;
}
function fmtTurn(prefix: string, e: Omit<LogEntry, "at">): string {
const c: any = e.content ?? {};
const head = `[${prefix} t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`;
if (e.kind === "tool_call") return `${head} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 70)})`;
if (e.kind === "tool_result") {
if (c.error) return `${head} error: ${c.error}`;
if (Array.isArray(c.sources)) return `${head} hybrid sql=${c.sql_matches} reranked=${c.vector_reranked}`;
if (Array.isArray(c.rows)) return `${head} sql ${c.rows.length} rows`;
return `${head} ${JSON.stringify(c).slice(0, 70)}`;
}
if (e.kind === "critique") return `${head} verdict=${c.verdict} ${(c.notes ?? "").slice(0, 50)}`;
if (e.kind === "propose_done") return `${head} ${(c.fills ?? []).length} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`;
if (e.kind === "consensus_done") return `${head}`;
if (e.kind === "plan") return `${head} ${(c.steps ?? []).length} steps`;
return `${head} ${JSON.stringify(c).slice(0, 60)}`;
}
async function buildPhase(task: TaskSpec, prefix: string): Promise<BuildResult> {
const t0 = Date.now();
const log: LogEntry[] = [];
let turn = 0, sealed: { fills: Fill[]; approach: string } | null = null;
let toolErrors = 0, drifts = 0;
const append = (e: Omit<LogEntry, "at">): LogEntry => {
const full: LogEntry = { ...e, at: new Date().toISOString() };
log.push(full);
console.log(fmtTurn(prefix, e));
return full;
};
try {
while (turn < MAX_TURNS && !sealed) {
turn += 1;
const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 600 });
const execAction = parseAction(execRaw, "executor");
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction });
if (execAction.kind === "tool_call") {
try {
const r = await executeToolCall(execAction.tool, execAction.args);
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trim(r) });
toolErrors = 0;
} catch (e) {
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result",
content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } });
toolErrors += 1;
if (toolErrors >= MAX_TOOL_ERRORS) throw new Error(`${MAX_TOOL_ERRORS} consecutive tool errors`);
}
}
const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 400 });
const revAction = parseAction(revRaw, "reviewer");
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction });
if (revAction.kind !== "critique") throw new Error(`reviewer non-critique`);
if (revAction.verdict === "drift") {
drifts += 1;
if (drifts >= MAX_DRIFTS) throw new Error(`${MAX_DRIFTS} consecutive drifts`);
} else drifts = 0;
if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") {
if (execAction.fills.length !== task.target_count) {
throw new Error(`fills=${execAction.fills.length} target=${task.target_count}`);
}
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } });
sealed = { fills: execAction.fills, approach: (execAction as any).rationale ?? "multi-agent" };
}
}
if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`);
// Phase 19 seed — CANONICAL short text (Pass 1 lesson). The verbose
// executor rationale stays out of the embedding; we keep a separate
// human-readable record in the playbook log.
const canonicalApproach = `${task.target_role.toLowerCase()} fill via hybrid search`;
const canonicalContext = `${task.target_role} fill in ${task.target_city}, ${task.target_state}`;
let playbook_id: string | undefined;
let entries_after_seed: number | undefined;
try {
const sr = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
method: "POST", headers: { "Content-Type": "application/json" },
body: JSON.stringify({
operation: task.operation,
approach: canonicalApproach,
context: canonicalContext,
endorsed_names: sealed.fills.map(f => f.name),
append: true,
}),
});
if (sr.ok) {
const j = await sr.json() as any;
playbook_id = j.playbook_id;
entries_after_seed = j.entries_after;
console.log(`[${prefix}] ↳ seeded id=${playbook_id} entries=${entries_after_seed}`);
} else {
console.warn(`[${prefix}] ↳ seed failed: ${sr.status} ${await sr.text()}`);
}
} catch (e) {
console.warn(`[${prefix}] ↳ seed errored: ${(e as Error).message}`);
}
return {
ok: true, task, fills: sealed.fills, turns: turn,
duration_secs: Math.round((Date.now() - t0) / 1000),
playbook_id, entries_after_seed,
};
} catch (e) {
return {
ok: false, task, fills: [], turns: turn,
duration_secs: Math.round((Date.now() - t0) / 1000),
error: (e as Error).message,
};
}
}
// ─────────────────────── VERIFY phase (fresh single agent) ───────────────────────
async function activateProfile(): Promise<{ ok: boolean; warmed: number; ms: number }> {
const t0 = Date.now();
const r = await fetch(`${GATEWAY}/vectors/profile/${PROFILE_ID}/activate`, { method: "POST" });
const ms = Date.now() - t0;
if (!r.ok) {
console.warn(`profile activation failed: ${r.status} ${await r.text()}`);
return { ok: false, warmed: 0, ms };
}
const j = await r.json() as any;
return { ok: true, warmed: (j.warmed_indexes ?? []).length, ms };
}
async function probeWithBoost(task: TaskSpec) {
const sql_filter = `role = '${task.target_role.replace(/'/g, "''")}' `
+ `AND state = '${task.target_state}' `
+ `AND city = '${task.target_city.replace(/'/g, "''")}'`;
const r = await fetch(`${GATEWAY}/vectors/hybrid`, {
method: "POST", headers: { "Content-Type": "application/json" },
body: JSON.stringify({
index_name: INDEX_NAME, filter_dataset: "workers_500k", id_column: "worker_id",
sql_filter, question: `${task.target_role} in ${task.target_city}, ${task.target_state}`,
top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15,
}),
});
if (!r.ok) throw new Error(`probe → ${r.status}: ${await r.text()}`);
const j = (await r.json()) as any;
const sources: any[] = j.sources ?? [];
const boostedHits = sources.filter(s => (s.playbook_boost ?? 0) > 0).length;
const totalBoost = sources.reduce((s, x) => s + (x.playbook_boost ?? 0), 0);
const cites = Array.from(new Set(sources.flatMap(s => s.playbook_citations ?? []))).slice(0, 5);
const topNames = sources.slice(0, 5).map(s => {
const t = String(s.chunk_text ?? "");
return t.split("—")[0]?.trim() ?? s.doc_id;
});
return { sources, boostedHits, totalBoost, cites, topNames };
}
// Verifier prompt — fresh agent, no shared log with the build pair. It
// gets the round's task, the prior rounds' sealed playbooks, and the live
// probe result, and renders a human-readable verdict with a confidence.
function verifierPrompt(task: TaskSpec, priorPlaybooks: Array<{op: string; fills: string[]}>,
probe: { boostedHits: number; totalBoost: number; cites: string[]; topNames: string[] }
): string {
const priorBlock = priorPlaybooks.length === 0
? "(no prior playbooks — this is the first round)"
: priorPlaybooks.map((p, i) => ` ${i+1}. ${p.op} → endorsed [${p.fills.join(", ")}]`).join("\n");
return `You are the VERIFIER agent. A fresh round just sealed a playbook on a real staffing
substrate. Your job: judge whether the system learned from prior rounds.
CURRENT ROUND:
task: ${task.operation}
in city: ${task.target_city}, ${task.target_state}
PRIOR PLAYBOOKS (in playbook_memory):
${priorBlock}
I activated the staffing-recruiter profile and ran a hybrid query for this exact task with
use_playbook_memory=true. Live result from the substrate:
- top-5 surfaced workers: ${probe.topNames.join(", ")}
- hits with non-zero playbook_boost: ${probe.boostedHits} / 10
- total boost across top-10: ${probe.totalBoost.toFixed(3)}
- playbook citations: [${probe.cites.join(", ")}]
JUDGE:
1. If a prior playbook covered this same city + role, the boost should fire on the workers
it endorsed (boostedHits > 0, citations non-empty).
2. If no prior playbook covers this combo, boost should be ~0 — that means the system is
correctly NOT bleeding endorsements across geos.
3. Anything in between (e.g. some boost but for the wrong reason) is a partial pass.
Respond with ONE JSON object only:
{"learned": true|false, "verdict": "<one sentence>", "confidence": 0-100}
learned=true means the network behaved as expected for this round (whether that's "boost fired
because it should" or "boost stayed zero because it should"). learned=false means the system
either failed to learn from a relevant prior playbook OR bled an irrelevant one. confidence is
how sure you are.`;
}
async function verifyPhase(task: TaskSpec, ledger: RoundLedger[]): Promise<VerifyResult> {
const t0 = Date.now();
const act = await activateProfile();
const probe = await probeWithBoost(task);
// Decide what counts as geo-correct based on prior playbooks
const priorMatchesThisGeo = ledger.some(r =>
r.build.ok &&
r.task.target_city === task.target_city &&
r.task.target_state === task.target_state &&
r.task.target_role === task.target_role
);
const priorOtherGeo = ledger.some(r =>
r.build.ok &&
r.task.target_role === task.target_role &&
!(r.task.target_city === task.target_city && r.task.target_state === task.target_state)
);
let geo_discrimination_ok: boolean;
if (priorMatchesThisGeo) {
geo_discrimination_ok = probe.boostedHits > 0; // expected lift
} else if (priorOtherGeo) {
geo_discrimination_ok = probe.boostedHits === 0; // must NOT bleed
} else {
geo_discrimination_ok = true; // no signal expected either way
}
// Spin up the fresh verifier agent
const priorPlaybooks = ledger.filter(r => r.build.ok).map(r => ({
op: r.task.operation, fills: r.build.fills.map(f => f.name),
}));
let verdict = "verifier failed to respond"; let confidence = 0;
try {
const raw = await generate(VERIFIER_MODEL, verifierPrompt(task, priorPlaybooks, probe), {
temperature: 0.1, max_tokens: 250,
});
const start = raw.indexOf("{"), end = raw.lastIndexOf("}");
if (start >= 0 && end > start) {
const j = JSON.parse(raw.slice(start, end + 1));
verdict = j.verdict ?? verdict;
confidence = Number(j.confidence) || 0;
}
} catch (e) {
verdict = `verifier parse error: ${(e as Error).message}`;
}
return {
profile_activated: act.ok,
warmed_indexes: act.warmed,
probe_boost_total: probe.totalBoost,
probe_boosted_hits: probe.boostedHits,
probe_top_citations: probe.cites,
geo_discrimination_ok,
verdict, confidence,
duration_secs: Math.round((Date.now() - t0) / 1000),
};
}
// ─────────────────────── round scoring ───────────────────────
function scoreRound(r: RoundLedger): { score: number; notes: string[] } {
const notes: string[] = [];
let s = 0;
if (r.build.ok) { s += 3; notes.push(`✓ build sealed (${r.build.fills.map(f => f.name).join(", ")})`); }
else { notes.push(`✗ build failed: ${r.build.error}`); }
if (r.build.playbook_id) { s += 1; notes.push(`✓ seeded id=${r.build.playbook_id}`); }
if (r.verify.profile_activated) { s += 1; notes.push(`✓ profile activated (warmed=${r.verify.warmed_indexes})`); }
if (r.verify.geo_discrimination_ok) { s += 3; notes.push(`✓ geo discrimination correct (boostedHits=${r.verify.probe_boosted_hits})`); }
else { notes.push(`✗ geo discrimination failed (boostedHits=${r.verify.probe_boosted_hits})`); }
if (r.verify.confidence >= 60) { s += 2; notes.push(`✓ verifier confident (${r.verify.confidence}%): ${r.verify.verdict}`); }
else { notes.push(`◑ verifier confidence ${r.verify.confidence}%: ${r.verify.verdict}`); }
return { score: s, notes };
}
// ─────────────────────── main loop ───────────────────────
async function main() {
console.log(`▶ Network proving — ${TASK_DECK.length} rounds, profile=${PROFILE_ID}`);
console.log(`▶ build pair: ${EXECUTOR_MODEL} + ${REVIEWER_MODEL}; verifier: ${VERIFIER_MODEL}\n`);
const ledger: RoundLedger[] = [];
for (let i = 0; i < TASK_DECK.length; i++) {
const task = TASK_DECK[i];
console.log(`\n══════════ Round ${i}${task.operation} ══════════`);
console.log(`\n[${task.id}] BUILD phase (two agents collaborating)`);
const build = await buildPhase(task, task.id);
console.log(`\n[${task.id}] VERIFY phase (fresh agent + hot-swap profile)`);
const verify = await verifyPhase(task, ledger);
console.log(` profile=${verify.profile_activated ? "ok" : "fail"} warmed=${verify.warmed_indexes} `
+ `boosted=${verify.probe_boosted_hits}/10 totalBoost=${verify.probe_boost_total.toFixed(3)} `
+ `cites=${verify.probe_top_citations.length} confidence=${verify.confidence}%`);
console.log(` verdict: ${verify.verdict}`);
const round: RoundLedger = { round: i, task, build, verify, score: 0, notes: [] };
const sc = scoreRound(round);
round.score = sc.score; round.notes = sc.notes;
ledger.push(round);
console.log(`\n Round ${i} score: ${round.score}/10`);
for (const n of round.notes) console.log(` ${n}`);
}
console.log(`\n══════════ Network viability summary ══════════`);
const total = ledger.reduce((s, r) => s + r.score, 0);
const max = ledger.length * 10;
const avg = total / ledger.length;
for (const r of ledger) console.log(` R${r.round} ${r.task.target_city.padEnd(10)} ${r.task.target_role.padEnd(20)} ${r.score}/10`);
console.log(`\n TOTAL: ${total}/${max} AVG: ${avg.toFixed(1)}/10`);
// Hard gate: at least 2/3 rounds must show the verifier is confident enough
// AND build phase succeeded
const passed = ledger.filter(r => r.build.ok && r.score >= 6).length;
if (passed < Math.ceil(ledger.length * 2 / 3)) {
throw new Error(`network proving gate failed — only ${passed}/${ledger.length} rounds passed (need ≥${Math.ceil(ledger.length * 2 / 3)})`);
}
console.log(`\n✓ Network proven over ${passed}/${ledger.length} rounds`);
process.exit(0);
}
main().catch(e => {
console.error(`\n✗ ${(e as Error).message}`);
if ((e as any).stack) console.error((e as any).stack);
process.exit(1);
});