lakehouse/tests/multi-agent/network_proving.ts

// Network proving: continuous build → verify → repeat with hot-swap profile.
//
// J's framing: "have them guide each other, when the test is complete we have
// a successful playbook, then spin up another agent that tests the viability
// of our network with the playbook and the hot-swap profile. Keep spinning up
// agents and testing — pass theory, real-world execution, not isolated unit
// tests."
//
// Each round = TWO phases:
//
//   1. BUILD phase. Two agents (mistral executor + qwen2.5 reviewer) work
//      on a real staffing fill task. They guide each other via the critique
//      loop. On consensus → seal a playbook with CANONICAL short seed text
//      (the Pass 1 lesson — verbose seeds silently kill boost). Real Ollama,
//      real workers_500k, real /vectors/hybrid path.
//
//   2. VERIFY phase. A FRESH qwen2.5 agent spins up, activates the
//      staffing-recruiter profile (Phase 17 hot-swap), runs a probe query
//      against the same network, and judges from the live response whether
//      prior rounds' playbooks actually surface relevant workers higher.
//      The verifier writes a verdict: did the network learn?
//
// Three rounds, progressively harder:
//   R0: Welder x2 in Toledo, OH                — baseline
//   R1: Welder x2 in Cleveland, OH             — same role, different city
//                                                  → tests geo discrimination
//                                                  (Toledo workers MUST NOT
//                                                  bleed into Cleveland boost)
//   R2: Welder x3 in Toledo, OH                — re-fill same city, bigger
//                                                  count → tests compounding
//                                                  (R0's endorsements should
//                                                  still rank up here)
//
// Run: bun run tests/multi-agent/network_proving.ts
//
// Fail-fast: any HTTP error or model crash bubbles to top-level, exits 1.

import {
  type LogEntry,
  type TaskSpec,
  type Action,
  type Fill,
  GATEWAY,
  generate,
  parseAction,
  executorPrompt,
  reviewerPrompt,
  sqlQuery,
  callTool,
} from "./agent.ts";

const EXECUTOR_MODEL = "mistral:latest";
const REVIEWER_MODEL = "qwen2.5:latest";
const VERIFIER_MODEL = "qwen2.5:latest";
const PROFILE_ID = "staffing-recruiter";
const INDEX_NAME = "workers_500k_v1";
const MAX_TURNS = 12;
const MAX_TOOL_ERRORS = 3;
const MAX_DRIFTS = 3;

const TASK_DECK: TaskSpec[] = [
  {
    id: "R0", operation: "fill: Welder x2 in Toledo, OH",
    target_role: "Welder", target_count: 2, target_city: "Toledo", target_state: "OH",
    approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
  },
  {
    id: "R1", operation: "fill: Welder x2 in Cleveland, OH",
    target_role: "Welder", target_count: 2, target_city: "Cleveland", target_state: "OH",
    approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
  },
  {
    id: "R2", operation: "fill: Welder x3 in Toledo, OH",
    target_role: "Welder", target_count: 3, target_city: "Toledo", target_state: "OH",
    approach_hint: "hybrid_search workers_500k_v1 with sql_filter role+state+city, then sql verify",
  },
];

interface BuildResult {
  ok: boolean;
  task: TaskSpec;
  fills: Fill[];
  turns: number;
  duration_secs: number;
  playbook_id?: string;
  entries_after_seed?: number;
  error?: string;
}

interface VerifyResult {
  profile_activated: boolean;
  warmed_indexes: number;
  probe_boost_total: number;          // sum of playbook_boost across top-K
  probe_boosted_hits: number;         // how many hits had boost > 0
  probe_top_citations: string[];      // playbook_ids cited
  geo_discrimination_ok: boolean;     // when prior playbook is in different city, boost should NOT bleed
  verdict: string;                    // qwen2.5's natural-language judgment
  confidence: number;                 // 0-100 self-rated
  duration_secs: number;
}

interface RoundLedger {
  round: number;
  task: TaskSpec;
  build: BuildResult;
  verify: VerifyResult;
  score: number;                      // /10 per round
  notes: string[];
}

// ─────────────────────── BUILD phase (two-agent loop) ───────────────────────

async function executeToolCall(name: string, args: Record<string, any>): Promise<any> {
  if (name === "hybrid_search") {
    const { sql_filter, question, index_name, k } = args;
    if (!sql_filter || !question || !index_name) {
      throw new Error(`hybrid_search needs sql_filter+question+index_name, got keys=${Object.keys(args).join(",")}`);
    }
    const r = await fetch(`${GATEWAY}/vectors/hybrid`, {
      method: "POST", headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
        sql_filter, question, index_name,
        filter_dataset: "workers_500k", id_column: "worker_id",
        top_k: k ?? 10, generate: false, use_playbook_memory: true,
      }),
    });
    if (!r.ok) throw new Error(`hybrid → ${r.status}: ${await r.text()}`);
    return r.json();
  }
  if (name === "sql") {
    if (!args.query) throw new Error("sql needs query");
    if (!/^\s*SELECT/i.test(args.query)) throw new Error("sql allows SELECT only");
    return sqlQuery(args.query);
  }
  return callTool(name, args);
}

function trim(r: any) {
  if (r && Array.isArray(r.rows)) return { ...r, rows: r.rows.slice(0, 20) };
  if (r && Array.isArray(r.sources)) return { ...r, sources: r.sources.slice(0, 12) };
  return r;
}

function fmtTurn(prefix: string, e: Omit<LogEntry, "at">): string {
  const c: any = e.content ?? {};
  const head = `[${prefix} t${e.turn.toString().padStart(2, "0")} ${e.role.padEnd(8)} ${e.kind.padEnd(14)}]`;
  if (e.kind === "tool_call") return `${head} ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 70)})`;
  if (e.kind === "tool_result") {
    if (c.error) return `${head} error: ${c.error}`;
    if (Array.isArray(c.sources)) return `${head} hybrid sql=${c.sql_matches} reranked=${c.vector_reranked}`;
    if (Array.isArray(c.rows)) return `${head} sql ${c.rows.length} rows`;
    return `${head} ${JSON.stringify(c).slice(0, 70)}`;
  }
  if (e.kind === "critique") return `${head} verdict=${c.verdict} ${(c.notes ?? "").slice(0, 50)}`;
  if (e.kind === "propose_done") return `${head} ${(c.fills ?? []).length} fills: ${(c.fills ?? []).map((f: Fill) => f.name).join(", ")}`;
  if (e.kind === "consensus_done") return `${head} ✓`;
  if (e.kind === "plan") return `${head} ${(c.steps ?? []).length} steps`;
  return `${head} ${JSON.stringify(c).slice(0, 60)}`;
}

async function buildPhase(task: TaskSpec, prefix: string): Promise<BuildResult> {
  const t0 = Date.now();
  const log: LogEntry[] = [];
  let turn = 0, sealed: { fills: Fill[]; approach: string } | null = null;
  let toolErrors = 0, drifts = 0;

  const append = (e: Omit<LogEntry, "at">): LogEntry => {
    const full: LogEntry = { ...e, at: new Date().toISOString() };
    log.push(full);
    console.log(fmtTurn(prefix, e));
    return full;
  };

  try {
    while (turn < MAX_TURNS && !sealed) {
      turn += 1;

      const execRaw = await generate(EXECUTOR_MODEL, executorPrompt(task, log), { temperature: 0.2, max_tokens: 600 });
      const execAction = parseAction(execRaw, "executor");
      append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: execAction.kind as any, content: execAction });

      if (execAction.kind === "tool_call") {
        try {
          const r = await executeToolCall(execAction.tool, execAction.args);
          append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result", content: trim(r) });
          toolErrors = 0;
        } catch (e) {
          append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result",
            content: { error: (e as Error).message, tool: execAction.tool, args: execAction.args } });
          toolErrors += 1;
          if (toolErrors >= MAX_TOOL_ERRORS) throw new Error(`${MAX_TOOL_ERRORS} consecutive tool errors`);
        }
      }

      const revRaw = await generate(REVIEWER_MODEL, reviewerPrompt(task, log), { temperature: 0.1, max_tokens: 400 });
      const revAction = parseAction(revRaw, "reviewer");
      append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "critique", content: revAction });

      if (revAction.kind !== "critique") throw new Error(`reviewer non-critique`);
      if (revAction.verdict === "drift") {
        drifts += 1;
        if (drifts >= MAX_DRIFTS) throw new Error(`${MAX_DRIFTS} consecutive drifts`);
      } else drifts = 0;

      if (execAction.kind === "propose_done" && revAction.verdict === "approve_done") {
        if (execAction.fills.length !== task.target_count) {
          throw new Error(`fills=${execAction.fills.length} target=${task.target_count}`);
        }
        append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done", content: { fills: execAction.fills } });
        sealed = { fills: execAction.fills, approach: (execAction as any).rationale ?? "multi-agent" };
      }
    }

    if (!sealed) throw new Error(`no consensus after ${MAX_TURNS} turns`);

    // Phase 19 seed — CANONICAL short text (Pass 1 lesson). The verbose
    // executor rationale stays out of the embedding; we keep a separate
    // human-readable record in the playbook log.
    const canonicalApproach = `${task.target_role.toLowerCase()} fill via hybrid search`;
    const canonicalContext = `${task.target_role} fill in ${task.target_city}, ${task.target_state}`;
    let playbook_id: string | undefined;
    let entries_after_seed: number | undefined;
    try {
      const sr = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
        method: "POST", headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
          operation: task.operation,
          approach: canonicalApproach,
          context: canonicalContext,
          endorsed_names: sealed.fills.map(f => f.name),
          append: true,
        }),
      });
      if (sr.ok) {
        const j = await sr.json() as any;
        playbook_id = j.playbook_id;
        entries_after_seed = j.entries_after;
        console.log(`[${prefix}] ↳ seeded id=${playbook_id} entries=${entries_after_seed}`);
      } else {
        console.warn(`[${prefix}] ↳ seed failed: ${sr.status} ${await sr.text()}`);
      }
    } catch (e) {
      console.warn(`[${prefix}] ↳ seed errored: ${(e as Error).message}`);
    }

    return {
      ok: true, task, fills: sealed.fills, turns: turn,
      duration_secs: Math.round((Date.now() - t0) / 1000),
      playbook_id, entries_after_seed,
    };
  } catch (e) {
    return {
      ok: false, task, fills: [], turns: turn,
      duration_secs: Math.round((Date.now() - t0) / 1000),
      error: (e as Error).message,
    };
  }
}

// ─────────────────────── VERIFY phase (fresh single agent) ───────────────────────

async function activateProfile(): Promise<{ ok: boolean; warmed: number; ms: number }> {
  const t0 = Date.now();
  const r = await fetch(`${GATEWAY}/vectors/profile/${PROFILE_ID}/activate`, { method: "POST" });
  const ms = Date.now() - t0;
  if (!r.ok) {
    console.warn(`profile activation failed: ${r.status} ${await r.text()}`);
    return { ok: false, warmed: 0, ms };
  }
  const j = await r.json() as any;
  return { ok: true, warmed: (j.warmed_indexes ?? []).length, ms };
}

async function probeWithBoost(task: TaskSpec) {
  const sql_filter = `role = '${task.target_role.replace(/'/g, "''")}' `
    + `AND state = '${task.target_state}' `
    + `AND city = '${task.target_city.replace(/'/g, "''")}'`;
  const r = await fetch(`${GATEWAY}/vectors/hybrid`, {
    method: "POST", headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      index_name: INDEX_NAME, filter_dataset: "workers_500k", id_column: "worker_id",
      sql_filter, question: `${task.target_role} in ${task.target_city}, ${task.target_state}`,
      top_k: 10, generate: false, use_playbook_memory: true, playbook_memory_k: 15,
    }),
  });
  if (!r.ok) throw new Error(`probe → ${r.status}: ${await r.text()}`);
  const j = (await r.json()) as any;
  const sources: any[] = j.sources ?? [];
  const boostedHits = sources.filter(s => (s.playbook_boost ?? 0) > 0).length;
  const totalBoost = sources.reduce((s, x) => s + (x.playbook_boost ?? 0), 0);
  const cites = Array.from(new Set(sources.flatMap(s => s.playbook_citations ?? []))).slice(0, 5);
  const topNames = sources.slice(0, 5).map(s => {
    const t = String(s.chunk_text ?? "");
    return t.split("—")[0]?.trim() ?? s.doc_id;
  });
  return { sources, boostedHits, totalBoost, cites, topNames };
}

// Verifier prompt — fresh agent, no shared log with the build pair. It
// gets the round's task, the prior rounds' sealed playbooks, and the live
// probe result, and renders a human-readable verdict with a confidence.
function verifierPrompt(task: TaskSpec, priorPlaybooks: Array<{op: string; fills: string[]}>,
  probe: { boostedHits: number; totalBoost: number; cites: string[]; topNames: string[] }
): string {
  const priorBlock = priorPlaybooks.length === 0
    ? "(no prior playbooks — this is the first round)"
    : priorPlaybooks.map((p, i) => `  ${i+1}. ${p.op} → endorsed [${p.fills.join(", ")}]`).join("\n");

  return `You are the VERIFIER agent. A fresh round just sealed a playbook on a real staffing
substrate. Your job: judge whether the system learned from prior rounds.

CURRENT ROUND:
  task:    ${task.operation}
  in city: ${task.target_city}, ${task.target_state}

PRIOR PLAYBOOKS (in playbook_memory):
${priorBlock}

I activated the staffing-recruiter profile and ran a hybrid query for this exact task with
use_playbook_memory=true. Live result from the substrate:
  - top-5 surfaced workers: ${probe.topNames.join(", ")}
  - hits with non-zero playbook_boost: ${probe.boostedHits} / 10
  - total boost across top-10: ${probe.totalBoost.toFixed(3)}
  - playbook citations: [${probe.cites.join(", ")}]

JUDGE:
1. If a prior playbook covered this same city + role, the boost should fire on the workers
   it endorsed (boostedHits > 0, citations non-empty).
2. If no prior playbook covers this combo, boost should be ~0 — that means the system is
   correctly NOT bleeding endorsements across geos.
3. Anything in between (e.g. some boost but for the wrong reason) is a partial pass.

Respond with ONE JSON object only:
{"learned": true|false, "verdict": "<one sentence>", "confidence": 0-100}

learned=true means the network behaved as expected for this round (whether that's "boost fired
because it should" or "boost stayed zero because it should"). learned=false means the system
either failed to learn from a relevant prior playbook OR bled an irrelevant one. confidence is
how sure you are.`;
}

async function verifyPhase(task: TaskSpec, ledger: RoundLedger[]): Promise<VerifyResult> {
  const t0 = Date.now();
  const act = await activateProfile();
  const probe = await probeWithBoost(task);

  // Decide what counts as geo-correct based on prior playbooks
  const priorMatchesThisGeo = ledger.some(r =>
    r.build.ok &&
    r.task.target_city === task.target_city &&
    r.task.target_state === task.target_state &&
    r.task.target_role === task.target_role
  );
  const priorOtherGeo = ledger.some(r =>
    r.build.ok &&
    r.task.target_role === task.target_role &&
    !(r.task.target_city === task.target_city && r.task.target_state === task.target_state)
  );

  let geo_discrimination_ok: boolean;
  if (priorMatchesThisGeo) {
    geo_discrimination_ok = probe.boostedHits > 0; // expected lift
  } else if (priorOtherGeo) {
    geo_discrimination_ok = probe.boostedHits === 0; // must NOT bleed
  } else {
    geo_discrimination_ok = true; // no signal expected either way
  }

  // Spin up the fresh verifier agent
  const priorPlaybooks = ledger.filter(r => r.build.ok).map(r => ({
    op: r.task.operation, fills: r.build.fills.map(f => f.name),
  }));

  let verdict = "verifier failed to respond"; let confidence = 0;
  try {
    const raw = await generate(VERIFIER_MODEL, verifierPrompt(task, priorPlaybooks, probe), {
      temperature: 0.1, max_tokens: 250,
    });
    const start = raw.indexOf("{"), end = raw.lastIndexOf("}");
    if (start >= 0 && end > start) {
      const j = JSON.parse(raw.slice(start, end + 1));
      verdict = j.verdict ?? verdict;
      confidence = Number(j.confidence) || 0;
    }
  } catch (e) {
    verdict = `verifier parse error: ${(e as Error).message}`;
  }

  return {
    profile_activated: act.ok,
    warmed_indexes: act.warmed,
    probe_boost_total: probe.totalBoost,
    probe_boosted_hits: probe.boostedHits,
    probe_top_citations: probe.cites,
    geo_discrimination_ok,
    verdict, confidence,
    duration_secs: Math.round((Date.now() - t0) / 1000),
  };
}

// ─────────────────────── round scoring ───────────────────────

function scoreRound(r: RoundLedger): { score: number; notes: string[] } {
  const notes: string[] = [];
  let s = 0;
  if (r.build.ok) { s += 3; notes.push(`✓ build sealed (${r.build.fills.map(f => f.name).join(", ")})`); }
  else { notes.push(`✗ build failed: ${r.build.error}`); }
  if (r.build.playbook_id) { s += 1; notes.push(`✓ seeded id=${r.build.playbook_id}`); }
  if (r.verify.profile_activated) { s += 1; notes.push(`✓ profile activated (warmed=${r.verify.warmed_indexes})`); }
  if (r.verify.geo_discrimination_ok) { s += 3; notes.push(`✓ geo discrimination correct (boostedHits=${r.verify.probe_boosted_hits})`); }
  else { notes.push(`✗ geo discrimination failed (boostedHits=${r.verify.probe_boosted_hits})`); }
  if (r.verify.confidence >= 60) { s += 2; notes.push(`✓ verifier confident (${r.verify.confidence}%): ${r.verify.verdict}`); }
  else { notes.push(`◑ verifier confidence ${r.verify.confidence}%: ${r.verify.verdict}`); }
  return { score: s, notes };
}

// ─────────────────────── main loop ───────────────────────

async function main() {
  console.log(`▶ Network proving — ${TASK_DECK.length} rounds, profile=${PROFILE_ID}`);
  console.log(`▶ build pair: ${EXECUTOR_MODEL} + ${REVIEWER_MODEL}; verifier: ${VERIFIER_MODEL}\n`);

  const ledger: RoundLedger[] = [];

  for (let i = 0; i < TASK_DECK.length; i++) {
    const task = TASK_DECK[i];
    console.log(`\n══════════ Round ${i} — ${task.operation} ══════════`);

    console.log(`\n[${task.id}] BUILD phase (two agents collaborating)`);
    const build = await buildPhase(task, task.id);

    console.log(`\n[${task.id}] VERIFY phase (fresh agent + hot-swap profile)`);
    const verify = await verifyPhase(task, ledger);
    console.log(`  profile=${verify.profile_activated ? "ok" : "fail"} warmed=${verify.warmed_indexes} `
      + `boosted=${verify.probe_boosted_hits}/10 totalBoost=${verify.probe_boost_total.toFixed(3)} `
      + `cites=${verify.probe_top_citations.length} confidence=${verify.confidence}%`);
    console.log(`  verdict: ${verify.verdict}`);

    const round: RoundLedger = { round: i, task, build, verify, score: 0, notes: [] };
    const sc = scoreRound(round);
    round.score = sc.score; round.notes = sc.notes;
    ledger.push(round);

    console.log(`\n  Round ${i} score: ${round.score}/10`);
    for (const n of round.notes) console.log(`    ${n}`);
  }

  console.log(`\n══════════ Network viability summary ══════════`);
  const total = ledger.reduce((s, r) => s + r.score, 0);
  const max = ledger.length * 10;
  const avg = total / ledger.length;
  for (const r of ledger) console.log(`  R${r.round} ${r.task.target_city.padEnd(10)} ${r.task.target_role.padEnd(20)} ${r.score}/10`);
  console.log(`\n  TOTAL: ${total}/${max}  AVG: ${avg.toFixed(1)}/10`);

  // Hard gate: at least 2/3 rounds must show the verifier is confident enough
  // AND build phase succeeded
  const passed = ledger.filter(r => r.build.ok && r.score >= 6).length;
  if (passed < Math.ceil(ledger.length * 2 / 3)) {
    throw new Error(`network proving gate failed — only ${passed}/${ledger.length} rounds passed (need ≥${Math.ceil(ledger.length * 2 / 3)})`);
  }
  console.log(`\n✓ Network proven over ${passed}/${ledger.length} rounds`);
  process.exit(0);
}

main().catch(e => {
  console.error(`\n✗ ${(e as Error).message}`);
  if ((e as any).stack) console.error((e as any).stack);
  process.exit(1);
});