lakehouse/tests/multi-agent/agent.ts

// Shared runtime for one agent. An agent is a role (executor or reviewer),
// a model name, and a conversation the orchestrator hands it. The agent
// produces ONE structured Action per turn; the orchestrator applies tool
// calls and feeds results back.
//
// Fail-fast: every HTTP error, parse error, and Ollama error throws. The
// orchestrator catches at the top and exits non-zero with the full log.

export const GATEWAY = "http://localhost:3100";
export const SIDECAR = "http://localhost:3200";

// Ollama Cloud — used for the T3 overview tier when LH_OVERVIEW_CLOUD=1.
// Same /api/generate surface as local Ollama; just needs the bearer key.
// Default base and key are read from env so secrets never land in git.
export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com";
export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? "";

// Rough token estimator — chars/4 biased safe by ~15%. Swap to a real
// tokenizer (tiktoken or provider endpoint) once Phase 21 lands. Good
// enough to stop the "context silently truncated" failure mode today.
export function estimateTokens(text: string): number {
  return Math.ceil(text.length / 4);
}

// Known context windows — matches crates/../config/models.json. Kept in
// code as a fallback so the test harness doesn't crash if the config is
// missing. Production path should read from models.json.
export const CONTEXT_WINDOWS: Record<string, number> = {
  "mistral:latest": 32768,
  "qwen2.5:latest": 32768,
  "qwen3:latest": 40960,
  "gpt-oss:20b": 131072,
  "gpt-oss:120b": 131072,
  "qwen3.5:397b": 131072,
  "kimi-k2-thinking": 200000,
  "kimi-k2:1t": 1048576,
  "deepseek-v3.1:671b": 131072,
  "glm-4.7": 131072,
};
const DEFAULT_CONTEXT_WINDOW = 32768;
const DEFAULT_SAFETY_MARGIN = 2000;

// Fail LOUDLY if a prompt would blow the model's context. The whole
// point of Phase 21 is to stop silent truncation — so we throw with the
// numbers. Callers that expect to handle overflow should chunk BEFORE
// calling; they can also set bypass: true to opt out (T5 gatekeeper
// handles its own overflow policy).
export function assertContextBudget(
  model: string,
  prompt: string,
  opts: { system?: string; max_tokens?: number; safety_margin?: number; bypass?: boolean } = {}
): { estimated: number; window: number; remaining: number } {
  const window = CONTEXT_WINDOWS[model] ?? DEFAULT_CONTEXT_WINDOW;
  const safety = opts.safety_margin ?? DEFAULT_SAFETY_MARGIN;
  const estimated = estimateTokens(prompt) + estimateTokens(opts.system ?? "") + (opts.max_tokens ?? 800);
  const remaining = window - estimated - safety;
  if (remaining < 0 && !opts.bypass) {
    throw new Error(
      `context overflow: model=${model} est=${estimated}t window=${window}t safety=${safety}t over=${-remaining}t. ` +
      `Chunk the prompt (see config/models.json overflow_policies) or set bypass:true if you know the risk.`
    );
  }
  return { estimated, window, remaining };
}

// --- Shared types ---

export type Role = "executor" | "reviewer";

export interface TaskSpec {
  id: string;
  operation: string;           // "fill: Welder x2 in Columbus, OH"
  target_role: string;         // "Welder"
  target_count: number;        // 2
  target_city: string;         // "Columbus"
  target_state: string;        // "OH"
  approach_hint?: string;      // e.g. "hybrid search"; agent is free to ignore
}

export interface LogEntry {
  turn: number;
  role: Role;
  model: string;
  at: string;
  kind:
    | "plan"
    | "tool_call"
    | "tool_result"
    | "critique"
    | "propose_done"
    | "consensus_done"
    | "error";
  content: any;
}

// Action = what an agent returns on one turn. Strict shape so we can
// enforce it at parse time rather than prompt-engineer around malformed
// JSON.
export type Action =
  | { kind: "tool_call"; tool: string; args: Record<string, any>; rationale: string }
  | { kind: "propose_done"; fills: Fill[]; rationale: string }
  | { kind: "critique"; verdict: "continue" | "drift" | "approve_done"; notes: string }
  | { kind: "plan"; steps: string[] };

export interface Fill {
  candidate_id: string;
  name: string;
  reason: string;
}

// --- HTTP helpers (fail-fast) ---

async function http<T>(method: string, url: string, body?: any): Promise<T> {
  const res = await fetch(url, {
    method,
    headers: { "Content-Type": "application/json" },
    body: body ? JSON.stringify(body) : undefined,
  });
  if (!res.ok) {
    const text = await res.text();
    throw new Error(`${method} ${url} → ${res.status}: ${text}`);
  }
  return (await res.json()) as T;
}

// Tool calls land in the Phase 12 audit log keyed by this agent name.
// Distinguishable from human-driven calls (agent=="operator" or similar)
// so post-hoc queries can separate multi-agent runs.
export const TOOL_AGENT_ID = "multi-agent-test";

export async function callTool(tool: string, args: Record<string, any>): Promise<any> {
  return http("POST", `${GATEWAY}/tools/${tool}/call`, {
    params: args,
    agent: TOOL_AGENT_ID,
  });
}

export async function hybridSearch(sql_filter: string, question: string, k = 10): Promise<any> {
  return http("POST", `${GATEWAY}/vectors/hybrid`, { sql_filter, question, k });
}

export async function sqlQuery(sql: string): Promise<any> {
  return http("POST", `${GATEWAY}/query/sql`, { sql, format: "json" });
}

// Sidecar generate. Ollama's default keep_alive (5 min) keeps the model
// warm between turns on its own, so we don't need to pass it through.
export async function generate(model: string, prompt: string, opts: {
  max_tokens?: number;
  temperature?: number;
  system?: string;
  bypass_budget?: boolean;
} = {}): Promise<string> {
  assertContextBudget(model, prompt, {
    system: opts.system,
    max_tokens: opts.max_tokens,
    bypass: opts.bypass_budget,
  });
  const body: Record<string, any> = {
    model,
    prompt,
    temperature: opts.temperature ?? 0.3,
    max_tokens: opts.max_tokens ?? 800,
  };
  if (opts.system) body.system = opts.system;
  const r = await http<any>("POST", `${SIDECAR}/generate`, body);
  const text = r.text ?? "";
  if (!text || typeof text !== "string") {
    throw new Error(`generate returned empty text from ${model}: ${JSON.stringify(r).slice(0, 200)}`);
  }
  return text;
}

// Cloud generate — hits Ollama Cloud directly with the bearer key. Same
// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb)
// is discarded, only `response` is returned. Caller should budget
// num_predict ≥ 400 so thinking-model reasoning has room before the
// visible response starts.
export async function generateCloud(model: string, prompt: string, opts: {
  max_tokens?: number;
  temperature?: number;
  system?: string;
  bypass_budget?: boolean;
} = {}): Promise<string> {
  if (!OLLAMA_CLOUD_KEY) {
    throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud");
  }
  assertContextBudget(model, prompt, {
    system: opts.system,
    max_tokens: opts.max_tokens,
    bypass: opts.bypass_budget,
  });
  const body: Record<string, any> = {
    model,
    prompt,
    stream: false,
    options: {
      temperature: opts.temperature ?? 0.3,
      num_predict: Math.max(opts.max_tokens ?? 800, 400),
    },
  };
  if (opts.system) body.system = opts.system;
  const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, {
    method: "POST",
    headers: {
      "Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify(body),
  });
  if (!resp.ok) {
    throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`);
  }
  const data: any = await resp.json();
  const text = data.response ?? "";
  if (!text) {
    throw new Error(`Ollama Cloud returned empty response for ${model}: ${JSON.stringify(data).slice(0, 200)}`);
  }
  return text;
}

// --- Prompt construction ---

const TOOL_CATALOG = `
Available tools (each takes a JSON "args" object):

- hybrid_search(sql_filter: string, question: string, index_name: string, k?: number)
  → Narrow workers via SQL WHERE clause, then rank by semantic match.
  → Canonical production tool for fill tasks. Always use this FIRST.
  → Example args:
    {"index_name":"workers_500k_v1",
     "sql_filter":"role = 'Forklift Operator' AND city = 'Toledo' AND state = 'OH' AND CAST(availability AS DOUBLE) > 0.5",
     "question":"reliable forklift operator Toledo",
     "k":10}

- sql(query: string)
  → Raw read-only SELECT. Use for verification (confirm a worker exists,
    check city/role/availability) after hybrid_search surfaces candidates.
  → Schema of workers_500k: worker_id, name, role, email, phone, city,
    state, zip, skills, certifications, archetype, reliability,
    responsiveness, engagement, communications, compliance, availability,
    resume_text.
  → Example args:
    {"query":"SELECT worker_id, name, role, city, state FROM workers_500k WHERE worker_id = 40123"}

Rules:
- hybrid_search returns sources[] each with {doc_id, chunk_text, score,
  sql_verified, playbook_boost, playbook_citations}.
- **ID mapping:** vector doc_ids look like "W500K-7995" (prefix + number).
  The SQL worker_id is an INTEGER. Use WHERE worker_id = 7995 directly.
- Names are NOT unique. Always identify by worker_id.
- availability and reliability are stored as text; ALWAYS cast as
  DOUBLE in filters: CAST(availability AS DOUBLE) > 0.5.
- Narrative words from the guidance ("shift", "recurring", "expansion",
  "emergency") are NOT columns. Only use columns listed above.
- Return EXACTLY ONE JSON object per turn. No markdown fences, no prose.
`;

// Smart per-kind summary so agents see the substance of each prior turn
// without a raw-JSON wall of text. hybrid_search results especially need
// this — raw JSON buries sources[] past any reasonable 400-char truncation.
function summarizeEntry(e: LogEntry): string {
  const c = e.content ?? {};
  switch (e.kind) {
    case "plan":
      return `PLAN: ${(c.steps ?? []).map((s: string, i: number) => `${i + 1}.${s}`).join(" ")}`;
    case "tool_call":
      return `TOOL_CALL ${c.tool}(${JSON.stringify(c.args ?? {}).slice(0, 250)})${c.rationale ? ` — ${c.rationale}` : ""}`;
    case "tool_result": {
      if (c.error) return `TOOL_RESULT error: ${c.error}`;
      // hybrid_search response
      if (Array.isArray(c.sources)) {
        const head = c.sources.slice(0, 5).map((s: any) =>
          `${s.doc_id}${s.sql_verified ? "✓" : ""} score=${(s.score ?? 0).toFixed(2)}: ${String(s.chunk_text ?? "").slice(0, 80)}`
        ).join(" | ");
        return `TOOL_RESULT hybrid: sql_matches=${c.sql_matches} vector_reranked=${c.vector_reranked} sources=[${head}${c.sources.length > 5 ? ` +${c.sources.length - 5} more` : ""}]`;
      }
      // sql response
      if (Array.isArray(c.rows)) {
        const head = c.rows.slice(0, 5).map((r: any) => JSON.stringify(r)).join(" | ");
        return `TOOL_RESULT sql: ${c.rows.length} rows${c.rows.length > 0 ? ` — ${head}${c.rows.length > 5 ? ` +${c.rows.length - 5} more` : ""}` : ""}`;
      }
      // fallback
      return `TOOL_RESULT ${JSON.stringify(c).slice(0, 250)}`;
    }
    case "critique":
      return `CRITIQUE verdict=${c.verdict} notes: ${String(c.notes ?? "").slice(0, 200)}`;
    case "propose_done":
      return `PROPOSE_DONE fills=[${(c.fills ?? []).map((f: Fill) => `${f.candidate_id}:${f.name}`).join(", ")}] rationale: ${String(c.rationale ?? "").slice(0, 120)}`;
    case "consensus_done":
      return `CONSENSUS ✓`;
    case "error":
      return `ERROR ${c.message ?? JSON.stringify(c)}`;
  }
  return JSON.stringify(c).slice(0, 200);
}

function renderLogForPrompt(log: LogEntry[]): string {
  if (log.length === 0) return "(no turns yet)";
  return log.slice(-12).map(e =>
    `[t${e.turn} ${e.role}] ${summarizeEntry(e)}`
  ).join("\n");
}

// Crawl the log for every hybrid_search tool_result and collect the
// worker names + ids seen so far. LLMs routinely "forget" earlier turns
// once the conversation grows, so we surface a running ledger in the
// prompt as orchestrator-maintained state. The executor doesn't have to
// track this itself — it just reads it.
function candidatesSeen(log: LogEntry[]): Array<{ doc_id: string; name: string; city: string; state: string }> {
  const seen = new Map<string, { doc_id: string; name: string; city: string; state: string }>();
  for (const e of log) {
    if (e.kind !== "tool_result") continue;
    const sources = (e.content as any)?.sources;
    if (!Array.isArray(sources)) continue;
    for (const s of sources) {
      // chunk_text shape "Name — Role in City, ST. …"
      const t = String(s.chunk_text ?? "");
      const [namePart, rest] = t.split("—", 2);
      if (!namePart || !rest) continue;
      const loc = rest.split(" in ")[1] ?? "";
      const [city, stateRaw] = loc.split(",", 2);
      const state = (stateRaw ?? "").trim().replace(/[^A-Za-z].*/, "");
      if (!s.doc_id || !namePart.trim() || !city?.trim() || !state) continue;
      if (!seen.has(s.doc_id)) {
        seen.set(s.doc_id, {
          doc_id: s.doc_id,
          name: namePart.trim(),
          city: city.trim(),
          state,
        });
      }
    }
  }
  return Array.from(seen.values());
}

export function executorPrompt(task: TaskSpec, log: LogEntry[]): string {
  const logStr = renderLogForPrompt(log);
  const seen = candidatesSeen(log);
  const seenBlock = seen.length === 0
    ? "(no candidates surfaced yet — start with hybrid_search)"
    : seen.map(s => `  - ${s.doc_id} ${s.name} (${s.city}, ${s.state})`).join("\n");

  return `You are the EXECUTOR agent. Your job is to complete this task:

OPERATION: ${task.operation}
TARGET: ${task.target_count} × ${task.target_role} in ${task.target_city}, ${task.target_state}
${task.approach_hint ? `HINT: ${task.approach_hint}` : ""}

The REVIEWER agent is watching every turn. They will flag drift. Stay on target.

${TOOL_CATALOG}

CANDIDATES SURFACED SO FAR (orchestrator-tracked, do not forget these):
${seenBlock}

SHARED LOG (recent turns):
${logStr}

Your next action MUST be a JSON object matching one of these shapes:
{"kind":"plan","steps":["short step 1","short step 2",...]}
  — use on turn 1 to outline your approach. Steps must be concrete.
{"kind":"tool_call","tool":"...","args":{...},"rationale":"why"}
  — call a tool and see its result next turn.
{"kind":"propose_done","fills":[{"candidate_id":"...","name":"First Last","reason":"why them"}],"rationale":"..."}
  — propose you've met the target. fills MUST have EXACTLY ${task.target_count} entries — count twice before emitting.

Strategy tip: once "CANDIDATES SURFACED SO FAR" has ≥ ${task.target_count} entries in ${task.target_city}, ${task.target_state} matching ${task.target_role}, verify ONE via the sql tool (to satisfy the reviewer's SQL-verification criterion) and then propose_done with the top ${task.target_count}. Don't keep re-searching.

Respond with ONLY the JSON object. No markdown fences, no prose.`;
}

export function reviewerPrompt(task: TaskSpec, log: LogEntry[]): string {
  const logStr = renderLogForPrompt(log);

  // If the most recent executor action was propose_done, the reviewer
  // must commit to an up-or-down vote this turn — "continue" would stall
  // the orchestrator forever. The wider prompt still describes all three
  // verdicts, but we add a hard rule at the end that the model must obey.
  const lastExec = [...log].reverse().find(e => e.role === "executor");
  const awaitingApproval = lastExec?.kind === "propose_done";

  return `You are the REVIEWER agent. The EXECUTOR is trying to complete this task:

OPERATION: ${task.operation}
TARGET: ${task.target_count} × ${task.target_role} in ${task.target_city}, ${task.target_state}

Your job: catch drift. Agents often wander from the actual objective. Specifically watch for:
- Proposing candidates who aren't in ${task.target_city}, ${task.target_state}.
- Proposing candidates who don't have ${task.target_role} skill.
- Proposing fewer or more than ${task.target_count} fills.
- Irrelevant tool calls (e.g. revenue_by_client when the task is a fill).

Available tools (for reference, but YOU don't call them):
- hybrid_search(sql_filter, question, index_name, k) — production fill path
- sql(query) — read-only SELECT for verification

SHARED LOG (recent turns):
${logStr}

Your next action MUST be a JSON object:
{"kind":"critique","verdict":"continue" | "drift" | "approve_done","notes":"..."}

- "continue" → executor is on a reasonable path, let them keep going.
- "drift" → executor is off-track; notes MUST tell them how to redirect.
- "approve_done" → executor's propose_done meets the criteria. Seal it.

APPROVAL CRITERIA (use these only for propose_done):
1. Exactly ${task.target_count} fills.
2. Each fill's name appears in a prior tool_result from ${task.target_city}, ${task.target_state} matching role "${task.target_role}".
3. Executor has SQL-verified at least one of the fills (any prior sql tool_result with that worker).
If 1–3 all hold, return approve_done. Do not demand further verification.
${awaitingApproval ? `

HARD RULE: The executor's most recent action was propose_done. On this turn you CANNOT return "continue" — it would stall the task. Choose approve_done (proposal is valid by the 3 criteria above) or drift (it fails one; state which in notes).` : ""}

Respond with ONLY the JSON object.`;
}

// Parse an agent's response into an Action, or throw.
export function parseAction(raw: string, role: Role): Action {
  // Models sometimes wrap JSON in ```json fences; strip them.
  let s = raw.trim();
  if (s.startsWith("```")) {
    s = s.replace(/^```(?:json)?\n?/, "").replace(/```$/, "").trim();
  }
  // Find the first {...} block.
  const start = s.indexOf("{");
  const end = s.lastIndexOf("}");
  if (start < 0 || end <= start) {
    throw new Error(`no JSON object in ${role} response: ${raw.slice(0, 300)}`);
  }
  let json = s.slice(start, end + 1);
  // Soft-tolerate common model mistakes: stray `)` before closing brace
  // (qwen2.5 does this on tool_call), trailing commas, etc. Fix the
  // cheapest ones that are unambiguous.
  json = json.replace(/\)\s*\}/g, "}");     // "...)}" → "...}"
  json = json.replace(/,(\s*[}\]])/g, "$1"); // trailing comma before } or ]
  let obj: any;
  try {
    obj = JSON.parse(json);
  } catch (e) {
    throw new Error(`invalid JSON from ${role}: ${(e as Error).message} | raw: ${json.slice(0, 300)}`);
  }

  if (role === "executor") {
    if (obj.kind === "plan" && Array.isArray(obj.steps)) return obj as Action;
    if (obj.kind === "tool_call" && typeof obj.tool === "string" && typeof obj.args === "object") return obj as Action;
    if (obj.kind === "propose_done" && Array.isArray(obj.fills)) return obj as Action;
    // Tolerance: some model outputs put a stray closing paren or
    // trailing garbage after the main object. If the kind looks
    // recognizable but shape doesn't match, bubble a cleaner error so
    // the orchestrator's soft-fail path doesn't swallow it.
    throw new Error(`executor returned unexpected shape: ${JSON.stringify(obj).slice(0, 200)}`);
  } else {
    // Normalize: some models (qwen2.5, mistral) emit the verdict AS the
    // `kind` field directly instead of nesting it under a "critique"
    // wrapper. Accept both shapes rather than hard-failing — the
    // semantic content is identical, and rejecting would stall the
    // orchestrator on a cosmetic schema miss.
    if (obj.kind === "critique" && ["continue", "drift", "approve_done"].includes(obj.verdict)) {
      return obj as Action;
    }
    if (["continue", "drift", "approve_done"].includes(obj.kind)) {
      return { kind: "critique", verdict: obj.kind, notes: obj.notes ?? "" } as Action;
    }
    throw new Error(`reviewer returned unexpected shape: ${JSON.stringify(obj).slice(0, 200)}`);
  }
}