T3 overview tier — mid-day checkpoints + cross-day lesson

Hot path (T1/T2) stays mistral + qwen2.5. The new T3 tier runs a thinking model SPARINGLY — after every misplacement, every N-th event (default N=3), and once post-scenario for the cross-day lesson. - agent.ts: generateCloud() for Ollama Cloud (gpt-oss:120b etc). Uses the same /api/generate shape; thinking field is discarded. - scenario.ts: runOverviewCheckpoint + runCrossDayLesson. Outputs land in checkpoints.jsonl and lesson.md. Lesson also seeds playbook_memory under operation "cross-day-lesson-{date}" — future runs pick it up through the existing similarity boost. - Env knobs: LH_OVERVIEW_CLOUD=1 routes T3 to cloud, LH_OVERVIEW_MODEL overrides (default gpt-oss:20b local, gpt-oss:120b cloud), LH_T3_CHECKPOINT_EVERY controls cadence, LH_T3_DISABLE=1 turns it off. Why this shape: prior feedback_phase19_seed_text.md warned that verbose seeds dilute the embedding and silently kill the boost. T3's rich prose goes to lesson.md; the embedded "approach" + "context" stay terse. Verified end-to-end: local 20b checkpoint 10.9s, lesson 4.0s; cloud 120b lesson 3.7s. Cloud output is both faster AND more specific than local (sequenced, tactical, logging advice included).
2026-04-20 19:21:45 -05:00 · 2026-04-20 19:21:45 -05:00 · e4ae5b646e
commit e4ae5b646e
parent 0ff091c173
2 changed files with 238 additions and 2 deletions
--- a/tests/multi-agent/agent.ts
+++ b/tests/multi-agent/agent.ts
@ -9,6 +9,12 @@
 export const GATEWAY = "http://localhost:3100";
 export const SIDECAR = "http://localhost:3200";

+// Ollama Cloud — used for the T3 overview tier when LH_OVERVIEW_CLOUD=1.
+// Same /api/generate surface as local Ollama; just needs the bearer key.
+// Default base and key are read from env so secrets never land in git.
+export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com";
+export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? "";
+
 // --- Shared types ---

 export type Role = "executor" | "reviewer";
@ -111,6 +117,48 @@ export async function generate(model: string, prompt: string, opts: {
  return text;
 }

+// Cloud generate — hits Ollama Cloud directly with the bearer key. Same
+// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb)
+// is discarded, only `response` is returned. Caller should budget
+// num_predict ≥ 400 so thinking-model reasoning has room before the
+// visible response starts.
+export async function generateCloud(model: string, prompt: string, opts: {
+  max_tokens?: number;
+  temperature?: number;
+  system?: string;
+} = {}): Promise<string> {
+  if (!OLLAMA_CLOUD_KEY) {
+    throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud");
+  }
+  const body: Record<string, any> = {
+    model,
+    prompt,
+    stream: false,
+    options: {
+      temperature: opts.temperature ?? 0.3,
+      num_predict: Math.max(opts.max_tokens ?? 800, 400),
+    },
+  };
+  if (opts.system) body.system = opts.system;
+  const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, {
+    method: "POST",
+    headers: {
+      "Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify(body),
+  });
+  if (!resp.ok) {
+    throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`);
+  }
+  const data: any = await resp.json();
+  const text = data.response ?? "";
+  if (!text) {
+    throw new Error(`Ollama Cloud returned empty response for ${model}: ${JSON.stringify(data).slice(0, 200)}`);
+  }
+  return text;
+}
+
 // --- Prompt construction ---

 const TOOL_CATALOG = `
--- a/tests/multi-agent/scenario.ts
+++ b/tests/multi-agent/scenario.ts
@ -28,6 +28,7 @@ import {
  hybridSearch,
  sqlQuery,
  generate,
+  generateCloud,
  parseAction,
  executorPrompt,
  reviewerPrompt,
@ -46,6 +47,24 @@ import { join } from "node:path";
 const EXECUTOR_MODEL = "mistral:latest";
 const REVIEWER_MODEL = "qwen2.5:latest";
 const DRAFT_MODEL = "qwen2.5:latest";     // artifact generation; short outputs
+
+// T3 overview tier. Called sparingly — NOT per tool call. Two insertion
+// points: (B) mid-scenario checkpoint after every misplacement event and
+// every N events, and (A) cross-day lesson after all events complete.
+// gpt-oss:20b is a thinking model: it spends tokens in a hidden reasoning
+// block before emitting `response`. Budget accordingly — never under 400.
+const OVERVIEW_CLOUD = process.env.LH_OVERVIEW_CLOUD === "1";
+const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL ?? (OVERVIEW_CLOUD ? "gpt-oss:120b" : "gpt-oss:20b");
+const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3);
+const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
+
+// Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
+// on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
+async function overviewGenerate(prompt: string, opts: { temperature?: number; max_tokens?: number } = {}): Promise<string> {
+  if (OVERVIEW_CLOUD) return generateCloud(OVERVIEW_MODEL, prompt, opts);
+  return generate(OVERVIEW_MODEL, prompt, opts);
+}
+
 const MAX_TURNS = 14;
 const MAX_CONSECUTIVE_DRIFTS = 3;
 const WORKERS_INDEX = "workers_500k_v1";
@ -723,13 +742,122 @@ async function resolveWorkerIds(fills: Fill[], event: FillEvent): Promise<Fill[]
  return resolved;
 }

+// =================== T3 overview tier ===================
+// Called sparingly so reasoning overhead stays amortized.
+//   (B) Checkpoint — after every misplacement AND every N-th event.
+//   (A) Cross-day lesson — once at end of scenario.
+// Results land in `checkpoints.jsonl` and `lesson.md`, and the lesson
+// seeds playbook_memory under operation "cross-day-lesson-{date}" so
+// future scenarios can surface it on similar setups.
+
+interface OverviewCheckpoint {
+  after_event: string;   // event.at label
+  event_kind: EventKind;
+  ok: boolean;
+  model: string;
+  duration_secs: number;
+  hint: string;          // T3's "what to do differently next time"
+  risk: string;          // T3's named risk flag
+}
+
+async function runOverviewCheckpoint(
+  event: FillEvent,
+  result: EventResult,
+  prior: EventResult[],
+): Promise<OverviewCheckpoint | null> {
+  if (T3_DISABLED) return null;
+  const start = Date.now();
+
+  const priorSummary = prior.slice(-3).map(p =>
+    `- ${p.event.at} ${p.event.kind} ${p.event.role}×${p.event.count} → ${p.ok ? p.fills.length + "/" + p.event.count + " filled" : "FAIL"}; pool=${p.pool_size ?? "?"}; cites=${p.playbook_citations?.length ?? 0}`
+  ).join("\n");
+
+  const thisOne = `This event: ${event.at} ${event.kind} ${event.role}×${event.count} in ${event.city}, ${event.state}. `
+    + `Outcome: ${result.ok ? "filled " + result.fills.length + "/" + event.count : "FAILED: " + (result.error ?? "unknown")}. `
+    + `Pool size: ${result.pool_size ?? "n/a"}. Turns: ${result.turns}. Playbook citations: ${result.playbook_citations?.length ?? 0}. `
+    + `Gap signals: ${result.gap_signals.join("; ") || "none"}.`;
+
+  const prompt = `You are the overview reviewer for a staffing coordinator agent system. A mid-day checkpoint has been triggered.
+
+Recent events (most recent last):
+${priorSummary || "(no prior events)"}
+
+${thisOne}
+
+Your job: emit ONE risk flag (≤6 words) and ONE actionable hint (≤25 words) for the NEXT event. Be concrete: name the role, city, or worker class if relevant. Do not restate what happened. Think step by step, then output strictly as:
+
+RISK: <flag>
+HINT: <hint>`;
+
+  let text = "";
+  try {
+    text = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 600 });
+  } catch (e) {
+    return {
+      after_event: event.at,
+      event_kind: event.kind,
+      ok: false,
+      model: OVERVIEW_MODEL,
+      duration_secs: (Date.now() - start) / 1000,
+      hint: "(T3 unavailable)",
+      risk: (e as Error).message.slice(0, 80),
+    };
+  }
+
+  const riskMatch = text.match(/RISK:\s*(.+)/i);
+  const hintMatch = text.match(/HINT:\s*(.+)/i);
+  return {
+    after_event: event.at,
+    event_kind: event.kind,
+    ok: Boolean(riskMatch && hintMatch),
+    model: OVERVIEW_MODEL,
+    duration_secs: (Date.now() - start) / 1000,
+    risk: (riskMatch?.[1] ?? "(unparsed)").trim().slice(0, 120),
+    hint: (hintMatch?.[1] ?? text).trim().slice(0, 400),
+  };
+}
+
+async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
+  if (T3_DISABLED) return null;
+
+  const eventDigest = ctx.results.map(r =>
+    `- ${r.event.at} ${r.event.kind} ${r.event.role}×${r.event.count} ${r.event.city},${r.event.state} → ${r.ok ? r.fills.length + " filled" : "FAIL"}; pool=${r.pool_size ?? "?"}; turns=${r.turns}; cites=${r.playbook_citations?.length ?? 0}; gaps=${r.gap_signals.length}`
+  ).join("\n");
+
+  const checkpointDigest = checkpoints.length > 0
+    ? checkpoints.map(c => `- after ${c.after_event} (${c.event_kind}): risk="${c.risk}" hint="${c.hint}"`).join("\n")
+    : "(no mid-day checkpoints)";
+
+  const prompt = `You are the end-of-day lesson writer for a staffing coordinator agent system. The day is done. Distill it.
+
+Client: ${ctx.spec.client}   Date: ${ctx.spec.date}
+
+Events that ran:
+${eventDigest}
+
+Mid-day checkpoints:
+${checkpointDigest}
+
+Your job: write ONE actionable lesson for future runs that face similar setups. Target audience: the agent tomorrow. Keep the lesson to 3-5 sentences. No filler, no restating. Think step by step about what pattern repeated, what to pre-fetch, or what to avoid — then write the lesson as plain prose.
+
+LESSON:`;
+
+  try {
+    const text = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 900 });
+    const m = text.match(/LESSON:\s*([\s\S]+)/i);
+    return (m ? m[1] : text).trim();
+  } catch (e) {
+    return `(T3 lesson unavailable: ${(e as Error).message})`;
+  }
+}
+
 // =================== EOD gap report ===================

 async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
  const lines: string[] = [];
  lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
  lines.push("");
-  lines.push(`Executor: \`${EXECUTOR_MODEL}\`   Reviewer: \`${REVIEWER_MODEL}\`   Draft: \`${DRAFT_MODEL}\``);
+  lines.push(`Executor: \`${EXECUTOR_MODEL}\`   Reviewer: \`${REVIEWER_MODEL}\`   Draft: \`${DRAFT_MODEL}\`   Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL}\``);
  lines.push("");

  // --- Per-event summary ---
@ -882,11 +1010,16 @@ async function main() {
  await writeFile(join(out_dir, "sms.md"), `# SMS drafts — ${spec.client}, ${spec.date}\n`);
  await writeFile(join(out_dir, "emails.md"), `# Client emails — ${spec.client}, ${spec.date}\n`);
  await writeFile(join(out_dir, "dispatch.jsonl"), "");
+  await writeFile(join(out_dir, "checkpoints.jsonl"), "");
+
+  const checkpoints: OverviewCheckpoint[] = [];

  console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
+  console.log(`▶ models: exec=${EXECUTOR_MODEL} review=${REVIEWER_MODEL} overview=${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}`);
  console.log(`▶ out: ${out_dir}\n`);

-  for (const event of spec.events) {
+  for (let i = 0; i < spec.events.length; i++) {
+    const event = spec.events[i];
    // Expand misplacement-style exclusions from the current roster: it
    // wants to replace a worker from a prior event, so grab everyone
    // booked at that at-label and add as exclusions.
@ -916,6 +1049,19 @@ async function main() {
      ctx.gap_signals.push({ event: event.at, category: category.trim(), detail: rest.join(":").trim() });
    }

+    // Option B — T3 checkpoint after every misplacement, and every N-th event.
+    const isLast = i === spec.events.length - 1;
+    const nthHit = T3_CHECKPOINT_EVERY > 0 && ((i + 1) % T3_CHECKPOINT_EVERY === 0);
+    const shouldCheckpoint = !T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
+    if (shouldCheckpoint) {
+      const cp = await runOverviewCheckpoint(event, result, ctx.results.slice(0, -1));
+      if (cp) {
+        checkpoints.push(cp);
+        await appendFile(join(out_dir, "checkpoints.jsonl"), JSON.stringify(cp) + "\n");
+        console.log(`   T3 checkpoint (${cp.duration_secs.toFixed(1)}s): risk="${cp.risk}" hint="${cp.hint.slice(0, 80)}${cp.hint.length > 80 ? "…" : ""}"`);
+      }
+    }
+
    // Small breather to not hammer Ollama on back-to-back runs.
    await new Promise(r => setTimeout(r, 500));
  }
@ -924,6 +1070,48 @@ async function main() {
  await writeFile(join(out_dir, "roster.json"), JSON.stringify(ctx.roster, null, 2));
  await writeFile(join(out_dir, "results.json"), JSON.stringify(ctx.results, null, 2));

+  // Option A — T3 cross-day lesson. One final call distills the whole run.
+  // Saved to lesson.md and also seeded into playbook_memory so tomorrow's
+  // agent can retrieve it on similar setups.
+  if (!T3_DISABLED) {
+    console.log(`\n▶ T3 cross-day lesson via ${OVERVIEW_MODEL}…`);
+    const tLesson = Date.now();
+    const lesson = await runCrossDayLesson(ctx, checkpoints);
+    const lessonSecs = ((Date.now() - tLesson) / 1000).toFixed(1);
+    if (lesson) {
+      await writeFile(
+        join(out_dir, "lesson.md"),
+        `# Cross-day lesson — ${ctx.spec.client}, ${ctx.spec.date}\n\n`
+          + `_Generated by \`${OVERVIEW_MODEL}\` in ${lessonSecs}s. `
+          + `Based on ${ctx.results.length} events + ${checkpoints.length} mid-day checkpoints._\n\n`
+          + lesson + "\n"
+      );
+      console.log(`✓ lesson (${lessonSecs}s) → ${join(out_dir, "lesson.md")}`);
+
+      // Seed the lesson into playbook_memory for future retrieval. Keep
+      // the embedded `approach` + `context` terse per feedback_phase19_seed_text.md;
+      // the rich prose lives in lesson.md and a separate `rationale` field.
+      try {
+        const kinds = [...new Set(ctx.spec.events.map(e => e.kind))].join("+");
+        const cities = [...new Set(ctx.spec.events.map(e => e.city))].slice(0, 3).join(",");
+        await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
+          method: "POST",
+          headers: { "content-type": "application/json" },
+          body: JSON.stringify({
+            operation: `cross-day-lesson-${ctx.spec.date}`,
+            approach: `${kinds} day in ${cities}`,
+            context: `${ctx.spec.client} ${ctx.spec.date}`,
+            rationale: lesson.slice(0, 2000),
+            endorsed_names: [],
+            append: true,
+          }),
+        });
+      } catch (e) {
+        console.log(`   (lesson seed skipped: ${(e as Error).message})`);
+      }
+    }
+  }
+
  await writeRetrospective(ctx);

  const okCount = ctx.results.filter(r => r.ok).length;