From e4ae5b646eff33e3b4578eea2882f0e9b92f8caf Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2026 19:21:45 -0500 Subject: [PATCH] =?UTF-8?q?T3=20overview=20tier=20=E2=80=94=20mid-day=20ch?= =?UTF-8?q?eckpoints=20+=20cross-day=20lesson?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hot path (T1/T2) stays mistral + qwen2.5. The new T3 tier runs a thinking model SPARINGLY — after every misplacement, every N-th event (default N=3), and once post-scenario for the cross-day lesson. - agent.ts: generateCloud() for Ollama Cloud (gpt-oss:120b etc). Uses the same /api/generate shape; thinking field is discarded. - scenario.ts: runOverviewCheckpoint + runCrossDayLesson. Outputs land in checkpoints.jsonl and lesson.md. Lesson also seeds playbook_memory under operation "cross-day-lesson-{date}" — future runs pick it up through the existing similarity boost. - Env knobs: LH_OVERVIEW_CLOUD=1 routes T3 to cloud, LH_OVERVIEW_MODEL overrides (default gpt-oss:20b local, gpt-oss:120b cloud), LH_T3_CHECKPOINT_EVERY controls cadence, LH_T3_DISABLE=1 turns it off. Why this shape: prior feedback_phase19_seed_text.md warned that verbose seeds dilute the embedding and silently kill the boost. T3's rich prose goes to lesson.md; the embedded "approach" + "context" stay terse. Verified end-to-end: local 20b checkpoint 10.9s, lesson 4.0s; cloud 120b lesson 3.7s. Cloud output is both faster AND more specific than local (sequenced, tactical, logging advice included). --- tests/multi-agent/agent.ts | 48 +++++++++ tests/multi-agent/scenario.ts | 192 +++++++++++++++++++++++++++++++++- 2 files changed, 238 insertions(+), 2 deletions(-) diff --git a/tests/multi-agent/agent.ts b/tests/multi-agent/agent.ts index b52344b..b16ebca 100644 --- a/tests/multi-agent/agent.ts +++ b/tests/multi-agent/agent.ts @@ -9,6 +9,12 @@ export const GATEWAY = "http://localhost:3100"; export const SIDECAR = "http://localhost:3200"; +// Ollama Cloud — used for the T3 overview tier when LH_OVERVIEW_CLOUD=1. +// Same /api/generate surface as local Ollama; just needs the bearer key. +// Default base and key are read from env so secrets never land in git. +export const OLLAMA_CLOUD_URL = process.env.OLLAMA_CLOUD_URL ?? "https://ollama.com"; +export const OLLAMA_CLOUD_KEY = process.env.OLLAMA_CLOUD_KEY ?? ""; + // --- Shared types --- export type Role = "executor" | "reviewer"; @@ -111,6 +117,48 @@ export async function generate(model: string, prompt: string, opts: { return text; } +// Cloud generate — hits Ollama Cloud directly with the bearer key. Same +// /api/generate shape as local Ollama; `thinking` field (for gpt-oss:Nb) +// is discarded, only `response` is returned. Caller should budget +// num_predict ≥ 400 so thinking-model reasoning has room before the +// visible response starts. +export async function generateCloud(model: string, prompt: string, opts: { + max_tokens?: number; + temperature?: number; + system?: string; +} = {}): Promise { + if (!OLLAMA_CLOUD_KEY) { + throw new Error("OLLAMA_CLOUD_KEY not set; cannot reach Ollama Cloud"); + } + const body: Record = { + model, + prompt, + stream: false, + options: { + temperature: opts.temperature ?? 0.3, + num_predict: Math.max(opts.max_tokens ?? 800, 400), + }, + }; + if (opts.system) body.system = opts.system; + const resp = await fetch(`${OLLAMA_CLOUD_URL}/api/generate`, { + method: "POST", + headers: { + "Authorization": `Bearer ${OLLAMA_CLOUD_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + if (!resp.ok) { + throw new Error(`Ollama Cloud ${resp.status}: ${await resp.text().catch(() => "?")}`); + } + const data: any = await resp.json(); + const text = data.response ?? ""; + if (!text) { + throw new Error(`Ollama Cloud returned empty response for ${model}: ${JSON.stringify(data).slice(0, 200)}`); + } + return text; +} + // --- Prompt construction --- const TOOL_CATALOG = ` diff --git a/tests/multi-agent/scenario.ts b/tests/multi-agent/scenario.ts index 1e07132..82da501 100644 --- a/tests/multi-agent/scenario.ts +++ b/tests/multi-agent/scenario.ts @@ -28,6 +28,7 @@ import { hybridSearch, sqlQuery, generate, + generateCloud, parseAction, executorPrompt, reviewerPrompt, @@ -46,6 +47,24 @@ import { join } from "node:path"; const EXECUTOR_MODEL = "mistral:latest"; const REVIEWER_MODEL = "qwen2.5:latest"; const DRAFT_MODEL = "qwen2.5:latest"; // artifact generation; short outputs + +// T3 overview tier. Called sparingly — NOT per tool call. Two insertion +// points: (B) mid-scenario checkpoint after every misplacement event and +// every N events, and (A) cross-day lesson after all events complete. +// gpt-oss:20b is a thinking model: it spends tokens in a hidden reasoning +// block before emitting `response`. Budget accordingly — never under 400. +const OVERVIEW_CLOUD = process.env.LH_OVERVIEW_CLOUD === "1"; +const OVERVIEW_MODEL = process.env.LH_OVERVIEW_MODEL ?? (OVERVIEW_CLOUD ? "gpt-oss:120b" : "gpt-oss:20b"); +const T3_CHECKPOINT_EVERY = Number(process.env.LH_T3_CHECKPOINT_EVERY ?? 3); +const T3_DISABLED = process.env.LH_T3_DISABLE === "1"; + +// Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending +// on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local. +async function overviewGenerate(prompt: string, opts: { temperature?: number; max_tokens?: number } = {}): Promise { + if (OVERVIEW_CLOUD) return generateCloud(OVERVIEW_MODEL, prompt, opts); + return generate(OVERVIEW_MODEL, prompt, opts); +} + const MAX_TURNS = 14; const MAX_CONSECUTIVE_DRIFTS = 3; const WORKERS_INDEX = "workers_500k_v1"; @@ -723,13 +742,122 @@ async function resolveWorkerIds(fills: Fill[], event: FillEvent): Promise { + if (T3_DISABLED) return null; + const start = Date.now(); + + const priorSummary = prior.slice(-3).map(p => + `- ${p.event.at} ${p.event.kind} ${p.event.role}×${p.event.count} → ${p.ok ? p.fills.length + "/" + p.event.count + " filled" : "FAIL"}; pool=${p.pool_size ?? "?"}; cites=${p.playbook_citations?.length ?? 0}` + ).join("\n"); + + const thisOne = `This event: ${event.at} ${event.kind} ${event.role}×${event.count} in ${event.city}, ${event.state}. ` + + `Outcome: ${result.ok ? "filled " + result.fills.length + "/" + event.count : "FAILED: " + (result.error ?? "unknown")}. ` + + `Pool size: ${result.pool_size ?? "n/a"}. Turns: ${result.turns}. Playbook citations: ${result.playbook_citations?.length ?? 0}. ` + + `Gap signals: ${result.gap_signals.join("; ") || "none"}.`; + + const prompt = `You are the overview reviewer for a staffing coordinator agent system. A mid-day checkpoint has been triggered. + +Recent events (most recent last): +${priorSummary || "(no prior events)"} + +${thisOne} + +Your job: emit ONE risk flag (≤6 words) and ONE actionable hint (≤25 words) for the NEXT event. Be concrete: name the role, city, or worker class if relevant. Do not restate what happened. Think step by step, then output strictly as: + +RISK: +HINT: `; + + let text = ""; + try { + text = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 600 }); + } catch (e) { + return { + after_event: event.at, + event_kind: event.kind, + ok: false, + model: OVERVIEW_MODEL, + duration_secs: (Date.now() - start) / 1000, + hint: "(T3 unavailable)", + risk: (e as Error).message.slice(0, 80), + }; + } + + const riskMatch = text.match(/RISK:\s*(.+)/i); + const hintMatch = text.match(/HINT:\s*(.+)/i); + return { + after_event: event.at, + event_kind: event.kind, + ok: Boolean(riskMatch && hintMatch), + model: OVERVIEW_MODEL, + duration_secs: (Date.now() - start) / 1000, + risk: (riskMatch?.[1] ?? "(unparsed)").trim().slice(0, 120), + hint: (hintMatch?.[1] ?? text).trim().slice(0, 400), + }; +} + +async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise { + if (T3_DISABLED) return null; + + const eventDigest = ctx.results.map(r => + `- ${r.event.at} ${r.event.kind} ${r.event.role}×${r.event.count} ${r.event.city},${r.event.state} → ${r.ok ? r.fills.length + " filled" : "FAIL"}; pool=${r.pool_size ?? "?"}; turns=${r.turns}; cites=${r.playbook_citations?.length ?? 0}; gaps=${r.gap_signals.length}` + ).join("\n"); + + const checkpointDigest = checkpoints.length > 0 + ? checkpoints.map(c => `- after ${c.after_event} (${c.event_kind}): risk="${c.risk}" hint="${c.hint}"`).join("\n") + : "(no mid-day checkpoints)"; + + const prompt = `You are the end-of-day lesson writer for a staffing coordinator agent system. The day is done. Distill it. + +Client: ${ctx.spec.client} Date: ${ctx.spec.date} + +Events that ran: +${eventDigest} + +Mid-day checkpoints: +${checkpointDigest} + +Your job: write ONE actionable lesson for future runs that face similar setups. Target audience: the agent tomorrow. Keep the lesson to 3-5 sentences. No filler, no restating. Think step by step about what pattern repeated, what to pre-fetch, or what to avoid — then write the lesson as plain prose. + +LESSON:`; + + try { + const text = await overviewGenerate(prompt, { temperature: 0.2, max_tokens: 900 }); + const m = text.match(/LESSON:\s*([\s\S]+)/i); + return (m ? m[1] : text).trim(); + } catch (e) { + return `(T3 lesson unavailable: ${(e as Error).message})`; + } +} + // =================== EOD gap report =================== async function writeRetrospective(ctx: ScenarioContext): Promise { const lines: string[] = []; lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`); lines.push(""); - lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\``); + lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL}\``); lines.push(""); // --- Per-event summary --- @@ -882,11 +1010,16 @@ async function main() { await writeFile(join(out_dir, "sms.md"), `# SMS drafts — ${spec.client}, ${spec.date}\n`); await writeFile(join(out_dir, "emails.md"), `# Client emails — ${spec.client}, ${spec.date}\n`); await writeFile(join(out_dir, "dispatch.jsonl"), ""); + await writeFile(join(out_dir, "checkpoints.jsonl"), ""); + + const checkpoints: OverviewCheckpoint[] = []; console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`); + console.log(`▶ models: exec=${EXECUTOR_MODEL} review=${REVIEWER_MODEL} overview=${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}`); console.log(`▶ out: ${out_dir}\n`); - for (const event of spec.events) { + for (let i = 0; i < spec.events.length; i++) { + const event = spec.events[i]; // Expand misplacement-style exclusions from the current roster: it // wants to replace a worker from a prior event, so grab everyone // booked at that at-label and add as exclusions. @@ -916,6 +1049,19 @@ async function main() { ctx.gap_signals.push({ event: event.at, category: category.trim(), detail: rest.join(":").trim() }); } + // Option B — T3 checkpoint after every misplacement, and every N-th event. + const isLast = i === spec.events.length - 1; + const nthHit = T3_CHECKPOINT_EVERY > 0 && ((i + 1) % T3_CHECKPOINT_EVERY === 0); + const shouldCheckpoint = !T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast); + if (shouldCheckpoint) { + const cp = await runOverviewCheckpoint(event, result, ctx.results.slice(0, -1)); + if (cp) { + checkpoints.push(cp); + await appendFile(join(out_dir, "checkpoints.jsonl"), JSON.stringify(cp) + "\n"); + console.log(` T3 checkpoint (${cp.duration_secs.toFixed(1)}s): risk="${cp.risk}" hint="${cp.hint.slice(0, 80)}${cp.hint.length > 80 ? "…" : ""}"`); + } + } + // Small breather to not hammer Ollama on back-to-back runs. await new Promise(r => setTimeout(r, 500)); } @@ -924,6 +1070,48 @@ async function main() { await writeFile(join(out_dir, "roster.json"), JSON.stringify(ctx.roster, null, 2)); await writeFile(join(out_dir, "results.json"), JSON.stringify(ctx.results, null, 2)); + // Option A — T3 cross-day lesson. One final call distills the whole run. + // Saved to lesson.md and also seeded into playbook_memory so tomorrow's + // agent can retrieve it on similar setups. + if (!T3_DISABLED) { + console.log(`\n▶ T3 cross-day lesson via ${OVERVIEW_MODEL}…`); + const tLesson = Date.now(); + const lesson = await runCrossDayLesson(ctx, checkpoints); + const lessonSecs = ((Date.now() - tLesson) / 1000).toFixed(1); + if (lesson) { + await writeFile( + join(out_dir, "lesson.md"), + `# Cross-day lesson — ${ctx.spec.client}, ${ctx.spec.date}\n\n` + + `_Generated by \`${OVERVIEW_MODEL}\` in ${lessonSecs}s. ` + + `Based on ${ctx.results.length} events + ${checkpoints.length} mid-day checkpoints._\n\n` + + lesson + "\n" + ); + console.log(`✓ lesson (${lessonSecs}s) → ${join(out_dir, "lesson.md")}`); + + // Seed the lesson into playbook_memory for future retrieval. Keep + // the embedded `approach` + `context` terse per feedback_phase19_seed_text.md; + // the rich prose lives in lesson.md and a separate `rationale` field. + try { + const kinds = [...new Set(ctx.spec.events.map(e => e.kind))].join("+"); + const cities = [...new Set(ctx.spec.events.map(e => e.city))].slice(0, 3).join(","); + await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + operation: `cross-day-lesson-${ctx.spec.date}`, + approach: `${kinds} day in ${cities}`, + context: `${ctx.spec.client} ${ctx.spec.date}`, + rationale: lesson.slice(0, 2000), + endorsed_names: [], + append: true, + }), + }); + } catch (e) { + console.log(` (lesson seed skipped: ${(e as Error).message})`); + } + } + } + await writeRetrospective(ctx); const okCount = ctx.results.filter(r => r.ok).length;