Phase 23 refinement — per-staffer tool_level variance

Staffer.tool_level now controls which subsystems a specific run gets: full — qwen3.5 + qwen3 + cloud T3 + cloud rescue local — qwen3.5 + qwen3 + local gpt-oss:20b T3 + rescue basic — qwen2.5 + qwen2.5 + local T3, no rescue minimal — qwen2.5 + qwen2.5, NO T3, NO rescue. Playbook inheritance only. applyToolLevel() mutates module-scoped ACTIVE_* slots each run from the env defaults, so prior staffer's overrides never leak. Hot-path code reads ACTIVE_EXECUTOR / ACTIVE_REVIEWER / ACTIVE_T3_DISABLED / ACTIVE_OVERVIEW_CLOUD / ACTIVE_RETRY_ON_FAIL instead of the baked constants. The architectural question this answers: does playbook_memory inheritance carry enough knowledge to let a weakly-tooled coordinator still produce usable outcomes? "Minimal" Alex runs qwen2.5 exec + no reviewer overseer + no cloud rescue. If Alex still fills events at a reasonable rate, the playbook system is the real knowledge carrier — the senior stack is nice-to-have, not the sine qua non. Demo personas mapped: Maria (senior, 48mo, full) James (mid, 14mo, local) Sam (junior, 4mo, basic) Alex (trainee, 1mo, minimal) Same 3 contracts (Nashville downtown, Joliet warehouse, Indianapolis assembly) across all four → 12 runs. KB + kb_staffer_report.py leaderboard already wired; competence_score will now reflect real tool asymmetry instead of LLM sampling variance.
2026-04-20 22:50:05 -05:00 · 2026-04-20 22:50:05 -05:00 · 5e89407939
commit 5e89407939
parent 6b71c8e9b2
14 changed files with 119 additions and 42 deletions
--- a/tests/multi-agent/gen_staffer_demo.ts
+++ b/tests/multi-agent/gen_staffer_demo.ts
@ -8,11 +8,15 @@
 import { mkdir, writeFile } from "node:fs/promises";
 import { join } from "node:path";

+// Per-staffer tool_level mirrors the real-world asymmetry: senior gets
+// the most powerful stack, trainee gets the least. The architectural
+// question is whether the playbook inheritance is strong enough to let
+// the trainee still produce usable outcomes when the big tools are off.
 const STAFFERS = [
-  { id: "S-001", name: "Maria Chen",   tenure_months: 48, role: "senior"  as const },
-  { id: "S-002", name: "James Park",   tenure_months: 14, role: "mid"     as const },
-  { id: "S-003", name: "Sam Torres",   tenure_months: 4,  role: "junior"  as const },
-  { id: "S-004", name: "Alex Rivera",  tenure_months: 1,  role: "trainee" as const },
+  { id: "S-001", name: "Maria Chen",   tenure_months: 48, role: "senior"  as const, tool_level: "full"    as const },
+  { id: "S-002", name: "James Park",   tenure_months: 14, role: "mid"     as const, tool_level: "local"   as const },
+  { id: "S-003", name: "Sam Torres",   tenure_months: 4,  role: "junior"  as const, tool_level: "basic"   as const },
+  { id: "S-004", name: "Alex Rivera",  tenure_months: 1,  role: "trainee" as const, tool_level: "minimal" as const },
 ];

 // Three contract shapes — one downtown assembly, one warehouse ramp,
--- a/tests/multi-agent/scenario.ts
+++ b/tests/multi-agent/scenario.ts
@ -85,6 +85,49 @@ const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
 // LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue.
 const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";

+// Phase 23 refinement — per-staffer tool_level overrides. Evaluated
+// per run in main() once we know the spec's staffer. These are
+// package-scoped mutable slots intentionally — the primary constant
+// above is the DEFAULT; main() flips them for the duration of the run
+// based on staffer.tool_level before calling anything else.
+let ACTIVE_EXECUTOR = EXECUTOR_MODEL;
+let ACTIVE_REVIEWER = REVIEWER_MODEL;
+let ACTIVE_T3_DISABLED = T3_DISABLED;
+let ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
+let ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
+
+function applyToolLevel(level: Staffer["tool_level"] | undefined): void {
+  // Start from env defaults each time so previous staffer's overrides
+  // don't leak.
+  ACTIVE_EXECUTOR = EXECUTOR_MODEL;
+  ACTIVE_REVIEWER = REVIEWER_MODEL;
+  ACTIVE_T3_DISABLED = T3_DISABLED;
+  ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
+  ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
+  if (!level) return;
+  switch (level) {
+    case "full":
+      ACTIVE_OVERVIEW_CLOUD = true;
+      break;
+    case "local":
+      ACTIVE_OVERVIEW_CLOUD = false;
+      break;
+    case "basic":
+      ACTIVE_EXECUTOR = "qwen2.5:latest";
+      ACTIVE_REVIEWER = "qwen2.5:latest";
+      ACTIVE_OVERVIEW_CLOUD = false;
+      ACTIVE_RETRY_ON_FAIL = false;
+      break;
+    case "minimal":
+      ACTIVE_EXECUTOR = "qwen2.5:latest";
+      ACTIVE_REVIEWER = "qwen2.5:latest";
+      ACTIVE_T3_DISABLED = true;
+      ACTIVE_OVERVIEW_CLOUD = false;
+      ACTIVE_RETRY_ON_FAIL = false;
+      break;
+  }
+}
+
 // Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
 // on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
 // T3 outputs are free-form prose (lesson/hint), so shape=text — the
@ -95,7 +138,7 @@ async function overviewGenerate(prompt: string, opts: { temperature?: number; ma
    max_tokens: opts.max_tokens ?? 1000,
    shape: "text",
    max_continuations: 2,
-    cloud: OVERVIEW_CLOUD,
+    cloud: ACTIVE_OVERVIEW_CLOUD,
  });
 }

@ -148,6 +191,19 @@ interface Staffer {
  name: string;                                      // "Maria Chen"
  tenure_months: number;
  role: "senior" | "mid" | "junior" | "trainee";
+  // Phase 23 refinement — tool_level controls which subsystems this
+  // staffer's runs get to use. The mechanism always leaves
+  // playbook_memory ON so inherited playbooks drive the outcome even
+  // when T3 / cloud rescue / the bigger executor are disabled.
+  //
+  //   full     — qwen3.5 executor + qwen3 reviewer + cloud T3 + rescue
+  //   local    — qwen3.5 + qwen3 + local gpt-oss:20b T3 + rescue
+  //   basic    — qwen2.5 + qwen2.5 + local T3, no rescue
+  //   minimal  — qwen2.5 + qwen2.5, NO T3, NO rescue. Only playbook
+  //              inheritance to lean on. This is the honest test of
+  //              whether the playbook system carries knowledge on its
+  //              own.
+  tool_level?: "full" | "local" | "basic" | "minimal";
 }

 interface ScenarioSpec {
@ -450,7 +506,7 @@ async function runAgentFill(
    // reasoning. Burning ~650 thinking tokens on a 400-token JSON was
    // exactly the bug we just solved.
    const execRaw = await generateContinuable(
-      EXECUTOR_MODEL,
+      ACTIVE_EXECUTOR,
      withExtras(executorPrompt(task, log)),
      {
        temperature: 0.2,
@ -459,7 +515,7 @@ async function runAgentFill(
        max_continuations: 3,
        think: false,
        on_continuation: (n, len) =>
-          append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "note",
+          append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "note",
            content: { continuation: n, combined_chars: len } }),
      },
    );
@ -467,11 +523,11 @@ async function runAgentFill(
    try {
      execAction = parseAction(execRaw, "executor");
    } catch (e) {
-      append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error",
+      append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "error",
        content: { message: (e as Error).message, raw: execRaw.slice(0, 300) } });
      throw e;
    }
-    append({ turn, role: "executor", model: EXECUTOR_MODEL,
+    append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
      kind: execAction.kind as any, content: execAction });

    if (execAction.kind === "tool_call") {
@ -490,7 +546,7 @@ async function runAgentFill(
          }
        }
        const trimmed = trimResult(filtered);
-        append({ turn, role: "executor", model: EXECUTOR_MODEL,
+        append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
          kind: "tool_result", content: trimmed });

        // Accumulate playbook citations from any hybrid result that
@ -503,7 +559,7 @@ async function runAgentFill(
          }
        }
      } catch (e) {
-        append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result",
+        append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "tool_result",
          content: { error: (e as Error).message, tool: execAction.tool } });
        consecutiveDrifts += 1;
        if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) {
@ -513,7 +569,7 @@ async function runAgentFill(
    }

    const revRaw = await generateContinuable(
-      REVIEWER_MODEL,
+      ACTIVE_REVIEWER,
      withExtras(reviewerPrompt(task, log)),
      {
        temperature: 0.1,
@ -522,7 +578,7 @@ async function runAgentFill(
        max_continuations: 3,
        think: false,
        on_continuation: (n, len) =>
-          append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "note",
+          append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "note",
            content: { continuation: n, combined_chars: len } }),
      },
    );
@ -530,11 +586,11 @@ async function runAgentFill(
    try {
      revAction = parseAction(revRaw, "reviewer");
    } catch (e) {
-      append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error",
+      append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "error",
        content: { message: (e as Error).message, raw: revRaw.slice(0, 300) } });
      throw e;
    }
-    append({ turn, role: "reviewer", model: REVIEWER_MODEL,
+    append({ turn, role: "reviewer", model: ACTIVE_REVIEWER,
      kind: "critique", content: revAction });

    if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`);
@ -559,7 +615,7 @@ async function runAgentFill(
          throw new Error(`consensus proposed excluded worker ${f.candidate_id}`);
        }
      }
-      append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done",
+      append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "consensus_done",
        content: { fills: execAction.fills } });
      sealed = { fills: execAction.fills, approach: execAction.rationale ?? "multi-agent hybrid" };
    }
@ -1008,7 +1064,7 @@ async function runOverviewCheckpoint(
  prior: EventResult[],
  contract?: ContractTerms,
 ): Promise<OverviewCheckpoint | null> {
-  if (T3_DISABLED) return null;
+  if (ACTIVE_T3_DISABLED) return null;
  const start = Date.now();

  const priorSummary = prior.slice(-3).map(p =>
@ -1084,7 +1140,7 @@ async function requestCloudRemediation(
  result: EventResult,
  contract?: ContractTerms,
 ): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> {
-  if (T3_DISABLED) return null;
+  if (ACTIVE_T3_DISABLED) return null;
  const start = Date.now();
  const diag = extractDiagnostics(result.diagnostic_log);

@ -1144,7 +1200,7 @@ ${contract ? `- CONTRACT AWARENESS: fill_requirement=${contract.fill_requirement
 }

 async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
-  if (T3_DISABLED) return null;
+  if (ACTIVE_T3_DISABLED) return null;

  const eventDigest = ctx.results.map(r => {
    const diag = extractDiagnostics(r.diagnostic_log);
@ -1188,7 +1244,7 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
  const lines: string[] = [];
  lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
  lines.push("");
-  lines.push(`Executor: \`${EXECUTOR_MODEL}\`   Reviewer: \`${REVIEWER_MODEL}\`   Draft: \`${DRAFT_MODEL}\`   Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
+  lines.push(`Executor: \`${ACTIVE_EXECUTOR}\`   Reviewer: \`${ACTIVE_REVIEWER}\`   Draft: \`${DRAFT_MODEL}\`   Overview(T3): \`${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
  lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`);
  lines.push("");

@ -1376,15 +1432,20 @@ async function main() {

  const checkpoints: OverviewCheckpoint[] = [];

+  // Phase 23 refinement — per-staffer tool_level override. Fires once
+  // per run. If no staffer or no tool_level, defaults hold.
+  applyToolLevel(spec.staffer?.tool_level);
+
  console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
  if (spec.staffer) {
-    console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo)`);
+    const level = spec.staffer.tool_level ?? "(default)";
+    console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo, tools=${level})`);
  }
  if (spec.contract) {
    const c = spec.contract;
    console.log(`▶ contract: deadline=${c.deadline} fill=${c.fill_requirement ?? "preferred"}${c.budget_per_hour_max ? ` budget=$${c.budget_per_hour_max}/hr` : ""}${c.local_bonus_radius_mi ? ` local_radius=${c.local_bonus_radius_mi}mi+$${c.local_bonus_per_hour ?? 0}` : ""}`);
  }
-  console.log(`▶ models: exec=${EXECUTOR_MODEL} review=${REVIEWER_MODEL} overview=${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}`);
+  console.log(`▶ models: exec=${ACTIVE_EXECUTOR} review=${ACTIVE_REVIEWER} overview=${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}`);
  console.log(`▶ out: ${out_dir}\n`);

  for (let i = 0; i < spec.events.length; i++) {
@ -1419,7 +1480,7 @@ async function main() {
    // (city, role, count). Capped at 1 retry per event to keep the
    // budget bounded and avoid infinite loops on genuinely-impossible
    // scenarios.
-    if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) {
+    if (!result.ok && ACTIVE_RETRY_ON_FAIL && !ACTIVE_T3_DISABLED) {
      console.log(`   ▶ cloud rescue requested for ${event.at} ${event.kind}…`);
      const rescue = await requestCloudRemediation(event, result, spec.contract);
      if (rescue && rescue.remediation.retry) {
@ -1506,7 +1567,7 @@ async function main() {
    // Option B — T3 checkpoint after every misplacement, and every N-th event.
    const isLast = i === spec.events.length - 1;
    const nthHit = T3_CHECKPOINT_EVERY > 0 && ((i + 1) % T3_CHECKPOINT_EVERY === 0);
-    const shouldCheckpoint = !T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
+    const shouldCheckpoint = !ACTIVE_T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
    if (shouldCheckpoint) {
      const cp = await runOverviewCheckpoint(event, result, ctx.results.slice(0, -1), spec.contract);
      if (cp) {
@ -1527,7 +1588,7 @@ async function main() {
  // Option A — T3 cross-day lesson. One final call distills the whole run.
  // Saved to lesson.md and also seeded into playbook_memory so tomorrow's
  // agent can retrieve it on similar setups.
-  if (!T3_DISABLED) {
+  if (!ACTIVE_T3_DISABLED) {
    console.log(`\n▶ T3 cross-day lesson via ${OVERVIEW_MODEL}…`);
    const tLesson = Date.now();
    const lesson = await runCrossDayLesson(ctx, checkpoints);
@ -1564,7 +1625,7 @@ async function main() {
          events_ok: ctx.results.filter(r => r.ok).length,
          checkpoint_count: checkpoints.length,
          model: OVERVIEW_MODEL,
-          cloud: OVERVIEW_CLOUD,
+          cloud: ACTIVE_OVERVIEW_CLOUD,
          lesson: lesson.trim(),
          checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })),
          created_at: new Date().toISOString(),
@ -1589,17 +1650,17 @@ async function main() {
      out_dir,
      { client: spec.client, date: spec.date, events: spec.events, staffer: spec.staffer },
      {
-        executor: EXECUTOR_MODEL,
-        reviewer: REVIEWER_MODEL,
+        executor: ACTIVE_EXECUTOR,
+        reviewer: ACTIVE_REVIEWER,
        overview: OVERVIEW_MODEL,
-        overview_cloud: OVERVIEW_CLOUD,
+        overview_cloud: ACTIVE_OVERVIEW_CLOUD,
      },
      elapsed,
    );
    console.log(`▶ KB indexed: sig=${sig_hash} (${elapsed.toFixed(1)}s)`);
    const newRec = await recommendFor(spec, {
      overview_model: OVERVIEW_MODEL,
-      cloud: OVERVIEW_CLOUD,
+      cloud: ACTIVE_OVERVIEW_CLOUD,
      k: 5,
    });
    if (newRec) {
--- a/tests/multi-agent/scenarios/staffer_demo/S-001_indianapolis_assembly.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-001_indianapolis_assembly.json
@ -13,7 +13,8 @@
    "id": "S-001",
    "name": "Maria Chen",
    "tenure_months": 48,
-    "role": "senior"
+    "role": "senior",
+    "tool_level": "full"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-001_joliet_warehouse.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-001_joliet_warehouse.json
@ -13,7 +13,8 @@
    "id": "S-001",
    "name": "Maria Chen",
    "tenure_months": 48,
-    "role": "senior"
+    "role": "senior",
+    "tool_level": "full"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-001_nashville_downtown.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-001_nashville_downtown.json
@ -13,7 +13,8 @@
    "id": "S-001",
    "name": "Maria Chen",
    "tenure_months": 48,
-    "role": "senior"
+    "role": "senior",
+    "tool_level": "full"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-002_indianapolis_assembly.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-002_indianapolis_assembly.json
@ -13,7 +13,8 @@
    "id": "S-002",
    "name": "James Park",
    "tenure_months": 14,
-    "role": "mid"
+    "role": "mid",
+    "tool_level": "local"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-002_joliet_warehouse.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-002_joliet_warehouse.json
@ -13,7 +13,8 @@
    "id": "S-002",
    "name": "James Park",
    "tenure_months": 14,
-    "role": "mid"
+    "role": "mid",
+    "tool_level": "local"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-002_nashville_downtown.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-002_nashville_downtown.json
@ -13,7 +13,8 @@
    "id": "S-002",
    "name": "James Park",
    "tenure_months": 14,
-    "role": "mid"
+    "role": "mid",
+    "tool_level": "local"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-003_indianapolis_assembly.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-003_indianapolis_assembly.json
@ -13,7 +13,8 @@
    "id": "S-003",
    "name": "Sam Torres",
    "tenure_months": 4,
-    "role": "junior"
+    "role": "junior",
+    "tool_level": "basic"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-003_joliet_warehouse.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-003_joliet_warehouse.json
@ -13,7 +13,8 @@
    "id": "S-003",
    "name": "Sam Torres",
    "tenure_months": 4,
-    "role": "junior"
+    "role": "junior",
+    "tool_level": "basic"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-003_nashville_downtown.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-003_nashville_downtown.json
@ -13,7 +13,8 @@
    "id": "S-003",
    "name": "Sam Torres",
    "tenure_months": 4,
-    "role": "junior"
+    "role": "junior",
+    "tool_level": "basic"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-004_indianapolis_assembly.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-004_indianapolis_assembly.json
@ -13,7 +13,8 @@
    "id": "S-004",
    "name": "Alex Rivera",
    "tenure_months": 1,
-    "role": "trainee"
+    "role": "trainee",
+    "tool_level": "minimal"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-004_joliet_warehouse.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-004_joliet_warehouse.json
@ -13,7 +13,8 @@
    "id": "S-004",
    "name": "Alex Rivera",
    "tenure_months": 1,
-    "role": "trainee"
+    "role": "trainee",
+    "tool_level": "minimal"
  },
  "events": [
    {
--- a/tests/multi-agent/scenarios/staffer_demo/S-004_nashville_downtown.json
+++ b/tests/multi-agent/scenarios/staffer_demo/S-004_nashville_downtown.json
@ -13,7 +13,8 @@
    "id": "S-004",
    "name": "Alex Rivera",
    "tenure_months": 1,
-    "role": "trainee"
+    "role": "trainee",
+    "tool_level": "minimal"
  },
  "events": [
    {