Phase 23 refinement — per-staffer tool_level variance

Staffer.tool_level now controls which subsystems a specific run gets:

  full     — qwen3.5 + qwen3 + cloud T3 + cloud rescue
  local    — qwen3.5 + qwen3 + local gpt-oss:20b T3 + rescue
  basic    — qwen2.5 + qwen2.5 + local T3, no rescue
  minimal  — qwen2.5 + qwen2.5, NO T3, NO rescue. Playbook
             inheritance only.

applyToolLevel() mutates module-scoped ACTIVE_* slots each run from the
env defaults, so prior staffer's overrides never leak. Hot-path code
reads ACTIVE_EXECUTOR / ACTIVE_REVIEWER / ACTIVE_T3_DISABLED /
ACTIVE_OVERVIEW_CLOUD / ACTIVE_RETRY_ON_FAIL instead of the baked
constants.

The architectural question this answers: does playbook_memory
inheritance carry enough knowledge to let a weakly-tooled coordinator
still produce usable outcomes? "Minimal" Alex runs qwen2.5 exec + no
reviewer overseer + no cloud rescue. If Alex still fills events at a
reasonable rate, the playbook system is the real knowledge carrier —
the senior stack is nice-to-have, not the sine qua non.

Demo personas mapped:
  Maria (senior, 48mo, full)
  James (mid, 14mo, local)
  Sam (junior, 4mo, basic)
  Alex (trainee, 1mo, minimal)

Same 3 contracts (Nashville downtown, Joliet warehouse, Indianapolis
assembly) across all four → 12 runs. KB + kb_staffer_report.py
leaderboard already wired; competence_score will now reflect real tool
asymmetry instead of LLM sampling variance.
This commit is contained in:
root 2026-04-20 22:50:05 -05:00
parent 6b71c8e9b2
commit 5e89407939
14 changed files with 119 additions and 42 deletions

View File

@ -8,11 +8,15 @@
import { mkdir, writeFile } from "node:fs/promises";
import { join } from "node:path";
// Per-staffer tool_level mirrors the real-world asymmetry: senior gets
// the most powerful stack, trainee gets the least. The architectural
// question is whether the playbook inheritance is strong enough to let
// the trainee still produce usable outcomes when the big tools are off.
const STAFFERS = [
{ id: "S-001", name: "Maria Chen", tenure_months: 48, role: "senior" as const },
{ id: "S-002", name: "James Park", tenure_months: 14, role: "mid" as const },
{ id: "S-003", name: "Sam Torres", tenure_months: 4, role: "junior" as const },
{ id: "S-004", name: "Alex Rivera", tenure_months: 1, role: "trainee" as const },
{ id: "S-001", name: "Maria Chen", tenure_months: 48, role: "senior" as const, tool_level: "full" as const },
{ id: "S-002", name: "James Park", tenure_months: 14, role: "mid" as const, tool_level: "local" as const },
{ id: "S-003", name: "Sam Torres", tenure_months: 4, role: "junior" as const, tool_level: "basic" as const },
{ id: "S-004", name: "Alex Rivera", tenure_months: 1, role: "trainee" as const, tool_level: "minimal" as const },
];
// Three contract shapes — one downtown assembly, one warehouse ramp,

View File

@ -85,6 +85,49 @@ const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
// LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue.
const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";
// Phase 23 refinement — per-staffer tool_level overrides. Evaluated
// per run in main() once we know the spec's staffer. These are
// package-scoped mutable slots intentionally — the primary constant
// above is the DEFAULT; main() flips them for the duration of the run
// based on staffer.tool_level before calling anything else.
let ACTIVE_EXECUTOR = EXECUTOR_MODEL;
let ACTIVE_REVIEWER = REVIEWER_MODEL;
let ACTIVE_T3_DISABLED = T3_DISABLED;
let ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
let ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
function applyToolLevel(level: Staffer["tool_level"] | undefined): void {
// Start from env defaults each time so previous staffer's overrides
// don't leak.
ACTIVE_EXECUTOR = EXECUTOR_MODEL;
ACTIVE_REVIEWER = REVIEWER_MODEL;
ACTIVE_T3_DISABLED = T3_DISABLED;
ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
if (!level) return;
switch (level) {
case "full":
ACTIVE_OVERVIEW_CLOUD = true;
break;
case "local":
ACTIVE_OVERVIEW_CLOUD = false;
break;
case "basic":
ACTIVE_EXECUTOR = "qwen2.5:latest";
ACTIVE_REVIEWER = "qwen2.5:latest";
ACTIVE_OVERVIEW_CLOUD = false;
ACTIVE_RETRY_ON_FAIL = false;
break;
case "minimal":
ACTIVE_EXECUTOR = "qwen2.5:latest";
ACTIVE_REVIEWER = "qwen2.5:latest";
ACTIVE_T3_DISABLED = true;
ACTIVE_OVERVIEW_CLOUD = false;
ACTIVE_RETRY_ON_FAIL = false;
break;
}
}
// Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
// on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
// T3 outputs are free-form prose (lesson/hint), so shape=text — the
@ -95,7 +138,7 @@ async function overviewGenerate(prompt: string, opts: { temperature?: number; ma
max_tokens: opts.max_tokens ?? 1000,
shape: "text",
max_continuations: 2,
cloud: OVERVIEW_CLOUD,
cloud: ACTIVE_OVERVIEW_CLOUD,
});
}
@ -148,6 +191,19 @@ interface Staffer {
name: string; // "Maria Chen"
tenure_months: number;
role: "senior" | "mid" | "junior" | "trainee";
// Phase 23 refinement — tool_level controls which subsystems this
// staffer's runs get to use. The mechanism always leaves
// playbook_memory ON so inherited playbooks drive the outcome even
// when T3 / cloud rescue / the bigger executor are disabled.
//
// full — qwen3.5 executor + qwen3 reviewer + cloud T3 + rescue
// local — qwen3.5 + qwen3 + local gpt-oss:20b T3 + rescue
// basic — qwen2.5 + qwen2.5 + local T3, no rescue
// minimal — qwen2.5 + qwen2.5, NO T3, NO rescue. Only playbook
// inheritance to lean on. This is the honest test of
// whether the playbook system carries knowledge on its
// own.
tool_level?: "full" | "local" | "basic" | "minimal";
}
interface ScenarioSpec {
@ -450,7 +506,7 @@ async function runAgentFill(
// reasoning. Burning ~650 thinking tokens on a 400-token JSON was
// exactly the bug we just solved.
const execRaw = await generateContinuable(
EXECUTOR_MODEL,
ACTIVE_EXECUTOR,
withExtras(executorPrompt(task, log)),
{
temperature: 0.2,
@ -459,7 +515,7 @@ async function runAgentFill(
max_continuations: 3,
think: false,
on_continuation: (n, len) =>
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "note",
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "note",
content: { continuation: n, combined_chars: len } }),
},
);
@ -467,11 +523,11 @@ async function runAgentFill(
try {
execAction = parseAction(execRaw, "executor");
} catch (e) {
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error",
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "error",
content: { message: (e as Error).message, raw: execRaw.slice(0, 300) } });
throw e;
}
append({ turn, role: "executor", model: EXECUTOR_MODEL,
append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
kind: execAction.kind as any, content: execAction });
if (execAction.kind === "tool_call") {
@ -490,7 +546,7 @@ async function runAgentFill(
}
}
const trimmed = trimResult(filtered);
append({ turn, role: "executor", model: EXECUTOR_MODEL,
append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
kind: "tool_result", content: trimmed });
// Accumulate playbook citations from any hybrid result that
@ -503,7 +559,7 @@ async function runAgentFill(
}
}
} catch (e) {
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result",
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "tool_result",
content: { error: (e as Error).message, tool: execAction.tool } });
consecutiveDrifts += 1;
if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) {
@ -513,7 +569,7 @@ async function runAgentFill(
}
const revRaw = await generateContinuable(
REVIEWER_MODEL,
ACTIVE_REVIEWER,
withExtras(reviewerPrompt(task, log)),
{
temperature: 0.1,
@ -522,7 +578,7 @@ async function runAgentFill(
max_continuations: 3,
think: false,
on_continuation: (n, len) =>
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "note",
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "note",
content: { continuation: n, combined_chars: len } }),
},
);
@ -530,11 +586,11 @@ async function runAgentFill(
try {
revAction = parseAction(revRaw, "reviewer");
} catch (e) {
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error",
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "error",
content: { message: (e as Error).message, raw: revRaw.slice(0, 300) } });
throw e;
}
append({ turn, role: "reviewer", model: REVIEWER_MODEL,
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER,
kind: "critique", content: revAction });
if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`);
@ -559,7 +615,7 @@ async function runAgentFill(
throw new Error(`consensus proposed excluded worker ${f.candidate_id}`);
}
}
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done",
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "consensus_done",
content: { fills: execAction.fills } });
sealed = { fills: execAction.fills, approach: execAction.rationale ?? "multi-agent hybrid" };
}
@ -1008,7 +1064,7 @@ async function runOverviewCheckpoint(
prior: EventResult[],
contract?: ContractTerms,
): Promise<OverviewCheckpoint | null> {
if (T3_DISABLED) return null;
if (ACTIVE_T3_DISABLED) return null;
const start = Date.now();
const priorSummary = prior.slice(-3).map(p =>
@ -1084,7 +1140,7 @@ async function requestCloudRemediation(
result: EventResult,
contract?: ContractTerms,
): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> {
if (T3_DISABLED) return null;
if (ACTIVE_T3_DISABLED) return null;
const start = Date.now();
const diag = extractDiagnostics(result.diagnostic_log);
@ -1144,7 +1200,7 @@ ${contract ? `- CONTRACT AWARENESS: fill_requirement=${contract.fill_requirement
}
async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
if (T3_DISABLED) return null;
if (ACTIVE_T3_DISABLED) return null;
const eventDigest = ctx.results.map(r => {
const diag = extractDiagnostics(r.diagnostic_log);
@ -1188,7 +1244,7 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
const lines: string[] = [];
lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
lines.push("");
lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
lines.push(`Executor: \`${ACTIVE_EXECUTOR}\` Reviewer: \`${ACTIVE_REVIEWER}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`);
lines.push("");
@ -1376,15 +1432,20 @@ async function main() {
const checkpoints: OverviewCheckpoint[] = [];
// Phase 23 refinement — per-staffer tool_level override. Fires once
// per run. If no staffer or no tool_level, defaults hold.
applyToolLevel(spec.staffer?.tool_level);
console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
if (spec.staffer) {
console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo)`);
const level = spec.staffer.tool_level ?? "(default)";
console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo, tools=${level})`);
}
if (spec.contract) {
const c = spec.contract;
console.log(`▶ contract: deadline=${c.deadline} fill=${c.fill_requirement ?? "preferred"}${c.budget_per_hour_max ? ` budget=$${c.budget_per_hour_max}/hr` : ""}${c.local_bonus_radius_mi ? ` local_radius=${c.local_bonus_radius_mi}mi+$${c.local_bonus_per_hour ?? 0}` : ""}`);
}
console.log(`▶ models: exec=${EXECUTOR_MODEL} review=${REVIEWER_MODEL} overview=${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}`);
console.log(`▶ models: exec=${ACTIVE_EXECUTOR} review=${ACTIVE_REVIEWER} overview=${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}`);
console.log(`▶ out: ${out_dir}\n`);
for (let i = 0; i < spec.events.length; i++) {
@ -1419,7 +1480,7 @@ async function main() {
// (city, role, count). Capped at 1 retry per event to keep the
// budget bounded and avoid infinite loops on genuinely-impossible
// scenarios.
if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) {
if (!result.ok && ACTIVE_RETRY_ON_FAIL && !ACTIVE_T3_DISABLED) {
console.log(` ▶ cloud rescue requested for ${event.at} ${event.kind}`);
const rescue = await requestCloudRemediation(event, result, spec.contract);
if (rescue && rescue.remediation.retry) {
@ -1506,7 +1567,7 @@ async function main() {
// Option B — T3 checkpoint after every misplacement, and every N-th event.
const isLast = i === spec.events.length - 1;
const nthHit = T3_CHECKPOINT_EVERY > 0 && ((i + 1) % T3_CHECKPOINT_EVERY === 0);
const shouldCheckpoint = !T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
const shouldCheckpoint = !ACTIVE_T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
if (shouldCheckpoint) {
const cp = await runOverviewCheckpoint(event, result, ctx.results.slice(0, -1), spec.contract);
if (cp) {
@ -1527,7 +1588,7 @@ async function main() {
// Option A — T3 cross-day lesson. One final call distills the whole run.
// Saved to lesson.md and also seeded into playbook_memory so tomorrow's
// agent can retrieve it on similar setups.
if (!T3_DISABLED) {
if (!ACTIVE_T3_DISABLED) {
console.log(`\n▶ T3 cross-day lesson via ${OVERVIEW_MODEL}`);
const tLesson = Date.now();
const lesson = await runCrossDayLesson(ctx, checkpoints);
@ -1564,7 +1625,7 @@ async function main() {
events_ok: ctx.results.filter(r => r.ok).length,
checkpoint_count: checkpoints.length,
model: OVERVIEW_MODEL,
cloud: OVERVIEW_CLOUD,
cloud: ACTIVE_OVERVIEW_CLOUD,
lesson: lesson.trim(),
checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })),
created_at: new Date().toISOString(),
@ -1589,17 +1650,17 @@ async function main() {
out_dir,
{ client: spec.client, date: spec.date, events: spec.events, staffer: spec.staffer },
{
executor: EXECUTOR_MODEL,
reviewer: REVIEWER_MODEL,
executor: ACTIVE_EXECUTOR,
reviewer: ACTIVE_REVIEWER,
overview: OVERVIEW_MODEL,
overview_cloud: OVERVIEW_CLOUD,
overview_cloud: ACTIVE_OVERVIEW_CLOUD,
},
elapsed,
);
console.log(`▶ KB indexed: sig=${sig_hash} (${elapsed.toFixed(1)}s)`);
const newRec = await recommendFor(spec, {
overview_model: OVERVIEW_MODEL,
cloud: OVERVIEW_CLOUD,
cloud: ACTIVE_OVERVIEW_CLOUD,
k: 5,
});
if (newRec) {

View File

@ -13,7 +13,8 @@
"id": "S-001",
"name": "Maria Chen",
"tenure_months": 48,
"role": "senior"
"role": "senior",
"tool_level": "full"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-001",
"name": "Maria Chen",
"tenure_months": 48,
"role": "senior"
"role": "senior",
"tool_level": "full"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-001",
"name": "Maria Chen",
"tenure_months": 48,
"role": "senior"
"role": "senior",
"tool_level": "full"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-002",
"name": "James Park",
"tenure_months": 14,
"role": "mid"
"role": "mid",
"tool_level": "local"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-002",
"name": "James Park",
"tenure_months": 14,
"role": "mid"
"role": "mid",
"tool_level": "local"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-002",
"name": "James Park",
"tenure_months": 14,
"role": "mid"
"role": "mid",
"tool_level": "local"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-003",
"name": "Sam Torres",
"tenure_months": 4,
"role": "junior"
"role": "junior",
"tool_level": "basic"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-003",
"name": "Sam Torres",
"tenure_months": 4,
"role": "junior"
"role": "junior",
"tool_level": "basic"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-003",
"name": "Sam Torres",
"tenure_months": 4,
"role": "junior"
"role": "junior",
"tool_level": "basic"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-004",
"name": "Alex Rivera",
"tenure_months": 1,
"role": "trainee"
"role": "trainee",
"tool_level": "minimal"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-004",
"name": "Alex Rivera",
"tenure_months": 1,
"role": "trainee"
"role": "trainee",
"tool_level": "minimal"
},
"events": [
{

View File

@ -13,7 +13,8 @@
"id": "S-004",
"name": "Alex Rivera",
"tenure_months": 1,
"role": "trainee"
"role": "trainee",
"tool_level": "minimal"
},
"events": [
{