|
|
|
|
@ -85,6 +85,49 @@ const T3_DISABLED = process.env.LH_T3_DISABLE === "1";
|
|
|
|
|
// LH_RETRY_ON_FAIL=0 to compare baseline outcomes without rescue.
|
|
|
|
|
const RETRY_ON_FAIL = process.env.LH_RETRY_ON_FAIL !== "0";
|
|
|
|
|
|
|
|
|
|
// Phase 23 refinement — per-staffer tool_level overrides. Evaluated
|
|
|
|
|
// per run in main() once we know the spec's staffer. These are
|
|
|
|
|
// package-scoped mutable slots intentionally — the primary constant
|
|
|
|
|
// above is the DEFAULT; main() flips them for the duration of the run
|
|
|
|
|
// based on staffer.tool_level before calling anything else.
|
|
|
|
|
let ACTIVE_EXECUTOR = EXECUTOR_MODEL;
|
|
|
|
|
let ACTIVE_REVIEWER = REVIEWER_MODEL;
|
|
|
|
|
let ACTIVE_T3_DISABLED = T3_DISABLED;
|
|
|
|
|
let ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
|
|
|
|
|
let ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
|
|
|
|
|
|
|
|
|
|
function applyToolLevel(level: Staffer["tool_level"] | undefined): void {
|
|
|
|
|
// Start from env defaults each time so previous staffer's overrides
|
|
|
|
|
// don't leak.
|
|
|
|
|
ACTIVE_EXECUTOR = EXECUTOR_MODEL;
|
|
|
|
|
ACTIVE_REVIEWER = REVIEWER_MODEL;
|
|
|
|
|
ACTIVE_T3_DISABLED = T3_DISABLED;
|
|
|
|
|
ACTIVE_OVERVIEW_CLOUD = OVERVIEW_CLOUD;
|
|
|
|
|
ACTIVE_RETRY_ON_FAIL = RETRY_ON_FAIL;
|
|
|
|
|
if (!level) return;
|
|
|
|
|
switch (level) {
|
|
|
|
|
case "full":
|
|
|
|
|
ACTIVE_OVERVIEW_CLOUD = true;
|
|
|
|
|
break;
|
|
|
|
|
case "local":
|
|
|
|
|
ACTIVE_OVERVIEW_CLOUD = false;
|
|
|
|
|
break;
|
|
|
|
|
case "basic":
|
|
|
|
|
ACTIVE_EXECUTOR = "qwen2.5:latest";
|
|
|
|
|
ACTIVE_REVIEWER = "qwen2.5:latest";
|
|
|
|
|
ACTIVE_OVERVIEW_CLOUD = false;
|
|
|
|
|
ACTIVE_RETRY_ON_FAIL = false;
|
|
|
|
|
break;
|
|
|
|
|
case "minimal":
|
|
|
|
|
ACTIVE_EXECUTOR = "qwen2.5:latest";
|
|
|
|
|
ACTIVE_REVIEWER = "qwen2.5:latest";
|
|
|
|
|
ACTIVE_T3_DISABLED = true;
|
|
|
|
|
ACTIVE_OVERVIEW_CLOUD = false;
|
|
|
|
|
ACTIVE_RETRY_ON_FAIL = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Dispatcher: route T3 calls to local sidecar or Ollama Cloud depending
|
|
|
|
|
// on the LH_OVERVIEW_CLOUD flag. Hot-path T1/T2 always stay local.
|
|
|
|
|
// T3 outputs are free-form prose (lesson/hint), so shape=text — the
|
|
|
|
|
@ -95,7 +138,7 @@ async function overviewGenerate(prompt: string, opts: { temperature?: number; ma
|
|
|
|
|
max_tokens: opts.max_tokens ?? 1000,
|
|
|
|
|
shape: "text",
|
|
|
|
|
max_continuations: 2,
|
|
|
|
|
cloud: OVERVIEW_CLOUD,
|
|
|
|
|
cloud: ACTIVE_OVERVIEW_CLOUD,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -148,6 +191,19 @@ interface Staffer {
|
|
|
|
|
name: string; // "Maria Chen"
|
|
|
|
|
tenure_months: number;
|
|
|
|
|
role: "senior" | "mid" | "junior" | "trainee";
|
|
|
|
|
// Phase 23 refinement — tool_level controls which subsystems this
|
|
|
|
|
// staffer's runs get to use. The mechanism always leaves
|
|
|
|
|
// playbook_memory ON so inherited playbooks drive the outcome even
|
|
|
|
|
// when T3 / cloud rescue / the bigger executor are disabled.
|
|
|
|
|
//
|
|
|
|
|
// full — qwen3.5 executor + qwen3 reviewer + cloud T3 + rescue
|
|
|
|
|
// local — qwen3.5 + qwen3 + local gpt-oss:20b T3 + rescue
|
|
|
|
|
// basic — qwen2.5 + qwen2.5 + local T3, no rescue
|
|
|
|
|
// minimal — qwen2.5 + qwen2.5, NO T3, NO rescue. Only playbook
|
|
|
|
|
// inheritance to lean on. This is the honest test of
|
|
|
|
|
// whether the playbook system carries knowledge on its
|
|
|
|
|
// own.
|
|
|
|
|
tool_level?: "full" | "local" | "basic" | "minimal";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface ScenarioSpec {
|
|
|
|
|
@ -450,7 +506,7 @@ async function runAgentFill(
|
|
|
|
|
// reasoning. Burning ~650 thinking tokens on a 400-token JSON was
|
|
|
|
|
// exactly the bug we just solved.
|
|
|
|
|
const execRaw = await generateContinuable(
|
|
|
|
|
EXECUTOR_MODEL,
|
|
|
|
|
ACTIVE_EXECUTOR,
|
|
|
|
|
withExtras(executorPrompt(task, log)),
|
|
|
|
|
{
|
|
|
|
|
temperature: 0.2,
|
|
|
|
|
@ -459,7 +515,7 @@ async function runAgentFill(
|
|
|
|
|
max_continuations: 3,
|
|
|
|
|
think: false,
|
|
|
|
|
on_continuation: (n, len) =>
|
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "note",
|
|
|
|
|
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "note",
|
|
|
|
|
content: { continuation: n, combined_chars: len } }),
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
@ -467,11 +523,11 @@ async function runAgentFill(
|
|
|
|
|
try {
|
|
|
|
|
execAction = parseAction(execRaw, "executor");
|
|
|
|
|
} catch (e) {
|
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "error",
|
|
|
|
|
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "error",
|
|
|
|
|
content: { message: (e as Error).message, raw: execRaw.slice(0, 300) } });
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL,
|
|
|
|
|
append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
|
|
|
|
|
kind: execAction.kind as any, content: execAction });
|
|
|
|
|
|
|
|
|
|
if (execAction.kind === "tool_call") {
|
|
|
|
|
@ -490,7 +546,7 @@ async function runAgentFill(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
const trimmed = trimResult(filtered);
|
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL,
|
|
|
|
|
append({ turn, role: "executor", model: ACTIVE_EXECUTOR,
|
|
|
|
|
kind: "tool_result", content: trimmed });
|
|
|
|
|
|
|
|
|
|
// Accumulate playbook citations from any hybrid result that
|
|
|
|
|
@ -503,7 +559,7 @@ async function runAgentFill(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (e) {
|
|
|
|
|
append({ turn, role: "executor", model: EXECUTOR_MODEL, kind: "tool_result",
|
|
|
|
|
append({ turn, role: "executor", model: ACTIVE_EXECUTOR, kind: "tool_result",
|
|
|
|
|
content: { error: (e as Error).message, tool: execAction.tool } });
|
|
|
|
|
consecutiveDrifts += 1;
|
|
|
|
|
if (consecutiveDrifts >= MAX_CONSECUTIVE_DRIFTS) {
|
|
|
|
|
@ -513,7 +569,7 @@ async function runAgentFill(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const revRaw = await generateContinuable(
|
|
|
|
|
REVIEWER_MODEL,
|
|
|
|
|
ACTIVE_REVIEWER,
|
|
|
|
|
withExtras(reviewerPrompt(task, log)),
|
|
|
|
|
{
|
|
|
|
|
temperature: 0.1,
|
|
|
|
|
@ -522,7 +578,7 @@ async function runAgentFill(
|
|
|
|
|
max_continuations: 3,
|
|
|
|
|
think: false,
|
|
|
|
|
on_continuation: (n, len) =>
|
|
|
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "note",
|
|
|
|
|
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "note",
|
|
|
|
|
content: { continuation: n, combined_chars: len } }),
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
@ -530,11 +586,11 @@ async function runAgentFill(
|
|
|
|
|
try {
|
|
|
|
|
revAction = parseAction(revRaw, "reviewer");
|
|
|
|
|
} catch (e) {
|
|
|
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "error",
|
|
|
|
|
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "error",
|
|
|
|
|
content: { message: (e as Error).message, raw: revRaw.slice(0, 300) } });
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
|
|
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL,
|
|
|
|
|
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER,
|
|
|
|
|
kind: "critique", content: revAction });
|
|
|
|
|
|
|
|
|
|
if (revAction.kind !== "critique") throw new Error(`reviewer emitted non-critique: ${revAction.kind}`);
|
|
|
|
|
@ -559,7 +615,7 @@ async function runAgentFill(
|
|
|
|
|
throw new Error(`consensus proposed excluded worker ${f.candidate_id}`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
append({ turn, role: "reviewer", model: REVIEWER_MODEL, kind: "consensus_done",
|
|
|
|
|
append({ turn, role: "reviewer", model: ACTIVE_REVIEWER, kind: "consensus_done",
|
|
|
|
|
content: { fills: execAction.fills } });
|
|
|
|
|
sealed = { fills: execAction.fills, approach: execAction.rationale ?? "multi-agent hybrid" };
|
|
|
|
|
}
|
|
|
|
|
@ -1008,7 +1064,7 @@ async function runOverviewCheckpoint(
|
|
|
|
|
prior: EventResult[],
|
|
|
|
|
contract?: ContractTerms,
|
|
|
|
|
): Promise<OverviewCheckpoint | null> {
|
|
|
|
|
if (T3_DISABLED) return null;
|
|
|
|
|
if (ACTIVE_T3_DISABLED) return null;
|
|
|
|
|
const start = Date.now();
|
|
|
|
|
|
|
|
|
|
const priorSummary = prior.slice(-3).map(p =>
|
|
|
|
|
@ -1084,7 +1140,7 @@ async function requestCloudRemediation(
|
|
|
|
|
result: EventResult,
|
|
|
|
|
contract?: ContractTerms,
|
|
|
|
|
): Promise<{ remediation: CloudRemediation; duration_secs: number } | null> {
|
|
|
|
|
if (T3_DISABLED) return null;
|
|
|
|
|
if (ACTIVE_T3_DISABLED) return null;
|
|
|
|
|
const start = Date.now();
|
|
|
|
|
const diag = extractDiagnostics(result.diagnostic_log);
|
|
|
|
|
|
|
|
|
|
@ -1144,7 +1200,7 @@ ${contract ? `- CONTRACT AWARENESS: fill_requirement=${contract.fill_requirement
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function runCrossDayLesson(ctx: ScenarioContext, checkpoints: OverviewCheckpoint[]): Promise<string | null> {
|
|
|
|
|
if (T3_DISABLED) return null;
|
|
|
|
|
if (ACTIVE_T3_DISABLED) return null;
|
|
|
|
|
|
|
|
|
|
const eventDigest = ctx.results.map(r => {
|
|
|
|
|
const diag = extractDiagnostics(r.diagnostic_log);
|
|
|
|
|
@ -1188,7 +1244,7 @@ async function writeRetrospective(ctx: ScenarioContext): Promise<void> {
|
|
|
|
|
const lines: string[] = [];
|
|
|
|
|
lines.push(`# Scenario retrospective — ${ctx.spec.client}, ${ctx.spec.date}`);
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push(`Executor: \`${EXECUTOR_MODEL}\` Reviewer: \`${REVIEWER_MODEL}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
|
|
|
|
|
lines.push(`Executor: \`${ACTIVE_EXECUTOR}\` Reviewer: \`${ACTIVE_REVIEWER}\` Draft: \`${DRAFT_MODEL}\` Overview(T3): \`${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}\``);
|
|
|
|
|
lines.push(`Prior lessons loaded into executor context: **${ctx.prior_lessons.length}**${ctx.prior_lessons.length > 0 ? " (from " + ctx.prior_lessons.map(p => p.date).join(", ") + ")" : " (baseline — no prior T3 history)"}`);
|
|
|
|
|
lines.push("");
|
|
|
|
|
|
|
|
|
|
@ -1376,15 +1432,20 @@ async function main() {
|
|
|
|
|
|
|
|
|
|
const checkpoints: OverviewCheckpoint[] = [];
|
|
|
|
|
|
|
|
|
|
// Phase 23 refinement — per-staffer tool_level override. Fires once
|
|
|
|
|
// per run. If no staffer or no tool_level, defaults hold.
|
|
|
|
|
applyToolLevel(spec.staffer?.tool_level);
|
|
|
|
|
|
|
|
|
|
console.log(`▶ scenario: ${spec.client}, ${spec.date}, ${spec.events.length} events`);
|
|
|
|
|
if (spec.staffer) {
|
|
|
|
|
console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo)`);
|
|
|
|
|
const level = spec.staffer.tool_level ?? "(default)";
|
|
|
|
|
console.log(`▶ staffer: ${spec.staffer.id} ${spec.staffer.name} (${spec.staffer.role}, ${spec.staffer.tenure_months}mo, tools=${level})`);
|
|
|
|
|
}
|
|
|
|
|
if (spec.contract) {
|
|
|
|
|
const c = spec.contract;
|
|
|
|
|
console.log(`▶ contract: deadline=${c.deadline} fill=${c.fill_requirement ?? "preferred"}${c.budget_per_hour_max ? ` budget=$${c.budget_per_hour_max}/hr` : ""}${c.local_bonus_radius_mi ? ` local_radius=${c.local_bonus_radius_mi}mi+$${c.local_bonus_per_hour ?? 0}` : ""}`);
|
|
|
|
|
}
|
|
|
|
|
console.log(`▶ models: exec=${EXECUTOR_MODEL} review=${REVIEWER_MODEL} overview=${T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (OVERVIEW_CLOUD ? " (cloud)" : "")}`);
|
|
|
|
|
console.log(`▶ models: exec=${ACTIVE_EXECUTOR} review=${ACTIVE_REVIEWER} overview=${ACTIVE_T3_DISABLED ? "disabled" : OVERVIEW_MODEL + (ACTIVE_OVERVIEW_CLOUD ? " (cloud)" : "")}`);
|
|
|
|
|
console.log(`▶ out: ${out_dir}\n`);
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < spec.events.length; i++) {
|
|
|
|
|
@ -1419,7 +1480,7 @@ async function main() {
|
|
|
|
|
// (city, role, count). Capped at 1 retry per event to keep the
|
|
|
|
|
// budget bounded and avoid infinite loops on genuinely-impossible
|
|
|
|
|
// scenarios.
|
|
|
|
|
if (!result.ok && RETRY_ON_FAIL && !T3_DISABLED) {
|
|
|
|
|
if (!result.ok && ACTIVE_RETRY_ON_FAIL && !ACTIVE_T3_DISABLED) {
|
|
|
|
|
console.log(` ▶ cloud rescue requested for ${event.at} ${event.kind}…`);
|
|
|
|
|
const rescue = await requestCloudRemediation(event, result, spec.contract);
|
|
|
|
|
if (rescue && rescue.remediation.retry) {
|
|
|
|
|
@ -1506,7 +1567,7 @@ async function main() {
|
|
|
|
|
// Option B — T3 checkpoint after every misplacement, and every N-th event.
|
|
|
|
|
const isLast = i === spec.events.length - 1;
|
|
|
|
|
const nthHit = T3_CHECKPOINT_EVERY > 0 && ((i + 1) % T3_CHECKPOINT_EVERY === 0);
|
|
|
|
|
const shouldCheckpoint = !T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
|
|
|
|
|
const shouldCheckpoint = !ACTIVE_T3_DISABLED && (event.kind === "misplacement" || nthHit || isLast);
|
|
|
|
|
if (shouldCheckpoint) {
|
|
|
|
|
const cp = await runOverviewCheckpoint(event, result, ctx.results.slice(0, -1), spec.contract);
|
|
|
|
|
if (cp) {
|
|
|
|
|
@ -1527,7 +1588,7 @@ async function main() {
|
|
|
|
|
// Option A — T3 cross-day lesson. One final call distills the whole run.
|
|
|
|
|
// Saved to lesson.md and also seeded into playbook_memory so tomorrow's
|
|
|
|
|
// agent can retrieve it on similar setups.
|
|
|
|
|
if (!T3_DISABLED) {
|
|
|
|
|
if (!ACTIVE_T3_DISABLED) {
|
|
|
|
|
console.log(`\n▶ T3 cross-day lesson via ${OVERVIEW_MODEL}…`);
|
|
|
|
|
const tLesson = Date.now();
|
|
|
|
|
const lesson = await runCrossDayLesson(ctx, checkpoints);
|
|
|
|
|
@ -1564,7 +1625,7 @@ async function main() {
|
|
|
|
|
events_ok: ctx.results.filter(r => r.ok).length,
|
|
|
|
|
checkpoint_count: checkpoints.length,
|
|
|
|
|
model: OVERVIEW_MODEL,
|
|
|
|
|
cloud: OVERVIEW_CLOUD,
|
|
|
|
|
cloud: ACTIVE_OVERVIEW_CLOUD,
|
|
|
|
|
lesson: lesson.trim(),
|
|
|
|
|
checkpoints: checkpoints.map(c => ({ after: c.after_event, risk: c.risk, hint: c.hint })),
|
|
|
|
|
created_at: new Date().toISOString(),
|
|
|
|
|
@ -1589,17 +1650,17 @@ async function main() {
|
|
|
|
|
out_dir,
|
|
|
|
|
{ client: spec.client, date: spec.date, events: spec.events, staffer: spec.staffer },
|
|
|
|
|
{
|
|
|
|
|
executor: EXECUTOR_MODEL,
|
|
|
|
|
reviewer: REVIEWER_MODEL,
|
|
|
|
|
executor: ACTIVE_EXECUTOR,
|
|
|
|
|
reviewer: ACTIVE_REVIEWER,
|
|
|
|
|
overview: OVERVIEW_MODEL,
|
|
|
|
|
overview_cloud: OVERVIEW_CLOUD,
|
|
|
|
|
overview_cloud: ACTIVE_OVERVIEW_CLOUD,
|
|
|
|
|
},
|
|
|
|
|
elapsed,
|
|
|
|
|
);
|
|
|
|
|
console.log(`▶ KB indexed: sig=${sig_hash} (${elapsed.toFixed(1)}s)`);
|
|
|
|
|
const newRec = await recommendFor(spec, {
|
|
|
|
|
overview_model: OVERVIEW_MODEL,
|
|
|
|
|
cloud: OVERVIEW_CLOUD,
|
|
|
|
|
cloud: ACTIVE_OVERVIEW_CLOUD,
|
|
|
|
|
k: 5,
|
|
|
|
|
});
|
|
|
|
|
if (newRec) {
|
|
|
|
|
|