diff --git a/tests/real-world/hard_task_escalation.ts b/tests/real-world/hard_task_escalation.ts
new file mode 100644
index 0000000..7f69f1b
--- /dev/null
+++ b/tests/real-world/hard_task_escalation.ts
@@ -0,0 +1,267 @@
+// Hard-task escalation test. The task is deliberately constructed so
+// that a local 7B model (qwen3.5:latest) will miss at least one of the
+// validation rules. Watch the escalation ladder:
+//   1. qwen3.5:latest  (local 7B)       — likely fails
+//   2. qwen3:latest    (local 7B)       — likely fails differently
+//   3. gpt-oss:20b     (cloud 20B)      — may fail
+//   4. gpt-oss:120b    (cloud 120B)     — should succeed
+//   5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
+//   6. absolute last ditch: return best-so-far with failure annotation
+//
+// Each attempt:
+//   - Calls the model via /v1/chat
+//   - Validates the output against a strict rubric
+//   - On fail: records the specific rubric violations + the partial
+//     output, injects both into the next attempt's prompt as "here's
+//     what's wrong, fix it specifically"
+//   - On success: exit loop
+//
+// Run: bun run tests/real-world/hard_task_escalation.ts
+
+import { writeFile, mkdir } from "node:fs/promises";
+
+const GATEWAY = "http://localhost:3100";
+const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
+const MAX_ATTEMPTS = 6;
+
+// The hard task. Specific enough that a small model will miss at
+// least one rule. Not purely knowledge-based — it's a code-generation
+// task with strict structural constraints.
+const TASK = `Write a complete Rust async function with the EXACT signature:
+
+    pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
+
+It must:
+1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
+2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
+3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
+4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
+5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
+6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
+7. Include rustdoc /// comments on the function and each internal helper
+
+Assume this struct is already imported:
+
+    pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
+
+Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
+
+// Escalation ladder — small-local → large-local → cloud → specialist
+// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
+// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
+// final escalation tier should be kimi-k2:1t (the biggest model
+// we have access to on Ollama Cloud).
+const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
+  { provider: "ollama",       model: "qwen3.5:latest", note: "local 7B" },
+  { provider: "ollama",       model: "qwen3:latest",   note: "local 7B (different) " },
+  { provider: "ollama",       model: "gpt-oss:20b",    note: "local 20B" },          // FIXED: local, not cloud
+  { provider: "ollama_cloud", model: "gpt-oss:120b",   note: "cloud 120B" },
+  { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
+  // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
+  // K2.5/K2.6 both return "this model requires a subscription" on our
+  // current Ollama Cloud key. mistral-large-3:675b is the biggest
+  // model actually provisioned on this key (verified via direct curl
+  // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
+  // swap this line to kimi-k2.5 or kimi-k2.6:cloud.
+  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
+];
+
+// Validation rubric — the answer must pass all of these to be accepted.
+interface RubricResult {
+  passed: boolean;
+  violations: string[];
+  passed_rules: string[];
+}
+
+function validate(code: string): RubricResult {
+  const violations: string[] = [];
+  const passed: string[] = [];
+
+  const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
+
+  check("has pub async fn check_drift_batched signature",
+    /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
+  check("takes Vec<DocRef> argument",
+    /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
+  check("returns Result<Vec<String>, String>",
+    /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
+  check("uses reqwest",
+    /\breqwest\b/i.test(code));
+  check("references JoinSet or Semaphore for concurrency",
+    /\bJoinSet\b|\bSemaphore\b/i.test(code));
+  check("bounds concurrency at 4",
+    /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
+  // Exponential backoff — models express this several ways. Accept
+  // any recognizable doubling pattern starting at 250ms. 2026-04-22:
+  // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
+  // rejected even though the code is correct. Broadening rubric to
+  // match all idiomatic doubling forms.
+  const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
+                  || /millis\s*\(\s*250\b/.test(code);
+  const hasDoublingPattern = /250\s*\*\s*2/.test(code)         // 250 * 2^n literal
+                          || /<<\s*\d+/.test(code)              // bit-shift
+                          || /\.pow\s*\(/.test(code)            // 2u32.pow(attempt)
+                          || /\*=\s*2\b/.test(code)             // delay *= 2  ← was missing
+                          || /\*\s*2\s*;/.test(code)            // delay = delay * 2;
+                          || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
+  check("has 250ms backoff seed",
+    hasSeed250);
+  check("reaches 500ms backoff (literal or doubling from 250)",
+    /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
+    || /millis\s*\(\s*500\b/.test(code)
+    || (hasSeed250 && hasDoublingPattern));
+  check("reaches 1000ms backoff (literal or doubling to 1000)",
+    /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
+    || /millis\s*\(\s*1000\b/.test(code)
+    || (hasSeed250 && hasDoublingPattern));
+  check("case-insensitive tool grouping (to_ascii_lowercase)",
+    /to_ascii_lowercase|to_lowercase/.test(code));
+  check("NO .unwrap() — all errors bubble via ?",
+    !/\.unwrap\s*\(\s*\)/.test(code));
+  check("NO .expect(...) — all errors bubble via ?",
+    !/\.expect\s*\(/.test(code));
+  check("NO panic!() / unimplemented!() / todo!()",
+    !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
+  check("has rustdoc /// comments",
+    /\/\/\//.test(code));
+  check("reasonable length (> 500 chars)",
+    code.length > 500);
+
+  return { passed: violations.length === 0, violations, passed_rules: passed };
+}
+
+function log(msg: string) { console.log(`[hard] ${msg}`); }
+
+async function chat(opts: {
+  provider: "ollama" | "ollama_cloud",
+  model: string,
+  prompt: string,
+}): Promise<{ content: string; error?: string }> {
+  try {
+    const r = await fetch(`${GATEWAY}/v1/chat`, {
+      method: "POST", headers: { "content-type": "application/json" },
+      body: JSON.stringify({
+        provider: opts.provider,
+        model: opts.model,
+        messages: [{ role: "user", content: opts.prompt }],
+        max_tokens: 2500,
+        temperature: 0.2,
+        think: false,
+      }),
+      signal: AbortSignal.timeout(240000),
+    });
+    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
+    const j: any = await r.json();
+    return { content: j.choices?.[0]?.message?.content ?? "" };
+  } catch (e) {
+    return { content: "", error: (e as Error).message };
+  }
+}
+
+interface AttemptRecord {
+  n: number;
+  provider: string;
+  model: string;
+  duration_ms: number;
+  content_chars: number;
+  error: string | null;
+  rubric_violations: string[];
+  rubric_passed: string[];
+  accepted: boolean;
+}
+
+function extractCode(raw: string): string {
+  // Strip common fence wrappers
+  const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
+  if (m) return m[1].trim();
+  return raw.trim();
+}
+
+async function main() {
+  await mkdir(OUT_DIR, { recursive: true });
+  log(`output: ${OUT_DIR}`);
+  log(`task: ${TASK.slice(0, 120)}...`);
+  log("");
+
+  const attempts: AttemptRecord[] = [];
+  let acceptedCode: string | null = null;
+
+  for (let i = 0; i < MAX_ATTEMPTS; i++) {
+    const n = i + 1;
+    const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
+
+    // Build the prompt: base task + prior failures' learning blocks
+    let priorLearning = "";
+    if (attempts.length > 0) {
+      priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
+      for (const a of attempts) {
+        priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
+        for (const v of a.rubric_violations) priorLearning += `  - ${v}\n`;
+        if (a.error) priorLearning += `  [error: ${a.error.slice(0, 120)}]\n`;
+      }
+      priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
+    }
+
+    log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
+    const t0 = Date.now();
+    const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
+    const dur = Date.now() - t0;
+
+    const code = extractCode(r.content);
+    const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
+
+    const record: AttemptRecord = {
+      n,
+      provider: rung.provider,
+      model: rung.model,
+      duration_ms: dur,
+      content_chars: code.length,
+      error: r.error ?? null,
+      rubric_violations: rubric.violations,
+      rubric_passed: rubric.passed_rules,
+      accepted: rubric.passed,
+    };
+    attempts.push(record);
+
+    log(`  → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
+    for (const v of rubric.violations.slice(0, 5)) log(`       ✗ ${v}`);
+
+    await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
+    await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
+
+    if (rubric.passed) {
+      log(`  ✅ ACCEPTED on attempt ${n}`);
+      acceptedCode = code;
+      break;
+    }
+  }
+
+  const summary = {
+    task: TASK.slice(0, 200),
+    total_attempts: attempts.length,
+    accepted: acceptedCode !== null,
+    accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
+    escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
+    per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
+    per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
+    total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
+  };
+  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
+
+  log("");
+  log(`═══ RESULT ═══`);
+  log(`attempts: ${summary.total_attempts}`);
+  log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
+  log(`escalation path:`);
+  for (const [i, a] of attempts.entries()) {
+    const mark = a.accepted ? "✅" : "❌";
+    log(`  ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
+  }
+  log("");
+  log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
+  log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
+
+  process.exit(summary.accepted ? 0 : 1);
+}
+
+main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });