matrix-agent-validated/tests/real-world/hard_task_escalation.ts

// Hard-task escalation test. The task is deliberately constructed so
// that a local 7B model (qwen3.5:latest) will miss at least one of the
// validation rules. Watch the escalation ladder:
//   1. qwen3.5:latest  (local 7B)       — likely fails
//   2. qwen3:latest    (local 7B)       — likely fails differently
//   3. gpt-oss:20b     (cloud 20B)      — may fail
//   4. gpt-oss:120b    (cloud 120B)     — should succeed
//   5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
//   6. absolute last ditch: return best-so-far with failure annotation
//
// Each attempt:
//   - Calls the model via /v1/chat
//   - Validates the output against a strict rubric
//   - On fail: records the specific rubric violations + the partial
//     output, injects both into the next attempt's prompt as "here's
//     what's wrong, fix it specifically"
//   - On success: exit loop
//
// Run: bun run tests/real-world/hard_task_escalation.ts

import { writeFile, mkdir } from "node:fs/promises";

const GATEWAY = "http://localhost:3100";
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
const MAX_ATTEMPTS = 6;

// The hard task. Specific enough that a small model will miss at
// least one rule. Not purely knowledge-based — it's a code-generation
// task with strict structural constraints.
const TASK = `Write a complete Rust async function with the EXACT signature:

    pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>

It must:
1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
7. Include rustdoc /// comments on the function and each internal helper

Assume this struct is already imported:

    pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }

Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;

// Escalation ladder — small-local → large-local → cloud → specialist
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
// final escalation tier should be kimi-k2:1t (the biggest model
// we have access to on Ollama Cloud).
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
  { provider: "ollama",       model: "qwen3.5:latest", note: "local 7B" },
  { provider: "ollama",       model: "qwen3:latest",   note: "local 7B (different) " },
  { provider: "ollama",       model: "gpt-oss:20b",    note: "local 20B" },          // FIXED: local, not cloud
  { provider: "ollama_cloud", model: "gpt-oss:120b",   note: "cloud 120B" },
  { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
  // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
  // K2.5/K2.6 both return "this model requires a subscription" on our
  // current Ollama Cloud key. mistral-large-3:675b is the biggest
  // model actually provisioned on this key (verified via direct curl
  // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
  // swap this line to kimi-k2.5 or kimi-k2.6:cloud.
  { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
];

// Validation rubric — the answer must pass all of these to be accepted.
interface RubricResult {
  passed: boolean;
  violations: string[];
  passed_rules: string[];
}

function validate(code: string): RubricResult {
  const violations: string[] = [];
  const passed: string[] = [];

  const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };

  check("has pub async fn check_drift_batched signature",
    /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
  check("takes Vec<DocRef> argument",
    /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
  check("returns Result<Vec<String>, String>",
    /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
  check("uses reqwest",
    /\breqwest\b/i.test(code));
  check("references JoinSet or Semaphore for concurrency",
    /\bJoinSet\b|\bSemaphore\b/i.test(code));
  check("bounds concurrency at 4",
    /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
  // Exponential backoff — models express this several ways. Accept
  // any recognizable doubling pattern starting at 250ms. 2026-04-22:
  // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
  // rejected even though the code is correct. Broadening rubric to
  // match all idiomatic doubling forms.
  const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
                  || /millis\s*\(\s*250\b/.test(code);
  const hasDoublingPattern = /250\s*\*\s*2/.test(code)         // 250 * 2^n literal
                          || /<<\s*\d+/.test(code)              // bit-shift
                          || /\.pow\s*\(/.test(code)            // 2u32.pow(attempt)
                          || /\*=\s*2\b/.test(code)             // delay *= 2  ← was missing
                          || /\*\s*2\s*;/.test(code)            // delay = delay * 2;
                          || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
  check("has 250ms backoff seed",
    hasSeed250);
  check("reaches 500ms backoff (literal or doubling from 250)",
    /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
    || /millis\s*\(\s*500\b/.test(code)
    || (hasSeed250 && hasDoublingPattern));
  check("reaches 1000ms backoff (literal or doubling to 1000)",
    /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
    || /millis\s*\(\s*1000\b/.test(code)
    || (hasSeed250 && hasDoublingPattern));
  check("case-insensitive tool grouping (to_ascii_lowercase)",
    /to_ascii_lowercase|to_lowercase/.test(code));
  check("NO .unwrap() — all errors bubble via ?",
    !/\.unwrap\s*\(\s*\)/.test(code));
  check("NO .expect(...) — all errors bubble via ?",
    !/\.expect\s*\(/.test(code));
  check("NO panic!() / unimplemented!() / todo!()",
    !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
  check("has rustdoc /// comments",
    /\/\/\//.test(code));
  check("reasonable length (> 500 chars)",
    code.length > 500);

  return { passed: violations.length === 0, violations, passed_rules: passed };
}

function log(msg: string) { console.log(`[hard] ${msg}`); }

async function chat(opts: {
  provider: "ollama" | "ollama_cloud",
  model: string,
  prompt: string,
}): Promise<{ content: string; error?: string }> {
  try {
    const r = await fetch(`${GATEWAY}/v1/chat`, {
      method: "POST", headers: { "content-type": "application/json" },
      body: JSON.stringify({
        provider: opts.provider,
        model: opts.model,
        messages: [{ role: "user", content: opts.prompt }],
        max_tokens: 2500,
        temperature: 0.2,
        think: false,
      }),
      signal: AbortSignal.timeout(240000),
    });
    if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
    const j: any = await r.json();
    return { content: j.choices?.[0]?.message?.content ?? "" };
  } catch (e) {
    return { content: "", error: (e as Error).message };
  }
}

interface AttemptRecord {
  n: number;
  provider: string;
  model: string;
  duration_ms: number;
  content_chars: number;
  error: string | null;
  rubric_violations: string[];
  rubric_passed: string[];
  accepted: boolean;
}

function extractCode(raw: string): string {
  // Strip common fence wrappers
  const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
  if (m) return m[1].trim();
  return raw.trim();
}

async function main() {
  await mkdir(OUT_DIR, { recursive: true });
  log(`output: ${OUT_DIR}`);
  log(`task: ${TASK.slice(0, 120)}...`);
  log("");

  const attempts: AttemptRecord[] = [];
  let acceptedCode: string | null = null;

  for (let i = 0; i < MAX_ATTEMPTS; i++) {
    const n = i + 1;
    const rung = LADDER[i] ?? LADDER[LADDER.length - 1];

    // Build the prompt: base task + prior failures' learning blocks
    let priorLearning = "";
    if (attempts.length > 0) {
      priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
      for (const a of attempts) {
        priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
        for (const v of a.rubric_violations) priorLearning += `  - ${v}\n`;
        if (a.error) priorLearning += `  [error: ${a.error.slice(0, 120)}]\n`;
      }
      priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
    }

    log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
    const t0 = Date.now();
    const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
    const dur = Date.now() - t0;

    const code = extractCode(r.content);
    const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };

    const record: AttemptRecord = {
      n,
      provider: rung.provider,
      model: rung.model,
      duration_ms: dur,
      content_chars: code.length,
      error: r.error ?? null,
      rubric_violations: rubric.violations,
      rubric_passed: rubric.passed_rules,
      accepted: rubric.passed,
    };
    attempts.push(record);

    log(`  → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
    for (const v of rubric.violations.slice(0, 5)) log(`       ✗ ${v}`);

    await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
    await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));

    if (rubric.passed) {
      log(`  ✅ ACCEPTED on attempt ${n}`);
      acceptedCode = code;
      break;
    }
  }

  const summary = {
    task: TASK.slice(0, 200),
    total_attempts: attempts.length,
    accepted: acceptedCode !== null,
    accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
    escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
    per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
    per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
    total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
  };
  await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));

  log("");
  log(`═══ RESULT ═══`);
  log(`attempts: ${summary.total_attempts}`);
  log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
  log(`escalation path:`);
  for (const [i, a] of attempts.entries()) {
    const mark = a.accepted ? "✅" : "❌";
    log(`  ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
  }
  log("");
  log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
  log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);

  process.exit(summary.accepted ? 0 : 1);
}

main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });