// Hard-task escalation test. The task is deliberately constructed so // that a local 7B model (qwen3.5:latest) will miss at least one of the // validation rules. Watch the escalation ladder: // 1. qwen3.5:latest (local 7B) — likely fails // 2. qwen3:latest (local 7B) — likely fails differently // 3. gpt-oss:20b (cloud 20B) — may fail // 4. gpt-oss:120b (cloud 120B) — should succeed // 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context // 6. absolute last ditch: return best-so-far with failure annotation // // Each attempt: // - Calls the model via /v1/chat // - Validates the output against a strict rubric // - On fail: records the specific rubric violations + the partial // output, injects both into the next attempt's prompt as "here's // what's wrong, fix it specifically" // - On success: exit loop // // Run: bun run tests/real-world/hard_task_escalation.ts import { writeFile, mkdir } from "node:fs/promises"; const GATEWAY = "http://localhost:3100"; const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`; const MAX_ATTEMPTS = 6; // The hard task. Specific enough that a small model will miss at // least one rule. Not purely knowledge-based — it's a code-generation // task with strict structural constraints. const TASK = `Write a complete Rust async function with the EXACT signature: pub async fn check_drift_batched(refs: Vec) -> Result, String> It must: 1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase()) 2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash} 3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4 4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool 5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec of tool names where drifted == true 6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!() 7. Include rustdoc /// comments on the function and each internal helper Assume this struct is already imported: pub struct DocRef { pub tool: String, pub snippet_hash: Option, pub version_seen: String } Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`; // Escalation ladder — small-local → large-local → cloud → specialist // cloud → trillion-param cloud. Corrected 2026-04-22 per J: // gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the // final escalation tier should be kimi-k2:1t (the biggest model // we have access to on Ollama Cloud). const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [ { provider: "ollama", model: "qwen3.5:latest", note: "local 7B" }, { provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " }, { provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" }, { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" }, // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi // K2.5/K2.6 both return "this model requires a subscription" on our // current Ollama Cloud key. mistral-large-3:675b is the biggest // model actually provisioned on this key (verified via direct curl // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro → // swap this line to kimi-k2.5 or kimi-k2.6:cloud. { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" }, ]; // Validation rubric — the answer must pass all of these to be accepted. interface RubricResult { passed: boolean; violations: string[]; passed_rules: string[]; } function validate(code: string): RubricResult { const violations: string[] = []; const passed: string[] = []; const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); }; check("has pub async fn check_drift_batched signature", /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code)); check("takes Vec argument", /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code)); check("returns Result, String>", /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code)); check("uses reqwest", /\breqwest\b/i.test(code)); check("references JoinSet or Semaphore for concurrency", /\bJoinSet\b|\bSemaphore\b/i.test(code)); check("bounds concurrency at 4", /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code))); // Exponential backoff — models express this several ways. Accept // any recognizable doubling pattern starting at 250ms. 2026-04-22: // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex // rejected even though the code is correct. Broadening rubric to // match all idiomatic doubling forms. const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code) || /millis\s*\(\s*250\b/.test(code); const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal || /<<\s*\d+/.test(code) // bit-shift || /\.pow\s*\(/.test(code) // 2u32.pow(attempt) || /\*=\s*2\b/.test(code) // delay *= 2 ← was missing || /\*\s*2\s*;/.test(code) // delay = delay * 2; || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling check("has 250ms backoff seed", hasSeed250); check("reaches 500ms backoff (literal or doubling from 250)", /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code) || /millis\s*\(\s*500\b/.test(code) || (hasSeed250 && hasDoublingPattern)); check("reaches 1000ms backoff (literal or doubling to 1000)", /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code) || /millis\s*\(\s*1000\b/.test(code) || (hasSeed250 && hasDoublingPattern)); check("case-insensitive tool grouping (to_ascii_lowercase)", /to_ascii_lowercase|to_lowercase/.test(code)); check("NO .unwrap() — all errors bubble via ?", !/\.unwrap\s*\(\s*\)/.test(code)); check("NO .expect(...) — all errors bubble via ?", !/\.expect\s*\(/.test(code)); check("NO panic!() / unimplemented!() / todo!()", !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code)); check("has rustdoc /// comments", /\/\/\//.test(code)); check("reasonable length (> 500 chars)", code.length > 500); return { passed: violations.length === 0, violations, passed_rules: passed }; } function log(msg: string) { console.log(`[hard] ${msg}`); } async function chat(opts: { provider: "ollama" | "ollama_cloud", model: string, prompt: string, }): Promise<{ content: string; error?: string }> { try { const r = await fetch(`${GATEWAY}/v1/chat`, { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ provider: opts.provider, model: opts.model, messages: [{ role: "user", content: opts.prompt }], max_tokens: 2500, temperature: 0.2, think: false, }), signal: AbortSignal.timeout(240000), }); if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` }; const j: any = await r.json(); return { content: j.choices?.[0]?.message?.content ?? "" }; } catch (e) { return { content: "", error: (e as Error).message }; } } interface AttemptRecord { n: number; provider: string; model: string; duration_ms: number; content_chars: number; error: string | null; rubric_violations: string[]; rubric_passed: string[]; accepted: boolean; } function extractCode(raw: string): string { // Strip common fence wrappers const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/); if (m) return m[1].trim(); return raw.trim(); } async function main() { await mkdir(OUT_DIR, { recursive: true }); log(`output: ${OUT_DIR}`); log(`task: ${TASK.slice(0, 120)}...`); log(""); const attempts: AttemptRecord[] = []; let acceptedCode: string | null = null; for (let i = 0; i < MAX_ATTEMPTS; i++) { const n = i + 1; const rung = LADDER[i] ?? LADDER[LADDER.length - 1]; // Build the prompt: base task + prior failures' learning blocks let priorLearning = ""; if (attempts.length > 0) { priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`; for (const a of attempts) { priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`; for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`; if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`; } priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`; } log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`); const t0 = Date.now(); const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning }); const dur = Date.now() - t0; const code = extractCode(r.content); const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] }; const record: AttemptRecord = { n, provider: rung.provider, model: rung.model, duration_ms: dur, content_chars: code.length, error: r.error ?? null, rubric_violations: rubric.violations, rubric_passed: rubric.passed_rules, accepted: rubric.passed, }; attempts.push(record); log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`); for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`); await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code); await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2)); if (rubric.passed) { log(` ✅ ACCEPTED on attempt ${n}`); acceptedCode = code; break; } } const summary = { task: TASK.slice(0, 200), total_attempts: attempts.length, accepted: acceptedCode !== null, accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null, escalation_path: attempts.map(a => `${a.provider}/${a.model}`), per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length), per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length), total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0), }; await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2)); log(""); log(`═══ RESULT ═══`); log(`attempts: ${summary.total_attempts}`); log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`); log(`escalation path:`); for (const [i, a] of attempts.entries()) { const mark = a.accepted ? "✅" : "❌"; log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`); } log(""); log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`); log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`); process.exit(summary.accepted ? 0 : 1); } main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });