diff --git a/tests/real-world/hard_task_escalation.ts b/tests/real-world/hard_task_escalation.ts new file mode 100644 index 0000000..7f69f1b --- /dev/null +++ b/tests/real-world/hard_task_escalation.ts @@ -0,0 +1,267 @@ +// Hard-task escalation test. The task is deliberately constructed so +// that a local 7B model (qwen3.5:latest) will miss at least one of the +// validation rules. Watch the escalation ladder: +// 1. qwen3.5:latest (local 7B) — likely fails +// 2. qwen3:latest (local 7B) — likely fails differently +// 3. gpt-oss:20b (cloud 20B) — may fail +// 4. gpt-oss:120b (cloud 120B) — should succeed +// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context +// 6. absolute last ditch: return best-so-far with failure annotation +// +// Each attempt: +// - Calls the model via /v1/chat +// - Validates the output against a strict rubric +// - On fail: records the specific rubric violations + the partial +// output, injects both into the next attempt's prompt as "here's +// what's wrong, fix it specifically" +// - On success: exit loop +// +// Run: bun run tests/real-world/hard_task_escalation.ts + +import { writeFile, mkdir } from "node:fs/promises"; + +const GATEWAY = "http://localhost:3100"; +const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`; +const MAX_ATTEMPTS = 6; + +// The hard task. Specific enough that a small model will miss at +// least one rule. Not purely knowledge-based — it's a code-generation +// task with strict structural constraints. +const TASK = `Write a complete Rust async function with the EXACT signature: + + pub async fn check_drift_batched(refs: Vec) -> Result, String> + +It must: +1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase()) +2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash} +3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4 +4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool +5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec of tool names where drifted == true +6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!() +7. Include rustdoc /// comments on the function and each internal helper + +Assume this struct is already imported: + + pub struct DocRef { pub tool: String, pub snippet_hash: Option, pub version_seen: String } + +Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`; + +// Escalation ladder — small-local → large-local → cloud → specialist +// cloud → trillion-param cloud. Corrected 2026-04-22 per J: +// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the +// final escalation tier should be kimi-k2:1t (the biggest model +// we have access to on Ollama Cloud). +const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [ + { provider: "ollama", model: "qwen3.5:latest", note: "local 7B" }, + { provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " }, + { provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud + { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" }, + { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" }, + // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi + // K2.5/K2.6 both return "this model requires a subscription" on our + // current Ollama Cloud key. mistral-large-3:675b is the biggest + // model actually provisioned on this key (verified via direct curl + // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro → + // swap this line to kimi-k2.5 or kimi-k2.6:cloud. + { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" }, +]; + +// Validation rubric — the answer must pass all of these to be accepted. +interface RubricResult { + passed: boolean; + violations: string[]; + passed_rules: string[]; +} + +function validate(code: string): RubricResult { + const violations: string[] = []; + const passed: string[] = []; + + const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); }; + + check("has pub async fn check_drift_batched signature", + /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code)); + check("takes Vec argument", + /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code)); + check("returns Result, String>", + /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code)); + check("uses reqwest", + /\breqwest\b/i.test(code)); + check("references JoinSet or Semaphore for concurrency", + /\bJoinSet\b|\bSemaphore\b/i.test(code)); + check("bounds concurrency at 4", + /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code))); + // Exponential backoff — models express this several ways. Accept + // any recognizable doubling pattern starting at 250ms. 2026-04-22: + // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex + // rejected even though the code is correct. Broadening rubric to + // match all idiomatic doubling forms. + const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code) + || /millis\s*\(\s*250\b/.test(code); + const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal + || /<<\s*\d+/.test(code) // bit-shift + || /\.pow\s*\(/.test(code) // 2u32.pow(attempt) + || /\*=\s*2\b/.test(code) // delay *= 2 ← was missing + || /\*\s*2\s*;/.test(code) // delay = delay * 2; + || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling + check("has 250ms backoff seed", + hasSeed250); + check("reaches 500ms backoff (literal or doubling from 250)", + /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code) + || /millis\s*\(\s*500\b/.test(code) + || (hasSeed250 && hasDoublingPattern)); + check("reaches 1000ms backoff (literal or doubling to 1000)", + /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code) + || /millis\s*\(\s*1000\b/.test(code) + || (hasSeed250 && hasDoublingPattern)); + check("case-insensitive tool grouping (to_ascii_lowercase)", + /to_ascii_lowercase|to_lowercase/.test(code)); + check("NO .unwrap() — all errors bubble via ?", + !/\.unwrap\s*\(\s*\)/.test(code)); + check("NO .expect(...) — all errors bubble via ?", + !/\.expect\s*\(/.test(code)); + check("NO panic!() / unimplemented!() / todo!()", + !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code)); + check("has rustdoc /// comments", + /\/\/\//.test(code)); + check("reasonable length (> 500 chars)", + code.length > 500); + + return { passed: violations.length === 0, violations, passed_rules: passed }; +} + +function log(msg: string) { console.log(`[hard] ${msg}`); } + +async function chat(opts: { + provider: "ollama" | "ollama_cloud", + model: string, + prompt: string, +}): Promise<{ content: string; error?: string }> { + try { + const r = await fetch(`${GATEWAY}/v1/chat`, { + method: "POST", headers: { "content-type": "application/json" }, + body: JSON.stringify({ + provider: opts.provider, + model: opts.model, + messages: [{ role: "user", content: opts.prompt }], + max_tokens: 2500, + temperature: 0.2, + think: false, + }), + signal: AbortSignal.timeout(240000), + }); + if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` }; + const j: any = await r.json(); + return { content: j.choices?.[0]?.message?.content ?? "" }; + } catch (e) { + return { content: "", error: (e as Error).message }; + } +} + +interface AttemptRecord { + n: number; + provider: string; + model: string; + duration_ms: number; + content_chars: number; + error: string | null; + rubric_violations: string[]; + rubric_passed: string[]; + accepted: boolean; +} + +function extractCode(raw: string): string { + // Strip common fence wrappers + const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/); + if (m) return m[1].trim(); + return raw.trim(); +} + +async function main() { + await mkdir(OUT_DIR, { recursive: true }); + log(`output: ${OUT_DIR}`); + log(`task: ${TASK.slice(0, 120)}...`); + log(""); + + const attempts: AttemptRecord[] = []; + let acceptedCode: string | null = null; + + for (let i = 0; i < MAX_ATTEMPTS; i++) { + const n = i + 1; + const rung = LADDER[i] ?? LADDER[LADDER.length - 1]; + + // Build the prompt: base task + prior failures' learning blocks + let priorLearning = ""; + if (attempts.length > 0) { + priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`; + for (const a of attempts) { + priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`; + for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`; + if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`; + } + priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`; + } + + log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`); + const t0 = Date.now(); + const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning }); + const dur = Date.now() - t0; + + const code = extractCode(r.content); + const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] }; + + const record: AttemptRecord = { + n, + provider: rung.provider, + model: rung.model, + duration_ms: dur, + content_chars: code.length, + error: r.error ?? null, + rubric_violations: rubric.violations, + rubric_passed: rubric.passed_rules, + accepted: rubric.passed, + }; + attempts.push(record); + + log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`); + for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`); + + await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code); + await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2)); + + if (rubric.passed) { + log(` ✅ ACCEPTED on attempt ${n}`); + acceptedCode = code; + break; + } + } + + const summary = { + task: TASK.slice(0, 200), + total_attempts: attempts.length, + accepted: acceptedCode !== null, + accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null, + escalation_path: attempts.map(a => `${a.provider}/${a.model}`), + per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length), + per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length), + total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0), + }; + await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2)); + + log(""); + log(`═══ RESULT ═══`); + log(`attempts: ${summary.total_attempts}`); + log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`); + log(`escalation path:`); + for (const [i, a] of attempts.entries()) { + const mark = a.accepted ? "✅" : "❌"; + log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`); + } + log(""); + log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`); + log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`); + + process.exit(summary.accepted ? 0 : 1); +} + +main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });