lakehouse/tests/real-world/hard_task_escalation.ts
profit 156dae6732 Auditor self-test branch: real-world pipelines + cohesion Phase C + KB index (PR #8)
Bundles 12 commits validating the auditor + scrum_master architecture end-to-end:

- enrich_prd_pipeline / hard_task_escalation / scrum_master_pipeline stress tests
- Tree-split + scrum_reviews.jsonl + kb_query surfacing
- Verdict → audit_lessons feedback loop (closed)
- kb_index aggregator with confidence-based severity policy
- 9-run + 5-run empirical tests proved the predictive-compounding property
- Level 1 correction: temp=0 cloud inference for deterministic per-claim verdicts
- audit_one.ts dry-run CLI
- Fixes: static quoted-string guard, empirical-claim classification, symbol-resolver gate, repo-file size cap

See PR #8 for run-by-run commit history.
2026-04-23 03:28:32 +00:00

268 lines
12 KiB
TypeScript

// Hard-task escalation test. The task is deliberately constructed so
// that a local 7B model (qwen3.5:latest) will miss at least one of the
// validation rules. Watch the escalation ladder:
// 1. qwen3.5:latest (local 7B) — likely fails
// 2. qwen3:latest (local 7B) — likely fails differently
// 3. gpt-oss:20b (cloud 20B) — may fail
// 4. gpt-oss:120b (cloud 120B) — should succeed
// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
// 6. absolute last ditch: return best-so-far with failure annotation
//
// Each attempt:
// - Calls the model via /v1/chat
// - Validates the output against a strict rubric
// - On fail: records the specific rubric violations + the partial
// output, injects both into the next attempt's prompt as "here's
// what's wrong, fix it specifically"
// - On success: exit loop
//
// Run: bun run tests/real-world/hard_task_escalation.ts
import { writeFile, mkdir } from "node:fs/promises";
const GATEWAY = "http://localhost:3100";
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
const MAX_ATTEMPTS = 6;
// The hard task. Specific enough that a small model will miss at
// least one rule. Not purely knowledge-based — it's a code-generation
// task with strict structural constraints.
const TASK = `Write a complete Rust async function with the EXACT signature:
pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
It must:
1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
7. Include rustdoc /// comments on the function and each internal helper
Assume this struct is already imported:
pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
// Escalation ladder — small-local → large-local → cloud → specialist
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
// final escalation tier should be kimi-k2:1t (the biggest model
// we have access to on Ollama Cloud).
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " },
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
// NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
// K2.5/K2.6 both return "this model requires a subscription" on our
// current Ollama Cloud key. mistral-large-3:675b is the biggest
// model actually provisioned on this key (verified via direct curl
// to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
// swap this line to kimi-k2.5 or kimi-k2.6:cloud.
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
];
// Validation rubric — the answer must pass all of these to be accepted.
interface RubricResult {
passed: boolean;
violations: string[];
passed_rules: string[];
}
function validate(code: string): RubricResult {
const violations: string[] = [];
const passed: string[] = [];
const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
check("has pub async fn check_drift_batched signature",
/pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
check("takes Vec<DocRef> argument",
/refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
check("returns Result<Vec<String>, String>",
/Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
check("uses reqwest",
/\breqwest\b/i.test(code));
check("references JoinSet or Semaphore for concurrency",
/\bJoinSet\b|\bSemaphore\b/i.test(code));
check("bounds concurrency at 4",
/\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
// Exponential backoff — models express this several ways. Accept
// any recognizable doubling pattern starting at 250ms. 2026-04-22:
// devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
// rejected even though the code is correct. Broadening rubric to
// match all idiomatic doubling forms.
const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
|| /millis\s*\(\s*250\b/.test(code);
const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal
|| /<<\s*\d+/.test(code) // bit-shift
|| /\.pow\s*\(/.test(code) // 2u32.pow(attempt)
|| /\*=\s*2\b/.test(code) // delay *= 2 ← was missing
|| /\*\s*2\s*;/.test(code) // delay = delay * 2;
|| /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
check("has 250ms backoff seed",
hasSeed250);
check("reaches 500ms backoff (literal or doubling from 250)",
/Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
|| /millis\s*\(\s*500\b/.test(code)
|| (hasSeed250 && hasDoublingPattern));
check("reaches 1000ms backoff (literal or doubling to 1000)",
/Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
|| /millis\s*\(\s*1000\b/.test(code)
|| (hasSeed250 && hasDoublingPattern));
check("case-insensitive tool grouping (to_ascii_lowercase)",
/to_ascii_lowercase|to_lowercase/.test(code));
check("NO .unwrap() — all errors bubble via ?",
!/\.unwrap\s*\(\s*\)/.test(code));
check("NO .expect(...) — all errors bubble via ?",
!/\.expect\s*\(/.test(code));
check("NO panic!() / unimplemented!() / todo!()",
!/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
check("has rustdoc /// comments",
/\/\/\//.test(code));
check("reasonable length (> 500 chars)",
code.length > 500);
return { passed: violations.length === 0, violations, passed_rules: passed };
}
function log(msg: string) { console.log(`[hard] ${msg}`); }
async function chat(opts: {
provider: "ollama" | "ollama_cloud",
model: string,
prompt: string,
}): Promise<{ content: string; error?: string }> {
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({
provider: opts.provider,
model: opts.model,
messages: [{ role: "user", content: opts.prompt }],
max_tokens: 2500,
temperature: 0.2,
think: false,
}),
signal: AbortSignal.timeout(240000),
});
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
const j: any = await r.json();
return { content: j.choices?.[0]?.message?.content ?? "" };
} catch (e) {
return { content: "", error: (e as Error).message };
}
}
interface AttemptRecord {
n: number;
provider: string;
model: string;
duration_ms: number;
content_chars: number;
error: string | null;
rubric_violations: string[];
rubric_passed: string[];
accepted: boolean;
}
function extractCode(raw: string): string {
// Strip common fence wrappers
const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
if (m) return m[1].trim();
return raw.trim();
}
async function main() {
await mkdir(OUT_DIR, { recursive: true });
log(`output: ${OUT_DIR}`);
log(`task: ${TASK.slice(0, 120)}...`);
log("");
const attempts: AttemptRecord[] = [];
let acceptedCode: string | null = null;
for (let i = 0; i < MAX_ATTEMPTS; i++) {
const n = i + 1;
const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
// Build the prompt: base task + prior failures' learning blocks
let priorLearning = "";
if (attempts.length > 0) {
priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
for (const a of attempts) {
priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`;
if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`;
}
priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
}
log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
const t0 = Date.now();
const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
const dur = Date.now() - t0;
const code = extractCode(r.content);
const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
const record: AttemptRecord = {
n,
provider: rung.provider,
model: rung.model,
duration_ms: dur,
content_chars: code.length,
error: r.error ?? null,
rubric_violations: rubric.violations,
rubric_passed: rubric.passed_rules,
accepted: rubric.passed,
};
attempts.push(record);
log(`${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
for (const v of rubric.violations.slice(0, 5)) log(`${v}`);
await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
if (rubric.passed) {
log(` ✅ ACCEPTED on attempt ${n}`);
acceptedCode = code;
break;
}
}
const summary = {
task: TASK.slice(0, 200),
total_attempts: attempts.length,
accepted: acceptedCode !== null,
accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
};
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
log("");
log(`═══ RESULT ═══`);
log(`attempts: ${summary.total_attempts}`);
log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
log(`escalation path:`);
for (const [i, a] of attempts.entries()) {
const mark = a.accepted ? "✅" : "❌";
log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model}${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
}
log("");
log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
process.exit(summary.accepted ? 0 : 1);
}
main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });