Architectural snapshot of the lakehouse codebase at the point where the
full matrix-driven agent loop with Mem0 versioning + deletion was
validated end-to-end.
WHAT THIS REPO IS
A clean single-commit snapshot of the lakehouse code. Heavy test data
(.parquet datasets, vector indexes) excluded — see REPLICATION.md for
regen path. Full lakehouse history at git.agentview.dev/profit/lakehouse.
WHAT WAS PROVEN
- Vector retrieval across multi-corpora matrix (chicago_permits + entity
briefs + sec_tickers + distilled procedural + llm_team runs)
- Observer hand-review (cloud + heuristic fallback) gating each candidate
- Local-model agent loop (qwen3.5:latest) with tool use + scratchpad
- Playbook seal on success → next-iter retrieval surfaces it as preamble
- Mem0 versioning + deletion in pathway_memory:
* UPSERT: ADD on new workflow, UPDATE bumps replay_count on identical
* REVISE: chains versions, parent.superseded_at + superseded_by stamped
* RETIRE: marks specific trace retired with reason, excluded from retrieval
* HISTORY: walks chain root→tip, cycle-safe
KEY DIRECTORIES
- crates/vectord/src/pathway_memory.rs — Mem0 ops live here
- crates/vectord/src/playbook_memory.rs — original Mem0 reference
- tests/agent_test/ — local-model agent harness + PRD + session archives
- scripts/dump_raw_corpus.sh — MinIO bucket dump (raw test corpus)
- scripts/vectorize_raw_corpus.ts — corpus → vector indexes
- scripts/analyze_chicago_contracts.ts — real inference pipeline
- scripts/seal_agent_playbook.ts — Mem0 upsert from agent traces
Replication: see REPLICATION.md for Debian 13 clean install + cloud-only
adaptation (no local Ollama).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
268 lines
12 KiB
TypeScript
268 lines
12 KiB
TypeScript
// Hard-task escalation test. The task is deliberately constructed so
|
|
// that a local 7B model (qwen3.5:latest) will miss at least one of the
|
|
// validation rules. Watch the escalation ladder:
|
|
// 1. qwen3.5:latest (local 7B) — likely fails
|
|
// 2. qwen3:latest (local 7B) — likely fails differently
|
|
// 3. gpt-oss:20b (cloud 20B) — may fail
|
|
// 4. gpt-oss:120b (cloud 120B) — should succeed
|
|
// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
|
|
// 6. absolute last ditch: return best-so-far with failure annotation
|
|
//
|
|
// Each attempt:
|
|
// - Calls the model via /v1/chat
|
|
// - Validates the output against a strict rubric
|
|
// - On fail: records the specific rubric violations + the partial
|
|
// output, injects both into the next attempt's prompt as "here's
|
|
// what's wrong, fix it specifically"
|
|
// - On success: exit loop
|
|
//
|
|
// Run: bun run tests/real-world/hard_task_escalation.ts
|
|
|
|
import { writeFile, mkdir } from "node:fs/promises";
|
|
|
|
const GATEWAY = "http://localhost:3100";
|
|
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
|
|
const MAX_ATTEMPTS = 6;
|
|
|
|
// The hard task. Specific enough that a small model will miss at
|
|
// least one rule. Not purely knowledge-based — it's a code-generation
|
|
// task with strict structural constraints.
|
|
const TASK = `Write a complete Rust async function with the EXACT signature:
|
|
|
|
pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
|
|
|
|
It must:
|
|
1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
|
|
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
|
|
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
|
|
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
|
|
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
|
|
6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
|
|
7. Include rustdoc /// comments on the function and each internal helper
|
|
|
|
Assume this struct is already imported:
|
|
|
|
pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
|
|
|
|
Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
|
|
|
|
// Escalation ladder — small-local → large-local → cloud → specialist
|
|
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
|
|
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
|
|
// final escalation tier should be kimi-k2:1t (the biggest model
|
|
// we have access to on Ollama Cloud).
|
|
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
|
|
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
|
|
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " },
|
|
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud
|
|
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
|
|
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
|
|
// NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
|
|
// K2.5/K2.6 both return "this model requires a subscription" on our
|
|
// current Ollama Cloud key. mistral-large-3:675b is the biggest
|
|
// model actually provisioned on this key (verified via direct curl
|
|
// to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
|
|
// swap this line to kimi-k2.5 or kimi-k2.6:cloud.
|
|
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
|
|
];
|
|
|
|
// Validation rubric — the answer must pass all of these to be accepted.
|
|
interface RubricResult {
|
|
passed: boolean;
|
|
violations: string[];
|
|
passed_rules: string[];
|
|
}
|
|
|
|
function validate(code: string): RubricResult {
|
|
const violations: string[] = [];
|
|
const passed: string[] = [];
|
|
|
|
const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
|
|
|
|
check("has pub async fn check_drift_batched signature",
|
|
/pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
|
|
check("takes Vec<DocRef> argument",
|
|
/refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
|
|
check("returns Result<Vec<String>, String>",
|
|
/Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
|
|
check("uses reqwest",
|
|
/\breqwest\b/i.test(code));
|
|
check("references JoinSet or Semaphore for concurrency",
|
|
/\bJoinSet\b|\bSemaphore\b/i.test(code));
|
|
check("bounds concurrency at 4",
|
|
/\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
|
|
// Exponential backoff — models express this several ways. Accept
|
|
// any recognizable doubling pattern starting at 250ms. 2026-04-22:
|
|
// devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
|
|
// rejected even though the code is correct. Broadening rubric to
|
|
// match all idiomatic doubling forms.
|
|
const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
|
|
|| /millis\s*\(\s*250\b/.test(code);
|
|
const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal
|
|
|| /<<\s*\d+/.test(code) // bit-shift
|
|
|| /\.pow\s*\(/.test(code) // 2u32.pow(attempt)
|
|
|| /\*=\s*2\b/.test(code) // delay *= 2 ← was missing
|
|
|| /\*\s*2\s*;/.test(code) // delay = delay * 2;
|
|
|| /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
|
|
check("has 250ms backoff seed",
|
|
hasSeed250);
|
|
check("reaches 500ms backoff (literal or doubling from 250)",
|
|
/Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
|
|
|| /millis\s*\(\s*500\b/.test(code)
|
|
|| (hasSeed250 && hasDoublingPattern));
|
|
check("reaches 1000ms backoff (literal or doubling to 1000)",
|
|
/Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
|
|
|| /millis\s*\(\s*1000\b/.test(code)
|
|
|| (hasSeed250 && hasDoublingPattern));
|
|
check("case-insensitive tool grouping (to_ascii_lowercase)",
|
|
/to_ascii_lowercase|to_lowercase/.test(code));
|
|
check("NO .unwrap() — all errors bubble via ?",
|
|
!/\.unwrap\s*\(\s*\)/.test(code));
|
|
check("NO .expect(...) — all errors bubble via ?",
|
|
!/\.expect\s*\(/.test(code));
|
|
check("NO panic!() / unimplemented!() / todo!()",
|
|
!/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
|
|
check("has rustdoc /// comments",
|
|
/\/\/\//.test(code));
|
|
check("reasonable length (> 500 chars)",
|
|
code.length > 500);
|
|
|
|
return { passed: violations.length === 0, violations, passed_rules: passed };
|
|
}
|
|
|
|
function log(msg: string) { console.log(`[hard] ${msg}`); }
|
|
|
|
async function chat(opts: {
|
|
provider: "ollama" | "ollama_cloud",
|
|
model: string,
|
|
prompt: string,
|
|
}): Promise<{ content: string; error?: string }> {
|
|
try {
|
|
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
|
method: "POST", headers: { "content-type": "application/json" },
|
|
body: JSON.stringify({
|
|
provider: opts.provider,
|
|
model: opts.model,
|
|
messages: [{ role: "user", content: opts.prompt }],
|
|
max_tokens: 2500,
|
|
temperature: 0.2,
|
|
think: false,
|
|
}),
|
|
signal: AbortSignal.timeout(240000),
|
|
});
|
|
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
|
|
const j: any = await r.json();
|
|
return { content: j.choices?.[0]?.message?.content ?? "" };
|
|
} catch (e) {
|
|
return { content: "", error: (e as Error).message };
|
|
}
|
|
}
|
|
|
|
interface AttemptRecord {
|
|
n: number;
|
|
provider: string;
|
|
model: string;
|
|
duration_ms: number;
|
|
content_chars: number;
|
|
error: string | null;
|
|
rubric_violations: string[];
|
|
rubric_passed: string[];
|
|
accepted: boolean;
|
|
}
|
|
|
|
function extractCode(raw: string): string {
|
|
// Strip common fence wrappers
|
|
const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
|
|
if (m) return m[1].trim();
|
|
return raw.trim();
|
|
}
|
|
|
|
async function main() {
|
|
await mkdir(OUT_DIR, { recursive: true });
|
|
log(`output: ${OUT_DIR}`);
|
|
log(`task: ${TASK.slice(0, 120)}...`);
|
|
log("");
|
|
|
|
const attempts: AttemptRecord[] = [];
|
|
let acceptedCode: string | null = null;
|
|
|
|
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
const n = i + 1;
|
|
const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
|
|
|
|
// Build the prompt: base task + prior failures' learning blocks
|
|
let priorLearning = "";
|
|
if (attempts.length > 0) {
|
|
priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
|
|
for (const a of attempts) {
|
|
priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
|
|
for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`;
|
|
if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`;
|
|
}
|
|
priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
|
|
}
|
|
|
|
log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
|
|
const t0 = Date.now();
|
|
const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
|
|
const dur = Date.now() - t0;
|
|
|
|
const code = extractCode(r.content);
|
|
const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
|
|
|
|
const record: AttemptRecord = {
|
|
n,
|
|
provider: rung.provider,
|
|
model: rung.model,
|
|
duration_ms: dur,
|
|
content_chars: code.length,
|
|
error: r.error ?? null,
|
|
rubric_violations: rubric.violations,
|
|
rubric_passed: rubric.passed_rules,
|
|
accepted: rubric.passed,
|
|
};
|
|
attempts.push(record);
|
|
|
|
log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
|
|
for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`);
|
|
|
|
await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
|
|
await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
|
|
|
|
if (rubric.passed) {
|
|
log(` ✅ ACCEPTED on attempt ${n}`);
|
|
acceptedCode = code;
|
|
break;
|
|
}
|
|
}
|
|
|
|
const summary = {
|
|
task: TASK.slice(0, 200),
|
|
total_attempts: attempts.length,
|
|
accepted: acceptedCode !== null,
|
|
accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
|
|
escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
|
|
per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
|
|
per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
|
|
total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
|
|
};
|
|
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
|
|
|
log("");
|
|
log(`═══ RESULT ═══`);
|
|
log(`attempts: ${summary.total_attempts}`);
|
|
log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
|
|
log(`escalation path:`);
|
|
for (const [i, a] of attempts.entries()) {
|
|
const mark = a.accepted ? "✅" : "❌";
|
|
log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
|
|
}
|
|
log("");
|
|
log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
|
log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
|
|
|
|
process.exit(summary.accepted ? 0 : 1);
|
|
}
|
|
|
|
main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
|