From 6aafd417853bfe03216a5159cad867697b8e18ce Mon Sep 17 00:00:00 2001 From: root Date: Sun, 3 May 2026 02:05:24 -0500 Subject: [PATCH] remove 5 orphaned dev experiments from tests/real-world/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per J: "all of our test code ended up in the main." These are 5 one-time dev experiments that were never wired into any automation and have zero live consumers in the production code path. Deleting them. Removed (1418 LOC total): - enrich_prd_pipeline.ts (528 LOC) — Phase 21 architecture stress test - nine_consecutive_audits.ts (185 LOC) — empirical study of audit compounding - hard_task_escalation.ts (267 LOC) — escalation-ladder test (refs retired cloud models qpt-oss:20b/120b) - autonomous_loop.ts (214 LOC) — wrapper experiment around scrum_master - consensus_reducer_design.ts (224 LOC) — N=3 design consultation; output JSON referenced from pathway_memory.rs comment but the script itself has no consumer Verified: 0 references in package.json / justfile / Makefile / any production .ts/.rs/.sh file. The single mention from pathway_memory.rs is a //! doc comment referencing the JSON output (data/_kb/ consensus_reducer_design_*.json), not the script. Build clean post-delete. KEPT: - scrum_master_pipeline.ts — referenced from observer.ts, vectord, scripts - scrum_applier.ts — referenced from auditor schemas If you need any of these back, they're in git history. cherry-pick or git show HEAD~1 -- tests/real-world/.ts will recover the source. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/real-world/autonomous_loop.ts | 214 -------- tests/real-world/consensus_reducer_design.ts | 224 -------- tests/real-world/enrich_prd_pipeline.ts | 528 ------------------- tests/real-world/hard_task_escalation.ts | 267 ---------- tests/real-world/nine_consecutive_audits.ts | 186 ------- 5 files changed, 1419 deletions(-) delete mode 100644 tests/real-world/autonomous_loop.ts delete mode 100644 tests/real-world/consensus_reducer_design.ts delete mode 100644 tests/real-world/enrich_prd_pipeline.ts delete mode 100644 tests/real-world/hard_task_escalation.ts delete mode 100644 tests/real-world/nine_consecutive_audits.ts diff --git a/tests/real-world/autonomous_loop.ts b/tests/real-world/autonomous_loop.ts deleted file mode 100644 index ba6ee83..0000000 --- a/tests/real-world/autonomous_loop.ts +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env bun -// Autonomous scrum loop — wraps scrum_master_pipeline.ts + scrum_applier.ts -// in a goal-driven retry loop. Observer is POSTed an iteration summary at -// every boundary so it can build meta-commentary outside the loop's epistemic -// scope. -// -// Usage: -// LOOP_TARGETS="crates/a/src/x.rs,crates/b/src/y.rs" \ -// LOOP_MAX_ITERS=5 \ -// LOOP_PUSH=1 \ -// bun run tests/real-world/autonomous_loop.ts -// -// Stop conditions: max_iters reached OR 2 consecutive iters with 0 commits. - -import { spawn } from "node:child_process"; -import { appendFile, readFile } from "node:fs/promises"; -import { existsSync } from "node:fs"; - -const REPO = "/home/profit/lakehouse"; -const OBSERVER = process.env.LOOP_OBSERVER ?? "http://localhost:3800"; -const BRANCH = process.env.LOOP_BRANCH ?? "scrum/auto-apply-19814"; -const MAX_ITERS = Number(process.env.LOOP_MAX_ITERS ?? 3); -const PUSH = process.env.LOOP_PUSH === "1"; -const MIN_CONF = process.env.LOOP_MIN_CONF ?? "85"; -// Optional override — when unset, let scrum_applier.ts use ITS default -// (currently x-ai/grok-4.1-fast on openrouter). The prior hardcoded -// qwen3-coder:480b default was clobbering the applier's own default -// and forcing every iter to hit the throttled ollama_cloud account. -const APPLIER_MODEL = process.env.LOOP_APPLIER_MODEL; -const APPLIER_PROVIDER = process.env.LOOP_APPLIER_PROVIDER; -const TARGETS = (process.env.LOOP_TARGETS ?? "crates/queryd/src/service.rs,crates/gateway/src/main.rs,crates/gateway/src/v1/mod.rs") - .split(",").map(s => s.trim()).filter(Boolean); - -const FORENSIC = process.env.LH_SCRUM_FORENSIC ?? `${REPO}/docs/SCRUM_FORENSIC_PROMPT.md`; -const PROPOSAL = process.env.LH_SCRUM_PROPOSAL ?? `${REPO}/docs/SCRUM_FIX_WAVE.md`; - -const LOOP_ID = `loop_${Date.now().toString(36)}`; -const JOURNAL = `${REPO}/data/_kb/autonomous_loops.jsonl`; - -interface IterResult { - iter: number; - scrum_reviews_added: number; - applier_outcomes: Record; - commits_landed: number; - commit_shas: string[]; - build_status: "green" | "red" | "unknown"; - duration_ms: number; -} - -function log(msg: string) { - const ts = new Date().toISOString().slice(11, 19); - console.log(`[loop ${LOOP_ID} ${ts}] ${msg}`); -} - -function runCmd(cmd: string, args: string[], env: Record = {}): Promise<{ code: number; stdout: string; stderr: string }> { - return new Promise((resolve) => { - const child = spawn(cmd, args, { cwd: REPO, env: { ...process.env, ...env } }); - let stdout = "", stderr = ""; - child.stdout.on("data", (d) => { stdout += d; process.stdout.write(d); }); - child.stderr.on("data", (d) => { stderr += d; process.stderr.write(d); }); - child.on("close", (code) => resolve({ code: code ?? -1, stdout, stderr })); - }); -} - -async function countLines(path: string): Promise { - if (!existsSync(path)) return 0; - const text = await readFile(path, "utf8"); - return text.split("\n").filter(Boolean).length; -} - -async function gitHeadSha(): Promise { - const r = await runCmd("git", ["rev-parse", "HEAD"]); - return r.stdout.trim(); -} - -async function commitsSince(baseSha: string): Promise { - const r = await runCmd("git", ["log", "--oneline", `${baseSha}..HEAD`]); - return r.stdout.trim().split("\n").filter(Boolean); -} - -async function cargoCheckGreen(): Promise { - log("cargo check --workspace …"); - const r = await runCmd("cargo", ["check", "--workspace", "--quiet"]); - return r.code === 0; -} - -async function postObserver(payload: object) { - try { - const r = await fetch(`${OBSERVER}/event`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(payload), - signal: AbortSignal.timeout(5000), - }); - if (!r.ok) log(`observer POST returned ${r.status}`); - } catch (e: any) { - log(`observer POST failed: ${e.message}`); - } -} - -async function runIter(iter: number, baseSha: string): Promise { - const t0 = Date.now(); - log(`══ iter ${iter} start (base ${baseSha.slice(0, 8)}) targets=${TARGETS.length}`); - - const reviewsBefore = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`); - const applyBefore = await countLines(`${REPO}/data/_kb/auto_apply.jsonl`); - - log(`scrum_master_pipeline.ts → ${TARGETS.length} files`); - await runCmd("bun", ["run", "tests/real-world/scrum_master_pipeline.ts"], { - LH_SCRUM_FILES: TARGETS.join(","), - LH_SCRUM_FORENSIC: FORENSIC, - LH_SCRUM_PROPOSAL: PROPOSAL, - }); - - log(`scrum_applier.ts COMMIT=1 MIN_CONF=${MIN_CONF} files=${TARGETS.length}`); - // Only forward model/provider when explicitly overridden — otherwise - // let scrum_applier.ts use its own defaults (Grok 4.1 fast on openrouter). - const applierEnv: Record = { - LH_APPLIER_COMMIT: "1", - LH_APPLIER_MIN_CONF: MIN_CONF, - LH_APPLIER_MAX_FILES: String(TARGETS.length), - LH_APPLIER_BRANCH: BRANCH, - // Constrain applier to THIS iter's targets so it patches what we - // just reviewed instead of the highest-confidence file from history. - LH_APPLIER_FILES: TARGETS.join(","), - }; - if (APPLIER_MODEL) applierEnv.LH_APPLIER_MODEL = APPLIER_MODEL; - if (APPLIER_PROVIDER) applierEnv.LH_APPLIER_PROVIDER = APPLIER_PROVIDER; - await runCmd("bun", ["run", "tests/real-world/scrum_applier.ts"], applierEnv); - - const reviewsAfter = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`); - const applyAfterText = existsSync(`${REPO}/data/_kb/auto_apply.jsonl`) - ? await readFile(`${REPO}/data/_kb/auto_apply.jsonl`, "utf8") - : ""; - const applyRows = applyAfterText.split("\n").filter(Boolean).slice(applyBefore); - const outcomes: Record = {}; - for (const line of applyRows) { - try { - const o = JSON.parse(line); - outcomes[o.action ?? "?"] = (outcomes[o.action ?? "?"] ?? 0) + 1; - } catch { /* skip malformed */ } - } - - const commitShas = await commitsSince(baseSha); - const buildStatus = commitShas.length > 0 ? (await cargoCheckGreen() ? "green" : "red") : "unknown"; - - const result: IterResult = { - iter, - scrum_reviews_added: reviewsAfter - reviewsBefore, - applier_outcomes: outcomes, - commits_landed: commitShas.length, - commit_shas: commitShas.map(s => s.split(" ")[0]), - build_status: buildStatus, - duration_ms: Date.now() - t0, - }; - - log(`iter ${iter} done — reviews+${result.scrum_reviews_added} commits=${result.commits_landed} build=${buildStatus} (${(result.duration_ms / 1000).toFixed(1)}s)`); - - await postObserver({ - source: "autonomous_loop", - loop_id: LOOP_ID, - event_kind: "iteration_complete", - iter, - targets: TARGETS, - success: buildStatus !== "red", - scrum_reviews_added: result.scrum_reviews_added, - applier_outcomes: result.applier_outcomes, - commits_landed: result.commits_landed, - commit_shas: result.commit_shas, - build_status: buildStatus, - duration_ms: result.duration_ms, - ts: new Date().toISOString(), - }); - - await appendFile(JOURNAL, JSON.stringify({ loop_id: LOOP_ID, ...result, ts: new Date().toISOString() }) + "\n"); - - return result; -} - -async function main() { - log(`autonomous loop starting · branch=${BRANCH} max_iters=${MAX_ITERS} push=${PUSH}`); - log(`targets: ${TARGETS.join(", ")}`); - - const branchR = await runCmd("git", ["branch", "--show-current"]); - if (branchR.stdout.trim() !== BRANCH) { - log(`ERROR: on branch ${branchR.stdout.trim()}, expected ${BRANCH}. Refusing to run.`); - process.exit(1); - } - - let consecutiveZero = 0; - for (let iter = 1; iter <= MAX_ITERS; iter++) { - const baseSha = await gitHeadSha(); - const result = await runIter(iter, baseSha); - - if (PUSH && result.commits_landed > 0) { - log(`git push origin ${BRANCH}`); - const pushR = await runCmd("git", ["push", "origin", BRANCH]); - if (pushR.code !== 0) log(`push failed (continuing): ${pushR.stderr.slice(0, 200)}`); - } - - consecutiveZero = result.commits_landed === 0 ? consecutiveZero + 1 : 0; - if (consecutiveZero >= 2) { - log(`STOP: 2 consecutive iters with 0 commits. Loop converged or stuck.`); - break; - } - } - - log(`loop ${LOOP_ID} complete. Journal: ${JOURNAL}`); -} - -main().catch((e) => { - log(`FATAL: ${e.message}`); - process.exit(1); -}); diff --git a/tests/real-world/consensus_reducer_design.ts b/tests/real-world/consensus_reducer_design.ts deleted file mode 100644 index e569235..0000000 --- a/tests/real-world/consensus_reducer_design.ts +++ /dev/null @@ -1,224 +0,0 @@ -// consensus_reducer_design.ts — N=3 design consultation. -// -// J's ask: enhance the tree-split reducer to preserve FULL backtrack-able -// context (endpoints tried, attempt count per model in the ladder, KB -// sources retrieved, context7 bridge hits, MCP observer signals, audit -// verdicts) instead of collapsing to a summary. Then index the full -// context through our existing vectord matrix indexing (HNSW + Lance + -// playbook_memory) so successful pathways become hot-swappable — the -// system asks "what did we try, what worked, in what order" for a -// similar task class and gets a ranked playbook back. -// -// Before building, consult three diverse models and print their proposals -// side-by-side so we can pick the convergent design. - -const GATEWAY = "http://localhost:3100"; - -const DESIGN_BRIEF = ` -# Context — Lakehouse signal→commit loop - -We run 6x scrum-master iterations that audit Rust crates for PRD -alignment, produce findings + confidence, and feed an auto-applier that -lands small mechanical commits through a cargo-green-and-warning-stable -gate. Key components: - -- \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung - model LADDER (kimi-k2:1t → qwen3-coder:480b → deepseek → mistral-large - → gpt-oss:120b → qwen3.5:397b → openrouter free rescues → local - qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5 - proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K - shards, asks each rung in order, accepts first response passing - structural checks. -- \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions - (file, verdict, critical_failures_count, gradient_tier, attempts, - reviewer_model, tree_split_fired). Escalates failure clusters to LLM - Team by POSTing to /v1/chat with qwen3-coder:480b. -- \`context7-bridge\` — external library docs lookup. -- \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum - findings; writes to data/_kb/audit_facts.jsonl via LLM Team - \`/api/run?mode=extract\`. -- \`crates/vectord/src/playbook_memory.rs\` — indexing for proven - playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry, - PatternReport. Uses HNSW index + Lance columnar backend + promotion - pipeline. Already battle-tested for workers/staffing queries. -- Tree-split REDUCER: after shards return map-style summaries, they are - concatenated with internal §N§ markers and fed to a reviewer model to - produce ONE file-level review. Currently the reducer sees summaries, - not the full context behind each shard's conclusion. - -# The problem - -The reducer currently TRUNCATES to a short summary. When the auditor or -a future iteration wants to backtrack WHY the reducer concluded what it -did — which attempt succeeded, which failed, what KB chunks were -retrieved, what observer signal classified the file as LOOPING vs -CONVERGING — that context is lost. So: - -1. Auditor can't verify citation provenance beyond the summary line. -2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a - "tried X and it was easy" playbook — they look identical downstream. -3. The matrix indexing is only used for RAG chunks during the scrum - pass, NOT for storing the full end-to-end pathway of a successful - review. - -# The design question - -Propose an enhanced reducer + indexing design that: - -(a) Preserves the FULL backtrack context per reviewed file: - - every ladder attempt (model, ms, accepted_y/n, reject_reason) - - every retrieved KB chunk (source doc, chunk id, cosine score, rank) - - every observer signal (class, priors, prior-iter outcomes) - - every context7 bridge hit (library, version pulled) - - every sub-pipeline call (LLM Team extract results, audit consensus) - -(b) Stores this pathway into vectord's matrix indexing alongside the - review verdict so it becomes retrievable by similarity. When a new - file's fingerprint (task_class + file-path prefix + signal class) - matches a past successful pathway, the system can hot-swap by - replaying or short-circuiting to the model/KB combo that worked. - -(c) Surfaces the matrix-index hit rate as a feedback signal on the - scrum's UI — "this file was solved 3 times before by the same ladder - rung; consider short-circuiting to rung 5." - -(d) Is compatible with the existing playbook_memory.rs primitives - (PlaybookEntry, DocRef, FailureRecord, BoostEntry) — extend don't - replace. The indexing layer is in production for workers/staffing; - we want the reducer pathway to piggyback on proven infrastructure. - -# Constraints -- NO new crate. Extend vectord + scrum_master_pipeline. -- Full context can be LARGE — a reviewed file might have 5 retrievals, - 4 ladder attempts, 8 observer priors. Design the embedding / - fingerprint so similar-but-not-identical pathways cluster. -- The reducer summary is still needed for the reviewer LLM input — - don't remove it, ADD the full-context sidecar. -- Audit trail: every pathway must be replayable deterministically from - what's stored (i.e., enough context to re-run without the original - prompt cache). - -# Required output (STRICT JSON, no prose, no markdown fences): - -{ - "approach": "one-paragraph summary of your proposed design", - "data_model": { - "new_fields_on_playbook_entry": [...], - "new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ] - }, - "storage_strategy": { - "what_to_vectorize": "the text that becomes the embedding", - "fingerprint_key": "the deterministic key for similarity retrieval", - "backend": "HNSW, Lance, playbook_memory — pick" - }, - "reducer_changes": { - "inputs_added": [...], - "outputs_added": [...], - "compatibility_notes": "how existing callers stay working" - }, - "hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway", - "ui_signal": "what to surface so J sees whether matrix indexing is earning its keep", - "risks": [...], - "why_this_beats_summarization": "one-paragraph argument" -} -`.trim(); - -interface Probe { - name: string; - provider: "ollama" | "ollama_cloud" | "openrouter"; - model: string; -} - -// Round-3 probe set — 4 probes covering the remaining ladder rungs + -// architecture/provider diversity. J wanted all 4 of the untouched -// options so the aggregated 10-model signal is saturated across the -// usable ladder. -const PROBES: Probe[] = [ - { name: "qwen35-397b", provider: "ollama_cloud", model: "qwen3.5:397b" }, - { name: "openrouter-gpt-oss", provider: "openrouter", model: "openai/gpt-oss-120b:free" }, - { name: "openrouter-gemma3", provider: "openrouter", model: "google/gemma-3-27b-it:free" }, - { name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check -]; - -async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> { - const started = Date.now(); - try { - const r = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - provider: p.provider, - model: p.model, - messages: [ - { role: "system", content: "You are a senior architect. Output STRICT JSON only." }, - { role: "user", content: DESIGN_BRIEF }, - ], - max_tokens: 3000, - temperature: 0, - }), - }); - const ms = Date.now() - started; - if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` }; - const j = await r.json(); - const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? ""; - return { name: p.name, raw: String(content), ms }; - } catch (e: any) { - return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) }; - } -} - -function extractJson(raw: string): any | null { - let s = raw.trim(); - const fence = s.match(/^```(?:json)?\s*/); - if (fence) s = s.slice(fence[0].length); - if (s.endsWith("```")) s = s.slice(0, -3).trim(); - const first = s.indexOf("{"); - const last = s.lastIndexOf("}"); - if (first < 0 || last <= first) return null; - try { - return JSON.parse(s.slice(first, last + 1)); - } catch { - return null; - } -} - -function summarize(obj: any, max = 240): string { - if (!obj) return "(no JSON parsed)"; - if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj; - if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; "); - return Object.entries(obj) - .map(([k, v]) => `${k}=${summarize(v, max)}`) - .join(" | "); -} - -async function main() { - console.log(`\n── N=3 design consensus ──`); - console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`); - - const results = await Promise.all(PROBES.map(ask)); - - for (const r of results) { - console.log(`\n── ${r.name} (${r.ms}ms) ──`); - if (r.error) { console.log(` ERROR: ${r.error}`); continue; } - const j = extractJson(r.raw); - if (!j) { - console.log(` raw (no JSON): ${r.raw.slice(0, 600)}…`); - continue; - } - console.log(` approach: ${summarize(j.approach, 400)}`); - console.log(` fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`); - console.log(` vectorize: ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`); - console.log(` backend: ${summarize(j.storage_strategy?.backend, 200)}`); - console.log(` hot_swap: ${summarize(j.hot_swap_logic, 300)}`); - console.log(` new_types: ${summarize(j.data_model?.new_types, 400)}`); - console.log(` risks: ${summarize(j.risks, 300)}`); - console.log(` why>summary: ${summarize(j.why_this_beats_summarization, 300)}`); - } - - // Write full JSON to disk so we can inspect later. - const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`; - await Bun.write(outPath, JSON.stringify(results, null, 2)); - console.log(`\nfull responses → ${outPath}`); -} - -await main(); diff --git a/tests/real-world/enrich_prd_pipeline.ts b/tests/real-world/enrich_prd_pipeline.ts deleted file mode 100644 index 134e2d7..0000000 --- a/tests/real-world/enrich_prd_pipeline.ts +++ /dev/null @@ -1,528 +0,0 @@ -// Real-world architecture stress test — 6 iterations of the full pipeline -// against the PRD as a corpus. Goal: prove at scale what Phase 21 -// promised (context continuation + tree-split), plus Phase 19 -// compounding across iterations. -// -// Run: bun run tests/real-world/enrich_prd_pipeline.ts -// -// No mocks. No skipped layers. On any error, the test triggers -// cloud-rescue rather than fail — it's the architecture's job to -// recover. The test FAILS only if we can't complete 6 iterations. - -import { readFile, writeFile, mkdir } from "node:fs/promises"; -import { createHash } from "node:crypto"; - -const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md"; -const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens -const CHUNK_OVERLAP = 120; -const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow -const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter -const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3 - -// Continuation controls (per-cloud-call) — used for output-overflow. -// Separate from the task-retry loop (per-task) — that handles errors -// across attempts. -const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation -const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom -const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call - -// Task-level retry loop (J's clarification, 2026-04-22): -// When a TASK errors, retry the whole task up to 6 times. Each -// retry gets prior attempts' failures injected as learning context, -// so attempt N+1 is informed by what N failed at. The loop caps at -// 6 to avoid infinite spinning on genuinely unsolvable tasks. -const MAX_TASK_RETRIES = 6; - -// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through -// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will -// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the -// loop fires all 6 with compounding failure context. -const FORCE_RETRY_MODEL_SEQUENCE = [ - "deliberately-invalid-model-attempt-1", - "deliberately-invalid-model-attempt-2", - "deliberately-invalid-model-attempt-3", - "deliberately-invalid-model-attempt-4", - "deliberately-invalid-model-attempt-5", - "gpt-oss:20b", // 6th attempt succeeds -]; -const GATEWAY = "http://localhost:3100"; -const SIDECAR = "http://localhost:3200"; -const CLOUD_MODEL = "gpt-oss:120b"; -const RESCUE_MODEL = "gpt-oss:20b"; // fallback local cloud model via sidecar -const RUN_NONCE = Date.now().toString(36); -const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`; - -// The 6 progressively-compounding questions. #6 explicitly requires -// synthesis across prior 5 answers. -const QUESTIONS: string[] = [ - "Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?", - "How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?", - "Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?", - "What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?", - "How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?", - "Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.", -]; - -type Chunk = { id: string; text: string; embedding: number[]; offset: number }; - -interface IterationResult { - iteration: number; - question: string; - retrieval_top_k: number; - context_chars_before_budget: number; - tree_split_fired: boolean; - cloud_calls_total: number; - continuation_retries: number; - rescue_triggered: boolean; - // Task-level retry telemetry - task_attempts_made: number; // how many attempts fired (1 = first succeeded) - task_retry_history: Array<{ n: number; model: string; error: string }>; - playbook_id: string | null; - tokens_prompt: number; - tokens_completion: number; - citations_from_prior_iterations: string[]; - duration_ms: number; - answer_preview: string; - errors_recovered: string[]; -} - -function log(msg: string) { console.log(`[enrich] ${msg}`); } - -function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); } - -function cosine(a: number[], b: number[]): number { - let dot = 0, na = 0, nb = 0; - for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } - if (na === 0 || nb === 0) return 0; - return dot / (Math.sqrt(na) * Math.sqrt(nb)); -} - -function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); } - -async function embedBatch(texts: string[]): Promise { - // Sidecar /embed accepts a list. On partial failure, retry individually. - const r = await fetch(`${SIDECAR}/embed`, { - method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ texts }), - signal: AbortSignal.timeout(120000), - }); - if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`); - const j: any = await r.json(); - return j.embeddings; -} - -function chunkText(text: string): Array<{ text: string; offset: number }> { - const out: Array<{ text: string; offset: number }> = []; - let i = 0; - while (i < text.length) { - const end = Math.min(i + CHUNK_SIZE, text.length); - const slice = text.slice(i, end).trim(); - if (slice.length > 50) out.push({ text: slice, offset: i }); - if (end >= text.length) break; - i = end - CHUNK_OVERLAP; - } - return out; -} - -async function chat(opts: { - provider: "ollama" | "ollama_cloud", - model: string, - messages: Array<{ role: string; content: string }>, - max_tokens: number, - think: boolean, -}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> { - const r = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ ...opts }), - signal: AbortSignal.timeout(180000), - }); - if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`); - const j: any = await r.json(); - return { - content: j.choices?.[0]?.message?.content ?? "", - prompt_tokens: j.usage?.prompt_tokens ?? 0, - completion_tokens: j.usage?.completion_tokens ?? 0, - finish_reason: j.choices?.[0]?.finish_reason ?? "?", - }; -} - -// ─── Tree-split over oversized chunk set ────────────────────────── -async function treeSplitSummarize( - chunks: Chunk[], - question: string, -): Promise<{ scratchpad: string; cloud_calls: number }> { - // Shard into groups fitting within half the budget each. - const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE)); - const shards: Chunk[][] = []; - for (let i = 0; i < chunks.length; i += perShard) { - shards.push(chunks.slice(i, i + perShard)); - } - log(` tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`); - let scratchpad = ""; - let cloud_calls = 0; - for (let si = 0; si < shards.length; si++) { - const shard = shards[si]; - const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n"); - const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`; - const r = await chat({ - provider: "ollama_cloud", - model: CLOUD_MODEL, - messages: [ - { role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." }, - { role: "user", content: userMsg }, - ], - max_tokens: 500, - think: false, - }); - cloud_calls += 1; - scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`; - if (scratchpad.length > CONTEXT_BUDGET_CHARS) { - // truncate oldest halves - scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS); - log(` tree-split: scratchpad truncated to ${scratchpad.length} chars`); - } - } - return { scratchpad, cloud_calls }; -} - -// ─── Continuable generate — up to max_continuations stitches ────── -// -// Two failure modes handled: -// A) Empty response — typically thinking model burned the budget -// on hidden reasoning. Retry with 2× max_tokens. -// B) Truncated response (finish_reason=length) — answer got cut off -// mid-sentence. Pass the partial back as scratchpad and ask the -// model to continue from where it stopped. -// -// Stitching: keep appending content across retries; prompt_tokens and -// completion_tokens accumulate; finish_reason reflects the LAST call. -// Loop exits on the first call that finishes cleanly (stop) with -// non-empty content, OR when retries hit the cap. -async function generateContinuable( - opts: Parameters[0] & { max_continuations?: number }, -): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> { - const maxCont = opts.max_continuations ?? 1; - let total = await chat(opts); - let retries = 0; - while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) { - retries += 1; - const mode = total.content.length === 0 ? "empty" : "truncated"; - log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`); - // Continuation prompt — branch on failure mode: - // empty → retry with 2× tokens, same prompt (thinking budget) - // length → pass the partial as assistant turn, ask to continue - const continuationMessages = total.content.length === 0 - ? opts.messages - : [ - ...opts.messages, - { role: "assistant", content: total.content }, - { role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." }, - ]; - const continued = await chat({ - ...opts, - max_tokens: CONTINUATION_MAX_TOKENS, - messages: continuationMessages, - }); - total = { - content: total.content + continued.content, - prompt_tokens: total.prompt_tokens + continued.prompt_tokens, - completion_tokens: total.completion_tokens + continued.completion_tokens, - finish_reason: continued.finish_reason, - }; - } - return { ...total, continuation_retries: retries }; -} - -// ─── Single iteration: retrieve → budget-check → chat → seed ───── -async function runIteration( - iteration: number, - question: string, - allChunks: Chunk[], - priorPlaybookIds: string[], - priorAnswers: string[], -): Promise { - const started = Date.now(); - const errorsRecovered: string[] = []; - log(`iter ${iteration}: "${question.slice(0, 70)}..."`); - - // 1. Embed the question - const qEmb = (await embedBatch([question]))[0]; - - // 2. Retrieve top-K chunks by cosine - const scored = allChunks - .map(c => ({ c, score: cosine(qEmb, c.embedding) })) - .sort((a, b) => b.score - a.score) - .slice(0, TOP_K_RETRIEVE); - const chunks = scored.map(x => x.c); - log(` retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`); - - // 3. Context budget check — tree-split if over - const contextChars = chunks.map(c => c.text).join("\n\n").length; - let contextForPrompt: string; - let treeSplit = false; - let cloudCallsTotal = 0; - if (contextChars > CONTEXT_BUDGET_CHARS) { - treeSplit = true; - log(` context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`); - const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question); - contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`; - cloudCallsTotal += cloud_calls; - } else { - contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n"); - } - - // 4. Seed prompt with prior iteration answers (real compounding). - // Not just IDs — the model needs the CONTENT to synthesize. - let citationBlock = ""; - let citationsReceived: string[] = []; - if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) { - const lines = priorAnswers.map((ans, i) => { - const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown"; - // Trim each prior answer to ~400 chars so we don't blow budget - return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`; - }); - citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`; - citationsReceived = priorPlaybookIds.slice(); - } - - // 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22. - // Try the task up to MAX_TASK_RETRIES times. Each retry: - // a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER, - // cycles through 5 invalid models + 1 valid to force full loop) - // b) Injects prior attempt errors as learning context - // c) If the attempt succeeds (non-empty, >100 chars), loop exits - // d) Otherwise, records failure and tries again with the learning - // - // Cap at 6 so we don't spin forever on unsolvable tasks. - let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null; - let rescueTriggered = false; - const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = []; - const forceRetries = iteration === INJECT_FAIL_ON_ITER; - if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`); - - for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) { - const modelForAttempt = forceRetries - ? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1] - : CLOUD_MODEL; - // Compose a prior-attempts learning block for attempts 2+ - const learningBlock = taskAttemptHistory.length > 0 - ? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n` - : ""; - log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`); - try { - const r = await generateContinuable({ - provider: "ollama_cloud", - model: modelForAttempt, - messages: [ - { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." }, - { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` }, - ], - max_tokens: PRIMARY_MAX_TOKENS, - think: false, - max_continuations: MAX_CONTINUATIONS, - }); - cloudCallsTotal += 1 + r.continuation_retries; - if (r.content && r.content.length > 100) { - // Acceptable answer — exit loop - result = r; - if (attempt > 1) { - log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`); - rescueTriggered = true; - } - break; - } - // Thin response — count as failure with learning signal - const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`; - taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err }); - errorsRecovered.push(`attempt ${attempt}: ${err}`); - } catch (e) { - const err = (e as Error).message; - taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err }); - errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`); - cloudCallsTotal += 1; - } - } - - // Last-ditch: if all 6 task attempts failed, try the local fallback - // once more so we at least return SOMETHING. This is the "don't get - // caught in a loop, accept best-so-far" rule J stated explicitly. - if (!result) { - errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`); - rescueTriggered = true; - try { - result = await generateContinuable({ - provider: "ollama", - model: "qwen3.5:latest", - messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }], - max_tokens: 300, - think: false, - max_continuations: 2, - }); - cloudCallsTotal += 1 + result.continuation_retries; - } catch (e) { - // Absolute last resort — fabricate a skeleton result - result = { - content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`, - prompt_tokens: 0, - completion_tokens: 0, - continuation_retries: 0, - finish_reason: "error", - }; - } - } - if (result.content.length === 0) { - errorsRecovered.push("even rescue returned empty — last-ditch local fallback"); - rescueTriggered = true; - result = await generateContinuable({ - provider: "ollama", - model: "qwen3.5:latest", - messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }], - max_tokens: 300, - think: false, - }); - cloudCallsTotal += 1; - } - - // 6. Seed playbook with the answer - let playbook_id: string | null = null; - try { - const ts = new Date().toISOString(); - const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`; - const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, { - method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ - operation: seedOp, - approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`, - context: result.content.slice(0, 600), - endorsed_names: [`iter${iteration}_${RUN_NONCE}`], - append: true, - }), - signal: AbortSignal.timeout(15000), - }); - if (r.ok) { - const j: any = await r.json(); - playbook_id = j.outcome?.playbook_id ?? null; - } else { - errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`); - } - } catch (e) { - errorsRecovered.push(`seed exception: ${(e as Error).message}`); - } - - return { - iteration, - question, - retrieval_top_k: chunks.length, - context_chars_before_budget: contextChars, - tree_split_fired: treeSplit, - cloud_calls_total: cloudCallsTotal, - continuation_retries: result.continuation_retries, - rescue_triggered: rescueTriggered, - task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt - task_retry_history: taskAttemptHistory, - playbook_id, - tokens_prompt: result.prompt_tokens, - tokens_completion: result.completion_tokens, - citations_from_prior_iterations: citationsReceived, - duration_ms: Date.now() - started, - answer_preview: result.content.slice(0, 500), - errors_recovered: errorsRecovered, - }; -} - -async function main() { - await mkdir(OUT_DIR, { recursive: true }); - log(`run nonce: ${RUN_NONCE}`); - log(`output dir: ${OUT_DIR}`); - - // ─── Phase 1: load, chunk, embed the PRD ─────────────────────── - log(`loading PRD from ${PRD_PATH}`); - const prd = await readFile(PRD_PATH, "utf8"); - log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`); - - const raw_chunks = chunkText(prd); - log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`); - - // Embed in batches of 32 to avoid sidecar overload - const allChunks: Chunk[] = []; - const BATCH = 32; - const t0 = Date.now(); - for (let i = 0; i < raw_chunks.length; i += BATCH) { - const batch = raw_chunks.slice(i, i + BATCH); - const embs = await embedBatch(batch.map(b => b.text)); - for (let j = 0; j < batch.length; j++) { - allChunks.push({ - id: hash(batch[j].text), - text: batch[j].text, - embedding: embs[j].map(x => Number(x)), - offset: batch[j].offset, - }); - } - log(` embedded ${allChunks.length}/${raw_chunks.length}`); - } - log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`); - - // ─── Phase 2: 6 iterations ───────────────────────────────────── - const results: IterationResult[] = []; - const priorIds: string[] = []; - const priorAnswers: string[] = []; - for (let i = 1; i <= QUESTIONS.length; i++) { - const q = QUESTIONS[i - 1]; - const r = await runIteration(i, q, allChunks, priorIds, priorAnswers); - results.push(r); - if (r.playbook_id) priorIds.push(r.playbook_id); - priorAnswers.push(r.answer_preview); - log(` → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`); - await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2)); - } - // Check whether iter 6 actually cited prior pb:IDs in its answer. - // Playbook IDs look like `pb-seed-` so the regex needs to allow - // hyphens + letters inside the brackets, not just hex chars. - const iter6 = results[5]; - const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0; - - // ─── Phase 3: summary ────────────────────────────────────────── - const summary = { - run_nonce: RUN_NONCE, - ran_at: new Date().toISOString(), - prd_chars: prd.length, - prd_chunks: allChunks.length, - iterations: results.length, - total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0), - total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0), - total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0), - tree_splits_fired: results.filter(r => r.tree_split_fired).length, - rescues_triggered: results.filter(r => r.rescue_triggered).length, - iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0, - iter6_actually_cited_in_answer: citationsHonored, - iter3_task_attempts: results[2]?.task_attempts_made ?? 0, - iter3_task_retries: results[2]?.task_retry_history.length ?? 0, - max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)), - total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0), - overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL", - }; - await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2)); - - log(""); - log(`══════ SUMMARY ${summary.overall} ══════`); - log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`); - log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`); - log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`); - log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`); - log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`); - log(""); - for (const r of results) { - const flags = [ - r.tree_split_fired ? "tree-split" : "", - r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "", - r.rescue_triggered ? "rescued" : "", - r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "", - ].filter(Boolean).join(" "); - log(` iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`); - } - log(""); - log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`); - process.exit(summary.overall === "PASS" ? 0 : 1); -} - -main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); }); diff --git a/tests/real-world/hard_task_escalation.ts b/tests/real-world/hard_task_escalation.ts deleted file mode 100644 index 7f69f1b..0000000 --- a/tests/real-world/hard_task_escalation.ts +++ /dev/null @@ -1,267 +0,0 @@ -// Hard-task escalation test. The task is deliberately constructed so -// that a local 7B model (qwen3.5:latest) will miss at least one of the -// validation rules. Watch the escalation ladder: -// 1. qwen3.5:latest (local 7B) — likely fails -// 2. qwen3:latest (local 7B) — likely fails differently -// 3. gpt-oss:20b (cloud 20B) — may fail -// 4. gpt-oss:120b (cloud 120B) — should succeed -// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context -// 6. absolute last ditch: return best-so-far with failure annotation -// -// Each attempt: -// - Calls the model via /v1/chat -// - Validates the output against a strict rubric -// - On fail: records the specific rubric violations + the partial -// output, injects both into the next attempt's prompt as "here's -// what's wrong, fix it specifically" -// - On success: exit loop -// -// Run: bun run tests/real-world/hard_task_escalation.ts - -import { writeFile, mkdir } from "node:fs/promises"; - -const GATEWAY = "http://localhost:3100"; -const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`; -const MAX_ATTEMPTS = 6; - -// The hard task. Specific enough that a small model will miss at -// least one rule. Not purely knowledge-based — it's a code-generation -// task with strict structural constraints. -const TASK = `Write a complete Rust async function with the EXACT signature: - - pub async fn check_drift_batched(refs: Vec) -> Result, String> - -It must: -1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase()) -2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash} -3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4 -4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool -5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec of tool names where drifted == true -6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!() -7. Include rustdoc /// comments on the function and each internal helper - -Assume this struct is already imported: - - pub struct DocRef { pub tool: String, pub snippet_hash: Option, pub version_seen: String } - -Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`; - -// Escalation ladder — small-local → large-local → cloud → specialist -// cloud → trillion-param cloud. Corrected 2026-04-22 per J: -// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the -// final escalation tier should be kimi-k2:1t (the biggest model -// we have access to on Ollama Cloud). -const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [ - { provider: "ollama", model: "qwen3.5:latest", note: "local 7B" }, - { provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " }, - { provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud - { provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" }, - { provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" }, - // NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi - // K2.5/K2.6 both return "this model requires a subscription" on our - // current Ollama Cloud key. mistral-large-3:675b is the biggest - // model actually provisioned on this key (verified via direct curl - // to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro → - // swap this line to kimi-k2.5 or kimi-k2.6:cloud. - { provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" }, -]; - -// Validation rubric — the answer must pass all of these to be accepted. -interface RubricResult { - passed: boolean; - violations: string[]; - passed_rules: string[]; -} - -function validate(code: string): RubricResult { - const violations: string[] = []; - const passed: string[] = []; - - const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); }; - - check("has pub async fn check_drift_batched signature", - /pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code)); - check("takes Vec argument", - /refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code)); - check("returns Result, String>", - /Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code)); - check("uses reqwest", - /\breqwest\b/i.test(code)); - check("references JoinSet or Semaphore for concurrency", - /\bJoinSet\b|\bSemaphore\b/i.test(code)); - check("bounds concurrency at 4", - /\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code))); - // Exponential backoff — models express this several ways. Accept - // any recognizable doubling pattern starting at 250ms. 2026-04-22: - // devstral-2:123b wrote `retry_delay *= 2` which my earlier regex - // rejected even though the code is correct. Broadening rubric to - // match all idiomatic doubling forms. - const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code) - || /millis\s*\(\s*250\b/.test(code); - const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal - || /<<\s*\d+/.test(code) // bit-shift - || /\.pow\s*\(/.test(code) // 2u32.pow(attempt) - || /\*=\s*2\b/.test(code) // delay *= 2 ← was missing - || /\*\s*2\s*;/.test(code) // delay = delay * 2; - || /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling - check("has 250ms backoff seed", - hasSeed250); - check("reaches 500ms backoff (literal or doubling from 250)", - /Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code) - || /millis\s*\(\s*500\b/.test(code) - || (hasSeed250 && hasDoublingPattern)); - check("reaches 1000ms backoff (literal or doubling to 1000)", - /Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code) - || /millis\s*\(\s*1000\b/.test(code) - || (hasSeed250 && hasDoublingPattern)); - check("case-insensitive tool grouping (to_ascii_lowercase)", - /to_ascii_lowercase|to_lowercase/.test(code)); - check("NO .unwrap() — all errors bubble via ?", - !/\.unwrap\s*\(\s*\)/.test(code)); - check("NO .expect(...) — all errors bubble via ?", - !/\.expect\s*\(/.test(code)); - check("NO panic!() / unimplemented!() / todo!()", - !/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code)); - check("has rustdoc /// comments", - /\/\/\//.test(code)); - check("reasonable length (> 500 chars)", - code.length > 500); - - return { passed: violations.length === 0, violations, passed_rules: passed }; -} - -function log(msg: string) { console.log(`[hard] ${msg}`); } - -async function chat(opts: { - provider: "ollama" | "ollama_cloud", - model: string, - prompt: string, -}): Promise<{ content: string; error?: string }> { - try { - const r = await fetch(`${GATEWAY}/v1/chat`, { - method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ - provider: opts.provider, - model: opts.model, - messages: [{ role: "user", content: opts.prompt }], - max_tokens: 2500, - temperature: 0.2, - think: false, - }), - signal: AbortSignal.timeout(240000), - }); - if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` }; - const j: any = await r.json(); - return { content: j.choices?.[0]?.message?.content ?? "" }; - } catch (e) { - return { content: "", error: (e as Error).message }; - } -} - -interface AttemptRecord { - n: number; - provider: string; - model: string; - duration_ms: number; - content_chars: number; - error: string | null; - rubric_violations: string[]; - rubric_passed: string[]; - accepted: boolean; -} - -function extractCode(raw: string): string { - // Strip common fence wrappers - const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/); - if (m) return m[1].trim(); - return raw.trim(); -} - -async function main() { - await mkdir(OUT_DIR, { recursive: true }); - log(`output: ${OUT_DIR}`); - log(`task: ${TASK.slice(0, 120)}...`); - log(""); - - const attempts: AttemptRecord[] = []; - let acceptedCode: string | null = null; - - for (let i = 0; i < MAX_ATTEMPTS; i++) { - const n = i + 1; - const rung = LADDER[i] ?? LADDER[LADDER.length - 1]; - - // Build the prompt: base task + prior failures' learning blocks - let priorLearning = ""; - if (attempts.length > 0) { - priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`; - for (const a of attempts) { - priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`; - for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`; - if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`; - } - priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`; - } - - log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`); - const t0 = Date.now(); - const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning }); - const dur = Date.now() - t0; - - const code = extractCode(r.content); - const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] }; - - const record: AttemptRecord = { - n, - provider: rung.provider, - model: rung.model, - duration_ms: dur, - content_chars: code.length, - error: r.error ?? null, - rubric_violations: rubric.violations, - rubric_passed: rubric.passed_rules, - accepted: rubric.passed, - }; - attempts.push(record); - - log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`); - for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`); - - await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code); - await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2)); - - if (rubric.passed) { - log(` ✅ ACCEPTED on attempt ${n}`); - acceptedCode = code; - break; - } - } - - const summary = { - task: TASK.slice(0, 200), - total_attempts: attempts.length, - accepted: acceptedCode !== null, - accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null, - escalation_path: attempts.map(a => `${a.provider}/${a.model}`), - per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length), - per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length), - total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0), - }; - await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2)); - - log(""); - log(`═══ RESULT ═══`); - log(`attempts: ${summary.total_attempts}`); - log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`); - log(`escalation path:`); - for (const [i, a] of attempts.entries()) { - const mark = a.accepted ? "✅" : "❌"; - log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`); - } - log(""); - log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`); - log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`); - - process.exit(summary.accepted ? 0 : 1); -} - -main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); }); diff --git a/tests/real-world/nine_consecutive_audits.ts b/tests/real-world/nine_consecutive_audits.ts deleted file mode 100644 index c204c07..0000000 --- a/tests/real-world/nine_consecutive_audits.ts +++ /dev/null @@ -1,186 +0,0 @@ -// Nine-consecutive audit runner — empirical test of the predictive- -// compounding property. Runs the audit pipeline 9 times against the -// same PR (each time with a new diff from Gitea), captures the -// verdict + audit_lessons state after each run, and reports whether -// the KB stabilizes or drifts. -// -// What we expect (favorable compounding): -// - signature_count grows sublinearly (same patterns recur, so -// distinct-signature count stabilizes fast) -// - verdict settles on a stable value after run 2-3 (first audit -// establishes baseline, rest repeat) -// - confidence stays LOW for all signatures (same PR repeatedly) -// - NO new recurring findings fire because confidence < 0.3 on -// same-PR noise (kb_index rating policy) -// -// What would indicate drift (the thing we want to prove DOESN'T happen): -// - signature_count grows linearly — each run produces new signatures -// - verdict oscillates (block → approve → block ...) -// - confidence inflates — kb_index rating escalates on repeated runs -// -// Run: bun run tests/real-world/nine_consecutive_audits.ts - -import { readFile, writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import { aggregate } from "../../auditor/kb_index.ts"; -import { getPrSnapshot } from "../../auditor/gitea.ts"; -import { auditPr } from "../../auditor/audit.ts"; - -const REPO = "/home/profit/lakehouse"; -const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`; -const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`; -const POLL_INTERVAL_MS = 5_000; -const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9); -const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8); -const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0"; -const RESET_KB = process.env.LH_RESET_KB === "1"; - -async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise { - const short = sha.slice(0, 12); - const path = join(VERDICTS_DIR, `${prNum}-${short}.json`); - const start = Date.now(); - while (Date.now() - start < deadlineMs) { - try { - const raw = await readFile(path, "utf8"); - return JSON.parse(raw); - } catch { /* not yet */ } - await Bun.sleep(POLL_INTERVAL_MS); - } - throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`); -} - -async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> { - const agg = await aggregate(AUDIT_LESSONS, { - keyFn: (r) => r?.signature, - scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined), - }); - const list = Array.from(agg.values()).sort((a, b) => b.count - a.count); - const recurring = list.filter(r => r.count >= 2); - const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0; - const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0; - return { - sig_count: list.length, - max_count: list[0]?.count ?? 0, - max_confidence: recurringMaxConf, - recurring_max_count: recurringMaxCount, - top3: list.slice(0, 3).map(a => ({ - sig: a.signature, - count: a.count, - conf: a.confidence, - summary: a.representative_summary.slice(0, 80), - })), - }; -} - -interface RunRecord { - run: number; - sha: string; - verdict_overall: string; - findings_total: number; - findings_block: number; - findings_warn: number; - findings_info: number; - audit_duration_ms: number; - claims_total: number; - claims_empirical: number; - kb_sig_count_after: number; - kb_max_count_after: number; - kb_max_confidence_after: number; - kb_recurring_max_count: number; -} - -async function main() { - console.log(`[nine] target PR: #${TARGET_PR}`); - console.log(`[nine] runs: ${RUNS}`); - console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`); - console.log(`[nine] reset_kb: ${RESET_KB}`); - console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`); - - if (RESET_KB) { - console.log("[nine] clearing audit_lessons.jsonl for clean test..."); - await writeFile(AUDIT_LESSONS, ""); - } - console.log(""); - - const pr = await getPrSnapshot(TARGET_PR); - console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`); - console.log(`[nine] files in diff: ${pr.files.length}`); - console.log(""); - - const baseline = await captureAggState(); - console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`); - console.log(""); - - const records: RunRecord[] = []; - for (let n = 1; n <= RUNS; n++) { - const t0 = Date.now(); - console.log(`─── run ${n}/${RUNS} ───`); - - const verdict = await auditPr(pr, { - dry_run: true, - skip_dynamic: true, - skip_inference: SKIP_INFERENCE, - }); - - console.log(` sha ${verdict.head_sha.slice(0, 12)}`); - const after = await captureAggState(); - const rec: RunRecord = { - run: n, - sha: verdict.head_sha.slice(0, 12), - verdict_overall: String(verdict.overall), - findings_total: Number(verdict.metrics?.findings_total ?? 0), - findings_block: Number(verdict.metrics?.findings_block ?? 0), - findings_warn: Number(verdict.metrics?.findings_warn ?? 0), - findings_info: Number(verdict.metrics?.findings_info ?? 0), - audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0), - claims_total: Number(verdict.metrics?.claims_total ?? 0), - claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0), - kb_sig_count_after: after.sig_count, - kb_max_count_after: after.max_count, - kb_max_confidence_after: after.max_confidence, - kb_recurring_max_count: after.recurring_max_count, - }; - records.push(rec); - console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`); - console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`); - console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`); - console.log(""); - } - - console.log("═══ FINAL ═══"); - console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf"); - for (const r of records) { - console.log( - ` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`, - ); - } - - console.log(""); - console.log("═══ COMPOUNDING PROPERTY ═══"); - const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count; - const maxConf = records[records.length - 1].kb_max_confidence_after; - const recurringMax = records[records.length - 1].kb_recurring_max_count; - console.log(` signatures added over ${RUNS} runs: ${sigDelta}`); - console.log(` max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`); - console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`); - - const verdictSet = new Set(records.map(r => r.verdict_overall)); - if (verdictSet.size === 1) { - console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`); - } else { - console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`); - } - - if (maxConf < 0.6 && recurringMax < 5) { - console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`); - } else { - console.log(` ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`); - } - - const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`; - await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2)); - console.log(""); - console.log(` report: ${jsonOut}`); -} - -main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); }); \ No newline at end of file