remove 5 orphaned dev experiments from tests/real-world/

Per J: "all of our test code ended up in the main." These are 5 one-time
dev experiments that were never wired into any automation and have zero
live consumers in the production code path. Deleting them.

Removed (1418 LOC total):
- enrich_prd_pipeline.ts      (528 LOC) — Phase 21 architecture stress test
- nine_consecutive_audits.ts  (185 LOC) — empirical study of audit compounding
- hard_task_escalation.ts     (267 LOC) — escalation-ladder test (refs retired
                                            cloud models qpt-oss:20b/120b)
- autonomous_loop.ts          (214 LOC) — wrapper experiment around scrum_master
- consensus_reducer_design.ts (224 LOC) — N=3 design consultation; output JSON
                                            referenced from pathway_memory.rs
                                            comment but the script itself has
                                            no consumer

Verified: 0 references in package.json / justfile / Makefile / any
production .ts/.rs/.sh file. The single mention from pathway_memory.rs
is a //! doc comment referencing the JSON output (data/_kb/
consensus_reducer_design_*.json), not the script. Build clean post-delete.

KEPT:
- scrum_master_pipeline.ts — referenced from observer.ts, vectord, scripts
- scrum_applier.ts          — referenced from auditor schemas

If you need any of these back, they're in git history. cherry-pick or
git show HEAD~1 -- tests/real-world/<file>.ts will recover the source.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-05-03 02:05:24 -05:00
parent bb5a3b3f5e
commit 6aafd41785
5 changed files with 0 additions and 1419 deletions

View File

@ -1,214 +0,0 @@
#!/usr/bin/env bun
// Autonomous scrum loop — wraps scrum_master_pipeline.ts + scrum_applier.ts
// in a goal-driven retry loop. Observer is POSTed an iteration summary at
// every boundary so it can build meta-commentary outside the loop's epistemic
// scope.
//
// Usage:
// LOOP_TARGETS="crates/a/src/x.rs,crates/b/src/y.rs" \
// LOOP_MAX_ITERS=5 \
// LOOP_PUSH=1 \
// bun run tests/real-world/autonomous_loop.ts
//
// Stop conditions: max_iters reached OR 2 consecutive iters with 0 commits.
import { spawn } from "node:child_process";
import { appendFile, readFile } from "node:fs/promises";
import { existsSync } from "node:fs";
const REPO = "/home/profit/lakehouse";
const OBSERVER = process.env.LOOP_OBSERVER ?? "http://localhost:3800";
const BRANCH = process.env.LOOP_BRANCH ?? "scrum/auto-apply-19814";
const MAX_ITERS = Number(process.env.LOOP_MAX_ITERS ?? 3);
const PUSH = process.env.LOOP_PUSH === "1";
const MIN_CONF = process.env.LOOP_MIN_CONF ?? "85";
// Optional override — when unset, let scrum_applier.ts use ITS default
// (currently x-ai/grok-4.1-fast on openrouter). The prior hardcoded
// qwen3-coder:480b default was clobbering the applier's own default
// and forcing every iter to hit the throttled ollama_cloud account.
const APPLIER_MODEL = process.env.LOOP_APPLIER_MODEL;
const APPLIER_PROVIDER = process.env.LOOP_APPLIER_PROVIDER;
const TARGETS = (process.env.LOOP_TARGETS ?? "crates/queryd/src/service.rs,crates/gateway/src/main.rs,crates/gateway/src/v1/mod.rs")
.split(",").map(s => s.trim()).filter(Boolean);
const FORENSIC = process.env.LH_SCRUM_FORENSIC ?? `${REPO}/docs/SCRUM_FORENSIC_PROMPT.md`;
const PROPOSAL = process.env.LH_SCRUM_PROPOSAL ?? `${REPO}/docs/SCRUM_FIX_WAVE.md`;
const LOOP_ID = `loop_${Date.now().toString(36)}`;
const JOURNAL = `${REPO}/data/_kb/autonomous_loops.jsonl`;
interface IterResult {
iter: number;
scrum_reviews_added: number;
applier_outcomes: Record<string, number>;
commits_landed: number;
commit_shas: string[];
build_status: "green" | "red" | "unknown";
duration_ms: number;
}
function log(msg: string) {
const ts = new Date().toISOString().slice(11, 19);
console.log(`[loop ${LOOP_ID} ${ts}] ${msg}`);
}
function runCmd(cmd: string, args: string[], env: Record<string, string> = {}): Promise<{ code: number; stdout: string; stderr: string }> {
return new Promise((resolve) => {
const child = spawn(cmd, args, { cwd: REPO, env: { ...process.env, ...env } });
let stdout = "", stderr = "";
child.stdout.on("data", (d) => { stdout += d; process.stdout.write(d); });
child.stderr.on("data", (d) => { stderr += d; process.stderr.write(d); });
child.on("close", (code) => resolve({ code: code ?? -1, stdout, stderr }));
});
}
async function countLines(path: string): Promise<number> {
if (!existsSync(path)) return 0;
const text = await readFile(path, "utf8");
return text.split("\n").filter(Boolean).length;
}
async function gitHeadSha(): Promise<string> {
const r = await runCmd("git", ["rev-parse", "HEAD"]);
return r.stdout.trim();
}
async function commitsSince(baseSha: string): Promise<string[]> {
const r = await runCmd("git", ["log", "--oneline", `${baseSha}..HEAD`]);
return r.stdout.trim().split("\n").filter(Boolean);
}
async function cargoCheckGreen(): Promise<boolean> {
log("cargo check --workspace …");
const r = await runCmd("cargo", ["check", "--workspace", "--quiet"]);
return r.code === 0;
}
async function postObserver(payload: object) {
try {
const r = await fetch(`${OBSERVER}/event`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
signal: AbortSignal.timeout(5000),
});
if (!r.ok) log(`observer POST returned ${r.status}`);
} catch (e: any) {
log(`observer POST failed: ${e.message}`);
}
}
async function runIter(iter: number, baseSha: string): Promise<IterResult> {
const t0 = Date.now();
log(`══ iter ${iter} start (base ${baseSha.slice(0, 8)}) targets=${TARGETS.length}`);
const reviewsBefore = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
const applyBefore = await countLines(`${REPO}/data/_kb/auto_apply.jsonl`);
log(`scrum_master_pipeline.ts → ${TARGETS.length} files`);
await runCmd("bun", ["run", "tests/real-world/scrum_master_pipeline.ts"], {
LH_SCRUM_FILES: TARGETS.join(","),
LH_SCRUM_FORENSIC: FORENSIC,
LH_SCRUM_PROPOSAL: PROPOSAL,
});
log(`scrum_applier.ts COMMIT=1 MIN_CONF=${MIN_CONF} files=${TARGETS.length}`);
// Only forward model/provider when explicitly overridden — otherwise
// let scrum_applier.ts use its own defaults (Grok 4.1 fast on openrouter).
const applierEnv: Record<string, string> = {
LH_APPLIER_COMMIT: "1",
LH_APPLIER_MIN_CONF: MIN_CONF,
LH_APPLIER_MAX_FILES: String(TARGETS.length),
LH_APPLIER_BRANCH: BRANCH,
// Constrain applier to THIS iter's targets so it patches what we
// just reviewed instead of the highest-confidence file from history.
LH_APPLIER_FILES: TARGETS.join(","),
};
if (APPLIER_MODEL) applierEnv.LH_APPLIER_MODEL = APPLIER_MODEL;
if (APPLIER_PROVIDER) applierEnv.LH_APPLIER_PROVIDER = APPLIER_PROVIDER;
await runCmd("bun", ["run", "tests/real-world/scrum_applier.ts"], applierEnv);
const reviewsAfter = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
const applyAfterText = existsSync(`${REPO}/data/_kb/auto_apply.jsonl`)
? await readFile(`${REPO}/data/_kb/auto_apply.jsonl`, "utf8")
: "";
const applyRows = applyAfterText.split("\n").filter(Boolean).slice(applyBefore);
const outcomes: Record<string, number> = {};
for (const line of applyRows) {
try {
const o = JSON.parse(line);
outcomes[o.action ?? "?"] = (outcomes[o.action ?? "?"] ?? 0) + 1;
} catch { /* skip malformed */ }
}
const commitShas = await commitsSince(baseSha);
const buildStatus = commitShas.length > 0 ? (await cargoCheckGreen() ? "green" : "red") : "unknown";
const result: IterResult = {
iter,
scrum_reviews_added: reviewsAfter - reviewsBefore,
applier_outcomes: outcomes,
commits_landed: commitShas.length,
commit_shas: commitShas.map(s => s.split(" ")[0]),
build_status: buildStatus,
duration_ms: Date.now() - t0,
};
log(`iter ${iter} done — reviews+${result.scrum_reviews_added} commits=${result.commits_landed} build=${buildStatus} (${(result.duration_ms / 1000).toFixed(1)}s)`);
await postObserver({
source: "autonomous_loop",
loop_id: LOOP_ID,
event_kind: "iteration_complete",
iter,
targets: TARGETS,
success: buildStatus !== "red",
scrum_reviews_added: result.scrum_reviews_added,
applier_outcomes: result.applier_outcomes,
commits_landed: result.commits_landed,
commit_shas: result.commit_shas,
build_status: buildStatus,
duration_ms: result.duration_ms,
ts: new Date().toISOString(),
});
await appendFile(JOURNAL, JSON.stringify({ loop_id: LOOP_ID, ...result, ts: new Date().toISOString() }) + "\n");
return result;
}
async function main() {
log(`autonomous loop starting · branch=${BRANCH} max_iters=${MAX_ITERS} push=${PUSH}`);
log(`targets: ${TARGETS.join(", ")}`);
const branchR = await runCmd("git", ["branch", "--show-current"]);
if (branchR.stdout.trim() !== BRANCH) {
log(`ERROR: on branch ${branchR.stdout.trim()}, expected ${BRANCH}. Refusing to run.`);
process.exit(1);
}
let consecutiveZero = 0;
for (let iter = 1; iter <= MAX_ITERS; iter++) {
const baseSha = await gitHeadSha();
const result = await runIter(iter, baseSha);
if (PUSH && result.commits_landed > 0) {
log(`git push origin ${BRANCH}`);
const pushR = await runCmd("git", ["push", "origin", BRANCH]);
if (pushR.code !== 0) log(`push failed (continuing): ${pushR.stderr.slice(0, 200)}`);
}
consecutiveZero = result.commits_landed === 0 ? consecutiveZero + 1 : 0;
if (consecutiveZero >= 2) {
log(`STOP: 2 consecutive iters with 0 commits. Loop converged or stuck.`);
break;
}
}
log(`loop ${LOOP_ID} complete. Journal: ${JOURNAL}`);
}
main().catch((e) => {
log(`FATAL: ${e.message}`);
process.exit(1);
});

View File

@ -1,224 +0,0 @@
// consensus_reducer_design.ts — N=3 design consultation.
//
// J's ask: enhance the tree-split reducer to preserve FULL backtrack-able
// context (endpoints tried, attempt count per model in the ladder, KB
// sources retrieved, context7 bridge hits, MCP observer signals, audit
// verdicts) instead of collapsing to a summary. Then index the full
// context through our existing vectord matrix indexing (HNSW + Lance +
// playbook_memory) so successful pathways become hot-swappable — the
// system asks "what did we try, what worked, in what order" for a
// similar task class and gets a ranked playbook back.
//
// Before building, consult three diverse models and print their proposals
// side-by-side so we can pick the convergent design.
const GATEWAY = "http://localhost:3100";
const DESIGN_BRIEF = `
# Context Lakehouse signalcommit loop
We run 6x scrum-master iterations that audit Rust crates for PRD
alignment, produce findings + confidence, and feed an auto-applier that
lands small mechanical commits through a cargo-green-and-warning-stable
gate. Key components:
- \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung
model LADDER (kimi-k2:1t qwen3-coder:480b deepseek mistral-large
gpt-oss:120b qwen3.5:397b openrouter free rescues local
qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5
proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K
shards, asks each rung in order, accepts first response passing
structural checks.
- \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions
(file, verdict, critical_failures_count, gradient_tier, attempts,
reviewer_model, tree_split_fired). Escalates failure clusters to LLM
Team by POSTing to /v1/chat with qwen3-coder:480b.
- \`context7-bridge\` — external library docs lookup.
- \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum
findings; writes to data/_kb/audit_facts.jsonl via LLM Team
\`/api/run?mode=extract\`.
- \`crates/vectord/src/playbook_memory.rs\` — indexing for proven
playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry,
PatternReport. Uses HNSW index + Lance columnar backend + promotion
pipeline. Already battle-tested for workers/staffing queries.
- Tree-split REDUCER: after shards return map-style summaries, they are
concatenated with internal §N§ markers and fed to a reviewer model to
produce ONE file-level review. Currently the reducer sees summaries,
not the full context behind each shard's conclusion.
# The problem
The reducer currently TRUNCATES to a short summary. When the auditor or
a future iteration wants to backtrack WHY the reducer concluded what it
did which attempt succeeded, which failed, what KB chunks were
retrieved, what observer signal classified the file as LOOPING vs
CONVERGING that context is lost. So:
1. Auditor can't verify citation provenance beyond the summary line.
2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a
"tried X and it was easy" playbook they look identical downstream.
3. The matrix indexing is only used for RAG chunks during the scrum
pass, NOT for storing the full end-to-end pathway of a successful
review.
# The design question
Propose an enhanced reducer + indexing design that:
(a) Preserves the FULL backtrack context per reviewed file:
- every ladder attempt (model, ms, accepted_y/n, reject_reason)
- every retrieved KB chunk (source doc, chunk id, cosine score, rank)
- every observer signal (class, priors, prior-iter outcomes)
- every context7 bridge hit (library, version pulled)
- every sub-pipeline call (LLM Team extract results, audit consensus)
(b) Stores this pathway into vectord's matrix indexing alongside the
review verdict so it becomes retrievable by similarity. When a new
file's fingerprint (task_class + file-path prefix + signal class)
matches a past successful pathway, the system can hot-swap by
replaying or short-circuiting to the model/KB combo that worked.
(c) Surfaces the matrix-index hit rate as a feedback signal on the
scrum's UI "this file was solved 3 times before by the same ladder
rung; consider short-circuiting to rung 5."
(d) Is compatible with the existing playbook_memory.rs primitives
(PlaybookEntry, DocRef, FailureRecord, BoostEntry) extend don't
replace. The indexing layer is in production for workers/staffing;
we want the reducer pathway to piggyback on proven infrastructure.
# Constraints
- NO new crate. Extend vectord + scrum_master_pipeline.
- Full context can be LARGE a reviewed file might have 5 retrievals,
4 ladder attempts, 8 observer priors. Design the embedding /
fingerprint so similar-but-not-identical pathways cluster.
- The reducer summary is still needed for the reviewer LLM input
don't remove it, ADD the full-context sidecar.
- Audit trail: every pathway must be replayable deterministically from
what's stored (i.e., enough context to re-run without the original
prompt cache).
# Required output (STRICT JSON, no prose, no markdown fences):
{
"approach": "one-paragraph summary of your proposed design",
"data_model": {
"new_fields_on_playbook_entry": [...],
"new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ]
},
"storage_strategy": {
"what_to_vectorize": "the text that becomes the embedding",
"fingerprint_key": "the deterministic key for similarity retrieval",
"backend": "HNSW, Lance, playbook_memory — pick"
},
"reducer_changes": {
"inputs_added": [...],
"outputs_added": [...],
"compatibility_notes": "how existing callers stay working"
},
"hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway",
"ui_signal": "what to surface so J sees whether matrix indexing is earning its keep",
"risks": [...],
"why_this_beats_summarization": "one-paragraph argument"
}
`.trim();
interface Probe {
name: string;
provider: "ollama" | "ollama_cloud" | "openrouter";
model: string;
}
// Round-3 probe set — 4 probes covering the remaining ladder rungs +
// architecture/provider diversity. J wanted all 4 of the untouched
// options so the aggregated 10-model signal is saturated across the
// usable ladder.
const PROBES: Probe[] = [
{ name: "qwen35-397b", provider: "ollama_cloud", model: "qwen3.5:397b" },
{ name: "openrouter-gpt-oss", provider: "openrouter", model: "openai/gpt-oss-120b:free" },
{ name: "openrouter-gemma3", provider: "openrouter", model: "google/gemma-3-27b-it:free" },
{ name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check
];
async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> {
const started = Date.now();
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
provider: p.provider,
model: p.model,
messages: [
{ role: "system", content: "You are a senior architect. Output STRICT JSON only." },
{ role: "user", content: DESIGN_BRIEF },
],
max_tokens: 3000,
temperature: 0,
}),
});
const ms = Date.now() - started;
if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` };
const j = await r.json();
const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? "";
return { name: p.name, raw: String(content), ms };
} catch (e: any) {
return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) };
}
}
function extractJson(raw: string): any | null {
let s = raw.trim();
const fence = s.match(/^```(?:json)?\s*/);
if (fence) s = s.slice(fence[0].length);
if (s.endsWith("```")) s = s.slice(0, -3).trim();
const first = s.indexOf("{");
const last = s.lastIndexOf("}");
if (first < 0 || last <= first) return null;
try {
return JSON.parse(s.slice(first, last + 1));
} catch {
return null;
}
}
function summarize(obj: any, max = 240): string {
if (!obj) return "(no JSON parsed)";
if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj;
if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; ");
return Object.entries(obj)
.map(([k, v]) => `${k}=${summarize(v, max)}`)
.join(" | ");
}
async function main() {
console.log(`\n── N=3 design consensus ──`);
console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`);
const results = await Promise.all(PROBES.map(ask));
for (const r of results) {
console.log(`\n── ${r.name} (${r.ms}ms) ──`);
if (r.error) { console.log(` ERROR: ${r.error}`); continue; }
const j = extractJson(r.raw);
if (!j) {
console.log(` raw (no JSON): ${r.raw.slice(0, 600)}`);
continue;
}
console.log(` approach: ${summarize(j.approach, 400)}`);
console.log(` fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`);
console.log(` vectorize: ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`);
console.log(` backend: ${summarize(j.storage_strategy?.backend, 200)}`);
console.log(` hot_swap: ${summarize(j.hot_swap_logic, 300)}`);
console.log(` new_types: ${summarize(j.data_model?.new_types, 400)}`);
console.log(` risks: ${summarize(j.risks, 300)}`);
console.log(` why>summary: ${summarize(j.why_this_beats_summarization, 300)}`);
}
// Write full JSON to disk so we can inspect later.
const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`;
await Bun.write(outPath, JSON.stringify(results, null, 2));
console.log(`\nfull responses → ${outPath}`);
}
await main();

View File

@ -1,528 +0,0 @@
// Real-world architecture stress test — 6 iterations of the full pipeline
// against the PRD as a corpus. Goal: prove at scale what Phase 21
// promised (context continuation + tree-split), plus Phase 19
// compounding across iterations.
//
// Run: bun run tests/real-world/enrich_prd_pipeline.ts
//
// No mocks. No skipped layers. On any error, the test triggers
// cloud-rescue rather than fail — it's the architecture's job to
// recover. The test FAILS only if we can't complete 6 iterations.
import { readFile, writeFile, mkdir } from "node:fs/promises";
import { createHash } from "node:crypto";
const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens
const CHUNK_OVERLAP = 120;
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3
// Continuation controls (per-cloud-call) — used for output-overflow.
// Separate from the task-retry loop (per-task) — that handles errors
// across attempts.
const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation
const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call
// Task-level retry loop (J's clarification, 2026-04-22):
// When a TASK errors, retry the whole task up to 6 times. Each
// retry gets prior attempts' failures injected as learning context,
// so attempt N+1 is informed by what N failed at. The loop caps at
// 6 to avoid infinite spinning on genuinely unsolvable tasks.
const MAX_TASK_RETRIES = 6;
// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
// loop fires all 6 with compounding failure context.
const FORCE_RETRY_MODEL_SEQUENCE = [
"deliberately-invalid-model-attempt-1",
"deliberately-invalid-model-attempt-2",
"deliberately-invalid-model-attempt-3",
"deliberately-invalid-model-attempt-4",
"deliberately-invalid-model-attempt-5",
"gpt-oss:20b", // 6th attempt succeeds
];
const GATEWAY = "http://localhost:3100";
const SIDECAR = "http://localhost:3200";
const CLOUD_MODEL = "gpt-oss:120b";
const RESCUE_MODEL = "gpt-oss:20b"; // fallback local cloud model via sidecar
const RUN_NONCE = Date.now().toString(36);
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
// The 6 progressively-compounding questions. #6 explicitly requires
// synthesis across prior 5 answers.
const QUESTIONS: string[] = [
"Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
"How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
"Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
"What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
"How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
"Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
];
type Chunk = { id: string; text: string; embedding: number[]; offset: number };
interface IterationResult {
iteration: number;
question: string;
retrieval_top_k: number;
context_chars_before_budget: number;
tree_split_fired: boolean;
cloud_calls_total: number;
continuation_retries: number;
rescue_triggered: boolean;
// Task-level retry telemetry
task_attempts_made: number; // how many attempts fired (1 = first succeeded)
task_retry_history: Array<{ n: number; model: string; error: string }>;
playbook_id: string | null;
tokens_prompt: number;
tokens_completion: number;
citations_from_prior_iterations: string[];
duration_ms: number;
answer_preview: string;
errors_recovered: string[];
}
function log(msg: string) { console.log(`[enrich] ${msg}`); }
function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
function cosine(a: number[], b: number[]): number {
let dot = 0, na = 0, nb = 0;
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
if (na === 0 || nb === 0) return 0;
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
async function embedBatch(texts: string[]): Promise<number[][]> {
// Sidecar /embed accepts a list. On partial failure, retry individually.
const r = await fetch(`${SIDECAR}/embed`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({ texts }),
signal: AbortSignal.timeout(120000),
});
if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
const j: any = await r.json();
return j.embeddings;
}
function chunkText(text: string): Array<{ text: string; offset: number }> {
const out: Array<{ text: string; offset: number }> = [];
let i = 0;
while (i < text.length) {
const end = Math.min(i + CHUNK_SIZE, text.length);
const slice = text.slice(i, end).trim();
if (slice.length > 50) out.push({ text: slice, offset: i });
if (end >= text.length) break;
i = end - CHUNK_OVERLAP;
}
return out;
}
async function chat(opts: {
provider: "ollama" | "ollama_cloud",
model: string,
messages: Array<{ role: string; content: string }>,
max_tokens: number,
think: boolean,
}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({ ...opts }),
signal: AbortSignal.timeout(180000),
});
if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
const j: any = await r.json();
return {
content: j.choices?.[0]?.message?.content ?? "",
prompt_tokens: j.usage?.prompt_tokens ?? 0,
completion_tokens: j.usage?.completion_tokens ?? 0,
finish_reason: j.choices?.[0]?.finish_reason ?? "?",
};
}
// ─── Tree-split over oversized chunk set ──────────────────────────
async function treeSplitSummarize(
chunks: Chunk[],
question: string,
): Promise<{ scratchpad: string; cloud_calls: number }> {
// Shard into groups fitting within half the budget each.
const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
const shards: Chunk[][] = [];
for (let i = 0; i < chunks.length; i += perShard) {
shards.push(chunks.slice(i, i + perShard));
}
log(` tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
let scratchpad = "";
let cloud_calls = 0;
for (let si = 0; si < shards.length; si++) {
const shard = shards[si];
const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
const r = await chat({
provider: "ollama_cloud",
model: CLOUD_MODEL,
messages: [
{ role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
{ role: "user", content: userMsg },
],
max_tokens: 500,
think: false,
});
cloud_calls += 1;
scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
// truncate oldest halves
scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
log(` tree-split: scratchpad truncated to ${scratchpad.length} chars`);
}
}
return { scratchpad, cloud_calls };
}
// ─── Continuable generate — up to max_continuations stitches ──────
//
// Two failure modes handled:
// A) Empty response — typically thinking model burned the budget
// on hidden reasoning. Retry with 2× max_tokens.
// B) Truncated response (finish_reason=length) — answer got cut off
// mid-sentence. Pass the partial back as scratchpad and ask the
// model to continue from where it stopped.
//
// Stitching: keep appending content across retries; prompt_tokens and
// completion_tokens accumulate; finish_reason reflects the LAST call.
// Loop exits on the first call that finishes cleanly (stop) with
// non-empty content, OR when retries hit the cap.
async function generateContinuable(
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
const maxCont = opts.max_continuations ?? 1;
let total = await chat(opts);
let retries = 0;
while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
retries += 1;
const mode = total.content.length === 0 ? "empty" : "truncated";
log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
// Continuation prompt — branch on failure mode:
// empty → retry with 2× tokens, same prompt (thinking budget)
// length → pass the partial as assistant turn, ask to continue
const continuationMessages = total.content.length === 0
? opts.messages
: [
...opts.messages,
{ role: "assistant", content: total.content },
{ role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
];
const continued = await chat({
...opts,
max_tokens: CONTINUATION_MAX_TOKENS,
messages: continuationMessages,
});
total = {
content: total.content + continued.content,
prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
completion_tokens: total.completion_tokens + continued.completion_tokens,
finish_reason: continued.finish_reason,
};
}
return { ...total, continuation_retries: retries };
}
// ─── Single iteration: retrieve → budget-check → chat → seed ─────
async function runIteration(
iteration: number,
question: string,
allChunks: Chunk[],
priorPlaybookIds: string[],
priorAnswers: string[],
): Promise<IterationResult> {
const started = Date.now();
const errorsRecovered: string[] = [];
log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
// 1. Embed the question
const qEmb = (await embedBatch([question]))[0];
// 2. Retrieve top-K chunks by cosine
const scored = allChunks
.map(c => ({ c, score: cosine(qEmb, c.embedding) }))
.sort((a, b) => b.score - a.score)
.slice(0, TOP_K_RETRIEVE);
const chunks = scored.map(x => x.c);
log(` retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
// 3. Context budget check — tree-split if over
const contextChars = chunks.map(c => c.text).join("\n\n").length;
let contextForPrompt: string;
let treeSplit = false;
let cloudCallsTotal = 0;
if (contextChars > CONTEXT_BUDGET_CHARS) {
treeSplit = true;
log(` context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
cloudCallsTotal += cloud_calls;
} else {
contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
}
// 4. Seed prompt with prior iteration answers (real compounding).
// Not just IDs — the model needs the CONTENT to synthesize.
let citationBlock = "";
let citationsReceived: string[] = [];
if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
const lines = priorAnswers.map((ans, i) => {
const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
// Trim each prior answer to ~400 chars so we don't blow budget
return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
});
citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
citationsReceived = priorPlaybookIds.slice();
}
// 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
// Try the task up to MAX_TASK_RETRIES times. Each retry:
// a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
// cycles through 5 invalid models + 1 valid to force full loop)
// b) Injects prior attempt errors as learning context
// c) If the attempt succeeds (non-empty, >100 chars), loop exits
// d) Otherwise, records failure and tries again with the learning
//
// Cap at 6 so we don't spin forever on unsolvable tasks.
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
let rescueTriggered = false;
const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
const forceRetries = iteration === INJECT_FAIL_ON_ITER;
if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
const modelForAttempt = forceRetries
? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
: CLOUD_MODEL;
// Compose a prior-attempts learning block for attempts 2+
const learningBlock = taskAttemptHistory.length > 0
? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
: "";
log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
try {
const r = await generateContinuable({
provider: "ollama_cloud",
model: modelForAttempt,
messages: [
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
],
max_tokens: PRIMARY_MAX_TOKENS,
think: false,
max_continuations: MAX_CONTINUATIONS,
});
cloudCallsTotal += 1 + r.continuation_retries;
if (r.content && r.content.length > 100) {
// Acceptable answer — exit loop
result = r;
if (attempt > 1) {
log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
rescueTriggered = true;
}
break;
}
// Thin response — count as failure with learning signal
const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
errorsRecovered.push(`attempt ${attempt}: ${err}`);
} catch (e) {
const err = (e as Error).message;
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
cloudCallsTotal += 1;
}
}
// Last-ditch: if all 6 task attempts failed, try the local fallback
// once more so we at least return SOMETHING. This is the "don't get
// caught in a loop, accept best-so-far" rule J stated explicitly.
if (!result) {
errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
rescueTriggered = true;
try {
result = await generateContinuable({
provider: "ollama",
model: "qwen3.5:latest",
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
max_tokens: 300,
think: false,
max_continuations: 2,
});
cloudCallsTotal += 1 + result.continuation_retries;
} catch (e) {
// Absolute last resort — fabricate a skeleton result
result = {
content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
prompt_tokens: 0,
completion_tokens: 0,
continuation_retries: 0,
finish_reason: "error",
};
}
}
if (result.content.length === 0) {
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
rescueTriggered = true;
result = await generateContinuable({
provider: "ollama",
model: "qwen3.5:latest",
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
max_tokens: 300,
think: false,
});
cloudCallsTotal += 1;
}
// 6. Seed playbook with the answer
let playbook_id: string | null = null;
try {
const ts = new Date().toISOString();
const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({
operation: seedOp,
approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
context: result.content.slice(0, 600),
endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
append: true,
}),
signal: AbortSignal.timeout(15000),
});
if (r.ok) {
const j: any = await r.json();
playbook_id = j.outcome?.playbook_id ?? null;
} else {
errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
}
} catch (e) {
errorsRecovered.push(`seed exception: ${(e as Error).message}`);
}
return {
iteration,
question,
retrieval_top_k: chunks.length,
context_chars_before_budget: contextChars,
tree_split_fired: treeSplit,
cloud_calls_total: cloudCallsTotal,
continuation_retries: result.continuation_retries,
rescue_triggered: rescueTriggered,
task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
task_retry_history: taskAttemptHistory,
playbook_id,
tokens_prompt: result.prompt_tokens,
tokens_completion: result.completion_tokens,
citations_from_prior_iterations: citationsReceived,
duration_ms: Date.now() - started,
answer_preview: result.content.slice(0, 500),
errors_recovered: errorsRecovered,
};
}
async function main() {
await mkdir(OUT_DIR, { recursive: true });
log(`run nonce: ${RUN_NONCE}`);
log(`output dir: ${OUT_DIR}`);
// ─── Phase 1: load, chunk, embed the PRD ───────────────────────
log(`loading PRD from ${PRD_PATH}`);
const prd = await readFile(PRD_PATH, "utf8");
log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
const raw_chunks = chunkText(prd);
log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
// Embed in batches of 32 to avoid sidecar overload
const allChunks: Chunk[] = [];
const BATCH = 32;
const t0 = Date.now();
for (let i = 0; i < raw_chunks.length; i += BATCH) {
const batch = raw_chunks.slice(i, i + BATCH);
const embs = await embedBatch(batch.map(b => b.text));
for (let j = 0; j < batch.length; j++) {
allChunks.push({
id: hash(batch[j].text),
text: batch[j].text,
embedding: embs[j].map(x => Number(x)),
offset: batch[j].offset,
});
}
log(` embedded ${allChunks.length}/${raw_chunks.length}`);
}
log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
// ─── Phase 2: 6 iterations ─────────────────────────────────────
const results: IterationResult[] = [];
const priorIds: string[] = [];
const priorAnswers: string[] = [];
for (let i = 1; i <= QUESTIONS.length; i++) {
const q = QUESTIONS[i - 1];
const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
results.push(r);
if (r.playbook_id) priorIds.push(r.playbook_id);
priorAnswers.push(r.answer_preview);
log(` → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
}
// Check whether iter 6 actually cited prior pb:IDs in its answer.
// Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
// hyphens + letters inside the brackets, not just hex chars.
const iter6 = results[5];
const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
// ─── Phase 3: summary ──────────────────────────────────────────
const summary = {
run_nonce: RUN_NONCE,
ran_at: new Date().toISOString(),
prd_chars: prd.length,
prd_chunks: allChunks.length,
iterations: results.length,
total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
tree_splits_fired: results.filter(r => r.tree_split_fired).length,
rescues_triggered: results.filter(r => r.rescue_triggered).length,
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
iter6_actually_cited_in_answer: citationsHonored,
iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
};
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
log("");
log(`══════ SUMMARY ${summary.overall} ══════`);
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
log("");
for (const r of results) {
const flags = [
r.tree_split_fired ? "tree-split" : "",
r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
r.rescue_triggered ? "rescued" : "",
r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
].filter(Boolean).join(" ");
log(` iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
}
log("");
log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
process.exit(summary.overall === "PASS" ? 0 : 1);
}
main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });

View File

@ -1,267 +0,0 @@
// Hard-task escalation test. The task is deliberately constructed so
// that a local 7B model (qwen3.5:latest) will miss at least one of the
// validation rules. Watch the escalation ladder:
// 1. qwen3.5:latest (local 7B) — likely fails
// 2. qwen3:latest (local 7B) — likely fails differently
// 3. gpt-oss:20b (cloud 20B) — may fail
// 4. gpt-oss:120b (cloud 120B) — should succeed
// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
// 6. absolute last ditch: return best-so-far with failure annotation
//
// Each attempt:
// - Calls the model via /v1/chat
// - Validates the output against a strict rubric
// - On fail: records the specific rubric violations + the partial
// output, injects both into the next attempt's prompt as "here's
// what's wrong, fix it specifically"
// - On success: exit loop
//
// Run: bun run tests/real-world/hard_task_escalation.ts
import { writeFile, mkdir } from "node:fs/promises";
const GATEWAY = "http://localhost:3100";
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
const MAX_ATTEMPTS = 6;
// The hard task. Specific enough that a small model will miss at
// least one rule. Not purely knowledge-based — it's a code-generation
// task with strict structural constraints.
const TASK = `Write a complete Rust async function with the EXACT signature:
pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
It must:
1. Group refs by tool name (case-insensitive use .to_ascii_lowercase())
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
6. All errors bubble via ? or Result NO .unwrap(), NO .expect(), NO panic!()
7. Include rustdoc /// comments on the function and each internal helper
Assume this struct is already imported:
pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
// Escalation ladder — small-local → large-local → cloud → specialist
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
// final escalation tier should be kimi-k2:1t (the biggest model
// we have access to on Ollama Cloud).
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " },
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
// NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
// K2.5/K2.6 both return "this model requires a subscription" on our
// current Ollama Cloud key. mistral-large-3:675b is the biggest
// model actually provisioned on this key (verified via direct curl
// to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
// swap this line to kimi-k2.5 or kimi-k2.6:cloud.
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
];
// Validation rubric — the answer must pass all of these to be accepted.
interface RubricResult {
passed: boolean;
violations: string[];
passed_rules: string[];
}
function validate(code: string): RubricResult {
const violations: string[] = [];
const passed: string[] = [];
const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
check("has pub async fn check_drift_batched signature",
/pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
check("takes Vec<DocRef> argument",
/refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
check("returns Result<Vec<String>, String>",
/Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
check("uses reqwest",
/\breqwest\b/i.test(code));
check("references JoinSet or Semaphore for concurrency",
/\bJoinSet\b|\bSemaphore\b/i.test(code));
check("bounds concurrency at 4",
/\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
// Exponential backoff — models express this several ways. Accept
// any recognizable doubling pattern starting at 250ms. 2026-04-22:
// devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
// rejected even though the code is correct. Broadening rubric to
// match all idiomatic doubling forms.
const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
|| /millis\s*\(\s*250\b/.test(code);
const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal
|| /<<\s*\d+/.test(code) // bit-shift
|| /\.pow\s*\(/.test(code) // 2u32.pow(attempt)
|| /\*=\s*2\b/.test(code) // delay *= 2 ← was missing
|| /\*\s*2\s*;/.test(code) // delay = delay * 2;
|| /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
check("has 250ms backoff seed",
hasSeed250);
check("reaches 500ms backoff (literal or doubling from 250)",
/Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
|| /millis\s*\(\s*500\b/.test(code)
|| (hasSeed250 && hasDoublingPattern));
check("reaches 1000ms backoff (literal or doubling to 1000)",
/Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
|| /millis\s*\(\s*1000\b/.test(code)
|| (hasSeed250 && hasDoublingPattern));
check("case-insensitive tool grouping (to_ascii_lowercase)",
/to_ascii_lowercase|to_lowercase/.test(code));
check("NO .unwrap() — all errors bubble via ?",
!/\.unwrap\s*\(\s*\)/.test(code));
check("NO .expect(...) — all errors bubble via ?",
!/\.expect\s*\(/.test(code));
check("NO panic!() / unimplemented!() / todo!()",
!/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
check("has rustdoc /// comments",
/\/\/\//.test(code));
check("reasonable length (> 500 chars)",
code.length > 500);
return { passed: violations.length === 0, violations, passed_rules: passed };
}
function log(msg: string) { console.log(`[hard] ${msg}`); }
async function chat(opts: {
provider: "ollama" | "ollama_cloud",
model: string,
prompt: string,
}): Promise<{ content: string; error?: string }> {
try {
const r = await fetch(`${GATEWAY}/v1/chat`, {
method: "POST", headers: { "content-type": "application/json" },
body: JSON.stringify({
provider: opts.provider,
model: opts.model,
messages: [{ role: "user", content: opts.prompt }],
max_tokens: 2500,
temperature: 0.2,
think: false,
}),
signal: AbortSignal.timeout(240000),
});
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
const j: any = await r.json();
return { content: j.choices?.[0]?.message?.content ?? "" };
} catch (e) {
return { content: "", error: (e as Error).message };
}
}
interface AttemptRecord {
n: number;
provider: string;
model: string;
duration_ms: number;
content_chars: number;
error: string | null;
rubric_violations: string[];
rubric_passed: string[];
accepted: boolean;
}
function extractCode(raw: string): string {
// Strip common fence wrappers
const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
if (m) return m[1].trim();
return raw.trim();
}
async function main() {
await mkdir(OUT_DIR, { recursive: true });
log(`output: ${OUT_DIR}`);
log(`task: ${TASK.slice(0, 120)}...`);
log("");
const attempts: AttemptRecord[] = [];
let acceptedCode: string | null = null;
for (let i = 0; i < MAX_ATTEMPTS; i++) {
const n = i + 1;
const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
// Build the prompt: base task + prior failures' learning blocks
let priorLearning = "";
if (attempts.length > 0) {
priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
for (const a of attempts) {
priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`;
if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`;
}
priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
}
log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
const t0 = Date.now();
const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
const dur = Date.now() - t0;
const code = extractCode(r.content);
const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
const record: AttemptRecord = {
n,
provider: rung.provider,
model: rung.model,
duration_ms: dur,
content_chars: code.length,
error: r.error ?? null,
rubric_violations: rubric.violations,
rubric_passed: rubric.passed_rules,
accepted: rubric.passed,
};
attempts.push(record);
log(`${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
for (const v of rubric.violations.slice(0, 5)) log(`${v}`);
await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
if (rubric.passed) {
log(` ✅ ACCEPTED on attempt ${n}`);
acceptedCode = code;
break;
}
}
const summary = {
task: TASK.slice(0, 200),
total_attempts: attempts.length,
accepted: acceptedCode !== null,
accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
};
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
log("");
log(`═══ RESULT ═══`);
log(`attempts: ${summary.total_attempts}`);
log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
log(`escalation path:`);
for (const [i, a] of attempts.entries()) {
const mark = a.accepted ? "✅" : "❌";
log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model}${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
}
log("");
log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
process.exit(summary.accepted ? 0 : 1);
}
main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });

View File

@ -1,186 +0,0 @@
// Nine-consecutive audit runner — empirical test of the predictive-
// compounding property. Runs the audit pipeline 9 times against the
// same PR (each time with a new diff from Gitea), captures the
// verdict + audit_lessons state after each run, and reports whether
// the KB stabilizes or drifts.
//
// What we expect (favorable compounding):
// - signature_count grows sublinearly (same patterns recur, so
// distinct-signature count stabilizes fast)
// - verdict settles on a stable value after run 2-3 (first audit
// establishes baseline, rest repeat)
// - confidence stays LOW for all signatures (same PR repeatedly)
// - NO new recurring findings fire because confidence < 0.3 on
// same-PR noise (kb_index rating policy)
//
// What would indicate drift (the thing we want to prove DOESN'T happen):
// - signature_count grows linearly — each run produces new signatures
// - verdict oscillates (block → approve → block ...)
// - confidence inflates — kb_index rating escalates on repeated runs
//
// Run: bun run tests/real-world/nine_consecutive_audits.ts
import { readFile, writeFile } from "node:fs/promises";
import { join } from "node:path";
import { aggregate } from "../../auditor/kb_index.ts";
import { getPrSnapshot } from "../../auditor/gitea.ts";
import { auditPr } from "../../auditor/audit.ts";
const REPO = "/home/profit/lakehouse";
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
const POLL_INTERVAL_MS = 5_000;
const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0";
const RESET_KB = process.env.LH_RESET_KB === "1";
async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise<any> {
const short = sha.slice(0, 12);
const path = join(VERDICTS_DIR, `${prNum}-${short}.json`);
const start = Date.now();
while (Date.now() - start < deadlineMs) {
try {
const raw = await readFile(path, "utf8");
return JSON.parse(raw);
} catch { /* not yet */ }
await Bun.sleep(POLL_INTERVAL_MS);
}
throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
}
async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
const agg = await aggregate<any>(AUDIT_LESSONS, {
keyFn: (r) => r?.signature,
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
});
const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
const recurring = list.filter(r => r.count >= 2);
const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0;
const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0;
return {
sig_count: list.length,
max_count: list[0]?.count ?? 0,
max_confidence: recurringMaxConf,
recurring_max_count: recurringMaxCount,
top3: list.slice(0, 3).map(a => ({
sig: a.signature,
count: a.count,
conf: a.confidence,
summary: a.representative_summary.slice(0, 80),
})),
};
}
interface RunRecord {
run: number;
sha: string;
verdict_overall: string;
findings_total: number;
findings_block: number;
findings_warn: number;
findings_info: number;
audit_duration_ms: number;
claims_total: number;
claims_empirical: number;
kb_sig_count_after: number;
kb_max_count_after: number;
kb_max_confidence_after: number;
kb_recurring_max_count: number;
}
async function main() {
console.log(`[nine] target PR: #${TARGET_PR}`);
console.log(`[nine] runs: ${RUNS}`);
console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`);
console.log(`[nine] reset_kb: ${RESET_KB}`);
console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
if (RESET_KB) {
console.log("[nine] clearing audit_lessons.jsonl for clean test...");
await writeFile(AUDIT_LESSONS, "");
}
console.log("");
const pr = await getPrSnapshot(TARGET_PR);
console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
console.log(`[nine] files in diff: ${pr.files.length}`);
console.log("");
const baseline = await captureAggState();
console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
console.log("");
const records: RunRecord[] = [];
for (let n = 1; n <= RUNS; n++) {
const t0 = Date.now();
console.log(`─── run ${n}/${RUNS} ───`);
const verdict = await auditPr(pr, {
dry_run: true,
skip_dynamic: true,
skip_inference: SKIP_INFERENCE,
});
console.log(` sha ${verdict.head_sha.slice(0, 12)}`);
const after = await captureAggState();
const rec: RunRecord = {
run: n,
sha: verdict.head_sha.slice(0, 12),
verdict_overall: String(verdict.overall),
findings_total: Number(verdict.metrics?.findings_total ?? 0),
findings_block: Number(verdict.metrics?.findings_block ?? 0),
findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
findings_info: Number(verdict.metrics?.findings_info ?? 0),
audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
claims_total: Number(verdict.metrics?.claims_total ?? 0),
claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
kb_sig_count_after: after.sig_count,
kb_max_count_after: after.max_count,
kb_max_confidence_after: after.max_confidence,
kb_recurring_max_count: after.recurring_max_count,
};
records.push(rec);
console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
console.log("");
}
console.log("═══ FINAL ═══");
console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf");
for (const r of records) {
console.log(
` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`,
);
}
console.log("");
console.log("═══ COMPOUNDING PROPERTY ═══");
const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
const maxConf = records[records.length - 1].kb_max_confidence_after;
const recurringMax = records[records.length - 1].kb_recurring_max_count;
console.log(` signatures added over ${RUNS} runs: ${sigDelta}`);
console.log(` max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`);
console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
const verdictSet = new Set(records.map(r => r.verdict_overall));
if (verdictSet.size === 1) {
console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
} else {
console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")}`);
}
if (maxConf < 0.6 && recurringMax < 5) {
console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`);
} else {
console.log(` ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`);
}
const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
console.log("");
console.log(` report: ${jsonOut}`);
}
main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });