remove 5 orphaned dev experiments from tests/real-world/
Per J: "all of our test code ended up in the main." These are 5 one-time
dev experiments that were never wired into any automation and have zero
live consumers in the production code path. Deleting them.
Removed (1418 LOC total):
- enrich_prd_pipeline.ts (528 LOC) — Phase 21 architecture stress test
- nine_consecutive_audits.ts (185 LOC) — empirical study of audit compounding
- hard_task_escalation.ts (267 LOC) — escalation-ladder test (refs retired
cloud models qpt-oss:20b/120b)
- autonomous_loop.ts (214 LOC) — wrapper experiment around scrum_master
- consensus_reducer_design.ts (224 LOC) — N=3 design consultation; output JSON
referenced from pathway_memory.rs
comment but the script itself has
no consumer
Verified: 0 references in package.json / justfile / Makefile / any
production .ts/.rs/.sh file. The single mention from pathway_memory.rs
is a //! doc comment referencing the JSON output (data/_kb/
consensus_reducer_design_*.json), not the script. Build clean post-delete.
KEPT:
- scrum_master_pipeline.ts — referenced from observer.ts, vectord, scripts
- scrum_applier.ts — referenced from auditor schemas
If you need any of these back, they're in git history. cherry-pick or
git show HEAD~1 -- tests/real-world/<file>.ts will recover the source.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bb5a3b3f5e
commit
6aafd41785
@ -1,214 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
// Autonomous scrum loop — wraps scrum_master_pipeline.ts + scrum_applier.ts
|
||||
// in a goal-driven retry loop. Observer is POSTed an iteration summary at
|
||||
// every boundary so it can build meta-commentary outside the loop's epistemic
|
||||
// scope.
|
||||
//
|
||||
// Usage:
|
||||
// LOOP_TARGETS="crates/a/src/x.rs,crates/b/src/y.rs" \
|
||||
// LOOP_MAX_ITERS=5 \
|
||||
// LOOP_PUSH=1 \
|
||||
// bun run tests/real-world/autonomous_loop.ts
|
||||
//
|
||||
// Stop conditions: max_iters reached OR 2 consecutive iters with 0 commits.
|
||||
|
||||
import { spawn } from "node:child_process";
|
||||
import { appendFile, readFile } from "node:fs/promises";
|
||||
import { existsSync } from "node:fs";
|
||||
|
||||
const REPO = "/home/profit/lakehouse";
|
||||
const OBSERVER = process.env.LOOP_OBSERVER ?? "http://localhost:3800";
|
||||
const BRANCH = process.env.LOOP_BRANCH ?? "scrum/auto-apply-19814";
|
||||
const MAX_ITERS = Number(process.env.LOOP_MAX_ITERS ?? 3);
|
||||
const PUSH = process.env.LOOP_PUSH === "1";
|
||||
const MIN_CONF = process.env.LOOP_MIN_CONF ?? "85";
|
||||
// Optional override — when unset, let scrum_applier.ts use ITS default
|
||||
// (currently x-ai/grok-4.1-fast on openrouter). The prior hardcoded
|
||||
// qwen3-coder:480b default was clobbering the applier's own default
|
||||
// and forcing every iter to hit the throttled ollama_cloud account.
|
||||
const APPLIER_MODEL = process.env.LOOP_APPLIER_MODEL;
|
||||
const APPLIER_PROVIDER = process.env.LOOP_APPLIER_PROVIDER;
|
||||
const TARGETS = (process.env.LOOP_TARGETS ?? "crates/queryd/src/service.rs,crates/gateway/src/main.rs,crates/gateway/src/v1/mod.rs")
|
||||
.split(",").map(s => s.trim()).filter(Boolean);
|
||||
|
||||
const FORENSIC = process.env.LH_SCRUM_FORENSIC ?? `${REPO}/docs/SCRUM_FORENSIC_PROMPT.md`;
|
||||
const PROPOSAL = process.env.LH_SCRUM_PROPOSAL ?? `${REPO}/docs/SCRUM_FIX_WAVE.md`;
|
||||
|
||||
const LOOP_ID = `loop_${Date.now().toString(36)}`;
|
||||
const JOURNAL = `${REPO}/data/_kb/autonomous_loops.jsonl`;
|
||||
|
||||
interface IterResult {
|
||||
iter: number;
|
||||
scrum_reviews_added: number;
|
||||
applier_outcomes: Record<string, number>;
|
||||
commits_landed: number;
|
||||
commit_shas: string[];
|
||||
build_status: "green" | "red" | "unknown";
|
||||
duration_ms: number;
|
||||
}
|
||||
|
||||
function log(msg: string) {
|
||||
const ts = new Date().toISOString().slice(11, 19);
|
||||
console.log(`[loop ${LOOP_ID} ${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
function runCmd(cmd: string, args: string[], env: Record<string, string> = {}): Promise<{ code: number; stdout: string; stderr: string }> {
|
||||
return new Promise((resolve) => {
|
||||
const child = spawn(cmd, args, { cwd: REPO, env: { ...process.env, ...env } });
|
||||
let stdout = "", stderr = "";
|
||||
child.stdout.on("data", (d) => { stdout += d; process.stdout.write(d); });
|
||||
child.stderr.on("data", (d) => { stderr += d; process.stderr.write(d); });
|
||||
child.on("close", (code) => resolve({ code: code ?? -1, stdout, stderr }));
|
||||
});
|
||||
}
|
||||
|
||||
async function countLines(path: string): Promise<number> {
|
||||
if (!existsSync(path)) return 0;
|
||||
const text = await readFile(path, "utf8");
|
||||
return text.split("\n").filter(Boolean).length;
|
||||
}
|
||||
|
||||
async function gitHeadSha(): Promise<string> {
|
||||
const r = await runCmd("git", ["rev-parse", "HEAD"]);
|
||||
return r.stdout.trim();
|
||||
}
|
||||
|
||||
async function commitsSince(baseSha: string): Promise<string[]> {
|
||||
const r = await runCmd("git", ["log", "--oneline", `${baseSha}..HEAD`]);
|
||||
return r.stdout.trim().split("\n").filter(Boolean);
|
||||
}
|
||||
|
||||
async function cargoCheckGreen(): Promise<boolean> {
|
||||
log("cargo check --workspace …");
|
||||
const r = await runCmd("cargo", ["check", "--workspace", "--quiet"]);
|
||||
return r.code === 0;
|
||||
}
|
||||
|
||||
async function postObserver(payload: object) {
|
||||
try {
|
||||
const r = await fetch(`${OBSERVER}/event`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(5000),
|
||||
});
|
||||
if (!r.ok) log(`observer POST returned ${r.status}`);
|
||||
} catch (e: any) {
|
||||
log(`observer POST failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function runIter(iter: number, baseSha: string): Promise<IterResult> {
|
||||
const t0 = Date.now();
|
||||
log(`══ iter ${iter} start (base ${baseSha.slice(0, 8)}) targets=${TARGETS.length}`);
|
||||
|
||||
const reviewsBefore = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
|
||||
const applyBefore = await countLines(`${REPO}/data/_kb/auto_apply.jsonl`);
|
||||
|
||||
log(`scrum_master_pipeline.ts → ${TARGETS.length} files`);
|
||||
await runCmd("bun", ["run", "tests/real-world/scrum_master_pipeline.ts"], {
|
||||
LH_SCRUM_FILES: TARGETS.join(","),
|
||||
LH_SCRUM_FORENSIC: FORENSIC,
|
||||
LH_SCRUM_PROPOSAL: PROPOSAL,
|
||||
});
|
||||
|
||||
log(`scrum_applier.ts COMMIT=1 MIN_CONF=${MIN_CONF} files=${TARGETS.length}`);
|
||||
// Only forward model/provider when explicitly overridden — otherwise
|
||||
// let scrum_applier.ts use its own defaults (Grok 4.1 fast on openrouter).
|
||||
const applierEnv: Record<string, string> = {
|
||||
LH_APPLIER_COMMIT: "1",
|
||||
LH_APPLIER_MIN_CONF: MIN_CONF,
|
||||
LH_APPLIER_MAX_FILES: String(TARGETS.length),
|
||||
LH_APPLIER_BRANCH: BRANCH,
|
||||
// Constrain applier to THIS iter's targets so it patches what we
|
||||
// just reviewed instead of the highest-confidence file from history.
|
||||
LH_APPLIER_FILES: TARGETS.join(","),
|
||||
};
|
||||
if (APPLIER_MODEL) applierEnv.LH_APPLIER_MODEL = APPLIER_MODEL;
|
||||
if (APPLIER_PROVIDER) applierEnv.LH_APPLIER_PROVIDER = APPLIER_PROVIDER;
|
||||
await runCmd("bun", ["run", "tests/real-world/scrum_applier.ts"], applierEnv);
|
||||
|
||||
const reviewsAfter = await countLines(`${REPO}/data/_kb/scrum_reviews.jsonl`);
|
||||
const applyAfterText = existsSync(`${REPO}/data/_kb/auto_apply.jsonl`)
|
||||
? await readFile(`${REPO}/data/_kb/auto_apply.jsonl`, "utf8")
|
||||
: "";
|
||||
const applyRows = applyAfterText.split("\n").filter(Boolean).slice(applyBefore);
|
||||
const outcomes: Record<string, number> = {};
|
||||
for (const line of applyRows) {
|
||||
try {
|
||||
const o = JSON.parse(line);
|
||||
outcomes[o.action ?? "?"] = (outcomes[o.action ?? "?"] ?? 0) + 1;
|
||||
} catch { /* skip malformed */ }
|
||||
}
|
||||
|
||||
const commitShas = await commitsSince(baseSha);
|
||||
const buildStatus = commitShas.length > 0 ? (await cargoCheckGreen() ? "green" : "red") : "unknown";
|
||||
|
||||
const result: IterResult = {
|
||||
iter,
|
||||
scrum_reviews_added: reviewsAfter - reviewsBefore,
|
||||
applier_outcomes: outcomes,
|
||||
commits_landed: commitShas.length,
|
||||
commit_shas: commitShas.map(s => s.split(" ")[0]),
|
||||
build_status: buildStatus,
|
||||
duration_ms: Date.now() - t0,
|
||||
};
|
||||
|
||||
log(`iter ${iter} done — reviews+${result.scrum_reviews_added} commits=${result.commits_landed} build=${buildStatus} (${(result.duration_ms / 1000).toFixed(1)}s)`);
|
||||
|
||||
await postObserver({
|
||||
source: "autonomous_loop",
|
||||
loop_id: LOOP_ID,
|
||||
event_kind: "iteration_complete",
|
||||
iter,
|
||||
targets: TARGETS,
|
||||
success: buildStatus !== "red",
|
||||
scrum_reviews_added: result.scrum_reviews_added,
|
||||
applier_outcomes: result.applier_outcomes,
|
||||
commits_landed: result.commits_landed,
|
||||
commit_shas: result.commit_shas,
|
||||
build_status: buildStatus,
|
||||
duration_ms: result.duration_ms,
|
||||
ts: new Date().toISOString(),
|
||||
});
|
||||
|
||||
await appendFile(JOURNAL, JSON.stringify({ loop_id: LOOP_ID, ...result, ts: new Date().toISOString() }) + "\n");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
log(`autonomous loop starting · branch=${BRANCH} max_iters=${MAX_ITERS} push=${PUSH}`);
|
||||
log(`targets: ${TARGETS.join(", ")}`);
|
||||
|
||||
const branchR = await runCmd("git", ["branch", "--show-current"]);
|
||||
if (branchR.stdout.trim() !== BRANCH) {
|
||||
log(`ERROR: on branch ${branchR.stdout.trim()}, expected ${BRANCH}. Refusing to run.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let consecutiveZero = 0;
|
||||
for (let iter = 1; iter <= MAX_ITERS; iter++) {
|
||||
const baseSha = await gitHeadSha();
|
||||
const result = await runIter(iter, baseSha);
|
||||
|
||||
if (PUSH && result.commits_landed > 0) {
|
||||
log(`git push origin ${BRANCH}`);
|
||||
const pushR = await runCmd("git", ["push", "origin", BRANCH]);
|
||||
if (pushR.code !== 0) log(`push failed (continuing): ${pushR.stderr.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
consecutiveZero = result.commits_landed === 0 ? consecutiveZero + 1 : 0;
|
||||
if (consecutiveZero >= 2) {
|
||||
log(`STOP: 2 consecutive iters with 0 commits. Loop converged or stuck.`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
log(`loop ${LOOP_ID} complete. Journal: ${JOURNAL}`);
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
log(`FATAL: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@ -1,224 +0,0 @@
|
||||
// consensus_reducer_design.ts — N=3 design consultation.
|
||||
//
|
||||
// J's ask: enhance the tree-split reducer to preserve FULL backtrack-able
|
||||
// context (endpoints tried, attempt count per model in the ladder, KB
|
||||
// sources retrieved, context7 bridge hits, MCP observer signals, audit
|
||||
// verdicts) instead of collapsing to a summary. Then index the full
|
||||
// context through our existing vectord matrix indexing (HNSW + Lance +
|
||||
// playbook_memory) so successful pathways become hot-swappable — the
|
||||
// system asks "what did we try, what worked, in what order" for a
|
||||
// similar task class and gets a ranked playbook back.
|
||||
//
|
||||
// Before building, consult three diverse models and print their proposals
|
||||
// side-by-side so we can pick the convergent design.
|
||||
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
|
||||
const DESIGN_BRIEF = `
|
||||
# Context — Lakehouse signal→commit loop
|
||||
|
||||
We run 6x scrum-master iterations that audit Rust crates for PRD
|
||||
alignment, produce findings + confidence, and feed an auto-applier that
|
||||
lands small mechanical commits through a cargo-green-and-warning-stable
|
||||
gate. Key components:
|
||||
|
||||
- \`tests/real-world/scrum_master_pipeline.ts\` — orchestrator. 9-rung
|
||||
model LADDER (kimi-k2:1t → qwen3-coder:480b → deepseek → mistral-large
|
||||
→ gpt-oss:120b → qwen3.5:397b → openrouter free rescues → local
|
||||
qwen3.5:latest). Each target file retrieves 5 PRD chunks + 5
|
||||
proposal-doc chunks via vectord RAG, tree-splits large files into 3.5K
|
||||
shards, asks each rung in order, accepts first response passing
|
||||
structural checks.
|
||||
- \`mcp-server/observer.ts\` — receives scrum \`/event\` emissions
|
||||
(file, verdict, critical_failures_count, gradient_tier, attempts,
|
||||
reviewer_model, tree_split_fired). Escalates failure clusters to LLM
|
||||
Team by POSTing to /v1/chat with qwen3-coder:480b.
|
||||
- \`context7-bridge\` — external library docs lookup.
|
||||
- \`auditor/audit.ts\` — independent N=3 consensus re-check of scrum
|
||||
findings; writes to data/_kb/audit_facts.jsonl via LLM Team
|
||||
\`/api/run?mode=extract\`.
|
||||
- \`crates/vectord/src/playbook_memory.rs\` — indexing for proven
|
||||
playbooks: PlaybookEntry, DocRef, FailureRecord, BoostEntry,
|
||||
PatternReport. Uses HNSW index + Lance columnar backend + promotion
|
||||
pipeline. Already battle-tested for workers/staffing queries.
|
||||
- Tree-split REDUCER: after shards return map-style summaries, they are
|
||||
concatenated with internal §N§ markers and fed to a reviewer model to
|
||||
produce ONE file-level review. Currently the reducer sees summaries,
|
||||
not the full context behind each shard's conclusion.
|
||||
|
||||
# The problem
|
||||
|
||||
The reducer currently TRUNCATES to a short summary. When the auditor or
|
||||
a future iteration wants to backtrack WHY the reducer concluded what it
|
||||
did — which attempt succeeded, which failed, what KB chunks were
|
||||
retrieved, what observer signal classified the file as LOOPING vs
|
||||
CONVERGING — that context is lost. So:
|
||||
|
||||
1. Auditor can't verify citation provenance beyond the summary line.
|
||||
2. Applier can't tell a "tried X, failed, qwen fixed it" playbook from a
|
||||
"tried X and it was easy" playbook — they look identical downstream.
|
||||
3. The matrix indexing is only used for RAG chunks during the scrum
|
||||
pass, NOT for storing the full end-to-end pathway of a successful
|
||||
review.
|
||||
|
||||
# The design question
|
||||
|
||||
Propose an enhanced reducer + indexing design that:
|
||||
|
||||
(a) Preserves the FULL backtrack context per reviewed file:
|
||||
- every ladder attempt (model, ms, accepted_y/n, reject_reason)
|
||||
- every retrieved KB chunk (source doc, chunk id, cosine score, rank)
|
||||
- every observer signal (class, priors, prior-iter outcomes)
|
||||
- every context7 bridge hit (library, version pulled)
|
||||
- every sub-pipeline call (LLM Team extract results, audit consensus)
|
||||
|
||||
(b) Stores this pathway into vectord's matrix indexing alongside the
|
||||
review verdict so it becomes retrievable by similarity. When a new
|
||||
file's fingerprint (task_class + file-path prefix + signal class)
|
||||
matches a past successful pathway, the system can hot-swap by
|
||||
replaying or short-circuiting to the model/KB combo that worked.
|
||||
|
||||
(c) Surfaces the matrix-index hit rate as a feedback signal on the
|
||||
scrum's UI — "this file was solved 3 times before by the same ladder
|
||||
rung; consider short-circuiting to rung 5."
|
||||
|
||||
(d) Is compatible with the existing playbook_memory.rs primitives
|
||||
(PlaybookEntry, DocRef, FailureRecord, BoostEntry) — extend don't
|
||||
replace. The indexing layer is in production for workers/staffing;
|
||||
we want the reducer pathway to piggyback on proven infrastructure.
|
||||
|
||||
# Constraints
|
||||
- NO new crate. Extend vectord + scrum_master_pipeline.
|
||||
- Full context can be LARGE — a reviewed file might have 5 retrievals,
|
||||
4 ladder attempts, 8 observer priors. Design the embedding /
|
||||
fingerprint so similar-but-not-identical pathways cluster.
|
||||
- The reducer summary is still needed for the reviewer LLM input —
|
||||
don't remove it, ADD the full-context sidecar.
|
||||
- Audit trail: every pathway must be replayable deterministically from
|
||||
what's stored (i.e., enough context to re-run without the original
|
||||
prompt cache).
|
||||
|
||||
# Required output (STRICT JSON, no prose, no markdown fences):
|
||||
|
||||
{
|
||||
"approach": "one-paragraph summary of your proposed design",
|
||||
"data_model": {
|
||||
"new_fields_on_playbook_entry": [...],
|
||||
"new_types": [ {"name": "...", "purpose": "...", "fields": [...]} ]
|
||||
},
|
||||
"storage_strategy": {
|
||||
"what_to_vectorize": "the text that becomes the embedding",
|
||||
"fingerprint_key": "the deterministic key for similarity retrieval",
|
||||
"backend": "HNSW, Lance, playbook_memory — pick"
|
||||
},
|
||||
"reducer_changes": {
|
||||
"inputs_added": [...],
|
||||
"outputs_added": [...],
|
||||
"compatibility_notes": "how existing callers stay working"
|
||||
},
|
||||
"hot_swap_logic": "concrete rule for when to skip the ladder and replay a past pathway",
|
||||
"ui_signal": "what to surface so J sees whether matrix indexing is earning its keep",
|
||||
"risks": [...],
|
||||
"why_this_beats_summarization": "one-paragraph argument"
|
||||
}
|
||||
`.trim();
|
||||
|
||||
interface Probe {
|
||||
name: string;
|
||||
provider: "ollama" | "ollama_cloud" | "openrouter";
|
||||
model: string;
|
||||
}
|
||||
|
||||
// Round-3 probe set — 4 probes covering the remaining ladder rungs +
|
||||
// architecture/provider diversity. J wanted all 4 of the untouched
|
||||
// options so the aggregated 10-model signal is saturated across the
|
||||
// usable ladder.
|
||||
const PROBES: Probe[] = [
|
||||
{ name: "qwen35-397b", provider: "ollama_cloud", model: "qwen3.5:397b" },
|
||||
{ name: "openrouter-gpt-oss", provider: "openrouter", model: "openai/gpt-oss-120b:free" },
|
||||
{ name: "openrouter-gemma3", provider: "openrouter", model: "google/gemma-3-27b-it:free" },
|
||||
{ name: "qwen3-coder-480b-2", provider: "ollama_cloud", model: "qwen3-coder:480b" }, // second probe of the coding specialist — stability check
|
||||
];
|
||||
|
||||
async function ask(p: Probe): Promise<{ name: string; raw: string; ms: number; error?: string }> {
|
||||
const started = Date.now();
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: p.provider,
|
||||
model: p.model,
|
||||
messages: [
|
||||
{ role: "system", content: "You are a senior architect. Output STRICT JSON only." },
|
||||
{ role: "user", content: DESIGN_BRIEF },
|
||||
],
|
||||
max_tokens: 3000,
|
||||
temperature: 0,
|
||||
}),
|
||||
});
|
||||
const ms = Date.now() - started;
|
||||
if (!r.ok) return { name: p.name, raw: "", ms, error: `HTTP ${r.status}: ${(await r.text()).slice(0, 200)}` };
|
||||
const j = await r.json();
|
||||
const content = j.content ?? j.message?.content ?? j.choices?.[0]?.message?.content ?? "";
|
||||
return { name: p.name, raw: String(content), ms };
|
||||
} catch (e: any) {
|
||||
return { name: p.name, raw: "", ms: Date.now() - started, error: String(e).slice(0, 200) };
|
||||
}
|
||||
}
|
||||
|
||||
function extractJson(raw: string): any | null {
|
||||
let s = raw.trim();
|
||||
const fence = s.match(/^```(?:json)?\s*/);
|
||||
if (fence) s = s.slice(fence[0].length);
|
||||
if (s.endsWith("```")) s = s.slice(0, -3).trim();
|
||||
const first = s.indexOf("{");
|
||||
const last = s.lastIndexOf("}");
|
||||
if (first < 0 || last <= first) return null;
|
||||
try {
|
||||
return JSON.parse(s.slice(first, last + 1));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function summarize(obj: any, max = 240): string {
|
||||
if (!obj) return "(no JSON parsed)";
|
||||
if (typeof obj === "string") return obj.length > max ? obj.slice(0, max) + "…" : obj;
|
||||
if (Array.isArray(obj)) return obj.map((x) => summarize(x, max)).join("; ");
|
||||
return Object.entries(obj)
|
||||
.map(([k, v]) => `${k}=${summarize(v, max)}`)
|
||||
.join(" | ");
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`\n── N=3 design consensus ──`);
|
||||
console.log(`models: ${PROBES.map((p) => p.model).join(", ")}\n`);
|
||||
|
||||
const results = await Promise.all(PROBES.map(ask));
|
||||
|
||||
for (const r of results) {
|
||||
console.log(`\n── ${r.name} (${r.ms}ms) ──`);
|
||||
if (r.error) { console.log(` ERROR: ${r.error}`); continue; }
|
||||
const j = extractJson(r.raw);
|
||||
if (!j) {
|
||||
console.log(` raw (no JSON): ${r.raw.slice(0, 600)}…`);
|
||||
continue;
|
||||
}
|
||||
console.log(` approach: ${summarize(j.approach, 400)}`);
|
||||
console.log(` fingerprint: ${summarize(j.storage_strategy?.fingerprint_key, 200)}`);
|
||||
console.log(` vectorize: ${summarize(j.storage_strategy?.what_to_vectorize, 200)}`);
|
||||
console.log(` backend: ${summarize(j.storage_strategy?.backend, 200)}`);
|
||||
console.log(` hot_swap: ${summarize(j.hot_swap_logic, 300)}`);
|
||||
console.log(` new_types: ${summarize(j.data_model?.new_types, 400)}`);
|
||||
console.log(` risks: ${summarize(j.risks, 300)}`);
|
||||
console.log(` why>summary: ${summarize(j.why_this_beats_summarization, 300)}`);
|
||||
}
|
||||
|
||||
// Write full JSON to disk so we can inspect later.
|
||||
const outPath = `/home/profit/lakehouse/data/_kb/consensus_reducer_design_${Date.now().toString(36)}.json`;
|
||||
await Bun.write(outPath, JSON.stringify(results, null, 2));
|
||||
console.log(`\nfull responses → ${outPath}`);
|
||||
}
|
||||
|
||||
await main();
|
||||
@ -1,528 +0,0 @@
|
||||
// Real-world architecture stress test — 6 iterations of the full pipeline
|
||||
// against the PRD as a corpus. Goal: prove at scale what Phase 21
|
||||
// promised (context continuation + tree-split), plus Phase 19
|
||||
// compounding across iterations.
|
||||
//
|
||||
// Run: bun run tests/real-world/enrich_prd_pipeline.ts
|
||||
//
|
||||
// No mocks. No skipped layers. On any error, the test triggers
|
||||
// cloud-rescue rather than fail — it's the architecture's job to
|
||||
// recover. The test FAILS only if we can't complete 6 iterations.
|
||||
|
||||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||||
import { createHash } from "node:crypto";
|
||||
|
||||
const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
|
||||
const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens
|
||||
const CHUNK_OVERLAP = 120;
|
||||
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
|
||||
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
|
||||
const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3
|
||||
|
||||
// Continuation controls (per-cloud-call) — used for output-overflow.
|
||||
// Separate from the task-retry loop (per-task) — that handles errors
|
||||
// across attempts.
|
||||
const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation
|
||||
const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
|
||||
const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call
|
||||
|
||||
// Task-level retry loop (J's clarification, 2026-04-22):
|
||||
// When a TASK errors, retry the whole task up to 6 times. Each
|
||||
// retry gets prior attempts' failures injected as learning context,
|
||||
// so attempt N+1 is informed by what N failed at. The loop caps at
|
||||
// 6 to avoid infinite spinning on genuinely unsolvable tasks.
|
||||
const MAX_TASK_RETRIES = 6;
|
||||
|
||||
// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
|
||||
// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
|
||||
// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
|
||||
// loop fires all 6 with compounding failure context.
|
||||
const FORCE_RETRY_MODEL_SEQUENCE = [
|
||||
"deliberately-invalid-model-attempt-1",
|
||||
"deliberately-invalid-model-attempt-2",
|
||||
"deliberately-invalid-model-attempt-3",
|
||||
"deliberately-invalid-model-attempt-4",
|
||||
"deliberately-invalid-model-attempt-5",
|
||||
"gpt-oss:20b", // 6th attempt succeeds
|
||||
];
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
const SIDECAR = "http://localhost:3200";
|
||||
const CLOUD_MODEL = "gpt-oss:120b";
|
||||
const RESCUE_MODEL = "gpt-oss:20b"; // fallback local cloud model via sidecar
|
||||
const RUN_NONCE = Date.now().toString(36);
|
||||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
|
||||
|
||||
// The 6 progressively-compounding questions. #6 explicitly requires
|
||||
// synthesis across prior 5 answers.
|
||||
const QUESTIONS: string[] = [
|
||||
"Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
|
||||
"How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
|
||||
"Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
|
||||
"What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
|
||||
"How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
|
||||
"Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
|
||||
];
|
||||
|
||||
type Chunk = { id: string; text: string; embedding: number[]; offset: number };
|
||||
|
||||
interface IterationResult {
|
||||
iteration: number;
|
||||
question: string;
|
||||
retrieval_top_k: number;
|
||||
context_chars_before_budget: number;
|
||||
tree_split_fired: boolean;
|
||||
cloud_calls_total: number;
|
||||
continuation_retries: number;
|
||||
rescue_triggered: boolean;
|
||||
// Task-level retry telemetry
|
||||
task_attempts_made: number; // how many attempts fired (1 = first succeeded)
|
||||
task_retry_history: Array<{ n: number; model: string; error: string }>;
|
||||
playbook_id: string | null;
|
||||
tokens_prompt: number;
|
||||
tokens_completion: number;
|
||||
citations_from_prior_iterations: string[];
|
||||
duration_ms: number;
|
||||
answer_preview: string;
|
||||
errors_recovered: string[];
|
||||
}
|
||||
|
||||
function log(msg: string) { console.log(`[enrich] ${msg}`); }
|
||||
|
||||
function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
|
||||
|
||||
function cosine(a: number[], b: number[]): number {
|
||||
let dot = 0, na = 0, nb = 0;
|
||||
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
|
||||
if (na === 0 || nb === 0) return 0;
|
||||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||||
}
|
||||
|
||||
function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
|
||||
|
||||
async function embedBatch(texts: string[]): Promise<number[][]> {
|
||||
// Sidecar /embed accepts a list. On partial failure, retry individually.
|
||||
const r = await fetch(`${SIDECAR}/embed`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ texts }),
|
||||
signal: AbortSignal.timeout(120000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
return j.embeddings;
|
||||
}
|
||||
|
||||
function chunkText(text: string): Array<{ text: string; offset: number }> {
|
||||
const out: Array<{ text: string; offset: number }> = [];
|
||||
let i = 0;
|
||||
while (i < text.length) {
|
||||
const end = Math.min(i + CHUNK_SIZE, text.length);
|
||||
const slice = text.slice(i, end).trim();
|
||||
if (slice.length > 50) out.push({ text: slice, offset: i });
|
||||
if (end >= text.length) break;
|
||||
i = end - CHUNK_OVERLAP;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
async function chat(opts: {
|
||||
provider: "ollama" | "ollama_cloud",
|
||||
model: string,
|
||||
messages: Array<{ role: string; content: string }>,
|
||||
max_tokens: number,
|
||||
think: boolean,
|
||||
}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ ...opts }),
|
||||
signal: AbortSignal.timeout(180000),
|
||||
});
|
||||
if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
|
||||
const j: any = await r.json();
|
||||
return {
|
||||
content: j.choices?.[0]?.message?.content ?? "",
|
||||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||||
finish_reason: j.choices?.[0]?.finish_reason ?? "?",
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Tree-split over oversized chunk set ──────────────────────────
|
||||
async function treeSplitSummarize(
|
||||
chunks: Chunk[],
|
||||
question: string,
|
||||
): Promise<{ scratchpad: string; cloud_calls: number }> {
|
||||
// Shard into groups fitting within half the budget each.
|
||||
const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
|
||||
const shards: Chunk[][] = [];
|
||||
for (let i = 0; i < chunks.length; i += perShard) {
|
||||
shards.push(chunks.slice(i, i + perShard));
|
||||
}
|
||||
log(` tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
|
||||
let scratchpad = "";
|
||||
let cloud_calls = 0;
|
||||
for (let si = 0; si < shards.length; si++) {
|
||||
const shard = shards[si];
|
||||
const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||||
const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
|
||||
const r = await chat({
|
||||
provider: "ollama_cloud",
|
||||
model: CLOUD_MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
|
||||
{ role: "user", content: userMsg },
|
||||
],
|
||||
max_tokens: 500,
|
||||
think: false,
|
||||
});
|
||||
cloud_calls += 1;
|
||||
scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
|
||||
if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
|
||||
// truncate oldest halves
|
||||
scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
|
||||
log(` tree-split: scratchpad truncated to ${scratchpad.length} chars`);
|
||||
}
|
||||
}
|
||||
return { scratchpad, cloud_calls };
|
||||
}
|
||||
|
||||
// ─── Continuable generate — up to max_continuations stitches ──────
|
||||
//
|
||||
// Two failure modes handled:
|
||||
// A) Empty response — typically thinking model burned the budget
|
||||
// on hidden reasoning. Retry with 2× max_tokens.
|
||||
// B) Truncated response (finish_reason=length) — answer got cut off
|
||||
// mid-sentence. Pass the partial back as scratchpad and ask the
|
||||
// model to continue from where it stopped.
|
||||
//
|
||||
// Stitching: keep appending content across retries; prompt_tokens and
|
||||
// completion_tokens accumulate; finish_reason reflects the LAST call.
|
||||
// Loop exits on the first call that finishes cleanly (stop) with
|
||||
// non-empty content, OR when retries hit the cap.
|
||||
async function generateContinuable(
|
||||
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
|
||||
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
|
||||
const maxCont = opts.max_continuations ?? 1;
|
||||
let total = await chat(opts);
|
||||
let retries = 0;
|
||||
while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
|
||||
retries += 1;
|
||||
const mode = total.content.length === 0 ? "empty" : "truncated";
|
||||
log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
|
||||
// Continuation prompt — branch on failure mode:
|
||||
// empty → retry with 2× tokens, same prompt (thinking budget)
|
||||
// length → pass the partial as assistant turn, ask to continue
|
||||
const continuationMessages = total.content.length === 0
|
||||
? opts.messages
|
||||
: [
|
||||
...opts.messages,
|
||||
{ role: "assistant", content: total.content },
|
||||
{ role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
|
||||
];
|
||||
const continued = await chat({
|
||||
...opts,
|
||||
max_tokens: CONTINUATION_MAX_TOKENS,
|
||||
messages: continuationMessages,
|
||||
});
|
||||
total = {
|
||||
content: total.content + continued.content,
|
||||
prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
|
||||
completion_tokens: total.completion_tokens + continued.completion_tokens,
|
||||
finish_reason: continued.finish_reason,
|
||||
};
|
||||
}
|
||||
return { ...total, continuation_retries: retries };
|
||||
}
|
||||
|
||||
// ─── Single iteration: retrieve → budget-check → chat → seed ─────
|
||||
async function runIteration(
|
||||
iteration: number,
|
||||
question: string,
|
||||
allChunks: Chunk[],
|
||||
priorPlaybookIds: string[],
|
||||
priorAnswers: string[],
|
||||
): Promise<IterationResult> {
|
||||
const started = Date.now();
|
||||
const errorsRecovered: string[] = [];
|
||||
log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
|
||||
|
||||
// 1. Embed the question
|
||||
const qEmb = (await embedBatch([question]))[0];
|
||||
|
||||
// 2. Retrieve top-K chunks by cosine
|
||||
const scored = allChunks
|
||||
.map(c => ({ c, score: cosine(qEmb, c.embedding) }))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, TOP_K_RETRIEVE);
|
||||
const chunks = scored.map(x => x.c);
|
||||
log(` retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
|
||||
|
||||
// 3. Context budget check — tree-split if over
|
||||
const contextChars = chunks.map(c => c.text).join("\n\n").length;
|
||||
let contextForPrompt: string;
|
||||
let treeSplit = false;
|
||||
let cloudCallsTotal = 0;
|
||||
if (contextChars > CONTEXT_BUDGET_CHARS) {
|
||||
treeSplit = true;
|
||||
log(` context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
|
||||
const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
|
||||
contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
|
||||
cloudCallsTotal += cloud_calls;
|
||||
} else {
|
||||
contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||||
}
|
||||
|
||||
// 4. Seed prompt with prior iteration answers (real compounding).
|
||||
// Not just IDs — the model needs the CONTENT to synthesize.
|
||||
let citationBlock = "";
|
||||
let citationsReceived: string[] = [];
|
||||
if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
|
||||
const lines = priorAnswers.map((ans, i) => {
|
||||
const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
|
||||
// Trim each prior answer to ~400 chars so we don't blow budget
|
||||
return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
|
||||
});
|
||||
citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
|
||||
citationsReceived = priorPlaybookIds.slice();
|
||||
}
|
||||
|
||||
// 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
|
||||
// Try the task up to MAX_TASK_RETRIES times. Each retry:
|
||||
// a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
|
||||
// cycles through 5 invalid models + 1 valid to force full loop)
|
||||
// b) Injects prior attempt errors as learning context
|
||||
// c) If the attempt succeeds (non-empty, >100 chars), loop exits
|
||||
// d) Otherwise, records failure and tries again with the learning
|
||||
//
|
||||
// Cap at 6 so we don't spin forever on unsolvable tasks.
|
||||
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
|
||||
let rescueTriggered = false;
|
||||
const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
|
||||
const forceRetries = iteration === INJECT_FAIL_ON_ITER;
|
||||
if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
|
||||
const modelForAttempt = forceRetries
|
||||
? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
|
||||
: CLOUD_MODEL;
|
||||
// Compose a prior-attempts learning block for attempts 2+
|
||||
const learningBlock = taskAttemptHistory.length > 0
|
||||
? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
|
||||
: "";
|
||||
log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
|
||||
try {
|
||||
const r = await generateContinuable({
|
||||
provider: "ollama_cloud",
|
||||
model: modelForAttempt,
|
||||
messages: [
|
||||
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
|
||||
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
|
||||
],
|
||||
max_tokens: PRIMARY_MAX_TOKENS,
|
||||
think: false,
|
||||
max_continuations: MAX_CONTINUATIONS,
|
||||
});
|
||||
cloudCallsTotal += 1 + r.continuation_retries;
|
||||
if (r.content && r.content.length > 100) {
|
||||
// Acceptable answer — exit loop
|
||||
result = r;
|
||||
if (attempt > 1) {
|
||||
log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
|
||||
rescueTriggered = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Thin response — count as failure with learning signal
|
||||
const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
|
||||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||
errorsRecovered.push(`attempt ${attempt}: ${err}`);
|
||||
} catch (e) {
|
||||
const err = (e as Error).message;
|
||||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||
errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
|
||||
cloudCallsTotal += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Last-ditch: if all 6 task attempts failed, try the local fallback
|
||||
// once more so we at least return SOMETHING. This is the "don't get
|
||||
// caught in a loop, accept best-so-far" rule J stated explicitly.
|
||||
if (!result) {
|
||||
errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
|
||||
rescueTriggered = true;
|
||||
try {
|
||||
result = await generateContinuable({
|
||||
provider: "ollama",
|
||||
model: "qwen3.5:latest",
|
||||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||||
max_tokens: 300,
|
||||
think: false,
|
||||
max_continuations: 2,
|
||||
});
|
||||
cloudCallsTotal += 1 + result.continuation_retries;
|
||||
} catch (e) {
|
||||
// Absolute last resort — fabricate a skeleton result
|
||||
result = {
|
||||
content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
continuation_retries: 0,
|
||||
finish_reason: "error",
|
||||
};
|
||||
}
|
||||
}
|
||||
if (result.content.length === 0) {
|
||||
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
|
||||
rescueTriggered = true;
|
||||
result = await generateContinuable({
|
||||
provider: "ollama",
|
||||
model: "qwen3.5:latest",
|
||||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||||
max_tokens: 300,
|
||||
think: false,
|
||||
});
|
||||
cloudCallsTotal += 1;
|
||||
}
|
||||
|
||||
// 6. Seed playbook with the answer
|
||||
let playbook_id: string | null = null;
|
||||
try {
|
||||
const ts = new Date().toISOString();
|
||||
const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
|
||||
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
operation: seedOp,
|
||||
approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
|
||||
context: result.content.slice(0, 600),
|
||||
endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
|
||||
append: true,
|
||||
}),
|
||||
signal: AbortSignal.timeout(15000),
|
||||
});
|
||||
if (r.ok) {
|
||||
const j: any = await r.json();
|
||||
playbook_id = j.outcome?.playbook_id ?? null;
|
||||
} else {
|
||||
errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
|
||||
}
|
||||
} catch (e) {
|
||||
errorsRecovered.push(`seed exception: ${(e as Error).message}`);
|
||||
}
|
||||
|
||||
return {
|
||||
iteration,
|
||||
question,
|
||||
retrieval_top_k: chunks.length,
|
||||
context_chars_before_budget: contextChars,
|
||||
tree_split_fired: treeSplit,
|
||||
cloud_calls_total: cloudCallsTotal,
|
||||
continuation_retries: result.continuation_retries,
|
||||
rescue_triggered: rescueTriggered,
|
||||
task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
|
||||
task_retry_history: taskAttemptHistory,
|
||||
playbook_id,
|
||||
tokens_prompt: result.prompt_tokens,
|
||||
tokens_completion: result.completion_tokens,
|
||||
citations_from_prior_iterations: citationsReceived,
|
||||
duration_ms: Date.now() - started,
|
||||
answer_preview: result.content.slice(0, 500),
|
||||
errors_recovered: errorsRecovered,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await mkdir(OUT_DIR, { recursive: true });
|
||||
log(`run nonce: ${RUN_NONCE}`);
|
||||
log(`output dir: ${OUT_DIR}`);
|
||||
|
||||
// ─── Phase 1: load, chunk, embed the PRD ───────────────────────
|
||||
log(`loading PRD from ${PRD_PATH}`);
|
||||
const prd = await readFile(PRD_PATH, "utf8");
|
||||
log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
|
||||
|
||||
const raw_chunks = chunkText(prd);
|
||||
log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
|
||||
|
||||
// Embed in batches of 32 to avoid sidecar overload
|
||||
const allChunks: Chunk[] = [];
|
||||
const BATCH = 32;
|
||||
const t0 = Date.now();
|
||||
for (let i = 0; i < raw_chunks.length; i += BATCH) {
|
||||
const batch = raw_chunks.slice(i, i + BATCH);
|
||||
const embs = await embedBatch(batch.map(b => b.text));
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
allChunks.push({
|
||||
id: hash(batch[j].text),
|
||||
text: batch[j].text,
|
||||
embedding: embs[j].map(x => Number(x)),
|
||||
offset: batch[j].offset,
|
||||
});
|
||||
}
|
||||
log(` embedded ${allChunks.length}/${raw_chunks.length}`);
|
||||
}
|
||||
log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||||
|
||||
// ─── Phase 2: 6 iterations ─────────────────────────────────────
|
||||
const results: IterationResult[] = [];
|
||||
const priorIds: string[] = [];
|
||||
const priorAnswers: string[] = [];
|
||||
for (let i = 1; i <= QUESTIONS.length; i++) {
|
||||
const q = QUESTIONS[i - 1];
|
||||
const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
|
||||
results.push(r);
|
||||
if (r.playbook_id) priorIds.push(r.playbook_id);
|
||||
priorAnswers.push(r.answer_preview);
|
||||
log(` → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
|
||||
await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
|
||||
}
|
||||
// Check whether iter 6 actually cited prior pb:IDs in its answer.
|
||||
// Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
|
||||
// hyphens + letters inside the brackets, not just hex chars.
|
||||
const iter6 = results[5];
|
||||
const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
|
||||
|
||||
// ─── Phase 3: summary ──────────────────────────────────────────
|
||||
const summary = {
|
||||
run_nonce: RUN_NONCE,
|
||||
ran_at: new Date().toISOString(),
|
||||
prd_chars: prd.length,
|
||||
prd_chunks: allChunks.length,
|
||||
iterations: results.length,
|
||||
total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
|
||||
total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
|
||||
total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
|
||||
tree_splits_fired: results.filter(r => r.tree_split_fired).length,
|
||||
rescues_triggered: results.filter(r => r.rescue_triggered).length,
|
||||
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
|
||||
iter6_actually_cited_in_answer: citationsHonored,
|
||||
iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
|
||||
iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
|
||||
max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
|
||||
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
|
||||
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
|
||||
};
|
||||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||||
|
||||
log("");
|
||||
log(`══════ SUMMARY ${summary.overall} ══════`);
|
||||
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
|
||||
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
|
||||
log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
|
||||
log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
|
||||
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||
log("");
|
||||
for (const r of results) {
|
||||
const flags = [
|
||||
r.tree_split_fired ? "tree-split" : "",
|
||||
r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
|
||||
r.rescue_triggered ? "rescued" : "",
|
||||
r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
|
||||
].filter(Boolean).join(" ");
|
||||
log(` iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
|
||||
}
|
||||
log("");
|
||||
log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
|
||||
process.exit(summary.overall === "PASS" ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
|
||||
@ -1,267 +0,0 @@
|
||||
// Hard-task escalation test. The task is deliberately constructed so
|
||||
// that a local 7B model (qwen3.5:latest) will miss at least one of the
|
||||
// validation rules. Watch the escalation ladder:
|
||||
// 1. qwen3.5:latest (local 7B) — likely fails
|
||||
// 2. qwen3:latest (local 7B) — likely fails differently
|
||||
// 3. gpt-oss:20b (cloud 20B) — may fail
|
||||
// 4. gpt-oss:120b (cloud 120B) — should succeed
|
||||
// 5. gpt-oss:120b w/ prior-attempt errors injected — retry with context
|
||||
// 6. absolute last ditch: return best-so-far with failure annotation
|
||||
//
|
||||
// Each attempt:
|
||||
// - Calls the model via /v1/chat
|
||||
// - Validates the output against a strict rubric
|
||||
// - On fail: records the specific rubric violations + the partial
|
||||
// output, injects both into the next attempt's prompt as "here's
|
||||
// what's wrong, fix it specifically"
|
||||
// - On success: exit loop
|
||||
//
|
||||
// Run: bun run tests/real-world/hard_task_escalation.ts
|
||||
|
||||
import { writeFile, mkdir } from "node:fs/promises";
|
||||
|
||||
const GATEWAY = "http://localhost:3100";
|
||||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/hard_task_${Date.now().toString(36)}`;
|
||||
const MAX_ATTEMPTS = 6;
|
||||
|
||||
// The hard task. Specific enough that a small model will miss at
|
||||
// least one rule. Not purely knowledge-based — it's a code-generation
|
||||
// task with strict structural constraints.
|
||||
const TASK = `Write a complete Rust async function with the EXACT signature:
|
||||
|
||||
pub async fn check_drift_batched(refs: Vec<DocRef>) -> Result<Vec<String>, String>
|
||||
|
||||
It must:
|
||||
1. Group refs by tool name (case-insensitive — use .to_ascii_lowercase())
|
||||
2. Issue parallel HTTP GET requests to http://localhost:3900/docs/{tool}/diff?since={snippet_hash}
|
||||
3. Use reqwest and a JoinSet/Semaphore to cap concurrent in-flight requests at 4
|
||||
4. On HTTP 5xx, retry with exponential backoff: sleep 250ms, then 500ms, then 1000ms, then give up on that tool
|
||||
5. Parse the response JSON: {"drifted": bool, ...}. Return a Vec<String> of tool names where drifted == true
|
||||
6. All errors bubble via ? or Result — NO .unwrap(), NO .expect(), NO panic!()
|
||||
7. Include rustdoc /// comments on the function and each internal helper
|
||||
|
||||
Assume this struct is already imported:
|
||||
|
||||
pub struct DocRef { pub tool: String, pub snippet_hash: Option<String>, pub version_seen: String }
|
||||
|
||||
Output ONLY the Rust code. No prose, no markdown fences, no explanation. Start directly with the /// doc comment.`;
|
||||
|
||||
// Escalation ladder — small-local → large-local → cloud → specialist
|
||||
// cloud → trillion-param cloud. Corrected 2026-04-22 per J:
|
||||
// gpt-oss:20b is LOCAL (ollama list confirms 13 GB on disk), and the
|
||||
// final escalation tier should be kimi-k2:1t (the biggest model
|
||||
// we have access to on Ollama Cloud).
|
||||
const LADDER: Array<{ provider: "ollama" | "ollama_cloud"; model: string; note: string }> = [
|
||||
{ provider: "ollama", model: "qwen3.5:latest", note: "local 7B" },
|
||||
{ provider: "ollama", model: "qwen3:latest", note: "local 7B (different) " },
|
||||
{ provider: "ollama", model: "gpt-oss:20b", note: "local 20B" }, // FIXED: local, not cloud
|
||||
{ provider: "ollama_cloud", model: "gpt-oss:120b", note: "cloud 120B" },
|
||||
{ provider: "ollama_cloud", model: "devstral-2:123b", note: "cloud 123B (coding specialist)" },
|
||||
// NOTE 2026-04-22 — J wanted Kimi as the last escalation but Kimi
|
||||
// K2.5/K2.6 both return "this model requires a subscription" on our
|
||||
// current Ollama Cloud key. mistral-large-3:675b is the biggest
|
||||
// model actually provisioned on this key (verified via direct curl
|
||||
// to ollama.com/api/generate). Upgrade path: Ollama Cloud Pro →
|
||||
// swap this line to kimi-k2.5 or kimi-k2.6:cloud.
|
||||
{ provider: "ollama_cloud", model: "mistral-large-3:675b", note: "cloud 675B (biggest available on current key; kimi-k2.x needs pro subscription)" },
|
||||
];
|
||||
|
||||
// Validation rubric — the answer must pass all of these to be accepted.
|
||||
interface RubricResult {
|
||||
passed: boolean;
|
||||
violations: string[];
|
||||
passed_rules: string[];
|
||||
}
|
||||
|
||||
function validate(code: string): RubricResult {
|
||||
const violations: string[] = [];
|
||||
const passed: string[] = [];
|
||||
|
||||
const check = (rule: string, ok: boolean) => { ok ? passed.push(rule) : violations.push(rule); };
|
||||
|
||||
check("has pub async fn check_drift_batched signature",
|
||||
/pub\s+async\s+fn\s+check_drift_batched\s*\(/.test(code));
|
||||
check("takes Vec<DocRef> argument",
|
||||
/refs\s*:\s*Vec\s*<\s*DocRef\s*>/.test(code));
|
||||
check("returns Result<Vec<String>, String>",
|
||||
/Result\s*<\s*Vec\s*<\s*String\s*>\s*,\s*String\s*>/.test(code));
|
||||
check("uses reqwest",
|
||||
/\breqwest\b/i.test(code));
|
||||
check("references JoinSet or Semaphore for concurrency",
|
||||
/\bJoinSet\b|\bSemaphore\b/i.test(code));
|
||||
check("bounds concurrency at 4",
|
||||
/\b4\b/.test(code) && (/Semaphore\s*::\s*new\s*\(\s*4\b/.test(code) || /permits\s*:\s*4\b/.test(code) || /limit\s*:\s*4\b/.test(code) || /max\s*:\s*4\b/.test(code) || /capacity\s*:\s*4\b/.test(code)));
|
||||
// Exponential backoff — models express this several ways. Accept
|
||||
// any recognizable doubling pattern starting at 250ms. 2026-04-22:
|
||||
// devstral-2:123b wrote `retry_delay *= 2` which my earlier regex
|
||||
// rejected even though the code is correct. Broadening rubric to
|
||||
// match all idiomatic doubling forms.
|
||||
const hasSeed250 = /Duration\s*::\s*from_millis\s*\(\s*250\b/.test(code)
|
||||
|| /millis\s*\(\s*250\b/.test(code);
|
||||
const hasDoublingPattern = /250\s*\*\s*2/.test(code) // 250 * 2^n literal
|
||||
|| /<<\s*\d+/.test(code) // bit-shift
|
||||
|| /\.pow\s*\(/.test(code) // 2u32.pow(attempt)
|
||||
|| /\*=\s*2\b/.test(code) // delay *= 2 ← was missing
|
||||
|| /\*\s*2\s*;/.test(code) // delay = delay * 2;
|
||||
|| /saturating_mul\s*\(\s*2\b/.test(code); // saturating doubling
|
||||
check("has 250ms backoff seed",
|
||||
hasSeed250);
|
||||
check("reaches 500ms backoff (literal or doubling from 250)",
|
||||
/Duration\s*::\s*from_millis\s*\(\s*500\b/.test(code)
|
||||
|| /millis\s*\(\s*500\b/.test(code)
|
||||
|| (hasSeed250 && hasDoublingPattern));
|
||||
check("reaches 1000ms backoff (literal or doubling to 1000)",
|
||||
/Duration\s*::\s*from_millis\s*\(\s*1000\b/.test(code)
|
||||
|| /millis\s*\(\s*1000\b/.test(code)
|
||||
|| (hasSeed250 && hasDoublingPattern));
|
||||
check("case-insensitive tool grouping (to_ascii_lowercase)",
|
||||
/to_ascii_lowercase|to_lowercase/.test(code));
|
||||
check("NO .unwrap() — all errors bubble via ?",
|
||||
!/\.unwrap\s*\(\s*\)/.test(code));
|
||||
check("NO .expect(...) — all errors bubble via ?",
|
||||
!/\.expect\s*\(/.test(code));
|
||||
check("NO panic!() / unimplemented!() / todo!()",
|
||||
!/\bpanic!\s*\(|\bunimplemented!\s*\(|\btodo!\s*\(/.test(code));
|
||||
check("has rustdoc /// comments",
|
||||
/\/\/\//.test(code));
|
||||
check("reasonable length (> 500 chars)",
|
||||
code.length > 500);
|
||||
|
||||
return { passed: violations.length === 0, violations, passed_rules: passed };
|
||||
}
|
||||
|
||||
function log(msg: string) { console.log(`[hard] ${msg}`); }
|
||||
|
||||
async function chat(opts: {
|
||||
provider: "ollama" | "ollama_cloud",
|
||||
model: string,
|
||||
prompt: string,
|
||||
}): Promise<{ content: string; error?: string }> {
|
||||
try {
|
||||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||||
method: "POST", headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
provider: opts.provider,
|
||||
model: opts.model,
|
||||
messages: [{ role: "user", content: opts.prompt }],
|
||||
max_tokens: 2500,
|
||||
temperature: 0.2,
|
||||
think: false,
|
||||
}),
|
||||
signal: AbortSignal.timeout(240000),
|
||||
});
|
||||
if (!r.ok) return { content: "", error: `/v1/chat ${r.status}: ${(await r.text()).slice(0, 300)}` };
|
||||
const j: any = await r.json();
|
||||
return { content: j.choices?.[0]?.message?.content ?? "" };
|
||||
} catch (e) {
|
||||
return { content: "", error: (e as Error).message };
|
||||
}
|
||||
}
|
||||
|
||||
interface AttemptRecord {
|
||||
n: number;
|
||||
provider: string;
|
||||
model: string;
|
||||
duration_ms: number;
|
||||
content_chars: number;
|
||||
error: string | null;
|
||||
rubric_violations: string[];
|
||||
rubric_passed: string[];
|
||||
accepted: boolean;
|
||||
}
|
||||
|
||||
function extractCode(raw: string): string {
|
||||
// Strip common fence wrappers
|
||||
const m = raw.match(/```(?:rust)?\s*\n([\s\S]*?)```/);
|
||||
if (m) return m[1].trim();
|
||||
return raw.trim();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await mkdir(OUT_DIR, { recursive: true });
|
||||
log(`output: ${OUT_DIR}`);
|
||||
log(`task: ${TASK.slice(0, 120)}...`);
|
||||
log("");
|
||||
|
||||
const attempts: AttemptRecord[] = [];
|
||||
let acceptedCode: string | null = null;
|
||||
|
||||
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
||||
const n = i + 1;
|
||||
const rung = LADDER[i] ?? LADDER[LADDER.length - 1];
|
||||
|
||||
// Build the prompt: base task + prior failures' learning blocks
|
||||
let priorLearning = "";
|
||||
if (attempts.length > 0) {
|
||||
priorLearning = `\n\n═══ PRIOR ATTEMPTS FAILED. Fix these exact issues: ═══\n`;
|
||||
for (const a of attempts) {
|
||||
priorLearning += `Attempt ${a.n} (${a.provider}/${a.model}, ${a.content_chars} chars) violations:\n`;
|
||||
for (const v of a.rubric_violations) priorLearning += ` - ${v}\n`;
|
||||
if (a.error) priorLearning += ` [error: ${a.error.slice(0, 120)}]\n`;
|
||||
}
|
||||
priorLearning += `═══ end prior attempts ═══\n\nDO NOT repeat the above violations. Address each one explicitly.`;
|
||||
}
|
||||
|
||||
log(`attempt ${n}/${MAX_ATTEMPTS}: ${rung.provider}::${rung.model}${priorLearning ? " [w/ learning]" : ""}`);
|
||||
const t0 = Date.now();
|
||||
const r = await chat({ provider: rung.provider, model: rung.model, prompt: TASK + priorLearning });
|
||||
const dur = Date.now() - t0;
|
||||
|
||||
const code = extractCode(r.content);
|
||||
const rubric = code ? validate(code) : { passed: false, violations: ["empty response"], passed_rules: [] };
|
||||
|
||||
const record: AttemptRecord = {
|
||||
n,
|
||||
provider: rung.provider,
|
||||
model: rung.model,
|
||||
duration_ms: dur,
|
||||
content_chars: code.length,
|
||||
error: r.error ?? null,
|
||||
rubric_violations: rubric.violations,
|
||||
rubric_passed: rubric.passed_rules,
|
||||
accepted: rubric.passed,
|
||||
};
|
||||
attempts.push(record);
|
||||
|
||||
log(` → ${dur}ms, ${code.length} chars, ${rubric.passed_rules.length} rules passed / ${rubric.violations.length} failed${r.error ? `, err: ${r.error.slice(0, 80)}` : ""}`);
|
||||
for (const v of rubric.violations.slice(0, 5)) log(` ✗ ${v}`);
|
||||
|
||||
await writeFile(`${OUT_DIR}/attempt_${n}.txt`, code);
|
||||
await writeFile(`${OUT_DIR}/attempt_${n}.json`, JSON.stringify(record, null, 2));
|
||||
|
||||
if (rubric.passed) {
|
||||
log(` ✅ ACCEPTED on attempt ${n}`);
|
||||
acceptedCode = code;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const summary = {
|
||||
task: TASK.slice(0, 200),
|
||||
total_attempts: attempts.length,
|
||||
accepted: acceptedCode !== null,
|
||||
accepted_on_attempt: acceptedCode ? attempts.findIndex(a => a.accepted) + 1 : null,
|
||||
escalation_path: attempts.map(a => `${a.provider}/${a.model}`),
|
||||
per_attempt_pass_counts: attempts.map(a => a.rubric_passed.length),
|
||||
per_attempt_violation_counts: attempts.map(a => a.rubric_violations.length),
|
||||
total_duration_ms: attempts.reduce((s, a) => s + a.duration_ms, 0),
|
||||
};
|
||||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||||
|
||||
log("");
|
||||
log(`═══ RESULT ═══`);
|
||||
log(`attempts: ${summary.total_attempts}`);
|
||||
log(`accepted: ${summary.accepted} ${summary.accepted ? `on attempt ${summary.accepted_on_attempt}` : ""}`);
|
||||
log(`escalation path:`);
|
||||
for (const [i, a] of attempts.entries()) {
|
||||
const mark = a.accepted ? "✅" : "❌";
|
||||
log(` ${mark} attempt ${i + 1}: ${a.provider}/${a.model} — ${a.rubric_passed.length}/${a.rubric_passed.length + a.rubric_violations.length} rules passed, ${a.duration_ms}ms`);
|
||||
}
|
||||
log("");
|
||||
log(`total time: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||
log(`artifacts: ${OUT_DIR}/{attempt_1..N.{txt,json}, summary.json}`);
|
||||
|
||||
process.exit(summary.accepted ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[hard] fatal:", e); process.exit(2); });
|
||||
@ -1,186 +0,0 @@
|
||||
// Nine-consecutive audit runner — empirical test of the predictive-
|
||||
// compounding property. Runs the audit pipeline 9 times against the
|
||||
// same PR (each time with a new diff from Gitea), captures the
|
||||
// verdict + audit_lessons state after each run, and reports whether
|
||||
// the KB stabilizes or drifts.
|
||||
//
|
||||
// What we expect (favorable compounding):
|
||||
// - signature_count grows sublinearly (same patterns recur, so
|
||||
// distinct-signature count stabilizes fast)
|
||||
// - verdict settles on a stable value after run 2-3 (first audit
|
||||
// establishes baseline, rest repeat)
|
||||
// - confidence stays LOW for all signatures (same PR repeatedly)
|
||||
// - NO new recurring findings fire because confidence < 0.3 on
|
||||
// same-PR noise (kb_index rating policy)
|
||||
//
|
||||
// What would indicate drift (the thing we want to prove DOESN'T happen):
|
||||
// - signature_count grows linearly — each run produces new signatures
|
||||
// - verdict oscillates (block → approve → block ...)
|
||||
// - confidence inflates — kb_index rating escalates on repeated runs
|
||||
//
|
||||
// Run: bun run tests/real-world/nine_consecutive_audits.ts
|
||||
|
||||
import { readFile, writeFile } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { aggregate } from "../../auditor/kb_index.ts";
|
||||
import { getPrSnapshot } from "../../auditor/gitea.ts";
|
||||
import { auditPr } from "../../auditor/audit.ts";
|
||||
|
||||
const REPO = "/home/profit/lakehouse";
|
||||
const AUDIT_LESSONS = `${REPO}/data/_kb/audit_lessons.jsonl`;
|
||||
const VERDICTS_DIR = `${REPO}/data/_auditor/verdicts`;
|
||||
const POLL_INTERVAL_MS = 5_000;
|
||||
const RUNS = Number(process.env.LH_AUDIT_RUNS ?? 9);
|
||||
const TARGET_PR = Number(process.env.LH_AUDIT_PR ?? 8);
|
||||
const SKIP_INFERENCE = process.env.LH_AUDITOR_SKIP_INFERENCE !== "0";
|
||||
const RESET_KB = process.env.LH_RESET_KB === "1";
|
||||
|
||||
async function waitForVerdict(prNum: number, sha: string, deadlineMs: number): Promise<any> {
|
||||
const short = sha.slice(0, 12);
|
||||
const path = join(VERDICTS_DIR, `${prNum}-${short}.json`);
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < deadlineMs) {
|
||||
try {
|
||||
const raw = await readFile(path, "utf8");
|
||||
return JSON.parse(raw);
|
||||
} catch { /* not yet */ }
|
||||
await Bun.sleep(POLL_INTERVAL_MS);
|
||||
}
|
||||
throw new Error(`no verdict file after ${deadlineMs}ms: ${path}`);
|
||||
}
|
||||
|
||||
async function captureAggState(): Promise<{ sig_count: number; max_count: number; max_confidence: number; top3: Array<{ sig: string; count: number; conf: number; summary: string }> }> {
|
||||
const agg = await aggregate<any>(AUDIT_LESSONS, {
|
||||
keyFn: (r) => r?.signature,
|
||||
scopeFn: (r) => (r?.pr_number !== undefined ? `pr-${r.pr_number}` : undefined),
|
||||
});
|
||||
const list = Array.from(agg.values()).sort((a, b) => b.count - a.count);
|
||||
const recurring = list.filter(r => r.count >= 2);
|
||||
const recurringMaxCount = recurring.length > 0 ? Math.max(...recurring.map(a => a.count)) : 0;
|
||||
const recurringMaxConf = recurring.length > 0 ? Math.max(...recurring.map(a => a.confidence)) : 0;
|
||||
return {
|
||||
sig_count: list.length,
|
||||
max_count: list[0]?.count ?? 0,
|
||||
max_confidence: recurringMaxConf,
|
||||
recurring_max_count: recurringMaxCount,
|
||||
top3: list.slice(0, 3).map(a => ({
|
||||
sig: a.signature,
|
||||
count: a.count,
|
||||
conf: a.confidence,
|
||||
summary: a.representative_summary.slice(0, 80),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
interface RunRecord {
|
||||
run: number;
|
||||
sha: string;
|
||||
verdict_overall: string;
|
||||
findings_total: number;
|
||||
findings_block: number;
|
||||
findings_warn: number;
|
||||
findings_info: number;
|
||||
audit_duration_ms: number;
|
||||
claims_total: number;
|
||||
claims_empirical: number;
|
||||
kb_sig_count_after: number;
|
||||
kb_max_count_after: number;
|
||||
kb_max_confidence_after: number;
|
||||
kb_recurring_max_count: number;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`[nine] target PR: #${TARGET_PR}`);
|
||||
console.log(`[nine] runs: ${RUNS}`);
|
||||
console.log(`[nine] skip_inference: ${SKIP_INFERENCE}`);
|
||||
console.log(`[nine] reset_kb: ${RESET_KB}`);
|
||||
console.log(`[nine] audit_lessons.jsonl: ${AUDIT_LESSONS}`);
|
||||
|
||||
if (RESET_KB) {
|
||||
console.log("[nine] clearing audit_lessons.jsonl for clean test...");
|
||||
await writeFile(AUDIT_LESSONS, "");
|
||||
}
|
||||
console.log("");
|
||||
|
||||
const pr = await getPrSnapshot(TARGET_PR);
|
||||
console.log(`[nine] PR #${pr.number}: "${pr.title}" (head=${pr.head_sha.slice(0, 12)})`);
|
||||
console.log(`[nine] files in diff: ${pr.files.length}`);
|
||||
console.log("");
|
||||
|
||||
const baseline = await captureAggState();
|
||||
console.log(`[nine] baseline: sig_count=${baseline.sig_count} max_count=${baseline.max_count} max_conf=${baseline.max_confidence.toFixed(2)}`);
|
||||
console.log("");
|
||||
|
||||
const records: RunRecord[] = [];
|
||||
for (let n = 1; n <= RUNS; n++) {
|
||||
const t0 = Date.now();
|
||||
console.log(`─── run ${n}/${RUNS} ───`);
|
||||
|
||||
const verdict = await auditPr(pr, {
|
||||
dry_run: true,
|
||||
skip_dynamic: true,
|
||||
skip_inference: SKIP_INFERENCE,
|
||||
});
|
||||
|
||||
console.log(` sha ${verdict.head_sha.slice(0, 12)}`);
|
||||
const after = await captureAggState();
|
||||
const rec: RunRecord = {
|
||||
run: n,
|
||||
sha: verdict.head_sha.slice(0, 12),
|
||||
verdict_overall: String(verdict.overall),
|
||||
findings_total: Number(verdict.metrics?.findings_total ?? 0),
|
||||
findings_block: Number(verdict.metrics?.findings_block ?? 0),
|
||||
findings_warn: Number(verdict.metrics?.findings_warn ?? 0),
|
||||
findings_info: Number(verdict.metrics?.findings_info ?? 0),
|
||||
audit_duration_ms: Number(verdict.metrics?.audit_duration_ms ?? 0),
|
||||
claims_total: Number(verdict.metrics?.claims_total ?? 0),
|
||||
claims_empirical: Number(verdict.metrics?.claims_empirical ?? 0),
|
||||
kb_sig_count_after: after.sig_count,
|
||||
kb_max_count_after: after.max_count,
|
||||
kb_max_confidence_after: after.max_confidence,
|
||||
kb_recurring_max_count: after.recurring_max_count,
|
||||
};
|
||||
records.push(rec);
|
||||
console.log(` verdict=${rec.verdict_overall} findings=${rec.findings_total} (b=${rec.findings_block} w=${rec.findings_warn})`);
|
||||
console.log(` kb after: sig=${rec.kb_sig_count_after} max_count=${rec.kb_max_count_after} recurring_max=${rec.kb_recurring_max_count} max_conf=${rec.kb_max_confidence_after.toFixed(2)}`);
|
||||
console.log(` elapsed: ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||||
console.log("");
|
||||
}
|
||||
|
||||
console.log("═══ FINAL ═══");
|
||||
console.log("run | verdict | find | block warn info | dur_s | kb_sig max_count max_conf");
|
||||
for (const r of records) {
|
||||
console.log(
|
||||
` ${String(r.run).padStart(1)} | ${r.verdict_overall.padEnd(16)} | ${String(r.findings_total).padStart(4)} | ${String(r.findings_block).padStart(5)} ${String(r.findings_warn).padStart(5)} ${String(r.findings_info).padStart(5)} | ${(r.audit_duration_ms / 1000).toFixed(1).padStart(5)} | ${String(r.kb_sig_count_after).padStart(6)} ${String(r.kb_max_count_after).padStart(9)} ${r.kb_max_confidence_after.toFixed(2)}`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log("═══ COMPOUNDING PROPERTY ═══");
|
||||
const sigDelta = records[records.length - 1].kb_sig_count_after - baseline.sig_count;
|
||||
const maxConf = records[records.length - 1].kb_max_confidence_after;
|
||||
const recurringMax = records[records.length - 1].kb_recurring_max_count;
|
||||
console.log(` signatures added over ${RUNS} runs: ${sigDelta}`);
|
||||
console.log(` max recurring count after run ${RUNS}: ${recurringMax} (same-PR recurrences per signature)`);
|
||||
console.log(` max confidence after run ${RUNS}: ${maxConf.toFixed(2)} (expect LOW — same-PR should not inflate)`);
|
||||
|
||||
const verdictSet = new Set(records.map(r => r.verdict_overall));
|
||||
if (verdictSet.size === 1) {
|
||||
console.log(` verdict stable: all ${RUNS} runs returned '${[...verdictSet][0]}' ✓`);
|
||||
} else {
|
||||
console.log(` verdict oscillated across runs: ${[...verdictSet].join(" | ")} ✗`);
|
||||
}
|
||||
|
||||
if (maxConf < 0.6 && recurringMax < 5) {
|
||||
console.log(` confidence policy holding: same-PR noise stays below escalation threshold ✓`);
|
||||
} else {
|
||||
console.log(` ⚠ cross-cutting pattern detected (conf=${maxConf.toFixed(2)}, recurring=${recurringMax}) — kb_index policy escalated`);
|
||||
}
|
||||
|
||||
const jsonOut = `${REPO}/tests/real-world/runs/nine_consecutive_${Date.now().toString(36)}.json`;
|
||||
await Bun.write(jsonOut, JSON.stringify({ target_pr: TARGET_PR, baseline, records }, null, 2));
|
||||
console.log("");
|
||||
console.log(` report: ${jsonOut}`);
|
||||
}
|
||||
|
||||
main().catch(e => { console.error("[nine] fatal:", e); process.exit(1); });
|
||||
Loading…
x
Reference in New Issue
Block a user