Bundles 12 commits validating the auditor + scrum_master architecture end-to-end: - enrich_prd_pipeline / hard_task_escalation / scrum_master_pipeline stress tests - Tree-split + scrum_reviews.jsonl + kb_query surfacing - Verdict → audit_lessons feedback loop (closed) - kb_index aggregator with confidence-based severity policy - 9-run + 5-run empirical tests proved the predictive-compounding property - Level 1 correction: temp=0 cloud inference for deterministic per-claim verdicts - audit_one.ts dry-run CLI - Fixes: static quoted-string guard, empirical-claim classification, symbol-resolver gate, repo-file size cap See PR #8 for run-by-run commit history.
529 lines
24 KiB
TypeScript
529 lines
24 KiB
TypeScript
// Real-world architecture stress test — 6 iterations of the full pipeline
|
||
// against the PRD as a corpus. Goal: prove at scale what Phase 21
|
||
// promised (context continuation + tree-split), plus Phase 19
|
||
// compounding across iterations.
|
||
//
|
||
// Run: bun run tests/real-world/enrich_prd_pipeline.ts
|
||
//
|
||
// No mocks. No skipped layers. On any error, the test triggers
|
||
// cloud-rescue rather than fail — it's the architecture's job to
|
||
// recover. The test FAILS only if we can't complete 6 iterations.
|
||
|
||
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
||
import { createHash } from "node:crypto";
|
||
|
||
const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md";
|
||
const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens
|
||
const CHUNK_OVERLAP = 120;
|
||
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
|
||
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
|
||
const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3
|
||
|
||
// Continuation controls (per-cloud-call) — used for output-overflow.
|
||
// Separate from the task-retry loop (per-task) — that handles errors
|
||
// across attempts.
|
||
const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation
|
||
const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
|
||
const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call
|
||
|
||
// Task-level retry loop (J's clarification, 2026-04-22):
|
||
// When a TASK errors, retry the whole task up to 6 times. Each
|
||
// retry gets prior attempts' failures injected as learning context,
|
||
// so attempt N+1 is informed by what N failed at. The loop caps at
|
||
// 6 to avoid infinite spinning on genuinely unsolvable tasks.
|
||
const MAX_TASK_RETRIES = 6;
|
||
|
||
// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
|
||
// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
|
||
// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
|
||
// loop fires all 6 with compounding failure context.
|
||
const FORCE_RETRY_MODEL_SEQUENCE = [
|
||
"deliberately-invalid-model-attempt-1",
|
||
"deliberately-invalid-model-attempt-2",
|
||
"deliberately-invalid-model-attempt-3",
|
||
"deliberately-invalid-model-attempt-4",
|
||
"deliberately-invalid-model-attempt-5",
|
||
"gpt-oss:20b", // 6th attempt succeeds
|
||
];
|
||
const GATEWAY = "http://localhost:3100";
|
||
const SIDECAR = "http://localhost:3200";
|
||
const CLOUD_MODEL = "gpt-oss:120b";
|
||
const RESCUE_MODEL = "gpt-oss:20b"; // fallback local cloud model via sidecar
|
||
const RUN_NONCE = Date.now().toString(36);
|
||
const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/${RUN_NONCE}`;
|
||
|
||
// The 6 progressively-compounding questions. #6 explicitly requires
|
||
// synthesis across prior 5 answers.
|
||
const QUESTIONS: string[] = [
|
||
"Summarize the Lakehouse project's one-paragraph thesis: what problem does it solve, what's the unique approach?",
|
||
"How does Phase 19 playbook memory turn successful fills into a signal that boosts future rankings?",
|
||
"Explain the role of Phase 24 observer in the learning loop — what does it see, and what does it feed into?",
|
||
"What's the VRAM-aware profile swap mechanism in Phase 17, and why does it matter for multi-model serving?",
|
||
"How do Phase 25 validity windows and Phase 27 playbook versioning interact when a schema drifts?",
|
||
"Synthesize the prior 5 answers: how do the pieces (playbook memory, observer, profile swap, validity windows, versioning) compose into a system that measurably gets smarter over time? Cite specific prior answers.",
|
||
];
|
||
|
||
type Chunk = { id: string; text: string; embedding: number[]; offset: number };
|
||
|
||
interface IterationResult {
|
||
iteration: number;
|
||
question: string;
|
||
retrieval_top_k: number;
|
||
context_chars_before_budget: number;
|
||
tree_split_fired: boolean;
|
||
cloud_calls_total: number;
|
||
continuation_retries: number;
|
||
rescue_triggered: boolean;
|
||
// Task-level retry telemetry
|
||
task_attempts_made: number; // how many attempts fired (1 = first succeeded)
|
||
task_retry_history: Array<{ n: number; model: string; error: string }>;
|
||
playbook_id: string | null;
|
||
tokens_prompt: number;
|
||
tokens_completion: number;
|
||
citations_from_prior_iterations: string[];
|
||
duration_ms: number;
|
||
answer_preview: string;
|
||
errors_recovered: string[];
|
||
}
|
||
|
||
function log(msg: string) { console.log(`[enrich] ${msg}`); }
|
||
|
||
function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
|
||
|
||
function cosine(a: number[], b: number[]): number {
|
||
let dot = 0, na = 0, nb = 0;
|
||
for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
|
||
if (na === 0 || nb === 0) return 0;
|
||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||
}
|
||
|
||
function hash(s: string): string { return createHash("sha256").update(s).digest("hex").slice(0, 10); }
|
||
|
||
async function embedBatch(texts: string[]): Promise<number[][]> {
|
||
// Sidecar /embed accepts a list. On partial failure, retry individually.
|
||
const r = await fetch(`${SIDECAR}/embed`, {
|
||
method: "POST", headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ texts }),
|
||
signal: AbortSignal.timeout(120000),
|
||
});
|
||
if (!r.ok) throw new Error(`embed batch ${r.status}: ${await r.text()}`);
|
||
const j: any = await r.json();
|
||
return j.embeddings;
|
||
}
|
||
|
||
function chunkText(text: string): Array<{ text: string; offset: number }> {
|
||
const out: Array<{ text: string; offset: number }> = [];
|
||
let i = 0;
|
||
while (i < text.length) {
|
||
const end = Math.min(i + CHUNK_SIZE, text.length);
|
||
const slice = text.slice(i, end).trim();
|
||
if (slice.length > 50) out.push({ text: slice, offset: i });
|
||
if (end >= text.length) break;
|
||
i = end - CHUNK_OVERLAP;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
async function chat(opts: {
|
||
provider: "ollama" | "ollama_cloud",
|
||
model: string,
|
||
messages: Array<{ role: string; content: string }>,
|
||
max_tokens: number,
|
||
think: boolean,
|
||
}): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; finish_reason: string }> {
|
||
const r = await fetch(`${GATEWAY}/v1/chat`, {
|
||
method: "POST", headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({ ...opts }),
|
||
signal: AbortSignal.timeout(180000),
|
||
});
|
||
if (!r.ok) throw new Error(`/v1/chat ${r.status}: ${await r.text()}`);
|
||
const j: any = await r.json();
|
||
return {
|
||
content: j.choices?.[0]?.message?.content ?? "",
|
||
prompt_tokens: j.usage?.prompt_tokens ?? 0,
|
||
completion_tokens: j.usage?.completion_tokens ?? 0,
|
||
finish_reason: j.choices?.[0]?.finish_reason ?? "?",
|
||
};
|
||
}
|
||
|
||
// ─── Tree-split over oversized chunk set ──────────────────────────
|
||
async function treeSplitSummarize(
|
||
chunks: Chunk[],
|
||
question: string,
|
||
): Promise<{ scratchpad: string; cloud_calls: number }> {
|
||
// Shard into groups fitting within half the budget each.
|
||
const perShard = Math.max(1, Math.floor((CONTEXT_BUDGET_CHARS / 2) / CHUNK_SIZE));
|
||
const shards: Chunk[][] = [];
|
||
for (let i = 0; i < chunks.length; i += perShard) {
|
||
shards.push(chunks.slice(i, i + perShard));
|
||
}
|
||
log(` tree-split: ${chunks.length} chunks → ${shards.length} shards of up to ${perShard}`);
|
||
let scratchpad = "";
|
||
let cloud_calls = 0;
|
||
for (let si = 0; si < shards.length; si++) {
|
||
const shard = shards[si];
|
||
const shardText = shard.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||
const userMsg = `Question: ${question}\n\nShard ${si + 1}/${shards.length} of source material:\n\n${shardText}\n\nScratchpad so far:\n${scratchpad || "(empty)"}\n\nUpdate the scratchpad: extract only facts from THIS shard that help answer the question. Be terse. No prose.`;
|
||
const r = await chat({
|
||
provider: "ollama_cloud",
|
||
model: CLOUD_MODEL,
|
||
messages: [
|
||
{ role: "system", content: "You maintain a concise factual scratchpad across multiple shards of source text. No prose outside the scratchpad. Each shard, append ≤80 words of relevant facts." },
|
||
{ role: "user", content: userMsg },
|
||
],
|
||
max_tokens: 500,
|
||
think: false,
|
||
});
|
||
cloud_calls += 1;
|
||
scratchpad += `\n--- shard ${si + 1} notes ---\n${r.content.trim()}`;
|
||
if (scratchpad.length > CONTEXT_BUDGET_CHARS) {
|
||
// truncate oldest halves
|
||
scratchpad = scratchpad.slice(-CONTEXT_BUDGET_CHARS);
|
||
log(` tree-split: scratchpad truncated to ${scratchpad.length} chars`);
|
||
}
|
||
}
|
||
return { scratchpad, cloud_calls };
|
||
}
|
||
|
||
// ─── Continuable generate — up to max_continuations stitches ──────
|
||
//
|
||
// Two failure modes handled:
|
||
// A) Empty response — typically thinking model burned the budget
|
||
// on hidden reasoning. Retry with 2× max_tokens.
|
||
// B) Truncated response (finish_reason=length) — answer got cut off
|
||
// mid-sentence. Pass the partial back as scratchpad and ask the
|
||
// model to continue from where it stopped.
|
||
//
|
||
// Stitching: keep appending content across retries; prompt_tokens and
|
||
// completion_tokens accumulate; finish_reason reflects the LAST call.
|
||
// Loop exits on the first call that finishes cleanly (stop) with
|
||
// non-empty content, OR when retries hit the cap.
|
||
async function generateContinuable(
|
||
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
|
||
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
|
||
const maxCont = opts.max_continuations ?? 1;
|
||
let total = await chat(opts);
|
||
let retries = 0;
|
||
while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
|
||
retries += 1;
|
||
const mode = total.content.length === 0 ? "empty" : "truncated";
|
||
log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
|
||
// Continuation prompt — branch on failure mode:
|
||
// empty → retry with 2× tokens, same prompt (thinking budget)
|
||
// length → pass the partial as assistant turn, ask to continue
|
||
const continuationMessages = total.content.length === 0
|
||
? opts.messages
|
||
: [
|
||
...opts.messages,
|
||
{ role: "assistant", content: total.content },
|
||
{ role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
|
||
];
|
||
const continued = await chat({
|
||
...opts,
|
||
max_tokens: CONTINUATION_MAX_TOKENS,
|
||
messages: continuationMessages,
|
||
});
|
||
total = {
|
||
content: total.content + continued.content,
|
||
prompt_tokens: total.prompt_tokens + continued.prompt_tokens,
|
||
completion_tokens: total.completion_tokens + continued.completion_tokens,
|
||
finish_reason: continued.finish_reason,
|
||
};
|
||
}
|
||
return { ...total, continuation_retries: retries };
|
||
}
|
||
|
||
// ─── Single iteration: retrieve → budget-check → chat → seed ─────
|
||
async function runIteration(
|
||
iteration: number,
|
||
question: string,
|
||
allChunks: Chunk[],
|
||
priorPlaybookIds: string[],
|
||
priorAnswers: string[],
|
||
): Promise<IterationResult> {
|
||
const started = Date.now();
|
||
const errorsRecovered: string[] = [];
|
||
log(`iter ${iteration}: "${question.slice(0, 70)}..."`);
|
||
|
||
// 1. Embed the question
|
||
const qEmb = (await embedBatch([question]))[0];
|
||
|
||
// 2. Retrieve top-K chunks by cosine
|
||
const scored = allChunks
|
||
.map(c => ({ c, score: cosine(qEmb, c.embedding) }))
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, TOP_K_RETRIEVE);
|
||
const chunks = scored.map(x => x.c);
|
||
log(` retrieved top ${chunks.length} chunks (score range ${scored[0].score.toFixed(3)} .. ${scored[scored.length - 1].score.toFixed(3)})`);
|
||
|
||
// 3. Context budget check — tree-split if over
|
||
const contextChars = chunks.map(c => c.text).join("\n\n").length;
|
||
let contextForPrompt: string;
|
||
let treeSplit = false;
|
||
let cloudCallsTotal = 0;
|
||
if (contextChars > CONTEXT_BUDGET_CHARS) {
|
||
treeSplit = true;
|
||
log(` context ${contextChars} chars > budget ${CONTEXT_BUDGET_CHARS} → tree-split`);
|
||
const { scratchpad, cloud_calls } = await treeSplitSummarize(chunks, question);
|
||
contextForPrompt = `Distilled scratchpad from ${chunks.length} source chunks (too large to fit directly):\n${scratchpad}`;
|
||
cloudCallsTotal += cloud_calls;
|
||
} else {
|
||
contextForPrompt = chunks.map(c => `[chunk @${c.offset}]\n${c.text}`).join("\n\n");
|
||
}
|
||
|
||
// 4. Seed prompt with prior iteration answers (real compounding).
|
||
// Not just IDs — the model needs the CONTENT to synthesize.
|
||
let citationBlock = "";
|
||
let citationsReceived: string[] = [];
|
||
if (priorPlaybookIds.length > 0 && priorAnswers.length > 0) {
|
||
const lines = priorAnswers.map((ans, i) => {
|
||
const pid = priorPlaybookIds[i]?.slice(0, 12) ?? "unknown";
|
||
// Trim each prior answer to ~400 chars so we don't blow budget
|
||
return `[pb:${pid}] iter ${i + 1} answer:\n${ans.slice(0, 400)}\n`;
|
||
});
|
||
citationBlock = `\n\n═══ PRIOR ITERATIONS (compounding context) ═══\n${lines.join("\n")}═══ end prior iterations ═══\n\nYour answer MUST cite specific prior iterations using [pb:ID] notation when drawing on them. Synthesis questions require explicit cross-iteration reasoning.`;
|
||
citationsReceived = priorPlaybookIds.slice();
|
||
}
|
||
|
||
// 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
|
||
// Try the task up to MAX_TASK_RETRIES times. Each retry:
|
||
// a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
|
||
// cycles through 5 invalid models + 1 valid to force full loop)
|
||
// b) Injects prior attempt errors as learning context
|
||
// c) If the attempt succeeds (non-empty, >100 chars), loop exits
|
||
// d) Otherwise, records failure and tries again with the learning
|
||
//
|
||
// Cap at 6 so we don't spin forever on unsolvable tasks.
|
||
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
|
||
let rescueTriggered = false;
|
||
const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
|
||
const forceRetries = iteration === INJECT_FAIL_ON_ITER;
|
||
if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
|
||
|
||
for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
|
||
const modelForAttempt = forceRetries
|
||
? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
|
||
: CLOUD_MODEL;
|
||
// Compose a prior-attempts learning block for attempts 2+
|
||
const learningBlock = taskAttemptHistory.length > 0
|
||
? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
|
||
: "";
|
||
log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
|
||
try {
|
||
const r = await generateContinuable({
|
||
provider: "ollama_cloud",
|
||
model: modelForAttempt,
|
||
messages: [
|
||
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
|
||
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
|
||
],
|
||
max_tokens: PRIMARY_MAX_TOKENS,
|
||
think: false,
|
||
max_continuations: MAX_CONTINUATIONS,
|
||
});
|
||
cloudCallsTotal += 1 + r.continuation_retries;
|
||
if (r.content && r.content.length > 100) {
|
||
// Acceptable answer — exit loop
|
||
result = r;
|
||
if (attempt > 1) {
|
||
log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
|
||
rescueTriggered = true;
|
||
}
|
||
break;
|
||
}
|
||
// Thin response — count as failure with learning signal
|
||
const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
|
||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||
errorsRecovered.push(`attempt ${attempt}: ${err}`);
|
||
} catch (e) {
|
||
const err = (e as Error).message;
|
||
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||
errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
|
||
cloudCallsTotal += 1;
|
||
}
|
||
}
|
||
|
||
// Last-ditch: if all 6 task attempts failed, try the local fallback
|
||
// once more so we at least return SOMETHING. This is the "don't get
|
||
// caught in a loop, accept best-so-far" rule J stated explicitly.
|
||
if (!result) {
|
||
errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
|
||
rescueTriggered = true;
|
||
try {
|
||
result = await generateContinuable({
|
||
provider: "ollama",
|
||
model: "qwen3.5:latest",
|
||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||
max_tokens: 300,
|
||
think: false,
|
||
max_continuations: 2,
|
||
});
|
||
cloudCallsTotal += 1 + result.continuation_retries;
|
||
} catch (e) {
|
||
// Absolute last resort — fabricate a skeleton result
|
||
result = {
|
||
content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
|
||
prompt_tokens: 0,
|
||
completion_tokens: 0,
|
||
continuation_retries: 0,
|
||
finish_reason: "error",
|
||
};
|
||
}
|
||
}
|
||
if (result.content.length === 0) {
|
||
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
|
||
rescueTriggered = true;
|
||
result = await generateContinuable({
|
||
provider: "ollama",
|
||
model: "qwen3.5:latest",
|
||
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||
max_tokens: 300,
|
||
think: false,
|
||
});
|
||
cloudCallsTotal += 1;
|
||
}
|
||
|
||
// 6. Seed playbook with the answer
|
||
let playbook_id: string | null = null;
|
||
try {
|
||
const ts = new Date().toISOString();
|
||
const seedOp = `TEST: enrich_prd_run_${RUN_NONCE} iter${iteration} in Corpus, PRD`;
|
||
const r = await fetch(`${GATEWAY}/vectors/playbook_memory/seed`, {
|
||
method: "POST", headers: { "content-type": "application/json" },
|
||
body: JSON.stringify({
|
||
operation: seedOp,
|
||
approach: `q="${question.slice(0, 80)}" context_chars=${contextChars} tree_split=${treeSplit}`,
|
||
context: result.content.slice(0, 600),
|
||
endorsed_names: [`iter${iteration}_${RUN_NONCE}`],
|
||
append: true,
|
||
}),
|
||
signal: AbortSignal.timeout(15000),
|
||
});
|
||
if (r.ok) {
|
||
const j: any = await r.json();
|
||
playbook_id = j.outcome?.playbook_id ?? null;
|
||
} else {
|
||
errorsRecovered.push(`seed ${r.status}: ${(await r.text()).slice(0, 100)}`);
|
||
}
|
||
} catch (e) {
|
||
errorsRecovered.push(`seed exception: ${(e as Error).message}`);
|
||
}
|
||
|
||
return {
|
||
iteration,
|
||
question,
|
||
retrieval_top_k: chunks.length,
|
||
context_chars_before_budget: contextChars,
|
||
tree_split_fired: treeSplit,
|
||
cloud_calls_total: cloudCallsTotal,
|
||
continuation_retries: result.continuation_retries,
|
||
rescue_triggered: rescueTriggered,
|
||
task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
|
||
task_retry_history: taskAttemptHistory,
|
||
playbook_id,
|
||
tokens_prompt: result.prompt_tokens,
|
||
tokens_completion: result.completion_tokens,
|
||
citations_from_prior_iterations: citationsReceived,
|
||
duration_ms: Date.now() - started,
|
||
answer_preview: result.content.slice(0, 500),
|
||
errors_recovered: errorsRecovered,
|
||
};
|
||
}
|
||
|
||
async function main() {
|
||
await mkdir(OUT_DIR, { recursive: true });
|
||
log(`run nonce: ${RUN_NONCE}`);
|
||
log(`output dir: ${OUT_DIR}`);
|
||
|
||
// ─── Phase 1: load, chunk, embed the PRD ───────────────────────
|
||
log(`loading PRD from ${PRD_PATH}`);
|
||
const prd = await readFile(PRD_PATH, "utf8");
|
||
log(`PRD: ${prd.length} chars, ${prd.split("\n").length} lines`);
|
||
|
||
const raw_chunks = chunkText(prd);
|
||
log(`chunked into ${raw_chunks.length} pieces (size ${CHUNK_SIZE}, overlap ${CHUNK_OVERLAP})`);
|
||
|
||
// Embed in batches of 32 to avoid sidecar overload
|
||
const allChunks: Chunk[] = [];
|
||
const BATCH = 32;
|
||
const t0 = Date.now();
|
||
for (let i = 0; i < raw_chunks.length; i += BATCH) {
|
||
const batch = raw_chunks.slice(i, i + BATCH);
|
||
const embs = await embedBatch(batch.map(b => b.text));
|
||
for (let j = 0; j < batch.length; j++) {
|
||
allChunks.push({
|
||
id: hash(batch[j].text),
|
||
text: batch[j].text,
|
||
embedding: embs[j].map(x => Number(x)),
|
||
offset: batch[j].offset,
|
||
});
|
||
}
|
||
log(` embedded ${allChunks.length}/${raw_chunks.length}`);
|
||
}
|
||
log(`embedded all ${allChunks.length} chunks in ${((Date.now() - t0) / 1000).toFixed(1)}s`);
|
||
|
||
// ─── Phase 2: 6 iterations ─────────────────────────────────────
|
||
const results: IterationResult[] = [];
|
||
const priorIds: string[] = [];
|
||
const priorAnswers: string[] = [];
|
||
for (let i = 1; i <= QUESTIONS.length; i++) {
|
||
const q = QUESTIONS[i - 1];
|
||
const r = await runIteration(i, q, allChunks, priorIds, priorAnswers);
|
||
results.push(r);
|
||
if (r.playbook_id) priorIds.push(r.playbook_id);
|
||
priorAnswers.push(r.answer_preview);
|
||
log(` → iter ${i}: ${r.errors_recovered.length} errors recovered, ${r.continuation_retries} continuations, tree-split=${r.tree_split_fired}, rescue=${r.rescue_triggered}, ${r.duration_ms}ms`);
|
||
await writeFile(`${OUT_DIR}/iter_${i}.json`, JSON.stringify(r, null, 2));
|
||
}
|
||
// Check whether iter 6 actually cited prior pb:IDs in its answer.
|
||
// Playbook IDs look like `pb-seed-<hex>` so the regex needs to allow
|
||
// hyphens + letters inside the brackets, not just hex chars.
|
||
const iter6 = results[5];
|
||
const citationsHonored = iter6 ? (iter6.answer_preview.match(/\[pb:[\w-]+\]/gi)?.length ?? 0) : 0;
|
||
|
||
// ─── Phase 3: summary ──────────────────────────────────────────
|
||
const summary = {
|
||
run_nonce: RUN_NONCE,
|
||
ran_at: new Date().toISOString(),
|
||
prd_chars: prd.length,
|
||
prd_chunks: allChunks.length,
|
||
iterations: results.length,
|
||
total_cloud_calls: results.reduce((s, r) => s + r.cloud_calls_total, 0),
|
||
total_continuation_retries: results.reduce((s, r) => s + r.continuation_retries, 0),
|
||
total_errors_recovered: results.reduce((s, r) => s + r.errors_recovered.length, 0),
|
||
tree_splits_fired: results.filter(r => r.tree_split_fired).length,
|
||
rescues_triggered: results.filter(r => r.rescue_triggered).length,
|
||
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
|
||
iter6_actually_cited_in_answer: citationsHonored,
|
||
iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
|
||
iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
|
||
max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
|
||
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
|
||
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
|
||
};
|
||
await writeFile(`${OUT_DIR}/summary.json`, JSON.stringify(summary, null, 2));
|
||
|
||
log("");
|
||
log(`══════ SUMMARY ${summary.overall} ══════`);
|
||
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
|
||
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
|
||
log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
|
||
log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
|
||
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||
log("");
|
||
for (const r of results) {
|
||
const flags = [
|
||
r.tree_split_fired ? "tree-split" : "",
|
||
r.continuation_retries > 0 ? `cont=${r.continuation_retries}` : "",
|
||
r.rescue_triggered ? "rescued" : "",
|
||
r.errors_recovered.length > 0 ? `err=${r.errors_recovered.length}` : "",
|
||
].filter(Boolean).join(" ");
|
||
log(` iter ${r.iteration}: ${r.tokens_prompt}+${r.tokens_completion} tok, ${r.duration_ms}ms ${flags ? `[${flags}]` : ""}`);
|
||
}
|
||
log("");
|
||
log(`artifacts: ${OUT_DIR}/{iter_1..6.json, summary.json}`);
|
||
process.exit(summary.overall === "PASS" ? 0 : 1);
|
||
}
|
||
|
||
main().catch(e => { console.error("[enrich] fatal:", e); process.exit(2); });
|