2026-04-23 05:29:39 +00:00
1 changed files with 157 additions and 60 deletions
--- a/tests/real-world/enrich_prd_pipeline.ts
+++ b/tests/real-world/enrich_prd_pipeline.ts
@ -17,7 +17,34 @@ const CHUNK_SIZE = 800;            // chars per chunk — ~200 tokens
 const CHUNK_OVERLAP = 120;
 const TOP_K_RETRIEVE = 12;         // chunks per iteration — pulled up to force overflow
 const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
-const INJECT_FAIL_ON_ITER = 3;     // plant a bad primary-cloud call so rescue fires
+const INJECT_FAIL_ON_ITER = 3;     // force the TASK-retry loop on iter 3
 // Continuation controls (per-cloud-call) — used for output-overflow.
 // Separate from the task-retry loop (per-task) — that handles errors
 // across attempts.
 const PRIMARY_MAX_TOKENS = 150;     // tight — forces truncation
 const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
 const MAX_CONTINUATIONS = 6;         // max stitch pieces per cloud call
 // Task-level retry loop (J's clarification, 2026-04-22):
 // When a TASK errors, retry the whole task up to 6 times. Each
 // retry gets prior attempts' failures injected as learning context,
 // so attempt N+1 is informed by what N failed at. The loop caps at
 // 6 to avoid infinite spinning on genuinely unsolvable tasks.
 const MAX_TASK_RETRIES = 6;
 // To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
 // 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
 // 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
 // loop fires all 6 with compounding failure context.
 const FORCE_RETRY_MODEL_SEQUENCE = [
  "deliberately-invalid-model-attempt-1",
  "deliberately-invalid-model-attempt-2",
  "deliberately-invalid-model-attempt-3",
  "deliberately-invalid-model-attempt-4",
  "deliberately-invalid-model-attempt-5",
  "gpt-oss:20b", // 6th attempt succeeds
 ];
 const GATEWAY = "http://localhost:3100";
 const SIDECAR = "http://localhost:3200";
 const CLOUD_MODEL = "gpt-oss:120b";
@ -47,6 +74,9 @@ interface IterationResult {
  cloud_calls_total: number;
  continuation_retries: number;
  rescue_triggered: boolean;
  // Task-level retry telemetry
  task_attempts_made: number;       // how many attempts fired (1 = first succeeded)
  task_retry_history: Array<{ n: number; model: string; error: string }>;
  playbook_id: string | null;
  tokens_prompt: number;
  tokens_completion: number;
@ -155,24 +185,43 @@ async function treeSplitSummarize(
  return { scratchpad, cloud_calls };
 }
-// ─── Continuable generate — retries once on empty/truncated ───────
+// ─── Continuable generate — up to max_continuations stitches ──────
 //
 // Two failure modes handled:
 //   A) Empty response — typically thinking model burned the budget
 //      on hidden reasoning. Retry with 2× max_tokens.
 //   B) Truncated response (finish_reason=length) — answer got cut off
 //      mid-sentence. Pass the partial back as scratchpad and ask the
 //      model to continue from where it stopped.
 //
 // Stitching: keep appending content across retries; prompt_tokens and
 // completion_tokens accumulate; finish_reason reflects the LAST call.
 // Loop exits on the first call that finishes cleanly (stop) with
 // non-empty content, OR when retries hit the cap.
 async function generateContinuable(
  opts: Parameters<typeof chat>[0] & { max_continuations?: number },
-): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number }> {
+): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
  const maxCont = opts.max_continuations ?? 1;
  let total = await chat(opts);
  let retries = 0;
-  if (total.content.length === 0 || total.finish_reason === "length") {
+  while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
    for (let i = 0; i < maxCont && (total.content.length === 0 || total.finish_reason === "length"); i++) {
    retries += 1;
-      log(`  continuation retry ${retries} (reason: ${total.finish_reason}, content=${total.content.length})`);
+    const mode = total.content.length === 0 ? "empty" : "truncated";
    log(`  continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
    // Continuation prompt — branch on failure mode:
    //   empty  → retry with 2× tokens, same prompt (thinking budget)
    //   length → pass the partial as assistant turn, ask to continue
    const continuationMessages = total.content.length === 0
      ? opts.messages
      : [
          ...opts.messages,
          { role: "assistant", content: total.content },
          { role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
        ];
    const continued = await chat({
      ...opts,
-        max_tokens: opts.max_tokens * 2,
+      max_tokens: CONTINUATION_MAX_TOKENS,
-        messages: [
+      messages: continuationMessages,
          ...opts.messages,
          ...(total.content ? [{ role: "assistant", content: total.content }, { role: "user", content: "Continue from where you stopped. Complete the answer." }] : []),
        ],
    });
    total = {
      content: total.content + continued.content,
@ -181,7 +230,6 @@ async function generateContinuable(
      finish_reason: continued.finish_reason,
    };
  }
  }
  return { ...total, continuation_retries: retries };
 }
@ -237,47 +285,90 @@ async function runIteration(
    citationsReceived = priorPlaybookIds.slice();
  }
-  // 5. Call cloud. On empty/error → rescue.
+  // 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
-  //    Intentional failure-injection on iter INJECT_FAIL_ON_ITER:
+  //    Try the task up to MAX_TASK_RETRIES times. Each retry:
-  //    a deliberately-invalid model name on the primary call so the
+  //    a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
-  //    rescue path actually runs. Proves the catch-and-rescue isn't
+  //       cycles through 5 invalid models + 1 valid to force full loop)
-  //    dead code.
+  //    b) Injects prior attempt errors as learning context
-  let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number };
+  //    c) If the attempt succeeds (non-empty, >100 chars), loop exits
  //    d) Otherwise, records failure and tries again with the learning
  //
  //    Cap at 6 so we don't spin forever on unsolvable tasks.
  let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
  let rescueTriggered = false;
-  const primaryModel = iteration === INJECT_FAIL_ON_ITER
+  const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
-    ? "deliberately-invalid-model-to-force-rescue"
+  const forceRetries = iteration === INJECT_FAIL_ON_ITER;
  if (forceRetries) log(`  FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
  for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
    const modelForAttempt = forceRetries
      ? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
      : CLOUD_MODEL;
-  if (iteration === INJECT_FAIL_ON_ITER) {
+    // Compose a prior-attempts learning block for attempts 2+
-    log(`  INJECTED FAILURE on primary call — model="${primaryModel}" will 400/500`);
+    const learningBlock = taskAttemptHistory.length > 0
      ? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
      : "";
    log(`  task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
    try {
      const r = await generateContinuable({
        provider: "ollama_cloud",
        model: modelForAttempt,
        messages: [
          { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
          { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
        ],
        max_tokens: PRIMARY_MAX_TOKENS,
        think: false,
        max_continuations: MAX_CONTINUATIONS,
      });
      cloudCallsTotal += 1 + r.continuation_retries;
      if (r.content && r.content.length > 100) {
        // Acceptable answer — exit loop
        result = r;
        if (attempt > 1) {
          log(`  task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
          rescueTriggered = true;
        }
        break;
      }
      // Thin response — count as failure with learning signal
      const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
      errorsRecovered.push(`attempt ${attempt}: ${err}`);
    } catch (e) {
      const err = (e as Error).message;
      taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
      errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
      cloudCallsTotal += 1;
    }
  }
  // Last-ditch: if all 6 task attempts failed, try the local fallback
  // once more so we at least return SOMETHING. This is the "don't get
  // caught in a loop, accept best-so-far" rule J stated explicitly.
  if (!result) {
    errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
    rescueTriggered = true;
    try {
      result = await generateContinuable({
-      provider: "ollama_cloud",
+        provider: "ollama",
-      model: primaryModel,
+        model: "qwen3.5:latest",
-      messages: [
+        messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
-        { role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Keep answers under 200 words." },
+        max_tokens: 300,
-        { role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}` },
+        think: false,
      ],
      max_tokens: 800,
      think: true,
        max_continuations: 2,
      });
      cloudCallsTotal += 1 + result.continuation_retries;
    } catch (e) {
-    errorsRecovered.push(`primary cloud call: ${(e as Error).message}`);
+      // Absolute last resort — fabricate a skeleton result
-    rescueTriggered = true;
+      result = {
-    log(`  primary failed → rescue with ${RESCUE_MODEL}`);
+        content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
-    result = await generateContinuable({
+        prompt_tokens: 0,
-      provider: "ollama_cloud",
+        completion_tokens: 0,
-      model: RESCUE_MODEL,
+        continuation_retries: 0,
-      messages: [
+        finish_reason: "error",
-        { role: "system", content: "Answer briefly using the source material." },
+      };
-        { role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 8000)}` },
+    }
      ],
      max_tokens: 500,
      think: false,
    });
    cloudCallsTotal += 1 + result.continuation_retries;
  }
  if (result.content.length === 0) {
    errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
@ -327,6 +418,8 @@ async function runIteration(
    cloud_calls_total: cloudCallsTotal,
    continuation_retries: result.continuation_retries,
    rescue_triggered: rescueTriggered,
    task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
    task_retry_history: taskAttemptHistory,
    playbook_id,
    tokens_prompt: result.prompt_tokens,
    tokens_completion: result.completion_tokens,
@ -402,6 +495,9 @@ async function main() {
    rescues_triggered: results.filter(r => r.rescue_triggered).length,
    iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
    iter6_actually_cited_in_answer: citationsHonored,
    iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
    iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
    max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
    total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
    overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
  };
@ -411,7 +507,8 @@ async function main() {
  log(`══════ SUMMARY ${summary.overall} ══════`);
  log(`  6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
  log(`  tree-splits: ${summary.tree_splits_fired}/6   continuations: ${summary.total_continuation_retries}   rescues: ${summary.rescues_triggered}`);
-  log(`  iter 6 cited ${summary.iter6_cited_prior} prior iteration playbooks`);
+  log(`  iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
  log(`  iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
  log(`  total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
  log("");
  for (const r of results) {