Audit pipeline PR #9: determinism + fact extraction + verifier gate + KB stats #9
@ -17,7 +17,34 @@ const CHUNK_SIZE = 800; // chars per chunk — ~200 tokens
|
|||||||
const CHUNK_OVERLAP = 120;
|
const CHUNK_OVERLAP = 120;
|
||||||
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
|
const TOP_K_RETRIEVE = 12; // chunks per iteration — pulled up to force overflow
|
||||||
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
|
const CONTEXT_BUDGET_CHARS = 4000; // tight budget — forces tree-split on every iter
|
||||||
const INJECT_FAIL_ON_ITER = 3; // plant a bad primary-cloud call so rescue fires
|
const INJECT_FAIL_ON_ITER = 3; // force the TASK-retry loop on iter 3
|
||||||
|
|
||||||
|
// Continuation controls (per-cloud-call) — used for output-overflow.
|
||||||
|
// Separate from the task-retry loop (per-task) — that handles errors
|
||||||
|
// across attempts.
|
||||||
|
const PRIMARY_MAX_TOKENS = 150; // tight — forces truncation
|
||||||
|
const CONTINUATION_MAX_TOKENS = 300; // each continuation doubles headroom
|
||||||
|
const MAX_CONTINUATIONS = 6; // max stitch pieces per cloud call
|
||||||
|
|
||||||
|
// Task-level retry loop (J's clarification, 2026-04-22):
|
||||||
|
// When a TASK errors, retry the whole task up to 6 times. Each
|
||||||
|
// retry gets prior attempts' failures injected as learning context,
|
||||||
|
// so attempt N+1 is informed by what N failed at. The loop caps at
|
||||||
|
// 6 to avoid infinite spinning on genuinely unsolvable tasks.
|
||||||
|
const MAX_TASK_RETRIES = 6;
|
||||||
|
|
||||||
|
// To FORCE the retry loop on iter INJECT_FAIL_ON_ITER, cycle through
|
||||||
|
// 5 deliberately-invalid models + 1 valid one. Attempts 1-5 will
|
||||||
|
// 502/404 from Ollama Cloud; attempt 6 finally succeeds. Proves the
|
||||||
|
// loop fires all 6 with compounding failure context.
|
||||||
|
const FORCE_RETRY_MODEL_SEQUENCE = [
|
||||||
|
"deliberately-invalid-model-attempt-1",
|
||||||
|
"deliberately-invalid-model-attempt-2",
|
||||||
|
"deliberately-invalid-model-attempt-3",
|
||||||
|
"deliberately-invalid-model-attempt-4",
|
||||||
|
"deliberately-invalid-model-attempt-5",
|
||||||
|
"gpt-oss:20b", // 6th attempt succeeds
|
||||||
|
];
|
||||||
const GATEWAY = "http://localhost:3100";
|
const GATEWAY = "http://localhost:3100";
|
||||||
const SIDECAR = "http://localhost:3200";
|
const SIDECAR = "http://localhost:3200";
|
||||||
const CLOUD_MODEL = "gpt-oss:120b";
|
const CLOUD_MODEL = "gpt-oss:120b";
|
||||||
@ -47,6 +74,9 @@ interface IterationResult {
|
|||||||
cloud_calls_total: number;
|
cloud_calls_total: number;
|
||||||
continuation_retries: number;
|
continuation_retries: number;
|
||||||
rescue_triggered: boolean;
|
rescue_triggered: boolean;
|
||||||
|
// Task-level retry telemetry
|
||||||
|
task_attempts_made: number; // how many attempts fired (1 = first succeeded)
|
||||||
|
task_retry_history: Array<{ n: number; model: string; error: string }>;
|
||||||
playbook_id: string | null;
|
playbook_id: string | null;
|
||||||
tokens_prompt: number;
|
tokens_prompt: number;
|
||||||
tokens_completion: number;
|
tokens_completion: number;
|
||||||
@ -155,24 +185,43 @@ async function treeSplitSummarize(
|
|||||||
return { scratchpad, cloud_calls };
|
return { scratchpad, cloud_calls };
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── Continuable generate — retries once on empty/truncated ───────
|
// ─── Continuable generate — up to max_continuations stitches ──────
|
||||||
|
//
|
||||||
|
// Two failure modes handled:
|
||||||
|
// A) Empty response — typically thinking model burned the budget
|
||||||
|
// on hidden reasoning. Retry with 2× max_tokens.
|
||||||
|
// B) Truncated response (finish_reason=length) — answer got cut off
|
||||||
|
// mid-sentence. Pass the partial back as scratchpad and ask the
|
||||||
|
// model to continue from where it stopped.
|
||||||
|
//
|
||||||
|
// Stitching: keep appending content across retries; prompt_tokens and
|
||||||
|
// completion_tokens accumulate; finish_reason reflects the LAST call.
|
||||||
|
// Loop exits on the first call that finishes cleanly (stop) with
|
||||||
|
// non-empty content, OR when retries hit the cap.
|
||||||
async function generateContinuable(
|
async function generateContinuable(
|
||||||
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
|
opts: Parameters<typeof chat>[0] & { max_continuations?: number },
|
||||||
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number }> {
|
): Promise<{ content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string }> {
|
||||||
const maxCont = opts.max_continuations ?? 1;
|
const maxCont = opts.max_continuations ?? 1;
|
||||||
let total = await chat(opts);
|
let total = await chat(opts);
|
||||||
let retries = 0;
|
let retries = 0;
|
||||||
if (total.content.length === 0 || total.finish_reason === "length") {
|
while (retries < maxCont && (total.content.length === 0 || total.finish_reason === "length")) {
|
||||||
for (let i = 0; i < maxCont && (total.content.length === 0 || total.finish_reason === "length"); i++) {
|
|
||||||
retries += 1;
|
retries += 1;
|
||||||
log(` continuation retry ${retries} (reason: ${total.finish_reason}, content=${total.content.length})`);
|
const mode = total.content.length === 0 ? "empty" : "truncated";
|
||||||
|
log(` continuation retry ${retries}/${maxCont} (${mode}: finish=${total.finish_reason}, content=${total.content.length} chars)`);
|
||||||
|
// Continuation prompt — branch on failure mode:
|
||||||
|
// empty → retry with 2× tokens, same prompt (thinking budget)
|
||||||
|
// length → pass the partial as assistant turn, ask to continue
|
||||||
|
const continuationMessages = total.content.length === 0
|
||||||
|
? opts.messages
|
||||||
|
: [
|
||||||
|
...opts.messages,
|
||||||
|
{ role: "assistant", content: total.content },
|
||||||
|
{ role: "user", content: "Continue from exactly where you stopped. Do not repeat. Finish the answer." },
|
||||||
|
];
|
||||||
const continued = await chat({
|
const continued = await chat({
|
||||||
...opts,
|
...opts,
|
||||||
max_tokens: opts.max_tokens * 2,
|
max_tokens: CONTINUATION_MAX_TOKENS,
|
||||||
messages: [
|
messages: continuationMessages,
|
||||||
...opts.messages,
|
|
||||||
...(total.content ? [{ role: "assistant", content: total.content }, { role: "user", content: "Continue from where you stopped. Complete the answer." }] : []),
|
|
||||||
],
|
|
||||||
});
|
});
|
||||||
total = {
|
total = {
|
||||||
content: total.content + continued.content,
|
content: total.content + continued.content,
|
||||||
@ -181,7 +230,6 @@ async function generateContinuable(
|
|||||||
finish_reason: continued.finish_reason,
|
finish_reason: continued.finish_reason,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return { ...total, continuation_retries: retries };
|
return { ...total, continuation_retries: retries };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,47 +285,90 @@ async function runIteration(
|
|||||||
citationsReceived = priorPlaybookIds.slice();
|
citationsReceived = priorPlaybookIds.slice();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. Call cloud. On empty/error → rescue.
|
// 5. TASK-LEVEL RETRY LOOP — per J's clarification 2026-04-22.
|
||||||
// Intentional failure-injection on iter INJECT_FAIL_ON_ITER:
|
// Try the task up to MAX_TASK_RETRIES times. Each retry:
|
||||||
// a deliberately-invalid model name on the primary call so the
|
// a) Picks a model (normally CLOUD_MODEL; on INJECT_FAIL_ON_ITER,
|
||||||
// rescue path actually runs. Proves the catch-and-rescue isn't
|
// cycles through 5 invalid models + 1 valid to force full loop)
|
||||||
// dead code.
|
// b) Injects prior attempt errors as learning context
|
||||||
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number };
|
// c) If the attempt succeeds (non-empty, >100 chars), loop exits
|
||||||
|
// d) Otherwise, records failure and tries again with the learning
|
||||||
|
//
|
||||||
|
// Cap at 6 so we don't spin forever on unsolvable tasks.
|
||||||
|
let result: { content: string; prompt_tokens: number; completion_tokens: number; continuation_retries: number; finish_reason: string } | null = null;
|
||||||
let rescueTriggered = false;
|
let rescueTriggered = false;
|
||||||
const primaryModel = iteration === INJECT_FAIL_ON_ITER
|
const taskAttemptHistory: Array<{ n: number; model: string; error: string }> = [];
|
||||||
? "deliberately-invalid-model-to-force-rescue"
|
const forceRetries = iteration === INJECT_FAIL_ON_ITER;
|
||||||
|
if (forceRetries) log(` FORCING TASK-RETRY LOOP — iter ${iteration} will cycle through 5 invalid models + 1 valid`);
|
||||||
|
|
||||||
|
for (let attempt = 1; attempt <= MAX_TASK_RETRIES; attempt++) {
|
||||||
|
const modelForAttempt = forceRetries
|
||||||
|
? FORCE_RETRY_MODEL_SEQUENCE[attempt - 1]
|
||||||
: CLOUD_MODEL;
|
: CLOUD_MODEL;
|
||||||
if (iteration === INJECT_FAIL_ON_ITER) {
|
// Compose a prior-attempts learning block for attempts 2+
|
||||||
log(` INJECTED FAILURE on primary call — model="${primaryModel}" will 400/500`);
|
const learningBlock = taskAttemptHistory.length > 0
|
||||||
|
? `\n\n═══ PRIOR ATTEMPTS THIS TASK (do NOT repeat these failures; adjust approach) ═══\n${taskAttemptHistory.map(a => `Attempt ${a.n} (model ${a.model}) failed: ${a.error.slice(0, 160)}`).join("\n")}\n═══ end prior attempts ═══\n`
|
||||||
|
: "";
|
||||||
|
log(` task attempt ${attempt}/${MAX_TASK_RETRIES}: model=${modelForAttempt}${learningBlock ? " [with prior-failure context]" : ""}`);
|
||||||
|
try {
|
||||||
|
const r = await generateContinuable({
|
||||||
|
provider: "ollama_cloud",
|
||||||
|
model: modelForAttempt,
|
||||||
|
messages: [
|
||||||
|
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Write a detailed 250-word answer." },
|
||||||
|
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}${learningBlock}` },
|
||||||
|
],
|
||||||
|
max_tokens: PRIMARY_MAX_TOKENS,
|
||||||
|
think: false,
|
||||||
|
max_continuations: MAX_CONTINUATIONS,
|
||||||
|
});
|
||||||
|
cloudCallsTotal += 1 + r.continuation_retries;
|
||||||
|
if (r.content && r.content.length > 100) {
|
||||||
|
// Acceptable answer — exit loop
|
||||||
|
result = r;
|
||||||
|
if (attempt > 1) {
|
||||||
|
log(` task attempt ${attempt} SUCCEEDED (${r.content.length} chars) after ${attempt - 1} prior failures`);
|
||||||
|
rescueTriggered = true;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Thin response — count as failure with learning signal
|
||||||
|
const err = `thin-answer: ${r.content.length} chars, finish=${r.finish_reason}`;
|
||||||
|
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||||
|
errorsRecovered.push(`attempt ${attempt}: ${err}`);
|
||||||
|
} catch (e) {
|
||||||
|
const err = (e as Error).message;
|
||||||
|
taskAttemptHistory.push({ n: attempt, model: modelForAttempt, error: err });
|
||||||
|
errorsRecovered.push(`attempt ${attempt}: ${err.slice(0, 120)}`);
|
||||||
|
cloudCallsTotal += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last-ditch: if all 6 task attempts failed, try the local fallback
|
||||||
|
// once more so we at least return SOMETHING. This is the "don't get
|
||||||
|
// caught in a loop, accept best-so-far" rule J stated explicitly.
|
||||||
|
if (!result) {
|
||||||
|
errorsRecovered.push(`all ${MAX_TASK_RETRIES} task attempts failed — local fallback`);
|
||||||
|
rescueTriggered = true;
|
||||||
try {
|
try {
|
||||||
result = await generateContinuable({
|
result = await generateContinuable({
|
||||||
provider: "ollama_cloud",
|
provider: "ollama",
|
||||||
model: primaryModel,
|
model: "qwen3.5:latest",
|
||||||
messages: [
|
messages: [{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 4000)}` }],
|
||||||
{ role: "system", content: "You answer questions about the Lakehouse PRD using only the provided source material and prior iteration answers. Be specific. Cite chunk offsets OR [pb:ID] markers. Keep answers under 200 words." },
|
max_tokens: 300,
|
||||||
{ role: "user", content: `Question: ${question}\n\nSource material:\n${contextForPrompt}${citationBlock}` },
|
think: false,
|
||||||
],
|
|
||||||
max_tokens: 800,
|
|
||||||
think: true,
|
|
||||||
max_continuations: 2,
|
max_continuations: 2,
|
||||||
});
|
});
|
||||||
cloudCallsTotal += 1 + result.continuation_retries;
|
cloudCallsTotal += 1 + result.continuation_retries;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
errorsRecovered.push(`primary cloud call: ${(e as Error).message}`);
|
// Absolute last resort — fabricate a skeleton result
|
||||||
rescueTriggered = true;
|
result = {
|
||||||
log(` primary failed → rescue with ${RESCUE_MODEL}`);
|
content: `[task failed after ${MAX_TASK_RETRIES} retries + local fallback: ${(e as Error).message}]`,
|
||||||
result = await generateContinuable({
|
prompt_tokens: 0,
|
||||||
provider: "ollama_cloud",
|
completion_tokens: 0,
|
||||||
model: RESCUE_MODEL,
|
continuation_retries: 0,
|
||||||
messages: [
|
finish_reason: "error",
|
||||||
{ role: "system", content: "Answer briefly using the source material." },
|
};
|
||||||
{ role: "user", content: `Q: ${question}\n\n${contextForPrompt.slice(0, 8000)}` },
|
}
|
||||||
],
|
|
||||||
max_tokens: 500,
|
|
||||||
think: false,
|
|
||||||
});
|
|
||||||
cloudCallsTotal += 1 + result.continuation_retries;
|
|
||||||
}
|
}
|
||||||
if (result.content.length === 0) {
|
if (result.content.length === 0) {
|
||||||
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
|
errorsRecovered.push("even rescue returned empty — last-ditch local fallback");
|
||||||
@ -327,6 +418,8 @@ async function runIteration(
|
|||||||
cloud_calls_total: cloudCallsTotal,
|
cloud_calls_total: cloudCallsTotal,
|
||||||
continuation_retries: result.continuation_retries,
|
continuation_retries: result.continuation_retries,
|
||||||
rescue_triggered: rescueTriggered,
|
rescue_triggered: rescueTriggered,
|
||||||
|
task_attempts_made: taskAttemptHistory.length + 1, // +1 for the successful attempt
|
||||||
|
task_retry_history: taskAttemptHistory,
|
||||||
playbook_id,
|
playbook_id,
|
||||||
tokens_prompt: result.prompt_tokens,
|
tokens_prompt: result.prompt_tokens,
|
||||||
tokens_completion: result.completion_tokens,
|
tokens_completion: result.completion_tokens,
|
||||||
@ -402,6 +495,9 @@ async function main() {
|
|||||||
rescues_triggered: results.filter(r => r.rescue_triggered).length,
|
rescues_triggered: results.filter(r => r.rescue_triggered).length,
|
||||||
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
|
iter6_received_prior_ids: results[5]?.citations_from_prior_iterations.length ?? 0,
|
||||||
iter6_actually_cited_in_answer: citationsHonored,
|
iter6_actually_cited_in_answer: citationsHonored,
|
||||||
|
iter3_task_attempts: results[2]?.task_attempts_made ?? 0,
|
||||||
|
iter3_task_retries: results[2]?.task_retry_history.length ?? 0,
|
||||||
|
max_task_attempts_any_iter: Math.max(...results.map(r => r.task_attempts_made)),
|
||||||
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
|
total_duration_ms: results.reduce((s, r) => s + r.duration_ms, 0),
|
||||||
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
|
overall: results.length === 6 && results.every(r => r.playbook_id !== null) ? "PASS" : "PARTIAL",
|
||||||
};
|
};
|
||||||
@ -411,7 +507,8 @@ async function main() {
|
|||||||
log(`══════ SUMMARY ${summary.overall} ══════`);
|
log(`══════ SUMMARY ${summary.overall} ══════`);
|
||||||
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
|
log(` 6 iterations, ${summary.total_cloud_calls} cloud calls, ${summary.total_errors_recovered} errors recovered`);
|
||||||
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
|
log(` tree-splits: ${summary.tree_splits_fired}/6 continuations: ${summary.total_continuation_retries} rescues: ${summary.rescues_triggered}`);
|
||||||
log(` iter 6 cited ${summary.iter6_cited_prior} prior iteration playbooks`);
|
log(` iter 6 received ${summary.iter6_received_prior_ids} prior IDs, cited ${summary.iter6_actually_cited_in_answer} [pb:...] markers in its answer`);
|
||||||
|
log(` iter 3 task-retry loop: ${summary.iter3_task_attempts} attempts (${summary.iter3_task_retries} prior-failure retries before success)`);
|
||||||
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
log(` total duration: ${(summary.total_duration_ms / 1000).toFixed(1)}s`);
|
||||||
log("");
|
log("");
|
||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user