diff --git a/tests/real-world/scrum_master_pipeline.ts b/tests/real-world/scrum_master_pipeline.ts index f70a9a6..9323da7 100644 --- a/tests/real-world/scrum_master_pipeline.ts +++ b/tests/real-world/scrum_master_pipeline.ts @@ -27,6 +27,14 @@ const CHUNK_SIZE = 800; const CHUNK_OVERLAP = 120; const TOP_K_CONTEXT = 5; const MAX_ATTEMPTS = 6; +// Files larger than this get tree-split instead of truncated. Fixes the +// 6KB false-positive class (model claiming a field is "missing" when +// it exists past the context cutoff). +const FILE_TREE_SPLIT_THRESHOLD = 6000; +const FILE_SHARD_SIZE = 3500; +// Appended jsonl so auditor's kb_query can surface scrum findings for +// files touched by a PR under review. Part of cohesion plan Phase C. +const SCRUM_REVIEWS_JSONL = "/home/profit/lakehouse/data/_kb/scrum_reviews.jsonl"; const OUT_DIR = `/home/profit/lakehouse/tests/real-world/runs/scrum_${Date.now().toString(36)}`; const PRD_PATH = "/home/profit/lakehouse/docs/PRD.md"; @@ -61,6 +69,8 @@ type Chunk = { id: string; text: string; embedding: number[]; origin: string; of interface FileReview { file: string; file_bytes: number; + tree_split_fired: boolean; + shards_summarized: number; top_prd_chunks: Array<{ origin: string; offset: number; score: number }>; top_proposal_chunks: Array<{ origin: string; offset: number; score: number }>; attempts_made: number; @@ -153,6 +163,47 @@ function retrieveTopK(query_emb: number[], pool: Chunk[], k: number): Chunk[] { .map(x => ({ ...x.c, _score: x.score } as any)); } +// Tree-split a large file: shard it, summarize each shard against +// the review question, merge into a scratchpad. Uses cloud because +// the summarization step needs quality > speed. Returns the +// scratchpad (full-file distillation) and the cloud-call count. +async function treeSplitFile( + filePath: string, + content: string, +): Promise<{ scratchpad: string; shards: number; cloud_calls: number }> { + const shards: Array<{ from: number; to: number; text: string }> = []; + for (let i = 0; i < content.length; i += FILE_SHARD_SIZE) { + const end = Math.min(i + FILE_SHARD_SIZE, content.length); + shards.push({ from: i, to: end, text: content.slice(i, end) }); + } + let scratchpad = ""; + let cloud_calls = 0; + log(` tree-split: ${content.length} chars → ${shards.length} shards of ${FILE_SHARD_SIZE}`); + for (const [si, shard] of shards.entries()) { + const prompt = `You are summarizing ONE SHARD of a source file as part of a multi-shard review. File: ${filePath}. Shard ${si + 1}/${shards.length} (bytes ${shard.from}..${shard.to}). + +─────── shard source ─────── +${shard.text} +─────── end shard ─────── + +Scratchpad of prior shards (if empty, this is shard 1): +${scratchpad || "(empty)"} + +Extract ONLY facts useful for reviewing this file against its PRD: function names + purposes, struct fields + types, invariants, edge cases, TODO markers, error-handling style. Under 150 words. No prose outside the extracted facts.`; + const r = await chat({ + provider: "ollama_cloud", + model: "gpt-oss:120b", + prompt, + max_tokens: 400, + }); + cloud_calls += 1; + if (r.content) { + scratchpad += `\n--- shard ${si + 1} (bytes ${shard.from}..${shard.to}) ---\n${r.content.trim()}`; + } + } + return { scratchpad, shards: shards.length, cloud_calls }; +} + async function reviewFile( filePath: string, prd_chunks: Chunk[], @@ -180,11 +231,39 @@ async function reviewFile( ...topPlan.map(c => `[PLAN @${c.offset}]\n${c.text.slice(0, 600)}`), ].join("\n\n"); + // Files bigger than FILE_TREE_SPLIT_THRESHOLD get tree-split. + // Summarize each shard to a scratchpad, then review against the + // scratchpad instead of the truncated first chunk. Prevents the + // false-positive pattern where the model claims a field is + // "missing" because it's past the context cutoff. + let sourceForPrompt: string; + let treeSplitFired = false; + let shardsSummarized = 0; + let extraCloudCalls = 0; + if (content.length > FILE_TREE_SPLIT_THRESHOLD) { + treeSplitFired = true; + const ts = await treeSplitFile(rel, content); + shardsSummarized = ts.shards; + extraCloudCalls = ts.cloud_calls; + sourceForPrompt = `[FULL-FILE SCRATCHPAD — distilled from ${ts.shards} shards via tree-split]\n${ts.scratchpad}`; + } else { + sourceForPrompt = content; + } + + // Prompt — when tree-split fired, include an explicit instruction + // not to claim a field/function is "missing" because the scratchpad + // is a distillation not the full file. Attacks the rubric-tuning + // concern J called out. + const truncationWarning = treeSplitFired + ? `\nIMPORTANT: the "source" below is a multi-shard distillation (tree-split across ${shardsSummarized} shards), NOT the full raw file. DO NOT claim any field, function, or feature is "missing" based on its absence from this distillation — the distillation may have elided it. Only call out gaps that appear DIRECTLY contradicted by the PRD excerpts.\n` + : ""; + const baseTask = `You are reviewing one source file against the Lakehouse PRD and an active cohesion-integration plan. -FILE: ${rel} +FILE: ${rel} (${content.length} bytes${treeSplitFired ? `, tree-split into ${shardsSummarized} shards` : ""}) +${truncationWarning} ─────── source ─────── -${content.slice(0, 6000)}${content.length > 6000 ? "\n[... truncated after 6KB ...]" : ""} +${sourceForPrompt} ─────── end source ─────── ${contextBlock} @@ -234,9 +313,11 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of break; } - return { + const review: FileReview = { file: rel, file_bytes: content.length, + tree_split_fired: treeSplitFired, + shards_summarized: shardsSummarized, top_prd_chunks: topPrd.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })), top_proposal_chunks: topPlan.map(c => ({ origin: c.origin, offset: c.offset, score: (c as any)._score })), attempts_made: history.length, @@ -246,6 +327,31 @@ Respond with markdown. Be specific, not generic. Cite file-region + PRD-chunk-of suggestions: accepted ?? "[no acceptable answer after escalation ladder exhausted]", duration_ms: Date.now() - t0, }; + + // Append to the shared scrum-reviews jsonl so the auditor's + // kb_query check can surface relevant reviews for files in a + // PR diff. Cohesion plan Phase C wire. + if (accepted) { + const { appendFile, mkdir } = await import("node:fs/promises"); + const { dirname } = await import("node:path"); + await mkdir(dirname(SCRUM_REVIEWS_JSONL), { recursive: true }); + const row = { + file: rel, + reviewed_at: new Date().toISOString(), + accepted_model: acceptedModel, + accepted_on_attempt: acceptedOn, + attempts_made: history.length, + tree_split_fired: treeSplitFired, + suggestions_preview: accepted.slice(0, 2000), + }; + try { + await appendFile(SCRUM_REVIEWS_JSONL, JSON.stringify(row) + "\n"); + } catch (e) { + console.error(`[scrum] failed to append scrum_reviews.jsonl: ${(e as Error).message}`); + } + } + + return review; } async function loadAndChunk(path: string, origin_tag: string): Promise {