From 52bb216c2d93a5cf3814b7e14e8ec5dc21e06c6c Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Apr 2026 01:44:21 -0500 Subject: [PATCH] mode_compare: grounding check + control flag + emoji-tolerant section detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes after the playbook_only confabulation surfaced in 2026-04-26 experiment (8 'findings' on a 333-line file all citing lines 378-945 โ€” fully fabricated from pathway-memory pattern names). (1) Aggregator regex bug โ€” section detection failed on emoji-prefixed markdown headers like `## ๐Ÿ”Ž Ranked Findings`. The original regex required word chars right after #{1,3}\s+, so the patches table header `## ๐Ÿ› ๏ธ Concrete Patch Suggestions` was never recognized as a stop boundary, double-counting every finding. Fix: allow non-letter chars (emoji/space) between # and the keyword. (2) Grounding check โ€” for each finding row in the response, extract backtick-quoted symbols + cited line numbers; verify symbols exist in the actual focus file and lines fall within EOF. Computes grounded/total ratio per mode. Surfaces 'OOB' (out-of-bounds) count explicitly so confabulation is visible at a glance. Confirms what hand-grading found: codereview_playbook_only's 8 findings on service.rs were 1/8 grounded with 7 OOB. (3) Control mode tagging โ€” codereview_null and codereview_playbook_only are designed as falsifiers (baseline / lossy ceiling) and their numerical wins should never be read as recommendations. Output marks them with โš— glyph + warning footer. Per-mode aggregate is now sorted by groundedness, not raw count. Per-mode-vs-lakehouse comparison uses grounded findings, not raw โ€” so confabulation can no longer score a "win". Updated SCRUM_MASTER_SPEC.md with refactor timeline pointing at the 2026-04-25/26 commits (observer fix, relevance filter, retire wire, mode router, enrichment runner, parameterized experiment). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/SCRUM_MASTER_SPEC.md | 43 +++++++- scripts/mode_compare.ts | 203 ++++++++++++++++++++++++++++++++------ 2 files changed, 214 insertions(+), 32 deletions(-) diff --git a/docs/SCRUM_MASTER_SPEC.md b/docs/SCRUM_MASTER_SPEC.md index 2de95f5..06a2f88 100644 --- a/docs/SCRUM_MASTER_SPEC.md +++ b/docs/SCRUM_MASTER_SPEC.md @@ -1,8 +1,7 @@ # Scrum Master Pipeline โ€” Spec + Current State **Status:** Active iteration on branch `scrum/auto-apply-19814` โ†’ PR #11 at git.agentview.dev/profit/lakehouse -**Last iter:** 9 (2026-04-24) -**Branch commit head:** `f4cff66` (ADR-021 Phase D fix) +**Branch commit head:** see `git log --oneline -1 scrum/auto-apply-19814` (auto-stale; check it) > **2026-04-25 โ€” see also `docs/MATRIX_AGENT_HANDOVER.md`** for the > standalone `matrix-agent-validated` repo split + the Ansible playbook @@ -10,7 +9,45 @@ > (partial deploy); the real destination is the `matrix-test` Incus > container at 10.111.129.50. -This doc is the single handoff artifact for the scrum-master + auto-apply + pathway-memory loop built during 2026-04-24 sessions. A fresh Claude Code session reading this + `docs/DECISIONS.md` (ADR-020 and ADR-021 specifically) + `MEMORY.md` should have the same context as the session that wrote it. +This doc is the single handoff artifact for the scrum-master + auto-apply + pathway-memory loop. A fresh Claude Code session reading this + `docs/DECISIONS.md` (ADR-020 and ADR-021) + `docs/MATRIX_AGENT_HANDOVER.md` + `MEMORY.md` should have the same context as the session that wrote it. + +## โ–ถ Refactor timeline (read in order) + +The pipeline has been refactored substantially since the 2026-04-24 +baseline below. Read the changes top-down to understand current shape: + +### 2026-04-23 โ†’ 24 (foundation, captured in ยง1-ยง12 below) +- 9-rung cloud ladder + tree-split + adversarial prompt +- Pathway memory base + ADR-021 semantic-correctness layer +- Hardened auto-applier (5 gates: confidence/size/cargo/warnings/rationale) +- Hand-review wire (commit `3f166a5`) โ€” judgment moved out of inner loop +- Anchor-grounding post-verifier (commit `9cc0ceb` / `9ecc584`) +- Single-model retry with enrichment (commit `d187bcd`) โ€” stop cascading on quality +- Unified matrix retriever pulling from ALL KB corpora (commit `a496ced`) +- Paid OpenRouter ladder + Kimi K2.6 + Gemini 2.5 (commit `4ac5656`) +- Goal-driven autonomous loop harness (commit `e79e51e`) + +### 2026-04-25 โ†’ 26 (architectural through-line โ€” read these next) +- **Observer health-probe TypeConfusion fix** (`54689d5`) โ€” `r.json()` on text/plain `/health` was crash-looping the observer; sealed in pathway_memory as `TypeConfusion:fetch-health-json`. +- **Adjacency-pollution relevance filter** (`0115a60`) โ€” observer `/relevance` endpoint + scrum wiring (`LH_RELEVANCE_FILTER` / `LH_RELEVANCE_THRESHOLD`). Drops chunks about symbols the focus file IMPORTS but doesn't define. +- **Audit-consensus โ†’ retire wire** (`626f18d`) โ€” when observer rejects a hot-swap-recommended attempt, immediately call `/vectors/pathway/retire` on the trace. `HotSwapCandidate` gained `trace_uid` for single-trace precision. Confidence โ‰ฅ0.7 gate avoids retiring on heuristic-fallback verdicts. +- **`/v1/mode` router phase 1** (`d277efb`) โ€” task_class โ†’ mode/model decision endpoint with `config/modes.toml`. Decision-only; doesn't execute. +- **Native enrichment runner** (`86f63a0`) โ€” `codereview_lakehouse` mode that COMPOSES every primitive (focus file + bug fingerprints + relevance-filtered matrix + adversarial framing) into ONE prompt for one-shot success. `POST /v1/mode/execute`. Modes-as-prompt-molders, not model-pickers โ€” see โ˜… Insight from session 2026-04-26. +- **Parameterized runner + 5 experiment modes** (`7c47734`) โ€” `codereview_lakehouse|null|isolation|matrix_only|playbook_only`. Each isolates one architectural axis. `scripts/mode_experiment.ts` sweeps files ร— modes; `scripts/mode_compare.ts` aggregates with grounding check (catches confabulation by comparing cited symbols to real file content). +- **Scrum mode-runner fast path** (`7c47734`) โ€” gated by `LH_USE_MODE_RUNNER=1`, scrum tries `/v1/mode/execute` BEFORE the 9-rung ladder. Falls through to ladder if response < `LH_MODE_MIN_CHARS` or anything errors. Off by default until A/B-validated. + +### Verified architectural insights (2026-04-26 experiment) +- `codereview_lakehouse` produces 100% grounded findings, beats every challenger. +- `codereview_playbook_only` (pathway-only, no file content) confabulates ~50% of findings โ€” keep as control, NEVER as recommendation. +- `codereview_null` (no enrichment, generic prompt) produces 0 ranked findings โ€” adversarial framing is load-bearing. +- Matrix corpus contributes ~2 grounded findings vs isolation. Small but real. + +### Where to read what +- **Loop architecture (this doc, ยง1-ยง12):** original 2026-04-24 design. +- **Modes-as-enrichment vision:** `crates/gateway/src/v1/mode.rs` doc comment + `config/modes.toml`. +- **Mode experiment results:** `data/_kb/mode_experiments.jsonl` + `bun run scripts/mode_compare.ts`. +- **Pathway memory mechanics:** `crates/vectord/src/pathway_memory.rs` + ADR-021 in `docs/DECISIONS.md`. +- **Handover to fresh box:** `docs/MATRIX_AGENT_HANDOVER.md`. ## 1. What the loop is diff --git a/scripts/mode_compare.ts b/scripts/mode_compare.ts index 8560675..ddc0877 100644 --- a/scripts/mode_compare.ts +++ b/scripts/mode_compare.ts @@ -76,10 +76,31 @@ function loadRows(path: string, since: string | null): Row[] { } function countFindings(md: string): number { - // Markdown table rows that look like findings: `| | ...` or `| **N** | ...` - // Heuristic โ€” adversarial framing produces ranked tables. - const matches = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm); - return matches ? matches.length : 0; + // Adversarial framing produces a "Ranked Findings" table early in + // the output. The original regex `^\|\s*\*?\*?\d+\*?\*?\s*\|` matched + // ANY numbered table row โ€” including the patch table that follows + // the findings table, double-counting every finding. + // + // Fix: only count rows under a "Ranked Findings" / "Findings" header + // until we hit the next ## heading or a "Patch" / "Suggestion" header. + // Falls back to the loose count if no findings header is detected + // (some modes use different framing). + const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i; + const m = md.match(findingsSectionRe); + if (m && m.index !== undefined) { + const after = md.slice(m.index + m[0].length); + // Stop at the next ## heading or Patch/Suggestion header. + // Allow non-letter chars (emoji/space) between # and the keyword + // so headers like `## ๐Ÿ› ๏ธ Concrete Patch Suggestions` get caught. + const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i; + const stop = after.search(stopRe); + const section = stop >= 0 ? after.slice(0, stop) : after; + const rows = section.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm); + return rows ? rows.length : 0; + } + // Fallback for outputs without a labeled findings section. + const all = md.match(/^\|\s*\*?\*?\d+\*?\*?\s*\|/gm); + return all ? all.length : 0; } function countPathwayCitations(md: string): number { @@ -88,12 +109,116 @@ function countPathwayCitations(md: string): number { return (md.match(re) ?? []).length; } +// โ”€โ”€โ”€ Grounding check โ”€โ”€โ”€ +// A finding is "grounded" if the symbols it cites actually exist in +// the focus file AND any cited line numbers fall within the file's +// real line count. Anti-pollution measure surfaced 2026-04-26 after +// codereview_playbook_only produced 8 findings citing lines 378-945 +// in a 332-line file (all hallucinated from pathway-memory preamble +// since the mode doesn't pass file content to the model). + +interface GroundingResult { + total: number; + grounded: number; + partial: number; + hallucinated: number; + out_of_bounds_lines: number; // findings citing lines past EOF + details: { row: string; verdict: string }[]; +} + +function extractFindings(md: string): { full: string; symbols: string[]; lines: number[] }[] { + // Pull each finding row from the Findings section (uses the same + // emoji-tolerant section-detection logic as countFindings). + const findingsSectionRe = /(?:^|\n)#{1,3}[^\na-zA-Z]*(?:Ranked\s+)?Findings?[^\n]*\n/i; + const m = md.match(findingsSectionRe); + let section = md; + if (m && m.index !== undefined) { + const after = md.slice(m.index + m[0].length); + const stopRe = /\n#{1,3}[^\na-zA-Z]*(?:Patch|Suggestion|Reference|Summary|Concrete)/i; + const stop = after.search(stopRe); + section = stop >= 0 ? after.slice(0, stop) : after; + } + const rows = section.split("\n").filter(l => /^\|\s*\*?\*?\d+\*?\*?\s*\|/.test(l)); + return rows.map(row => { + // Symbols: backtick-quoted identifiers, also bare snake_case_words + const sym = new Set(); + for (const t of row.matchAll(/`([A-Za-z_][A-Za-z0-9_:]*)`/g)) sym.add(t[1]); + for (const t of row.matchAll(/\b([a-z][a-z0-9_]{4,})\b/g)) { + const w = t[1]; + // Filter common words that aren't symbols + if (!["score", "level", "match", "table", "value", "where", "found", "would", "after", "before", "calls", "needs", "patch", "should", "missing", "violation", "evidence", "checks", "either", "audit", "later", "field", "rules", "stage", "early", "always", "later", "could", "leaks", "memory"].includes(w)) { + sym.add(w); + } + } + // Line numbers from `path:NNN` or `:NNN-NNN` patterns + const lineNums: number[] = []; + for (const t of row.matchAll(/[:โ€‘\-](\d{2,5})(?:[โ€‘\-](\d{2,5}))?/g)) { + lineNums.push(parseInt(t[1])); + if (t[2]) lineNums.push(parseInt(t[2])); + } + return { full: row, symbols: [...sym], lines: lineNums }; + }); +} + +function checkGrounding(md: string, fileContent: string | null): GroundingResult { + const findings = extractFindings(md); + if (!fileContent) { + return { total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [] }; + } + const fileLines = fileContent.split("\n").length; + const result: GroundingResult = { + total: findings.length, grounded: 0, partial: 0, hallucinated: 0, out_of_bounds_lines: 0, details: [], + }; + for (const f of findings) { + const sym_hits = f.symbols.filter(s => fileContent.includes(s)); + const symbol_grounded = f.symbols.length === 0 ? false : sym_hits.length > 0; + const line_oob = f.lines.length > 0 && f.lines.some(l => l > fileLines); + if (line_oob) result.out_of_bounds_lines++; + let verdict: string; + if (sym_hits.length > 0 && !line_oob) { + result.grounded++; + verdict = `grounded (${sym_hits.length}/${f.symbols.length} syms hit)`; + } else if (sym_hits.length > 0 && line_oob) { + result.partial++; + verdict = `partial (real syms but lines >${fileLines} EOF)`; + } else if (symbol_grounded) { + result.partial++; + verdict = "partial"; + } else { + result.hallucinated++; + verdict = `hallucinated (0/${f.symbols.length} syms hit${line_oob ? `, lines>${fileLines}` : ''})`; + } + result.details.push({ row: f.full.slice(0, 80), verdict }); + } + return result; +} + +function readFileSafe(path: string): string | null { + try { + return readFileSync(path, "utf8"); + } catch { + return null; + } +} + function pad(s: string | number, n: number, right = false): string { const str = String(s); if (str.length >= n) return str.slice(0, n); return right ? " ".repeat(n - str.length) + str : str + " ".repeat(n - str.length); } +// Modes intentionally lossy or designed to expose architectural axes โ€” +// their numerical wins should NOT be read as recommendations. Tagged +// in the output so a glance at the matrix doesn't mislead. +const CONTROL_MODES = new Set([ + "codereview_null", // baseline โ€” no enrichment, generic framing + "codereview_playbook_only", // lossy โ€” pathway only, NO file content +]); + +function modeLabel(mode: string): string { + return CONTROL_MODES.has(mode) ? `${mode} โš—` : mode; +} + function main() { const { jsonl, since } = parseArgs(); const rows = loadRows(jsonl, since); @@ -113,52 +238,71 @@ function main() { const modesSorted = [...allModes].sort(); // Per-file matrix - console.log("\nโ•โ•โ• PER-FILE COMPARISON โ•โ•โ•\n"); + console.log("\nโ•โ•โ• PER-FILE COMPARISON โ•โ•โ•"); + console.log("(โš— = control/lossy mode โ€” wins should not be read as recommendations)\n"); for (const file of Object.keys(byFile).sort()) { console.log(`๐Ÿ“„ ${file}`); + const fileContent = readFileSafe(file); + const fileLines = fileContent ? fileContent.split("\n").length : 0; + console.log(` (file: ${fileLines} lines${fileContent === null ? ", NOT READABLE โ€” grounding skipped" : ""})`); console.log( - ` ${pad("mode", 28)} ${pad("resp", 6, true)} ${pad("findings", 8, true)} ${pad("path_cit", 8, true)} ${pad("ms", 7, true)} ${pad("mtx k/d", 9, true)} ${pad("bug_fp", 6, true)}` + ` ${pad("mode", 30)} ${pad("resp", 6, true)} ${pad("find", 5, true)} ${pad("ground", 9, true)} ${pad("hallu", 6, true)} ${pad("OOB", 4, true)} ${pad("path", 5, true)} ${pad("ms", 7, true)} ${pad("bug_fp", 6, true)}` ); - console.log(` ${"โ”€".repeat(28)} ${"โ”€".repeat(6)} ${"โ”€".repeat(8)} ${"โ”€".repeat(8)} ${"โ”€".repeat(7)} ${"โ”€".repeat(9)} ${"โ”€".repeat(6)}`); + console.log(` ${"โ”€".repeat(30)} ${"โ”€".repeat(6)} ${"โ”€".repeat(5)} ${"โ”€".repeat(9)} ${"โ”€".repeat(6)} ${"โ”€".repeat(4)} ${"โ”€".repeat(5)} ${"โ”€".repeat(7)} ${"โ”€".repeat(6)}`); for (const mode of modesSorted) { const r = byFile[file][mode]; if (!r) { - console.log(` ${pad(mode, 28)} ${pad("โ€”", 6, true)}`); + console.log(` ${pad(modeLabel(mode), 30)} ${pad("โ€”", 6, true)}`); continue; } const findings = countFindings(r.response); const cits = countPathwayCitations(r.response); - const mk = r.sources.matrix_chunks_kept ?? 0; - const md = r.sources.matrix_chunks_dropped ?? 0; const bf = r.sources.bug_fingerprints_count ?? 0; + const grounding = checkGrounding(r.response, fileContent); + const groundedStr = grounding.total === 0 ? "โ€”" : `${grounding.grounded}/${grounding.total}`; console.log( - ` ${pad(mode, 28)} ${pad(r.response_chars, 6, true)} ${pad(findings, 8, true)} ${pad(cits, 8, true)} ${pad(r.latency_ms, 7, true)} ${pad(`${mk}/${mk + md}`, 9, true)} ${pad(bf, 6, true)}` + ` ${pad(modeLabel(mode), 30)} ${pad(r.response_chars, 6, true)} ${pad(findings, 5, true)} ${pad(groundedStr, 9, true)} ${pad(grounding.hallucinated, 6, true)} ${pad(grounding.out_of_bounds_lines, 4, true)} ${pad(cits, 5, true)} ${pad(r.latency_ms, 7, true)} ${pad(bf, 6, true)}` ); } console.log(""); } - // Per-mode averages + // Per-mode averages โ€” grounded findings is now the primary metric. + // avg_groundedness is the rate at which findings cite real symbols + // within file bounds. Modes with low groundedness are confabulating. console.log("โ•โ•โ• PER-MODE AGGREGATE โ•โ•โ•\n"); - console.log(` ${pad("mode", 28)} ${pad("n", 4, true)} ${pad("avg resp", 9, true)} ${pad("avg find", 9, true)} ${pad("avg cit", 8, true)} ${pad("avg ms", 8, true)}`); - console.log(` ${"โ”€".repeat(28)} ${"โ”€".repeat(4)} ${"โ”€".repeat(9)} ${"โ”€".repeat(9)} ${"โ”€".repeat(8)} ${"โ”€".repeat(8)}`); + console.log(` ${pad("mode", 30)} ${pad("n", 3, true)} ${pad("avg find", 9, true)} ${pad("avg grnd", 9, true)} ${pad("grnd %", 7, true)} ${pad("avg hallu", 10, true)} ${pad("avg ms", 7, true)}`); + console.log(` ${"โ”€".repeat(30)} ${"โ”€".repeat(3)} ${"โ”€".repeat(9)} ${"โ”€".repeat(9)} ${"โ”€".repeat(7)} ${"โ”€".repeat(10)} ${"โ”€".repeat(7)}`); + const fileCache: Record = {}; for (const mode of modesSorted) { const modeRows = rows.filter(r => r.mode === mode); if (modeRows.length === 0) continue; const n = modeRows.length; - const avgResp = Math.round(modeRows.reduce((s, r) => s + r.response_chars, 0) / n); - const avgFind = Math.round(10 * modeRows.reduce((s, r) => s + countFindings(r.response), 0) / n) / 10; - const avgCit = Math.round(10 * modeRows.reduce((s, r) => s + countPathwayCitations(r.response), 0) / n) / 10; + let totFind = 0, totGround = 0, totHallu = 0; + for (const r of modeRows) { + const fc = fileCache[r.file_path] ??= readFileSafe(r.file_path); + const g = checkGrounding(r.response, fc); + totFind += g.total; + totGround += g.grounded; + totHallu += g.hallucinated; + } + const avgFind = (totFind / n).toFixed(1); + const avgGround = (totGround / n).toFixed(1); + const grndPct = totFind > 0 ? `${Math.round(100 * totGround / totFind)}%` : "โ€”"; + const avgHallu = (totHallu / n).toFixed(1); const avgMs = Math.round(modeRows.reduce((s, r) => s + r.latency_ms, 0) / n); console.log( - ` ${pad(mode, 28)} ${pad(n, 4, true)} ${pad(avgResp, 9, true)} ${pad(avgFind, 9, true)} ${pad(avgCit, 8, true)} ${pad(avgMs, 8, true)}` + ` ${pad(modeLabel(mode), 30)} ${pad(n, 3, true)} ${pad(avgFind, 9, true)} ${pad(avgGround, 9, true)} ${pad(grndPct, 7, true)} ${pad(avgHallu, 10, true)} ${pad(avgMs, 7, true)}` ); } - // Mode-relative: how often does each mode produce MORE findings than lakehouse? - console.log("\nโ•โ•โ• MODE vs codereview_lakehouse (per file) โ•โ•โ•\n"); - console.log(` ${pad("mode", 28)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("ฮ” avg findings", 16, true)}`); - console.log(` ${"โ”€".repeat(28)} ${"โ”€".repeat(5)} ${"โ”€".repeat(7)} ${"โ”€".repeat(5)} ${"โ”€".repeat(16)}`); + // Mode-relative: GROUNDED findings vs lakehouse. The earlier raw + // finding-count comparison rewarded confabulation (more rows = more + // wins). Comparing grounded findings instead corrects for modes + // that produce convincing-but-fake output. + console.log("\nโ•โ•โ• MODE vs codereview_lakehouse (grounded findings, per file) โ•โ•โ•\n"); + console.log(` ${pad("mode", 30)} ${pad("wins", 5, true)} ${pad("losses", 7, true)} ${pad("ties", 5, true)} ${pad("ฮ” avg grounded", 16, true)}`); + console.log(` ${"โ”€".repeat(30)} ${"โ”€".repeat(5)} ${"โ”€".repeat(7)} ${"โ”€".repeat(5)} ${"โ”€".repeat(16)}`); for (const mode of modesSorted) { if (mode === "codereview_lakehouse") continue; let wins = 0, losses = 0, ties = 0, totalDelta = 0, n = 0; @@ -166,21 +310,22 @@ function main() { const baseline = byFile[file]["codereview_lakehouse"]; const challenger = byFile[file][mode]; if (!baseline || !challenger) continue; - const bf = countFindings(baseline.response); - const cf = countFindings(challenger.response); - if (cf > bf) wins++; - else if (cf < bf) losses++; + const fc = fileCache[file] ??= readFileSafe(file); + const bg = checkGrounding(baseline.response, fc).grounded; + const cg = checkGrounding(challenger.response, fc).grounded; + if (cg > bg) wins++; + else if (cg < bg) losses++; else ties++; - totalDelta += cf - bf; + totalDelta += cg - bg; n++; } if (n === 0) continue; const avgDelta = (totalDelta / n).toFixed(1); console.log( - ` ${pad(mode, 28)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}` + ` ${pad(modeLabel(mode), 30)} ${pad(wins, 5, true)} ${pad(losses, 7, true)} ${pad(ties, 5, true)} ${pad(avgDelta, 16, true)}` ); } - console.log("\n[compare] done\n"); + console.log("\n[compare] done โ€” โš— marks lossy/control modes, exclude from recommendations\n"); } main();