golangLAKEHOUSE/scripts/scrum_review.sh

#!/usr/bin/env bash
# Cross-lineage scrum review driver.
#
# Per feedback_cross_lineage_review.md: Opus + Kimi + Qwen3-coder
# review the same diff via /v1/chat. Convergent findings (≥2
# reviewers) = high-signal real bugs; single-reviewer = lineage
# catch.
#
# Usage:
#   ./scripts/scrum_review.sh <bundle.diff> <bundle_label>
#
# Outputs verbatim verdicts to:
#   reports/scrum/_evidence/<DATE>/verdicts/<bundle>_<reviewer>.md
set -euo pipefail
cd "$(dirname "$0")/.."

DIFF_FILE="${1:-}"
BUNDLE_LABEL="${2:-}"
DATE="${SCRUM_DATE:-$(date +%Y-%m-%d)}"
GATEWAY="${LH_GATEWAY:-http://127.0.0.1:3110}"
OUT_DIR="reports/scrum/_evidence/${DATE}/verdicts"

if [ -z "$DIFF_FILE" ] || [ ! -f "$DIFF_FILE" ]; then
  echo "usage: $0 <diff_file> <bundle_label>"
  echo "example: $0 reports/scrum/_evidence/2026-04-30/diffs/bundle_A_config_refactor.diff config_refactor"
  exit 1
fi

mkdir -p "$OUT_DIR"
DIFF_BYTES=$(wc -c < "$DIFF_FILE")
DIFF_LINES=$(wc -l < "$DIFF_FILE")
echo "[scrum] $BUNDLE_LABEL — $DIFF_LINES lines · $DIFF_BYTES bytes · 3 reviewers"

# Diff-size guard. Per the 2026-05-02 disposition: a 165KB bundle
# produced 0 convergent findings + 3 confabulated BLOCKs because Kimi
# and Qwen gave up at <300 output tokens (input-token spent on
# scanning, not analysis). Sweet spot per per-component runs is
# ≤60KB. SCRUM_FORCE_OVERSIZE=1 lets operators override for cases
# where splitting isn't possible.
if [ "$DIFF_BYTES" -gt 100000 ] && [ "${SCRUM_FORCE_OVERSIZE:-0}" != "1" ]; then
  echo "[scrum] ABORT: diff is ${DIFF_BYTES} bytes (>100KB)."
  echo "         Big diffs make Kimi/Qwen give up early — split into"
  echo "         per-component bundles ≤60KB each, then re-run."
  echo "         Override (NOT recommended): SCRUM_FORCE_OVERSIZE=1"
  exit 2
fi
if [ "$DIFF_BYTES" -gt 60000 ]; then
  echo "[scrum] WARN: diff is ${DIFF_BYTES} bytes (>60KB) — non-Opus"
  echo "         lineages may produce thin output. Per-component split"
  echo "         is preferred. Continuing."
fi

# System prompt — same shape as the Rust auditor's review template,
# tightened per feedback_cross_lineage_review.md (lead with verdict).
SYSTEM='You are a senior code reviewer in a 3-lineage cross-review.
Your verdict feeds a convergent-finding gate (≥2 reviewers = real
bug). Be terse, evidence-based, and lead with the verdict.

For each finding, output one block. The format is STRICT — a
post-processor greps WHERE: lines across all 3 reviewers to find
convergent findings, so the file path must appear EXACTLY as it
does in the diff (e.g. `cmd/foo/main.go:42`, not `foo/main.go:42`).

  SEVERITY: BLOCK | WARN | INFO
  WHERE:    <relative/path/from/repo/root>:<line_or_symbol>
  WHAT:     one-sentence description
  WHY:      one-sentence rationale grounded in the diff

Severity ladder:
- BLOCK = ship-blocker. Wrong correctness, security flaw, broken
          contract, lost test coverage, panic on real input, secret
          leak. Worth blocking the PR.
- WARN  = real but non-blocking. Race condition under specific
          load, missing edge case, weak naming making future bugs
          likely, regression risk in adjacent code.
- INFO  = nit / style / better-name suggestion / dead-code remnant.

Skip the analysis preamble. Lead with the first BLOCK/WARN/INFO
block. End with an empty "VERDICT:" line of "ship | ship-with-fixes
| hold" + ≤15 word summary.

Never invent line numbers — only cite lines the diff shows.
Never repeat a file:line in two findings — combine them.'

REVIEWERS=(
  "opus|opencode/claude-opus-4-7"
  "kimi|openrouter/moonshotai/kimi-k2-0905"
  "qwen|openrouter/qwen/qwen3-coder"
)

DIFF_CONTENT=$(cat "$DIFF_FILE")

run_review() {
  local short="$1" model="$2"
  local out="$OUT_DIR/${BUNDLE_LABEL}_${short}.md"
  local user="Review the following diff. Bundle: $BUNDLE_LABEL.

\`\`\`diff
$DIFF_CONTENT
\`\`\`"
  printf "  %-6s %s ... " "$short" "$model"
  local t0=$SECONDS
  local status

  # Build the body via temp files — both jq's --arg AND curl's
  # --data run into the kernel's argv limit (~128KB) when the diff
  # is large. Voice-ai full bundle was 156K and hit it twice.
  # Piping through files (and using --rawfile for jq) sidesteps both.
  local body_file user_file sys_file
  body_file=$(mktemp); user_file=$(mktemp); sys_file=$(mktemp)
  printf '%s' "$user" > "$user_file"
  printf '%s' "$SYSTEM" > "$sys_file"
  jq -n --arg model "$model" --rawfile sys "$sys_file" --rawfile user "$user_file" \
    '{model:$model, max_tokens:4096, messages:[{role:"system",content:$sys},{role:"user",content:$user}]}' \
    > "$body_file"

  status=$(curl -sS -o /tmp/scrum_resp.json -w '%{http_code}' --max-time 240 \
    -X POST "$GATEWAY/v1/chat" \
    -H 'Content-Type: application/json' \
    --data-binary "@$body_file")
  rm -f "$body_file" "$user_file" "$sys_file"
  local elapsed=$((SECONDS - t0))
  if [ "$status" != "200" ]; then
    printf "✗ HTTP %s (%ds)\n" "$status" "$elapsed"
    head -c 300 /tmp/scrum_resp.json
    echo
    return 1
  fi
  local content latency tokens_in tokens_out
  content=$(jq -r '.content' /tmp/scrum_resp.json)
  latency=$(jq -r '.latency_ms' /tmp/scrum_resp.json)
  tokens_in=$(jq -r '.input_tokens // 0' /tmp/scrum_resp.json)
  tokens_out=$(jq -r '.output_tokens // 0' /tmp/scrum_resp.json)
  {
    echo "# Scrum review — $BUNDLE_LABEL — $short ($model)"
    echo
    echo "**Latency:** ${latency}ms · **Tokens:** ${tokens_in} in / ${tokens_out} out · **Date:** ${DATE}"
    echo
    echo "---"
    echo
    echo "$content"
  } > "$out"
  printf "✓ %dms · %dt-out → %s\n" "$latency" "$tokens_out" "$out"
}

for r in "${REVIEWERS[@]}"; do
  short="${r%%|*}"
  model="${r#*|}"
  run_review "$short" "$model" || true
done

# ─── Convergence tally ────────────────────────────────────────────
# Walk the 3 verdicts, extract WHERE: lines + their SEVERITY, dedupe
# across reviewers. Output a tally file showing what ≥2 reviewers
# flagged (real-bug signal) vs 1-reviewer (lineage catch / possibly
# confabulation).
TALLY="$OUT_DIR/${BUNDLE_LABEL}_tally.md"
{
  echo "# Convergence tally — $BUNDLE_LABEL"
  echo
  echo "**Date:** ${DATE} · **Diff:** ${DIFF_LINES} lines / ${DIFF_BYTES} bytes"
  echo
  echo "## Findings by location"
  echo
  echo "| Reviewers | Severity | Where | Hits |"
  echo "|---|---|---|---:|"
  for v in "$OUT_DIR/${BUNDLE_LABEL}"_{opus,kimi,qwen}.md; do
    [ -f "$v" ] || continue
    short=$(basename "$v" .md | sed "s|.*${BUNDLE_LABEL}_||")
    grep -E "^(SEVERITY|WHERE):" "$v" 2>/dev/null \
      | awk -v r="$short" '
          /^SEVERITY:/ { sev = $2; next }
          /^WHERE:/    {
            sub(/^WHERE: */, "")
            # Drop trailing parenthetical ("(or <symbol>)") if it crept in.
            sub(/\s*\(.*$/, "")
            print r "|" sev "|" $0
          }'
  done | sort -u -t'|' -k1,1 -k3,3 \
       | sort -t'|' -k3 \
       | awk -F'|' '
           # Aggregate by location. Dedup reviewers within a location
           # (multiple findings from the same lineage at the same WHERE
           # collapse to a single entry — that is reviewer self-repeat,
           # not convergence). Track distinct reviewers + their highest
           # severity across that location.
           function rank(s) { return s == "BLOCK" ? 3 : s == "WARN" ? 2 : 1 }
           function sevname(r) { return r == 3 ? "BLOCK" : r == 2 ? "WARN" : "INFO" }
           {
             key=$3
             if (!(key in seen)) { seen[key]=""; sev_rank[key]=0 }
             # split seen[key] on ";" and check if reviewer already present
             present=0
             n=split(seen[key], a, ";")
             for (i=1;i<=n;i++) if (a[i]==$1) { present=1; break }
             if (!present) {
               seen[key] = seen[key] == "" ? $1 : seen[key] ";" $1
               distinct_n[key]++
             }
             r = rank($2)
             if (r > sev_rank[key]) { sev_rank[key]=r; sev_max[key]=$2 }
           }
           END {
             for (k in distinct_n) {
               # Reviewers column shows distinct lineages joined by "+"
               gsub(";", "+", seen[k])
               printf "%s|%s|%s|%d\n", seen[k], sev_max[k], k, distinct_n[k]
             }
           }
         ' \
       | sort -t'|' -k4nr -k1 \
       | awk -F'|' '{ printf "| %s | %s | `%s` | %d |\n", $1, $2, $3, $4 }'
  echo
  echo "(Convergent rows above are those whose Reviewers column contains a '+' — i.e. ≥2 lineages flagged the same location.)"
  echo
  echo "## Verdict line per reviewer"
  echo
  for v in "$OUT_DIR/${BUNDLE_LABEL}"_{opus,kimi,qwen}.md; do
    [ -f "$v" ] || continue
    short=$(basename "$v" .md | sed "s|.*${BUNDLE_LABEL}_||")
    line=$(grep -E "^VERDICT:" "$v" 2>/dev/null | head -1)
    echo "- **${short}**: ${line:-_no VERDICT line emitted_}"
  done
} > "$TALLY"
echo "[scrum] tally → $TALLY"

# Convergent count from the tally body — count rows where the Hits
# column is ≥2 (distinct-reviewer count, after the awk dedup above).
CONV=$(awk -F'|' '$2 ~ /^ [0-9]+ $/ && ($5 + 0) >= 2 {n++} END {print n+0}' "$TALLY")
TOTAL=$(awk -F'|' '$2 ~ /^ [0-9]+ $/ {n++} END {print n+0}' "$TALLY")
# (The above scans rows of the tally table where the Hits column —
# cell 5 in `| reviewers | sev | where | hits |` — parses as int.)
# Fall back to a simpler check if the table parsing finds nothing.
if [ "$TOTAL" = "0" ]; then
  TOTAL=$(grep -c "^| " "$TALLY" | awk '{print $1 - 1}') # subtract header row
  CONV=$(awk '/^\|/ && $4 != "" && ($4 + 0) >= 2 {n++} END {print n+0}' "$TALLY")
fi
echo "[scrum] $BUNDLE_LABEL: $CONV convergent / $TOTAL distinct findings"
echo "[scrum] $BUNDLE_LABEL complete"