lakehouse/scrum

#!/usr/bin/env bash
# scrum — find gaps in current work-in-progress, push to KB.
#
# Usage:
#   ./scrum                    auto-bundle current diff vs origin/main, auto-label
#   ./scrum my_label           same, with explicit label
#   ./scrum --staged           bundle staged-only diff (pre-commit check)
#   ./scrum --since=COMMIT     bundle from a specific commit
#
# Output:
#   findings  → data/_kb/scrum_findings.jsonl  (one row per scrum run, KB-queryable)
#   verdicts  → reports/scrum/_evidence/$(date +%Y-%m-%d)/verdicts/<label>_*.md
#
# This is a TOOL J runs to find gaps. Findings flow to the KB. The KB
# informs how WE work on the code. Findings DO NOT auto-flow into
# architecture or design docs — that's J's call after reading them.
#
# Cloud models are used here BY DESIGN (3-lineage cross-review needs
# distinct training corpora). This is dev tooling, not the runtime
# hot path — PRD line 70 applies to customer requests, not to J's
# dev tools.
set -euo pipefail
cd "$(git rev-parse --show-toplevel 2>/dev/null || pwd)"

LABEL=""
DIFF_MODE="branch"
SINCE_COMMIT=""

while [ $# -gt 0 ]; do
  case "$1" in
    --staged) DIFF_MODE="staged"; shift ;;
    --since=*) DIFF_MODE="since"; SINCE_COMMIT="${1#--since=}"; shift ;;
    -h|--help) sed -n '1,18p' "$0"; exit 0 ;;
    *) LABEL="$1"; shift ;;
  esac
done

[ -z "$LABEL" ] && LABEL="scrum_$(date +%Y%m%d_%H%M%S)"

# Build the diff
DIFF_FILE=$(mktemp -t "scrum_${LABEL}.XXXXXX.diff")
case "$DIFF_MODE" in
  branch) git diff origin/main...HEAD > "$DIFF_FILE" 2>/dev/null || git diff main...HEAD > "$DIFF_FILE" ;;
  staged) git diff --staged > "$DIFF_FILE" ;;
  since)  git diff "$SINCE_COMMIT"...HEAD > "$DIFF_FILE" ;;
esac

DIFF_SIZE=$(wc -c < "$DIFF_FILE")
DIFF_LINES=$(wc -l < "$DIFF_FILE")

if [ "$DIFF_SIZE" -lt 50 ]; then
  echo "[scrum] no diff to review (mode=$DIFF_MODE)"
  rm -f "$DIFF_FILE"
  exit 0
fi
if [ "$DIFF_SIZE" -gt 100000 ]; then
  echo "[scrum] diff is ${DIFF_SIZE} bytes (>100KB) — kimi/qwen will give up"
  echo "        split into smaller bundles by file/component, then re-run"
  echo "        per-component diffs are typically <60KB"
  rm -f "$DIFF_FILE"
  exit 1
fi

echo "[scrum] $LABEL — $DIFF_LINES lines / $DIFF_SIZE bytes — running 3 reviewers"

# Locate the underlying scrum_review.sh (lives in golangLAKEHOUSE)
SCRUM_REVIEW=""
for path in \
  "/home/profit/golangLAKEHOUSE/scripts/scrum_review.sh" \
  "$(git rev-parse --show-toplevel)/scripts/scrum_review.sh" \
  "../golangLAKEHOUSE/scripts/scrum_review.sh"
do
  if [ -f "$path" ]; then SCRUM_REVIEW="$path"; break; fi
done
if [ -z "$SCRUM_REVIEW" ]; then
  echo "[scrum] could not locate scrum_review.sh"
  echo "        expected at /home/profit/golangLAKEHOUSE/scripts/scrum_review.sh"
  rm -f "$DIFF_FILE"
  exit 1
fi

# Default to the live Go gateway on :4110 (chatd-routed multi-provider).
# Set LH_GATEWAY=http://127.0.0.1:3100 to use the Rust gateway instead.
export LH_GATEWAY="${LH_GATEWAY:-http://127.0.0.1:4110}"

# Run from the scrum_review.sh directory so its `cd "$(dirname $0)/.."`
# works correctly (lands in golangLAKEHOUSE root, where reports/ lives).
SCRUM_DIR=$(dirname "$SCRUM_REVIEW")
SCRUM_REPO=$(dirname "$SCRUM_DIR")
( cd "$SCRUM_REPO" && bash "$SCRUM_REVIEW" "$DIFF_FILE" "$LABEL" ) || {
  echo "[scrum] reviewers failed — bailing"
  rm -f "$DIFF_FILE"
  exit 1
}

# Locate the tally file the review just wrote
TALLY="$SCRUM_REPO/reports/scrum/_evidence/$(date +%Y-%m-%d)/verdicts/${LABEL}_tally.md"
if [ ! -f "$TALLY" ]; then
  echo "[scrum] no tally found at $TALLY (review may have failed silently)"
  rm -f "$DIFF_FILE"
  exit 1
fi

# Push to KB — one JSONL row per scrum run, queryable via DuckDB or jq
KB_DIR="$(git rev-parse --show-toplevel)/data/_kb"
mkdir -p "$KB_DIR"
KB_FILE="$KB_DIR/scrum_findings.jsonl"

# Extract finding count + convergent count from the tally.
# Tally row shape: `| <reviewers> | <severity> | <where> | <hits> |`
# Convergent rows have "+" in the Reviewers (first) column.
FINDINGS=$(awk -F'|' '/^\| / && NF>=5 && $2 !~ /^[ -]*$/ && $2 !~ /Reviewers/ {n++} END{print n+0}' "$TALLY" 2>/dev/null)
CONVERGENT=$(awk -F'|' '/^\| / && NF>=5 && $2 ~ /\+/ {n++} END{print n+0}' "$TALLY" 2>/dev/null)
[ -z "$FINDINGS" ] && FINDINGS=0
[ -z "$CONVERGENT" ] && CONVERGENT=0

# Build a single JSONL row
python3 <<PYEOF >> "$KB_FILE"
import json, sys, datetime
from pathlib import Path
tally = Path("$TALLY").read_text()
row = {
  "schema": "scrum_finding.v1",
  "ts": datetime.datetime.utcnow().isoformat() + "Z",
  "label": "$LABEL",
  "diff_mode": "$DIFF_MODE",
  "diff_bytes": $DIFF_SIZE,
  "diff_lines": $DIFF_LINES,
  "findings_total": $FINDINGS,
  "findings_convergent": $CONVERGENT,
  "tally_path": "$TALLY",
  "tally_excerpt": tally[:2000],
  "branch": "$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo unknown)",
  "head_sha": "$(git rev-parse HEAD 2>/dev/null || echo unknown)",
}
print(json.dumps(row))
PYEOF

rm -f "$DIFF_FILE"

echo ""
echo "──────────────────────────────────────────────────────────────────"
echo "[scrum] $LABEL: $FINDINGS findings ($CONVERGENT convergent)"
echo "[scrum] tally:    $TALLY"
echo "[scrum] verdicts: $SCRUM_REPO/reports/scrum/_evidence/$(date +%Y-%m-%d)/verdicts/"
echo "[scrum] KB:       $KB_FILE"
echo ""
echo "[scrum] read tally:  cat '$TALLY'"
echo "[scrum] query KB:    jq -c 'select(.label == \"$LABEL\")' '$KB_FILE'"