llm-team-ui/server/triage.sh
root 2bb910b72c Add triage, backup, and disaster recovery system
- brain-backup: daily borg + pg_dump, 7d/4w/3m retention, cron at 3AM
- brain-triage: full system health check (services, ports, firewall,
  headers, kernel, app, DB, disk, backups, security scan)
- brain-recover: restore from backup (full/db/configs/app) + emergency
  lockdown mode that blocks all external access except LAN SSH

All accessible via /usr/local/bin/brain-{backup,triage,recover}

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 04:52:48 -05:00

219 lines
6.4 KiB
Bash
Executable File

#!/bin/bash
# =============================================================================
# brain server triage — quick system health check
# Run after incident, reboot, or anytime something feels off.
# Exits 0 if healthy, 1 if issues found.
# =============================================================================
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
ISSUES=0
pass() { echo -e " ${GREEN}${NC} $1"; }
fail() { echo -e " ${RED}${NC} $1"; ISSUES=$((ISSUES+1)); }
warn() { echo -e " ${YELLOW}!${NC} $1"; }
section() { echo -e "\n${CYAN}[$1]${NC}"; }
echo -e "${CYAN}=== brain server triage — $(date) ===${NC}"
# --- Services ---
section "Services"
for svc in llm-team-ui nginx ollama postgresql minio vault fail2ban ufw; do
if systemctl is-active --quiet "$svc" 2>/dev/null; then
pass "$svc running"
else
fail "$svc NOT running"
fi
done
# --- Ports ---
section "Ports"
check_port() {
local port=$1 name=$2 bind=$3
if ss -tlnp | grep -q ":${port} "; then
actual_bind=$(ss -tlnp | grep ":${port} " | awk '{print $4}' | head -1)
if [ -n "$bind" ] && ! echo "$actual_bind" | grep -q "$bind"; then
fail "$name on $port — bound to $actual_bind (expected $bind)"
else
pass "$name listening on $actual_bind"
fi
else
fail "$name NOT listening on port $port"
fi
}
check_port 5000 "Flask app" "127.0.0.1"
check_port 80 "Nginx HTTP" ""
check_port 11434 "Ollama" ""
check_port 5432 "PostgreSQL" "127.0.0.1"
check_port 9000 "MinIO" ""
# --- Firewall ---
section "Firewall"
if ufw status | grep -q "Status: active"; then
pass "UFW active"
# Check default deny
if ufw status verbose | grep -q "Default: deny (incoming)"; then
pass "Default deny incoming"
else
fail "Default is NOT deny incoming"
fi
else
fail "UFW is NOT active"
fi
# --- Fail2ban ---
section "Fail2ban"
jail_count=$(fail2ban-client status 2>/dev/null | grep "Number of jail" | awk '{print $NF}')
if [ -n "$jail_count" ] && [ "$jail_count" -ge 3 ]; then
pass "$jail_count jails active"
else
fail "Only $jail_count fail2ban jails (expected >= 3)"
fi
# Check for banned IPs
banned=$(fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $NF}')
if [ "$banned" -gt 0 ] 2>/dev/null; then
warn "$banned IPs currently banned on SSH"
fi
# --- SSH ---
section "SSH"
if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then
pass "Root login disabled"
else
fail "Root login NOT disabled"
fi
if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then
pass "Password auth disabled"
else
warn "Password auth still enabled (SSH keys not yet set up)"
fi
if grep -q "^MaxAuthTries 3" /etc/ssh/sshd_config; then
pass "Max auth tries = 3"
else
fail "Max auth tries not set to 3"
fi
# --- Nginx headers ---
section "Nginx Security Headers"
headers=$(curl -sI http://127.0.0.1 2>/dev/null)
for h in "X-Frame-Options" "X-Content-Type-Options" "Referrer-Policy" "X-XSS-Protection"; do
if echo "$headers" | grep -qi "$h"; then
pass "$h present"
else
fail "$h MISSING"
fi
done
# --- Kernel hardening ---
section "Kernel"
if [ "$(sysctl -n net.ipv4.conf.all.rp_filter 2>/dev/null)" = "1" ]; then
pass "Reverse path filtering enabled"
else
fail "Reverse path filtering disabled"
fi
if [ "$(sysctl -n net.ipv4.conf.all.send_redirects 2>/dev/null)" = "0" ]; then
pass "ICMP redirects disabled"
else
fail "ICMP redirects NOT disabled"
fi
# --- App health ---
section "Application"
http_code=$(curl -so /dev/null -w '%{http_code}' http://127.0.0.1/ 2>/dev/null)
if [ "$http_code" = "200" ] || [ "$http_code" = "302" ]; then
pass "App responding (HTTP $http_code)"
else
fail "App NOT responding (HTTP $http_code)"
fi
# --- Database ---
section "Database"
if sudo -u postgres psql -d knowledge_base -c "SELECT 1;" >/dev/null 2>&1; then
tables=$(sudo -u postgres psql -d knowledge_base -t -c "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" 2>/dev/null | tr -d ' ')
pass "knowledge_base reachable ($tables tables)"
else
fail "Cannot connect to knowledge_base"
fi
# --- Disk ---
section "Disk"
usage=$(df / --output=pcent | tail -1 | tr -d ' %')
if [ "$usage" -lt 80 ]; then
pass "Disk usage: ${usage}%"
elif [ "$usage" -lt 95 ]; then
warn "Disk usage: ${usage}% (getting full)"
else
fail "Disk usage: ${usage}% (critical!)"
fi
# --- Backups ---
section "Backups"
if [ -d /var/backups/brain/borg-repo ]; then
last_backup=$(borg list /var/backups/brain/borg-repo 2>/dev/null | tail -1)
if [ -n "$last_backup" ]; then
pass "Borg repo exists, last: $last_backup"
else
warn "Borg repo exists but empty — run backup.sh"
fi
else
fail "No borg repo at /var/backups/brain/borg-repo"
fi
last_dump=$(ls -t /var/backups/brain/pg-dumps/knowledge_base_*.dump 2>/dev/null | head -1)
if [ -n "$last_dump" ]; then
dump_age=$(( ($(date +%s) - $(stat -c %Y "$last_dump")) / 86400 ))
if [ "$dump_age" -le 1 ]; then
pass "Latest pg dump: $(basename "$last_dump")"
else
warn "Latest pg dump is ${dump_age} days old: $(basename "$last_dump")"
fi
else
warn "No PostgreSQL dumps found"
fi
# --- Suspicious activity ---
section "Security Scan"
# Check for unexpected SUID binaries
suid_count=$(find /usr/local -perm -4000 -type f 2>/dev/null | wc -l)
if [ "$suid_count" -eq 0 ]; then
pass "No unexpected SUID binaries in /usr/local"
else
fail "$suid_count SUID binaries found in /usr/local — investigate"
find /usr/local -perm -4000 -type f 2>/dev/null
fi
# Check for unauthorized cron jobs
cron_users=$(ls /var/spool/cron/crontabs/ 2>/dev/null | grep -v root)
if [ -z "$cron_users" ]; then
pass "No non-root user crontabs"
else
warn "Crontabs found for: $cron_users"
fi
# Check for recent failed SSH logins
failed_ssh=$(journalctl -u sshd --since "1 hour ago" --no-pager 2>/dev/null | grep -c "Failed password" || true)
if [ "$failed_ssh" -gt 10 ]; then
warn "$failed_ssh failed SSH logins in last hour"
elif [ "$failed_ssh" -gt 0 ]; then
pass "$failed_ssh failed SSH logins in last hour (normal)"
else
pass "No failed SSH logins in last hour"
fi
# --- Summary ---
echo ""
if [ "$ISSUES" -eq 0 ]; then
echo -e "${GREEN}=== ALL CHECKS PASSED ===${NC}"
exit 0
else
echo -e "${RED}=== $ISSUES ISSUE(S) FOUND ===${NC}"
exit 1
fi