- brain-backup: daily borg + pg_dump, 7d/4w/3m retention, cron at 3AM
- brain-triage: full system health check (services, ports, firewall,
headers, kernel, app, DB, disk, backups, security scan)
- brain-recover: restore from backup (full/db/configs/app) + emergency
lockdown mode that blocks all external access except LAN SSH
All accessible via /usr/local/bin/brain-{backup,triage,recover}
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
219 lines
6.4 KiB
Bash
Executable File
219 lines
6.4 KiB
Bash
Executable File
#!/bin/bash
|
|
# =============================================================================
|
|
# brain server triage — quick system health check
|
|
# Run after incident, reboot, or anytime something feels off.
|
|
# Exits 0 if healthy, 1 if issues found.
|
|
# =============================================================================
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
ISSUES=0
|
|
|
|
pass() { echo -e " ${GREEN}✓${NC} $1"; }
|
|
fail() { echo -e " ${RED}✗${NC} $1"; ISSUES=$((ISSUES+1)); }
|
|
warn() { echo -e " ${YELLOW}!${NC} $1"; }
|
|
section() { echo -e "\n${CYAN}[$1]${NC}"; }
|
|
|
|
echo -e "${CYAN}=== brain server triage — $(date) ===${NC}"
|
|
|
|
# --- Services ---
|
|
section "Services"
|
|
for svc in llm-team-ui nginx ollama postgresql minio vault fail2ban ufw; do
|
|
if systemctl is-active --quiet "$svc" 2>/dev/null; then
|
|
pass "$svc running"
|
|
else
|
|
fail "$svc NOT running"
|
|
fi
|
|
done
|
|
|
|
# --- Ports ---
|
|
section "Ports"
|
|
check_port() {
|
|
local port=$1 name=$2 bind=$3
|
|
if ss -tlnp | grep -q ":${port} "; then
|
|
actual_bind=$(ss -tlnp | grep ":${port} " | awk '{print $4}' | head -1)
|
|
if [ -n "$bind" ] && ! echo "$actual_bind" | grep -q "$bind"; then
|
|
fail "$name on $port — bound to $actual_bind (expected $bind)"
|
|
else
|
|
pass "$name listening on $actual_bind"
|
|
fi
|
|
else
|
|
fail "$name NOT listening on port $port"
|
|
fi
|
|
}
|
|
check_port 5000 "Flask app" "127.0.0.1"
|
|
check_port 80 "Nginx HTTP" ""
|
|
check_port 11434 "Ollama" ""
|
|
check_port 5432 "PostgreSQL" "127.0.0.1"
|
|
check_port 9000 "MinIO" ""
|
|
|
|
# --- Firewall ---
|
|
section "Firewall"
|
|
if ufw status | grep -q "Status: active"; then
|
|
pass "UFW active"
|
|
# Check default deny
|
|
if ufw status verbose | grep -q "Default: deny (incoming)"; then
|
|
pass "Default deny incoming"
|
|
else
|
|
fail "Default is NOT deny incoming"
|
|
fi
|
|
else
|
|
fail "UFW is NOT active"
|
|
fi
|
|
|
|
# --- Fail2ban ---
|
|
section "Fail2ban"
|
|
jail_count=$(fail2ban-client status 2>/dev/null | grep "Number of jail" | awk '{print $NF}')
|
|
if [ -n "$jail_count" ] && [ "$jail_count" -ge 3 ]; then
|
|
pass "$jail_count jails active"
|
|
else
|
|
fail "Only $jail_count fail2ban jails (expected >= 3)"
|
|
fi
|
|
|
|
# Check for banned IPs
|
|
banned=$(fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $NF}')
|
|
if [ "$banned" -gt 0 ] 2>/dev/null; then
|
|
warn "$banned IPs currently banned on SSH"
|
|
fi
|
|
|
|
# --- SSH ---
|
|
section "SSH"
|
|
if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then
|
|
pass "Root login disabled"
|
|
else
|
|
fail "Root login NOT disabled"
|
|
fi
|
|
|
|
if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then
|
|
pass "Password auth disabled"
|
|
else
|
|
warn "Password auth still enabled (SSH keys not yet set up)"
|
|
fi
|
|
|
|
if grep -q "^MaxAuthTries 3" /etc/ssh/sshd_config; then
|
|
pass "Max auth tries = 3"
|
|
else
|
|
fail "Max auth tries not set to 3"
|
|
fi
|
|
|
|
# --- Nginx headers ---
|
|
section "Nginx Security Headers"
|
|
headers=$(curl -sI http://127.0.0.1 2>/dev/null)
|
|
for h in "X-Frame-Options" "X-Content-Type-Options" "Referrer-Policy" "X-XSS-Protection"; do
|
|
if echo "$headers" | grep -qi "$h"; then
|
|
pass "$h present"
|
|
else
|
|
fail "$h MISSING"
|
|
fi
|
|
done
|
|
|
|
# --- Kernel hardening ---
|
|
section "Kernel"
|
|
if [ "$(sysctl -n net.ipv4.conf.all.rp_filter 2>/dev/null)" = "1" ]; then
|
|
pass "Reverse path filtering enabled"
|
|
else
|
|
fail "Reverse path filtering disabled"
|
|
fi
|
|
if [ "$(sysctl -n net.ipv4.conf.all.send_redirects 2>/dev/null)" = "0" ]; then
|
|
pass "ICMP redirects disabled"
|
|
else
|
|
fail "ICMP redirects NOT disabled"
|
|
fi
|
|
|
|
# --- App health ---
|
|
section "Application"
|
|
http_code=$(curl -so /dev/null -w '%{http_code}' http://127.0.0.1/ 2>/dev/null)
|
|
if [ "$http_code" = "200" ] || [ "$http_code" = "302" ]; then
|
|
pass "App responding (HTTP $http_code)"
|
|
else
|
|
fail "App NOT responding (HTTP $http_code)"
|
|
fi
|
|
|
|
# --- Database ---
|
|
section "Database"
|
|
if sudo -u postgres psql -d knowledge_base -c "SELECT 1;" >/dev/null 2>&1; then
|
|
tables=$(sudo -u postgres psql -d knowledge_base -t -c "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" 2>/dev/null | tr -d ' ')
|
|
pass "knowledge_base reachable ($tables tables)"
|
|
else
|
|
fail "Cannot connect to knowledge_base"
|
|
fi
|
|
|
|
# --- Disk ---
|
|
section "Disk"
|
|
usage=$(df / --output=pcent | tail -1 | tr -d ' %')
|
|
if [ "$usage" -lt 80 ]; then
|
|
pass "Disk usage: ${usage}%"
|
|
elif [ "$usage" -lt 95 ]; then
|
|
warn "Disk usage: ${usage}% (getting full)"
|
|
else
|
|
fail "Disk usage: ${usage}% (critical!)"
|
|
fi
|
|
|
|
# --- Backups ---
|
|
section "Backups"
|
|
if [ -d /var/backups/brain/borg-repo ]; then
|
|
last_backup=$(borg list /var/backups/brain/borg-repo 2>/dev/null | tail -1)
|
|
if [ -n "$last_backup" ]; then
|
|
pass "Borg repo exists, last: $last_backup"
|
|
else
|
|
warn "Borg repo exists but empty — run backup.sh"
|
|
fi
|
|
else
|
|
fail "No borg repo at /var/backups/brain/borg-repo"
|
|
fi
|
|
|
|
last_dump=$(ls -t /var/backups/brain/pg-dumps/knowledge_base_*.dump 2>/dev/null | head -1)
|
|
if [ -n "$last_dump" ]; then
|
|
dump_age=$(( ($(date +%s) - $(stat -c %Y "$last_dump")) / 86400 ))
|
|
if [ "$dump_age" -le 1 ]; then
|
|
pass "Latest pg dump: $(basename "$last_dump")"
|
|
else
|
|
warn "Latest pg dump is ${dump_age} days old: $(basename "$last_dump")"
|
|
fi
|
|
else
|
|
warn "No PostgreSQL dumps found"
|
|
fi
|
|
|
|
# --- Suspicious activity ---
|
|
section "Security Scan"
|
|
# Check for unexpected SUID binaries
|
|
suid_count=$(find /usr/local -perm -4000 -type f 2>/dev/null | wc -l)
|
|
if [ "$suid_count" -eq 0 ]; then
|
|
pass "No unexpected SUID binaries in /usr/local"
|
|
else
|
|
fail "$suid_count SUID binaries found in /usr/local — investigate"
|
|
find /usr/local -perm -4000 -type f 2>/dev/null
|
|
fi
|
|
|
|
# Check for unauthorized cron jobs
|
|
cron_users=$(ls /var/spool/cron/crontabs/ 2>/dev/null | grep -v root)
|
|
if [ -z "$cron_users" ]; then
|
|
pass "No non-root user crontabs"
|
|
else
|
|
warn "Crontabs found for: $cron_users"
|
|
fi
|
|
|
|
# Check for recent failed SSH logins
|
|
failed_ssh=$(journalctl -u sshd --since "1 hour ago" --no-pager 2>/dev/null | grep -c "Failed password" || true)
|
|
if [ "$failed_ssh" -gt 10 ]; then
|
|
warn "$failed_ssh failed SSH logins in last hour"
|
|
elif [ "$failed_ssh" -gt 0 ]; then
|
|
pass "$failed_ssh failed SSH logins in last hour (normal)"
|
|
else
|
|
pass "No failed SSH logins in last hour"
|
|
fi
|
|
|
|
# --- Summary ---
|
|
echo ""
|
|
if [ "$ISSUES" -eq 0 ]; then
|
|
echo -e "${GREEN}=== ALL CHECKS PASSED ===${NC}"
|
|
exit 0
|
|
else
|
|
echo -e "${RED}=== $ISSUES ISSUE(S) FOUND ===${NC}"
|
|
exit 1
|
|
fi
|