#!/bin/bash # ============================================================================= # brain server triage — quick system health check # Run after incident, reboot, or anytime something feels off. # Exits 0 if healthy, 1 if issues found. # ============================================================================= RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' NC='\033[0m' ISSUES=0 pass() { echo -e " ${GREEN}✓${NC} $1"; } fail() { echo -e " ${RED}✗${NC} $1"; ISSUES=$((ISSUES+1)); } warn() { echo -e " ${YELLOW}!${NC} $1"; } section() { echo -e "\n${CYAN}[$1]${NC}"; } echo -e "${CYAN}=== brain server triage — $(date) ===${NC}" # --- Services --- section "Services" for svc in llm-team-ui nginx ollama postgresql minio vault fail2ban ufw; do if systemctl is-active --quiet "$svc" 2>/dev/null; then pass "$svc running" else fail "$svc NOT running" fi done # --- Ports --- section "Ports" check_port() { local port=$1 name=$2 bind=$3 if ss -tlnp | grep -q ":${port} "; then actual_bind=$(ss -tlnp | grep ":${port} " | awk '{print $4}' | head -1) if [ -n "$bind" ] && ! echo "$actual_bind" | grep -q "$bind"; then fail "$name on $port — bound to $actual_bind (expected $bind)" else pass "$name listening on $actual_bind" fi else fail "$name NOT listening on port $port" fi } check_port 5000 "Flask app" "127.0.0.1" check_port 80 "Nginx HTTP" "" check_port 11434 "Ollama" "" check_port 5432 "PostgreSQL" "127.0.0.1" check_port 9000 "MinIO" "" # --- Firewall --- section "Firewall" if ufw status | grep -q "Status: active"; then pass "UFW active" # Check default deny if ufw status verbose | grep -q "Default: deny (incoming)"; then pass "Default deny incoming" else fail "Default is NOT deny incoming" fi else fail "UFW is NOT active" fi # --- Fail2ban --- section "Fail2ban" jail_count=$(fail2ban-client status 2>/dev/null | grep "Number of jail" | awk '{print $NF}') if [ -n "$jail_count" ] && [ "$jail_count" -ge 3 ]; then pass "$jail_count jails active" else fail "Only $jail_count fail2ban jails (expected >= 3)" fi # Check for banned IPs banned=$(fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $NF}') if [ "$banned" -gt 0 ] 2>/dev/null; then warn "$banned IPs currently banned on SSH" fi # --- SSH --- section "SSH" if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then pass "Root login disabled" else fail "Root login NOT disabled" fi if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then pass "Password auth disabled" else warn "Password auth still enabled (SSH keys not yet set up)" fi if grep -q "^MaxAuthTries 3" /etc/ssh/sshd_config; then pass "Max auth tries = 3" else fail "Max auth tries not set to 3" fi # --- Nginx headers --- section "Nginx Security Headers" headers=$(curl -sI http://127.0.0.1 2>/dev/null) for h in "X-Frame-Options" "X-Content-Type-Options" "Referrer-Policy" "X-XSS-Protection"; do if echo "$headers" | grep -qi "$h"; then pass "$h present" else fail "$h MISSING" fi done # --- Kernel hardening --- section "Kernel" if [ "$(sysctl -n net.ipv4.conf.all.rp_filter 2>/dev/null)" = "1" ]; then pass "Reverse path filtering enabled" else fail "Reverse path filtering disabled" fi if [ "$(sysctl -n net.ipv4.conf.all.send_redirects 2>/dev/null)" = "0" ]; then pass "ICMP redirects disabled" else fail "ICMP redirects NOT disabled" fi # --- App health --- section "Application" http_code=$(curl -so /dev/null -w '%{http_code}' http://127.0.0.1/ 2>/dev/null) if [ "$http_code" = "200" ] || [ "$http_code" = "302" ]; then pass "App responding (HTTP $http_code)" else fail "App NOT responding (HTTP $http_code)" fi # --- Database --- section "Database" if sudo -u postgres psql -d knowledge_base -c "SELECT 1;" >/dev/null 2>&1; then tables=$(sudo -u postgres psql -d knowledge_base -t -c "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" 2>/dev/null | tr -d ' ') pass "knowledge_base reachable ($tables tables)" else fail "Cannot connect to knowledge_base" fi # --- Disk --- section "Disk" usage=$(df / --output=pcent | tail -1 | tr -d ' %') if [ "$usage" -lt 80 ]; then pass "Disk usage: ${usage}%" elif [ "$usage" -lt 95 ]; then warn "Disk usage: ${usage}% (getting full)" else fail "Disk usage: ${usage}% (critical!)" fi # --- Backups --- section "Backups" if [ -d /var/backups/brain/borg-repo ]; then last_backup=$(borg list /var/backups/brain/borg-repo 2>/dev/null | tail -1) if [ -n "$last_backup" ]; then pass "Borg repo exists, last: $last_backup" else warn "Borg repo exists but empty — run backup.sh" fi else fail "No borg repo at /var/backups/brain/borg-repo" fi last_dump=$(ls -t /var/backups/brain/pg-dumps/knowledge_base_*.dump 2>/dev/null | head -1) if [ -n "$last_dump" ]; then dump_age=$(( ($(date +%s) - $(stat -c %Y "$last_dump")) / 86400 )) if [ "$dump_age" -le 1 ]; then pass "Latest pg dump: $(basename "$last_dump")" else warn "Latest pg dump is ${dump_age} days old: $(basename "$last_dump")" fi else warn "No PostgreSQL dumps found" fi # --- Suspicious activity --- section "Security Scan" # Check for unexpected SUID binaries suid_count=$(find /usr/local -perm -4000 -type f 2>/dev/null | wc -l) if [ "$suid_count" -eq 0 ]; then pass "No unexpected SUID binaries in /usr/local" else fail "$suid_count SUID binaries found in /usr/local — investigate" find /usr/local -perm -4000 -type f 2>/dev/null fi # Check for unauthorized cron jobs cron_users=$(ls /var/spool/cron/crontabs/ 2>/dev/null | grep -v root) if [ -z "$cron_users" ]; then pass "No non-root user crontabs" else warn "Crontabs found for: $cron_users" fi # Check for recent failed SSH logins failed_ssh=$(journalctl -u sshd --since "1 hour ago" --no-pager 2>/dev/null | grep -c "Failed password" || true) if [ "$failed_ssh" -gt 10 ]; then warn "$failed_ssh failed SSH logins in last hour" elif [ "$failed_ssh" -gt 0 ]; then pass "$failed_ssh failed SSH logins in last hour (normal)" else pass "No failed SSH logins in last hour" fi # --- Summary --- echo "" if [ "$ISSUES" -eq 0 ]; then echo -e "${GREEN}=== ALL CHECKS PASSED ===${NC}" exit 0 else echo -e "${RED}=== $ISSUES ISSUE(S) FOUND ===${NC}" exit 1 fi