diff --git a/server/backup.sh b/server/backup.sh new file mode 100755 index 0000000..8702d60 --- /dev/null +++ b/server/backup.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# ============================================================================= +# brain server backup — runs daily via cron +# Backs up: app, configs, database, nginx, systemd, SSL, fail2ban, sysctl +# Storage: /var/backups/brain/ (borg repo + pg dumps) +# Retention: 7 daily, 4 weekly, 3 monthly +# ============================================================================= + +set -euo pipefail +BACKUP_DIR="/var/backups/brain" +BORG_REPO="$BACKUP_DIR/borg-repo" +PG_DIR="$BACKUP_DIR/pg-dumps" +LOG="/var/log/brain-backup.log" +TIMESTAMP=$(date +%Y-%m-%d_%H%M) + +export BORG_PASSPHRASE="" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } + +log "=== Backup starting ===" + +# --- PostgreSQL dump --- +mkdir -p "$PG_DIR" +log "Dumping PostgreSQL knowledge_base..." +sudo -u postgres pg_dump -Fc knowledge_base > "$PG_DIR/knowledge_base_${TIMESTAMP}.dump" 2>> "$LOG" +# Keep last 14 dumps +ls -t "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | tail -n +15 | xargs -r rm -- +log " pg_dump OK ($(du -sh "$PG_DIR/knowledge_base_${TIMESTAMP}.dump" | cut -f1))" + +# --- Borg backup --- +log "Running borg backup..." +borg create \ + --stats \ + --compression zstd,3 \ + --exclude '*.pyc' \ + --exclude '__pycache__' \ + --exclude '.git' \ + --exclude 'node_modules' \ + "$BORG_REPO::${TIMESTAMP}" \ + /root/llm_team_ui.py \ + /root/llm_team_config.json \ + /home/profit/.env \ + /etc/nginx/sites-available/ \ + /etc/nginx/sites-enabled/ \ + /etc/nginx/nginx.conf \ + /etc/fail2ban/jail.local \ + /etc/fail2ban/jail.d/ \ + /etc/ssh/sshd_config \ + /etc/sysctl.d/99-security.conf \ + /etc/systemd/system/llm-team-ui.service \ + /etc/systemd/system/goaccess.service \ + /etc/systemd/system/ollama.service \ + /etc/systemd/system/minio.service \ + /etc/systemd/system/vault.service \ + /var/backups/brain/pg-dumps/ \ + 2>> "$LOG" + +log " borg OK" + +# --- Prune old backups --- +log "Pruning old backups..." +borg prune \ + --keep-daily=7 \ + --keep-weekly=4 \ + --keep-monthly=3 \ + "$BORG_REPO" 2>> "$LOG" + +borg compact "$BORG_REPO" 2>> "$LOG" +log " prune OK" + +# --- Report --- +REPO_SIZE=$(du -sh "$BORG_REPO" | cut -f1) +log "=== Backup complete. Repo size: $REPO_SIZE ===" diff --git a/server/recover.sh b/server/recover.sh new file mode 100755 index 0000000..2296cb9 --- /dev/null +++ b/server/recover.sh @@ -0,0 +1,286 @@ +#!/bin/bash +# ============================================================================= +# brain server recovery — restore from backup or lock down after compromise +# +# Usage: +# ./recover.sh status Show available backups +# ./recover.sh restore [DATE] Restore configs + DB from backup (latest or DATE) +# ./recover.sh lockdown Emergency lockdown — block all external access +# ./recover.sh unlock Undo lockdown — restore normal firewall rules +# ./recover.sh db [DATE] Restore only the database +# ./recover.sh configs [DATE] Restore only configs (nginx, ssh, fail2ban, etc.) +# ./recover.sh app [DATE] Restore only the app file + config +# ============================================================================= + +set -euo pipefail + +BORG_REPO="/var/backups/brain/borg-repo" +PG_DIR="/var/backups/brain/pg-dumps" +LOCKDOWN_FLAG="/var/backups/brain/.lockdown-active" +LOG="/var/log/brain-recovery.log" + +export BORG_PASSPHRASE="" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log() { echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } + +usage() { + echo "Usage: $0 {status|restore|lockdown|unlock|db|configs|app} [DATE]" + echo "" + echo "Commands:" + echo " status List available backups" + echo " restore [DATE] Full restore (configs + DB + app)" + echo " lockdown Emergency: block all external traffic" + echo " unlock Undo lockdown, restore normal firewall" + echo " db [DATE] Restore database only" + echo " configs [DATE] Restore server configs only" + echo " app [DATE] Restore app + config file only" + exit 1 +} + +get_archive() { + local date="${1:-}" + if [ -n "$date" ]; then + borg list "$BORG_REPO" | grep "$date" | tail -1 | awk '{print $1}' + else + borg list "$BORG_REPO" | tail -1 | awk '{print $1}' + fi +} + +# --- STATUS --- +cmd_status() { + echo -e "${CYAN}=== Available Backups ===${NC}" + + echo -e "\n${CYAN}Borg archives:${NC}" + if borg list "$BORG_REPO" 2>/dev/null | head -20; then + repo_size=$(du -sh "$BORG_REPO" | cut -f1) + echo -e " Repo size: $repo_size" + else + echo " No borg archives found" + fi + + echo -e "\n${CYAN}PostgreSQL dumps:${NC}" + if ls -lht "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -10; then + true + else + echo " No dumps found" + fi + + if [ -f "$LOCKDOWN_FLAG" ]; then + echo -e "\n${RED}*** LOCKDOWN IS ACTIVE ***${NC}" + echo " Run '$0 unlock' to restore normal access" + fi +} + +# --- LOCKDOWN --- +cmd_lockdown() { + log "${RED}=== EMERGENCY LOCKDOWN ===${NC}" + log "Blocking all external access except SSH from LAN..." + + # Save current rules + ufw status verbose > /var/backups/brain/ufw-pre-lockdown.txt 2>/dev/null || true + + # Reset and lock down + ufw --force reset >/dev/null 2>&1 + ufw default deny incoming + ufw default deny outgoing + + # Only allow SSH from LAN + ufw allow from 192.168.1.0/24 to any port 22 + # Allow DNS out (needed for recovery) + ufw allow out 53 + # Allow apt out (needed for fixes) + ufw allow out 80/tcp + ufw allow out 443/tcp + + ufw --force enable >/dev/null 2>&1 + + # Stop public-facing services + systemctl stop nginx 2>/dev/null || true + + touch "$LOCKDOWN_FLAG" + log "${RED}LOCKDOWN ACTIVE — only LAN SSH allowed${NC}" + log "Services stopped: nginx" + log "Run '$0 unlock' when safe" + + # Log who's connected right now + log "Current connections:" + ss -tnp | tee -a "$LOG" + + # Snapshot auth log + log "Saving auth log snapshot..." + cp /var/log/auth.log "/var/backups/brain/auth-lockdown-$(date +%Y%m%d_%H%M%S).log" 2>/dev/null || true +} + +# --- UNLOCK --- +cmd_unlock() { + if [ ! -f "$LOCKDOWN_FLAG" ]; then + echo "Lockdown is not active." + exit 0 + fi + + log "${GREEN}=== RESTORING NORMAL ACCESS ===${NC}" + + # Restore firewall + ufw --force reset >/dev/null 2>&1 + ufw default deny incoming + ufw default allow outgoing + + ufw allow 22/tcp + ufw allow 80/tcp comment "HTTP web server" + ufw allow 443/tcp comment "HTTPS web server" + ufw allow 3030/tcp + ufw allow from 192.168.1.0/24 to any port 139,445 proto tcp + ufw allow from 192.168.1.0/24 to any port 137,138 proto udp + ufw allow from 192.168.1.0/24 to any port 5000 comment "LLM Team UI" + ufw allow from 192.168.1.0/24 to any port 9000 comment "MinIO LAN only" + ufw deny 9000 comment "Block MinIO external" + ufw allow from 192.168.1.0/24 to any port 11434 comment "Ollama internal" + ufw allow from 192.168.1.0/24 to any port 18789 comment "OpenClaw brain" + + ufw --force enable >/dev/null 2>&1 + + # Restart services + systemctl start nginx + systemctl restart llm-team-ui + systemctl restart fail2ban + + rm -f "$LOCKDOWN_FLAG" + log "${GREEN}Normal access restored. All services restarted.${NC}" +} + +# --- RESTORE DB --- +cmd_db() { + local date="${1:-}" + local dump + + if [ -n "$date" ]; then + dump=$(ls -t "$PG_DIR"/knowledge_base_${date}*.dump 2>/dev/null | head -1) + else + dump=$(ls -t "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -1) + fi + + if [ -z "$dump" ]; then + echo -e "${RED}No matching database dump found${NC}" + exit 1 + fi + + log "Restoring database from: $(basename "$dump")" + echo -e "${YELLOW}This will DROP and recreate knowledge_base. Continue? [y/N]${NC}" + read -r confirm + [ "$confirm" = "y" ] || exit 0 + + # Stop app to release connections + systemctl stop llm-team-ui 2>/dev/null || true + + sudo -u postgres dropdb --if-exists knowledge_base + sudo -u postgres createdb -O kbuser knowledge_base + sudo -u postgres pg_restore -d knowledge_base "$dump" 2>&1 | tee -a "$LOG" + + systemctl start llm-team-ui + log "${GREEN}Database restored from $(basename "$dump")${NC}" +} + +# --- RESTORE CONFIGS --- +cmd_configs() { + local date="${1:-}" + local archive + archive=$(get_archive "$date") + + if [ -z "$archive" ]; then + echo -e "${RED}No matching borg archive found${NC}" + exit 1 + fi + + log "Restoring configs from archive: $archive" + echo -e "${YELLOW}This will overwrite current server configs. Continue? [y/N]${NC}" + read -r confirm + [ "$confirm" = "y" ] || exit 0 + + local tmpdir + tmpdir=$(mktemp -d) + cd "$tmpdir" + borg extract "$BORG_REPO::${archive}" + + # Restore each config + cp -v etc/nginx/sites-available/* /etc/nginx/sites-available/ 2>/dev/null || true + cp -v etc/nginx/nginx.conf /etc/nginx/nginx.conf 2>/dev/null || true + cp -v etc/fail2ban/jail.local /etc/fail2ban/jail.local 2>/dev/null || true + cp -v etc/ssh/sshd_config /etc/ssh/sshd_config 2>/dev/null || true + cp -v etc/sysctl.d/99-security.conf /etc/sysctl.d/99-security.conf 2>/dev/null || true + cp -v etc/systemd/system/llm-team-ui.service /etc/systemd/system/ 2>/dev/null || true + cp -v etc/systemd/system/goaccess.service /etc/systemd/system/ 2>/dev/null || true + + # Reload everything + nginx -t && systemctl reload nginx + sshd -t && systemctl reload sshd + systemctl restart fail2ban + sysctl --system >/dev/null 2>&1 + systemctl daemon-reload + + rm -rf "$tmpdir" + log "${GREEN}Configs restored from $archive and services reloaded${NC}" +} + +# --- RESTORE APP --- +cmd_app() { + local date="${1:-}" + local archive + archive=$(get_archive "$date") + + if [ -z "$archive" ]; then + echo -e "${RED}No matching borg archive found${NC}" + exit 1 + fi + + log "Restoring app from archive: $archive" + + local tmpdir + tmpdir=$(mktemp -d) + cd "$tmpdir" + borg extract "$BORG_REPO::${archive}" + + cp -v root/llm_team_ui.py /root/llm_team_ui.py + cp -v root/llm_team_config.json /root/llm_team_config.json + cp -v home/profit/.env /home/profit/.env 2>/dev/null || true + + systemctl restart llm-team-ui + + rm -rf "$tmpdir" + log "${GREEN}App restored from $archive and restarted${NC}" +} + +# --- FULL RESTORE --- +cmd_restore() { + local date="${1:-}" + log "${CYAN}=== FULL RESTORE ===${NC}" + echo -e "${YELLOW}This will restore configs, app, and database. Continue? [y/N]${NC}" + read -r confirm + [ "$confirm" = "y" ] || exit 0 + + cmd_configs "$date" + cmd_app "$date" + cmd_db "$date" + + log "${GREEN}=== FULL RESTORE COMPLETE ===${NC}" + log "Run ./triage.sh to verify system health" +} + +# --- Main --- +[ $# -lt 1 ] && usage + +case "$1" in + status) cmd_status ;; + restore) cmd_restore "${2:-}" ;; + lockdown) cmd_lockdown ;; + unlock) cmd_unlock ;; + db) cmd_db "${2:-}" ;; + configs) cmd_configs "${2:-}" ;; + app) cmd_app "${2:-}" ;; + *) usage ;; +esac diff --git a/server/triage.sh b/server/triage.sh new file mode 100755 index 0000000..629e15f --- /dev/null +++ b/server/triage.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# ============================================================================= +# brain server triage — quick system health check +# Run after incident, reboot, or anytime something feels off. +# Exits 0 if healthy, 1 if issues found. +# ============================================================================= + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +ISSUES=0 + +pass() { echo -e " ${GREEN}✓${NC} $1"; } +fail() { echo -e " ${RED}✗${NC} $1"; ISSUES=$((ISSUES+1)); } +warn() { echo -e " ${YELLOW}!${NC} $1"; } +section() { echo -e "\n${CYAN}[$1]${NC}"; } + +echo -e "${CYAN}=== brain server triage — $(date) ===${NC}" + +# --- Services --- +section "Services" +for svc in llm-team-ui nginx ollama postgresql minio vault fail2ban ufw; do + if systemctl is-active --quiet "$svc" 2>/dev/null; then + pass "$svc running" + else + fail "$svc NOT running" + fi +done + +# --- Ports --- +section "Ports" +check_port() { + local port=$1 name=$2 bind=$3 + if ss -tlnp | grep -q ":${port} "; then + actual_bind=$(ss -tlnp | grep ":${port} " | awk '{print $4}' | head -1) + if [ -n "$bind" ] && ! echo "$actual_bind" | grep -q "$bind"; then + fail "$name on $port — bound to $actual_bind (expected $bind)" + else + pass "$name listening on $actual_bind" + fi + else + fail "$name NOT listening on port $port" + fi +} +check_port 5000 "Flask app" "127.0.0.1" +check_port 80 "Nginx HTTP" "" +check_port 11434 "Ollama" "" +check_port 5432 "PostgreSQL" "127.0.0.1" +check_port 9000 "MinIO" "" + +# --- Firewall --- +section "Firewall" +if ufw status | grep -q "Status: active"; then + pass "UFW active" + # Check default deny + if ufw status verbose | grep -q "Default: deny (incoming)"; then + pass "Default deny incoming" + else + fail "Default is NOT deny incoming" + fi +else + fail "UFW is NOT active" +fi + +# --- Fail2ban --- +section "Fail2ban" +jail_count=$(fail2ban-client status 2>/dev/null | grep "Number of jail" | awk '{print $NF}') +if [ -n "$jail_count" ] && [ "$jail_count" -ge 3 ]; then + pass "$jail_count jails active" +else + fail "Only $jail_count fail2ban jails (expected >= 3)" +fi + +# Check for banned IPs +banned=$(fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $NF}') +if [ "$banned" -gt 0 ] 2>/dev/null; then + warn "$banned IPs currently banned on SSH" +fi + +# --- SSH --- +section "SSH" +if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then + pass "Root login disabled" +else + fail "Root login NOT disabled" +fi + +if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then + pass "Password auth disabled" +else + warn "Password auth still enabled (SSH keys not yet set up)" +fi + +if grep -q "^MaxAuthTries 3" /etc/ssh/sshd_config; then + pass "Max auth tries = 3" +else + fail "Max auth tries not set to 3" +fi + +# --- Nginx headers --- +section "Nginx Security Headers" +headers=$(curl -sI http://127.0.0.1 2>/dev/null) +for h in "X-Frame-Options" "X-Content-Type-Options" "Referrer-Policy" "X-XSS-Protection"; do + if echo "$headers" | grep -qi "$h"; then + pass "$h present" + else + fail "$h MISSING" + fi +done + +# --- Kernel hardening --- +section "Kernel" +if [ "$(sysctl -n net.ipv4.conf.all.rp_filter 2>/dev/null)" = "1" ]; then + pass "Reverse path filtering enabled" +else + fail "Reverse path filtering disabled" +fi +if [ "$(sysctl -n net.ipv4.conf.all.send_redirects 2>/dev/null)" = "0" ]; then + pass "ICMP redirects disabled" +else + fail "ICMP redirects NOT disabled" +fi + +# --- App health --- +section "Application" +http_code=$(curl -so /dev/null -w '%{http_code}' http://127.0.0.1/ 2>/dev/null) +if [ "$http_code" = "200" ] || [ "$http_code" = "302" ]; then + pass "App responding (HTTP $http_code)" +else + fail "App NOT responding (HTTP $http_code)" +fi + +# --- Database --- +section "Database" +if sudo -u postgres psql -d knowledge_base -c "SELECT 1;" >/dev/null 2>&1; then + tables=$(sudo -u postgres psql -d knowledge_base -t -c "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" 2>/dev/null | tr -d ' ') + pass "knowledge_base reachable ($tables tables)" +else + fail "Cannot connect to knowledge_base" +fi + +# --- Disk --- +section "Disk" +usage=$(df / --output=pcent | tail -1 | tr -d ' %') +if [ "$usage" -lt 80 ]; then + pass "Disk usage: ${usage}%" +elif [ "$usage" -lt 95 ]; then + warn "Disk usage: ${usage}% (getting full)" +else + fail "Disk usage: ${usage}% (critical!)" +fi + +# --- Backups --- +section "Backups" +if [ -d /var/backups/brain/borg-repo ]; then + last_backup=$(borg list /var/backups/brain/borg-repo 2>/dev/null | tail -1) + if [ -n "$last_backup" ]; then + pass "Borg repo exists, last: $last_backup" + else + warn "Borg repo exists but empty — run backup.sh" + fi +else + fail "No borg repo at /var/backups/brain/borg-repo" +fi + +last_dump=$(ls -t /var/backups/brain/pg-dumps/knowledge_base_*.dump 2>/dev/null | head -1) +if [ -n "$last_dump" ]; then + dump_age=$(( ($(date +%s) - $(stat -c %Y "$last_dump")) / 86400 )) + if [ "$dump_age" -le 1 ]; then + pass "Latest pg dump: $(basename "$last_dump")" + else + warn "Latest pg dump is ${dump_age} days old: $(basename "$last_dump")" + fi +else + warn "No PostgreSQL dumps found" +fi + +# --- Suspicious activity --- +section "Security Scan" +# Check for unexpected SUID binaries +suid_count=$(find /usr/local -perm -4000 -type f 2>/dev/null | wc -l) +if [ "$suid_count" -eq 0 ]; then + pass "No unexpected SUID binaries in /usr/local" +else + fail "$suid_count SUID binaries found in /usr/local — investigate" + find /usr/local -perm -4000 -type f 2>/dev/null +fi + +# Check for unauthorized cron jobs +cron_users=$(ls /var/spool/cron/crontabs/ 2>/dev/null | grep -v root) +if [ -z "$cron_users" ]; then + pass "No non-root user crontabs" +else + warn "Crontabs found for: $cron_users" +fi + +# Check for recent failed SSH logins +failed_ssh=$(journalctl -u sshd --since "1 hour ago" --no-pager 2>/dev/null | grep -c "Failed password" || true) +if [ "$failed_ssh" -gt 10 ]; then + warn "$failed_ssh failed SSH logins in last hour" +elif [ "$failed_ssh" -gt 0 ]; then + pass "$failed_ssh failed SSH logins in last hour (normal)" +else + pass "No failed SSH logins in last hour" +fi + +# --- Summary --- +echo "" +if [ "$ISSUES" -eq 0 ]; then + echo -e "${GREEN}=== ALL CHECKS PASSED ===${NC}" + exit 0 +else + echo -e "${RED}=== $ISSUES ISSUE(S) FOUND ===${NC}" + exit 1 +fi