Add triage, backup, and disaster recovery system

- brain-backup: daily borg + pg_dump, 7d/4w/3m retention, cron at 3AM
- brain-triage: full system health check (services, ports, firewall,
  headers, kernel, app, DB, disk, backups, security scan)
- brain-recover: restore from backup (full/db/configs/app) + emergency
  lockdown mode that blocks all external access except LAN SSH

All accessible via /usr/local/bin/brain-{backup,triage,recover}

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-25 04:52:48 -05:00
parent 6ea457d01d
commit 2bb910b72c
3 changed files with 577 additions and 0 deletions

73
server/backup.sh Executable file
View File

@ -0,0 +1,73 @@
#!/bin/bash
# =============================================================================
# brain server backup — runs daily via cron
# Backs up: app, configs, database, nginx, systemd, SSL, fail2ban, sysctl
# Storage: /var/backups/brain/ (borg repo + pg dumps)
# Retention: 7 daily, 4 weekly, 3 monthly
# =============================================================================
set -euo pipefail
BACKUP_DIR="/var/backups/brain"
BORG_REPO="$BACKUP_DIR/borg-repo"
PG_DIR="$BACKUP_DIR/pg-dumps"
LOG="/var/log/brain-backup.log"
TIMESTAMP=$(date +%Y-%m-%d_%H%M)
export BORG_PASSPHRASE=""
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== Backup starting ==="
# --- PostgreSQL dump ---
mkdir -p "$PG_DIR"
log "Dumping PostgreSQL knowledge_base..."
sudo -u postgres pg_dump -Fc knowledge_base > "$PG_DIR/knowledge_base_${TIMESTAMP}.dump" 2>> "$LOG"
# Keep last 14 dumps
ls -t "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | tail -n +15 | xargs -r rm --
log " pg_dump OK ($(du -sh "$PG_DIR/knowledge_base_${TIMESTAMP}.dump" | cut -f1))"
# --- Borg backup ---
log "Running borg backup..."
borg create \
--stats \
--compression zstd,3 \
--exclude '*.pyc' \
--exclude '__pycache__' \
--exclude '.git' \
--exclude 'node_modules' \
"$BORG_REPO::${TIMESTAMP}" \
/root/llm_team_ui.py \
/root/llm_team_config.json \
/home/profit/.env \
/etc/nginx/sites-available/ \
/etc/nginx/sites-enabled/ \
/etc/nginx/nginx.conf \
/etc/fail2ban/jail.local \
/etc/fail2ban/jail.d/ \
/etc/ssh/sshd_config \
/etc/sysctl.d/99-security.conf \
/etc/systemd/system/llm-team-ui.service \
/etc/systemd/system/goaccess.service \
/etc/systemd/system/ollama.service \
/etc/systemd/system/minio.service \
/etc/systemd/system/vault.service \
/var/backups/brain/pg-dumps/ \
2>> "$LOG"
log " borg OK"
# --- Prune old backups ---
log "Pruning old backups..."
borg prune \
--keep-daily=7 \
--keep-weekly=4 \
--keep-monthly=3 \
"$BORG_REPO" 2>> "$LOG"
borg compact "$BORG_REPO" 2>> "$LOG"
log " prune OK"
# --- Report ---
REPO_SIZE=$(du -sh "$BORG_REPO" | cut -f1)
log "=== Backup complete. Repo size: $REPO_SIZE ==="

286
server/recover.sh Executable file
View File

@ -0,0 +1,286 @@
#!/bin/bash
# =============================================================================
# brain server recovery — restore from backup or lock down after compromise
#
# Usage:
# ./recover.sh status Show available backups
# ./recover.sh restore [DATE] Restore configs + DB from backup (latest or DATE)
# ./recover.sh lockdown Emergency lockdown — block all external access
# ./recover.sh unlock Undo lockdown — restore normal firewall rules
# ./recover.sh db [DATE] Restore only the database
# ./recover.sh configs [DATE] Restore only configs (nginx, ssh, fail2ban, etc.)
# ./recover.sh app [DATE] Restore only the app file + config
# =============================================================================
set -euo pipefail
BORG_REPO="/var/backups/brain/borg-repo"
PG_DIR="/var/backups/brain/pg-dumps"
LOCKDOWN_FLAG="/var/backups/brain/.lockdown-active"
LOG="/var/log/brain-recovery.log"
export BORG_PASSPHRASE=""
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
log() { echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
usage() {
echo "Usage: $0 {status|restore|lockdown|unlock|db|configs|app} [DATE]"
echo ""
echo "Commands:"
echo " status List available backups"
echo " restore [DATE] Full restore (configs + DB + app)"
echo " lockdown Emergency: block all external traffic"
echo " unlock Undo lockdown, restore normal firewall"
echo " db [DATE] Restore database only"
echo " configs [DATE] Restore server configs only"
echo " app [DATE] Restore app + config file only"
exit 1
}
get_archive() {
local date="${1:-}"
if [ -n "$date" ]; then
borg list "$BORG_REPO" | grep "$date" | tail -1 | awk '{print $1}'
else
borg list "$BORG_REPO" | tail -1 | awk '{print $1}'
fi
}
# --- STATUS ---
cmd_status() {
echo -e "${CYAN}=== Available Backups ===${NC}"
echo -e "\n${CYAN}Borg archives:${NC}"
if borg list "$BORG_REPO" 2>/dev/null | head -20; then
repo_size=$(du -sh "$BORG_REPO" | cut -f1)
echo -e " Repo size: $repo_size"
else
echo " No borg archives found"
fi
echo -e "\n${CYAN}PostgreSQL dumps:${NC}"
if ls -lht "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -10; then
true
else
echo " No dumps found"
fi
if [ -f "$LOCKDOWN_FLAG" ]; then
echo -e "\n${RED}*** LOCKDOWN IS ACTIVE ***${NC}"
echo " Run '$0 unlock' to restore normal access"
fi
}
# --- LOCKDOWN ---
cmd_lockdown() {
log "${RED}=== EMERGENCY LOCKDOWN ===${NC}"
log "Blocking all external access except SSH from LAN..."
# Save current rules
ufw status verbose > /var/backups/brain/ufw-pre-lockdown.txt 2>/dev/null || true
# Reset and lock down
ufw --force reset >/dev/null 2>&1
ufw default deny incoming
ufw default deny outgoing
# Only allow SSH from LAN
ufw allow from 192.168.1.0/24 to any port 22
# Allow DNS out (needed for recovery)
ufw allow out 53
# Allow apt out (needed for fixes)
ufw allow out 80/tcp
ufw allow out 443/tcp
ufw --force enable >/dev/null 2>&1
# Stop public-facing services
systemctl stop nginx 2>/dev/null || true
touch "$LOCKDOWN_FLAG"
log "${RED}LOCKDOWN ACTIVE — only LAN SSH allowed${NC}"
log "Services stopped: nginx"
log "Run '$0 unlock' when safe"
# Log who's connected right now
log "Current connections:"
ss -tnp | tee -a "$LOG"
# Snapshot auth log
log "Saving auth log snapshot..."
cp /var/log/auth.log "/var/backups/brain/auth-lockdown-$(date +%Y%m%d_%H%M%S).log" 2>/dev/null || true
}
# --- UNLOCK ---
cmd_unlock() {
if [ ! -f "$LOCKDOWN_FLAG" ]; then
echo "Lockdown is not active."
exit 0
fi
log "${GREEN}=== RESTORING NORMAL ACCESS ===${NC}"
# Restore firewall
ufw --force reset >/dev/null 2>&1
ufw default deny incoming
ufw default allow outgoing
ufw allow 22/tcp
ufw allow 80/tcp comment "HTTP web server"
ufw allow 443/tcp comment "HTTPS web server"
ufw allow 3030/tcp
ufw allow from 192.168.1.0/24 to any port 139,445 proto tcp
ufw allow from 192.168.1.0/24 to any port 137,138 proto udp
ufw allow from 192.168.1.0/24 to any port 5000 comment "LLM Team UI"
ufw allow from 192.168.1.0/24 to any port 9000 comment "MinIO LAN only"
ufw deny 9000 comment "Block MinIO external"
ufw allow from 192.168.1.0/24 to any port 11434 comment "Ollama internal"
ufw allow from 192.168.1.0/24 to any port 18789 comment "OpenClaw brain"
ufw --force enable >/dev/null 2>&1
# Restart services
systemctl start nginx
systemctl restart llm-team-ui
systemctl restart fail2ban
rm -f "$LOCKDOWN_FLAG"
log "${GREEN}Normal access restored. All services restarted.${NC}"
}
# --- RESTORE DB ---
cmd_db() {
local date="${1:-}"
local dump
if [ -n "$date" ]; then
dump=$(ls -t "$PG_DIR"/knowledge_base_${date}*.dump 2>/dev/null | head -1)
else
dump=$(ls -t "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -1)
fi
if [ -z "$dump" ]; then
echo -e "${RED}No matching database dump found${NC}"
exit 1
fi
log "Restoring database from: $(basename "$dump")"
echo -e "${YELLOW}This will DROP and recreate knowledge_base. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
# Stop app to release connections
systemctl stop llm-team-ui 2>/dev/null || true
sudo -u postgres dropdb --if-exists knowledge_base
sudo -u postgres createdb -O kbuser knowledge_base
sudo -u postgres pg_restore -d knowledge_base "$dump" 2>&1 | tee -a "$LOG"
systemctl start llm-team-ui
log "${GREEN}Database restored from $(basename "$dump")${NC}"
}
# --- RESTORE CONFIGS ---
cmd_configs() {
local date="${1:-}"
local archive
archive=$(get_archive "$date")
if [ -z "$archive" ]; then
echo -e "${RED}No matching borg archive found${NC}"
exit 1
fi
log "Restoring configs from archive: $archive"
echo -e "${YELLOW}This will overwrite current server configs. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
local tmpdir
tmpdir=$(mktemp -d)
cd "$tmpdir"
borg extract "$BORG_REPO::${archive}"
# Restore each config
cp -v etc/nginx/sites-available/* /etc/nginx/sites-available/ 2>/dev/null || true
cp -v etc/nginx/nginx.conf /etc/nginx/nginx.conf 2>/dev/null || true
cp -v etc/fail2ban/jail.local /etc/fail2ban/jail.local 2>/dev/null || true
cp -v etc/ssh/sshd_config /etc/ssh/sshd_config 2>/dev/null || true
cp -v etc/sysctl.d/99-security.conf /etc/sysctl.d/99-security.conf 2>/dev/null || true
cp -v etc/systemd/system/llm-team-ui.service /etc/systemd/system/ 2>/dev/null || true
cp -v etc/systemd/system/goaccess.service /etc/systemd/system/ 2>/dev/null || true
# Reload everything
nginx -t && systemctl reload nginx
sshd -t && systemctl reload sshd
systemctl restart fail2ban
sysctl --system >/dev/null 2>&1
systemctl daemon-reload
rm -rf "$tmpdir"
log "${GREEN}Configs restored from $archive and services reloaded${NC}"
}
# --- RESTORE APP ---
cmd_app() {
local date="${1:-}"
local archive
archive=$(get_archive "$date")
if [ -z "$archive" ]; then
echo -e "${RED}No matching borg archive found${NC}"
exit 1
fi
log "Restoring app from archive: $archive"
local tmpdir
tmpdir=$(mktemp -d)
cd "$tmpdir"
borg extract "$BORG_REPO::${archive}"
cp -v root/llm_team_ui.py /root/llm_team_ui.py
cp -v root/llm_team_config.json /root/llm_team_config.json
cp -v home/profit/.env /home/profit/.env 2>/dev/null || true
systemctl restart llm-team-ui
rm -rf "$tmpdir"
log "${GREEN}App restored from $archive and restarted${NC}"
}
# --- FULL RESTORE ---
cmd_restore() {
local date="${1:-}"
log "${CYAN}=== FULL RESTORE ===${NC}"
echo -e "${YELLOW}This will restore configs, app, and database. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
cmd_configs "$date"
cmd_app "$date"
cmd_db "$date"
log "${GREEN}=== FULL RESTORE COMPLETE ===${NC}"
log "Run ./triage.sh to verify system health"
}
# --- Main ---
[ $# -lt 1 ] && usage
case "$1" in
status) cmd_status ;;
restore) cmd_restore "${2:-}" ;;
lockdown) cmd_lockdown ;;
unlock) cmd_unlock ;;
db) cmd_db "${2:-}" ;;
configs) cmd_configs "${2:-}" ;;
app) cmd_app "${2:-}" ;;
*) usage ;;
esac

218
server/triage.sh Executable file
View File

@ -0,0 +1,218 @@
#!/bin/bash
# =============================================================================
# brain server triage — quick system health check
# Run after incident, reboot, or anytime something feels off.
# Exits 0 if healthy, 1 if issues found.
# =============================================================================
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
ISSUES=0
pass() { echo -e " ${GREEN}${NC} $1"; }
fail() { echo -e " ${RED}${NC} $1"; ISSUES=$((ISSUES+1)); }
warn() { echo -e " ${YELLOW}!${NC} $1"; }
section() { echo -e "\n${CYAN}[$1]${NC}"; }
echo -e "${CYAN}=== brain server triage — $(date) ===${NC}"
# --- Services ---
section "Services"
for svc in llm-team-ui nginx ollama postgresql minio vault fail2ban ufw; do
if systemctl is-active --quiet "$svc" 2>/dev/null; then
pass "$svc running"
else
fail "$svc NOT running"
fi
done
# --- Ports ---
section "Ports"
check_port() {
local port=$1 name=$2 bind=$3
if ss -tlnp | grep -q ":${port} "; then
actual_bind=$(ss -tlnp | grep ":${port} " | awk '{print $4}' | head -1)
if [ -n "$bind" ] && ! echo "$actual_bind" | grep -q "$bind"; then
fail "$name on $port — bound to $actual_bind (expected $bind)"
else
pass "$name listening on $actual_bind"
fi
else
fail "$name NOT listening on port $port"
fi
}
check_port 5000 "Flask app" "127.0.0.1"
check_port 80 "Nginx HTTP" ""
check_port 11434 "Ollama" ""
check_port 5432 "PostgreSQL" "127.0.0.1"
check_port 9000 "MinIO" ""
# --- Firewall ---
section "Firewall"
if ufw status | grep -q "Status: active"; then
pass "UFW active"
# Check default deny
if ufw status verbose | grep -q "Default: deny (incoming)"; then
pass "Default deny incoming"
else
fail "Default is NOT deny incoming"
fi
else
fail "UFW is NOT active"
fi
# --- Fail2ban ---
section "Fail2ban"
jail_count=$(fail2ban-client status 2>/dev/null | grep "Number of jail" | awk '{print $NF}')
if [ -n "$jail_count" ] && [ "$jail_count" -ge 3 ]; then
pass "$jail_count jails active"
else
fail "Only $jail_count fail2ban jails (expected >= 3)"
fi
# Check for banned IPs
banned=$(fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $NF}')
if [ "$banned" -gt 0 ] 2>/dev/null; then
warn "$banned IPs currently banned on SSH"
fi
# --- SSH ---
section "SSH"
if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then
pass "Root login disabled"
else
fail "Root login NOT disabled"
fi
if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then
pass "Password auth disabled"
else
warn "Password auth still enabled (SSH keys not yet set up)"
fi
if grep -q "^MaxAuthTries 3" /etc/ssh/sshd_config; then
pass "Max auth tries = 3"
else
fail "Max auth tries not set to 3"
fi
# --- Nginx headers ---
section "Nginx Security Headers"
headers=$(curl -sI http://127.0.0.1 2>/dev/null)
for h in "X-Frame-Options" "X-Content-Type-Options" "Referrer-Policy" "X-XSS-Protection"; do
if echo "$headers" | grep -qi "$h"; then
pass "$h present"
else
fail "$h MISSING"
fi
done
# --- Kernel hardening ---
section "Kernel"
if [ "$(sysctl -n net.ipv4.conf.all.rp_filter 2>/dev/null)" = "1" ]; then
pass "Reverse path filtering enabled"
else
fail "Reverse path filtering disabled"
fi
if [ "$(sysctl -n net.ipv4.conf.all.send_redirects 2>/dev/null)" = "0" ]; then
pass "ICMP redirects disabled"
else
fail "ICMP redirects NOT disabled"
fi
# --- App health ---
section "Application"
http_code=$(curl -so /dev/null -w '%{http_code}' http://127.0.0.1/ 2>/dev/null)
if [ "$http_code" = "200" ] || [ "$http_code" = "302" ]; then
pass "App responding (HTTP $http_code)"
else
fail "App NOT responding (HTTP $http_code)"
fi
# --- Database ---
section "Database"
if sudo -u postgres psql -d knowledge_base -c "SELECT 1;" >/dev/null 2>&1; then
tables=$(sudo -u postgres psql -d knowledge_base -t -c "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" 2>/dev/null | tr -d ' ')
pass "knowledge_base reachable ($tables tables)"
else
fail "Cannot connect to knowledge_base"
fi
# --- Disk ---
section "Disk"
usage=$(df / --output=pcent | tail -1 | tr -d ' %')
if [ "$usage" -lt 80 ]; then
pass "Disk usage: ${usage}%"
elif [ "$usage" -lt 95 ]; then
warn "Disk usage: ${usage}% (getting full)"
else
fail "Disk usage: ${usage}% (critical!)"
fi
# --- Backups ---
section "Backups"
if [ -d /var/backups/brain/borg-repo ]; then
last_backup=$(borg list /var/backups/brain/borg-repo 2>/dev/null | tail -1)
if [ -n "$last_backup" ]; then
pass "Borg repo exists, last: $last_backup"
else
warn "Borg repo exists but empty — run backup.sh"
fi
else
fail "No borg repo at /var/backups/brain/borg-repo"
fi
last_dump=$(ls -t /var/backups/brain/pg-dumps/knowledge_base_*.dump 2>/dev/null | head -1)
if [ -n "$last_dump" ]; then
dump_age=$(( ($(date +%s) - $(stat -c %Y "$last_dump")) / 86400 ))
if [ "$dump_age" -le 1 ]; then
pass "Latest pg dump: $(basename "$last_dump")"
else
warn "Latest pg dump is ${dump_age} days old: $(basename "$last_dump")"
fi
else
warn "No PostgreSQL dumps found"
fi
# --- Suspicious activity ---
section "Security Scan"
# Check for unexpected SUID binaries
suid_count=$(find /usr/local -perm -4000 -type f 2>/dev/null | wc -l)
if [ "$suid_count" -eq 0 ]; then
pass "No unexpected SUID binaries in /usr/local"
else
fail "$suid_count SUID binaries found in /usr/local — investigate"
find /usr/local -perm -4000 -type f 2>/dev/null
fi
# Check for unauthorized cron jobs
cron_users=$(ls /var/spool/cron/crontabs/ 2>/dev/null | grep -v root)
if [ -z "$cron_users" ]; then
pass "No non-root user crontabs"
else
warn "Crontabs found for: $cron_users"
fi
# Check for recent failed SSH logins
failed_ssh=$(journalctl -u sshd --since "1 hour ago" --no-pager 2>/dev/null | grep -c "Failed password" || true)
if [ "$failed_ssh" -gt 10 ]; then
warn "$failed_ssh failed SSH logins in last hour"
elif [ "$failed_ssh" -gt 0 ]; then
pass "$failed_ssh failed SSH logins in last hour (normal)"
else
pass "No failed SSH logins in last hour"
fi
# --- Summary ---
echo ""
if [ "$ISSUES" -eq 0 ]; then
echo -e "${GREEN}=== ALL CHECKS PASSED ===${NC}"
exit 0
else
echo -e "${RED}=== $ISSUES ISSUE(S) FOUND ===${NC}"
exit 1
fi