llm-team-ui/server/recover.sh
root 2bb910b72c Add triage, backup, and disaster recovery system
- brain-backup: daily borg + pg_dump, 7d/4w/3m retention, cron at 3AM
- brain-triage: full system health check (services, ports, firewall,
  headers, kernel, app, DB, disk, backups, security scan)
- brain-recover: restore from backup (full/db/configs/app) + emergency
  lockdown mode that blocks all external access except LAN SSH

All accessible via /usr/local/bin/brain-{backup,triage,recover}

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 04:52:48 -05:00

287 lines
8.6 KiB
Bash
Executable File

#!/bin/bash
# =============================================================================
# brain server recovery — restore from backup or lock down after compromise
#
# Usage:
# ./recover.sh status Show available backups
# ./recover.sh restore [DATE] Restore configs + DB from backup (latest or DATE)
# ./recover.sh lockdown Emergency lockdown — block all external access
# ./recover.sh unlock Undo lockdown — restore normal firewall rules
# ./recover.sh db [DATE] Restore only the database
# ./recover.sh configs [DATE] Restore only configs (nginx, ssh, fail2ban, etc.)
# ./recover.sh app [DATE] Restore only the app file + config
# =============================================================================
set -euo pipefail
BORG_REPO="/var/backups/brain/borg-repo"
PG_DIR="/var/backups/brain/pg-dumps"
LOCKDOWN_FLAG="/var/backups/brain/.lockdown-active"
LOG="/var/log/brain-recovery.log"
export BORG_PASSPHRASE=""
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
log() { echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
usage() {
echo "Usage: $0 {status|restore|lockdown|unlock|db|configs|app} [DATE]"
echo ""
echo "Commands:"
echo " status List available backups"
echo " restore [DATE] Full restore (configs + DB + app)"
echo " lockdown Emergency: block all external traffic"
echo " unlock Undo lockdown, restore normal firewall"
echo " db [DATE] Restore database only"
echo " configs [DATE] Restore server configs only"
echo " app [DATE] Restore app + config file only"
exit 1
}
get_archive() {
local date="${1:-}"
if [ -n "$date" ]; then
borg list "$BORG_REPO" | grep "$date" | tail -1 | awk '{print $1}'
else
borg list "$BORG_REPO" | tail -1 | awk '{print $1}'
fi
}
# --- STATUS ---
cmd_status() {
echo -e "${CYAN}=== Available Backups ===${NC}"
echo -e "\n${CYAN}Borg archives:${NC}"
if borg list "$BORG_REPO" 2>/dev/null | head -20; then
repo_size=$(du -sh "$BORG_REPO" | cut -f1)
echo -e " Repo size: $repo_size"
else
echo " No borg archives found"
fi
echo -e "\n${CYAN}PostgreSQL dumps:${NC}"
if ls -lht "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -10; then
true
else
echo " No dumps found"
fi
if [ -f "$LOCKDOWN_FLAG" ]; then
echo -e "\n${RED}*** LOCKDOWN IS ACTIVE ***${NC}"
echo " Run '$0 unlock' to restore normal access"
fi
}
# --- LOCKDOWN ---
cmd_lockdown() {
log "${RED}=== EMERGENCY LOCKDOWN ===${NC}"
log "Blocking all external access except SSH from LAN..."
# Save current rules
ufw status verbose > /var/backups/brain/ufw-pre-lockdown.txt 2>/dev/null || true
# Reset and lock down
ufw --force reset >/dev/null 2>&1
ufw default deny incoming
ufw default deny outgoing
# Only allow SSH from LAN
ufw allow from 192.168.1.0/24 to any port 22
# Allow DNS out (needed for recovery)
ufw allow out 53
# Allow apt out (needed for fixes)
ufw allow out 80/tcp
ufw allow out 443/tcp
ufw --force enable >/dev/null 2>&1
# Stop public-facing services
systemctl stop nginx 2>/dev/null || true
touch "$LOCKDOWN_FLAG"
log "${RED}LOCKDOWN ACTIVE — only LAN SSH allowed${NC}"
log "Services stopped: nginx"
log "Run '$0 unlock' when safe"
# Log who's connected right now
log "Current connections:"
ss -tnp | tee -a "$LOG"
# Snapshot auth log
log "Saving auth log snapshot..."
cp /var/log/auth.log "/var/backups/brain/auth-lockdown-$(date +%Y%m%d_%H%M%S).log" 2>/dev/null || true
}
# --- UNLOCK ---
cmd_unlock() {
if [ ! -f "$LOCKDOWN_FLAG" ]; then
echo "Lockdown is not active."
exit 0
fi
log "${GREEN}=== RESTORING NORMAL ACCESS ===${NC}"
# Restore firewall
ufw --force reset >/dev/null 2>&1
ufw default deny incoming
ufw default allow outgoing
ufw allow 22/tcp
ufw allow 80/tcp comment "HTTP web server"
ufw allow 443/tcp comment "HTTPS web server"
ufw allow 3030/tcp
ufw allow from 192.168.1.0/24 to any port 139,445 proto tcp
ufw allow from 192.168.1.0/24 to any port 137,138 proto udp
ufw allow from 192.168.1.0/24 to any port 5000 comment "LLM Team UI"
ufw allow from 192.168.1.0/24 to any port 9000 comment "MinIO LAN only"
ufw deny 9000 comment "Block MinIO external"
ufw allow from 192.168.1.0/24 to any port 11434 comment "Ollama internal"
ufw allow from 192.168.1.0/24 to any port 18789 comment "OpenClaw brain"
ufw --force enable >/dev/null 2>&1
# Restart services
systemctl start nginx
systemctl restart llm-team-ui
systemctl restart fail2ban
rm -f "$LOCKDOWN_FLAG"
log "${GREEN}Normal access restored. All services restarted.${NC}"
}
# --- RESTORE DB ---
cmd_db() {
local date="${1:-}"
local dump
if [ -n "$date" ]; then
dump=$(ls -t "$PG_DIR"/knowledge_base_${date}*.dump 2>/dev/null | head -1)
else
dump=$(ls -t "$PG_DIR"/knowledge_base_*.dump 2>/dev/null | head -1)
fi
if [ -z "$dump" ]; then
echo -e "${RED}No matching database dump found${NC}"
exit 1
fi
log "Restoring database from: $(basename "$dump")"
echo -e "${YELLOW}This will DROP and recreate knowledge_base. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
# Stop app to release connections
systemctl stop llm-team-ui 2>/dev/null || true
sudo -u postgres dropdb --if-exists knowledge_base
sudo -u postgres createdb -O kbuser knowledge_base
sudo -u postgres pg_restore -d knowledge_base "$dump" 2>&1 | tee -a "$LOG"
systemctl start llm-team-ui
log "${GREEN}Database restored from $(basename "$dump")${NC}"
}
# --- RESTORE CONFIGS ---
cmd_configs() {
local date="${1:-}"
local archive
archive=$(get_archive "$date")
if [ -z "$archive" ]; then
echo -e "${RED}No matching borg archive found${NC}"
exit 1
fi
log "Restoring configs from archive: $archive"
echo -e "${YELLOW}This will overwrite current server configs. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
local tmpdir
tmpdir=$(mktemp -d)
cd "$tmpdir"
borg extract "$BORG_REPO::${archive}"
# Restore each config
cp -v etc/nginx/sites-available/* /etc/nginx/sites-available/ 2>/dev/null || true
cp -v etc/nginx/nginx.conf /etc/nginx/nginx.conf 2>/dev/null || true
cp -v etc/fail2ban/jail.local /etc/fail2ban/jail.local 2>/dev/null || true
cp -v etc/ssh/sshd_config /etc/ssh/sshd_config 2>/dev/null || true
cp -v etc/sysctl.d/99-security.conf /etc/sysctl.d/99-security.conf 2>/dev/null || true
cp -v etc/systemd/system/llm-team-ui.service /etc/systemd/system/ 2>/dev/null || true
cp -v etc/systemd/system/goaccess.service /etc/systemd/system/ 2>/dev/null || true
# Reload everything
nginx -t && systemctl reload nginx
sshd -t && systemctl reload sshd
systemctl restart fail2ban
sysctl --system >/dev/null 2>&1
systemctl daemon-reload
rm -rf "$tmpdir"
log "${GREEN}Configs restored from $archive and services reloaded${NC}"
}
# --- RESTORE APP ---
cmd_app() {
local date="${1:-}"
local archive
archive=$(get_archive "$date")
if [ -z "$archive" ]; then
echo -e "${RED}No matching borg archive found${NC}"
exit 1
fi
log "Restoring app from archive: $archive"
local tmpdir
tmpdir=$(mktemp -d)
cd "$tmpdir"
borg extract "$BORG_REPO::${archive}"
cp -v root/llm_team_ui.py /root/llm_team_ui.py
cp -v root/llm_team_config.json /root/llm_team_config.json
cp -v home/profit/.env /home/profit/.env 2>/dev/null || true
systemctl restart llm-team-ui
rm -rf "$tmpdir"
log "${GREEN}App restored from $archive and restarted${NC}"
}
# --- FULL RESTORE ---
cmd_restore() {
local date="${1:-}"
log "${CYAN}=== FULL RESTORE ===${NC}"
echo -e "${YELLOW}This will restore configs, app, and database. Continue? [y/N]${NC}"
read -r confirm
[ "$confirm" = "y" ] || exit 0
cmd_configs "$date"
cmd_app "$date"
cmd_db "$date"
log "${GREEN}=== FULL RESTORE COMPLETE ===${NC}"
log "Run ./triage.sh to verify system health"
}
# --- Main ---
[ $# -lt 1 ] && usage
case "$1" in
status) cmd_status ;;
restore) cmd_restore "${2:-}" ;;
lockdown) cmd_lockdown ;;
unlock) cmd_unlock ;;
db) cmd_db "${2:-}" ;;
configs) cmd_configs "${2:-}" ;;
app) cmd_app "${2:-}" ;;
*) usage ;;
esac