diff --git a/llm_team_ui.py b/llm_team_ui.py index 6cefebc..8d46e2a 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -7635,7 +7635,10 @@ def admin_security_data(): sort_by = request.args.get("sort", "hits") result = [] for ip, d in ips.items(): - if ip.startswith("192.168."): + # 2026-04-30: was substring "192.168." — replaced with the + # canonical allowlist so 10.x, IPv6 ::1, and operator-added + # entries also stay out of the threat panel. + if is_allowlisted(ip): continue result.append({ "ip": ip, "hits": d["hits"], "exploit_scans": d["exploit_scans"], @@ -7685,19 +7688,41 @@ def _nginx_ban(ip): sec_log.info("NGINX_BAN_BLOCKED ip=%s — allowlisted, refused to write deny rule", ip) return import subprocess + line = f"deny {ip};\n" + # Each step has its own try/except so we know WHICH step failed. + # Pre-2026-04-30 a single bare `except: pass` swallowed every + # error including PermissionError on the conf file write and + # CalledProcessError from systemctl. Sentinel + auto-escalate + # logged "BAN" but the request actually never landed in nginx. + # Now each failure mode hits sec_log so the operator sees why. try: - line = f"deny {ip};\n" try: with open(_NGINX_BAN_FILE) as f: if line in f.read(): return except FileNotFoundError: pass + except PermissionError as e: + sec_log.warning("NGINX_BAN_READ_DENIED file=%s err=%s — won't dedup, attempting append anyway", _NGINX_BAN_FILE, e) + try: with open(_NGINX_BAN_FILE, "a") as f: f.write(line) - subprocess.run(["systemctl", "reload", "nginx"], capture_output=True, timeout=5) - except Exception: - pass + except PermissionError as e: + sec_log.error("NGINX_BAN_WRITE_DENIED ip=%s file=%s err=%s — ban NOT effective at nginx layer", ip, _NGINX_BAN_FILE, e) + return + except Exception as e: + sec_log.error("NGINX_BAN_WRITE_ERROR ip=%s err=%s", ip, e) + return + try: + result = subprocess.run(["systemctl", "reload", "nginx"], capture_output=True, text=True, timeout=5) + if result.returncode != 0: + sec_log.error("NGINX_RELOAD_FAILED ip=%s rc=%d stderr=%s", ip, result.returncode, result.stderr.strip()) + except subprocess.TimeoutExpired: + sec_log.error("NGINX_RELOAD_TIMEOUT ip=%s — systemctl reload nginx didn't finish in 5s", ip) + except FileNotFoundError: + sec_log.error("NGINX_RELOAD_NO_SYSTEMCTL ip=%s — systemctl not in PATH for service user", ip) + except Exception as e: + sec_log.error("NGINX_RELOAD_ERROR ip=%s err=%s", ip, e) def _nginx_unban(ip): """Remove IP from nginx deny list and reload.""" @@ -7991,7 +8016,10 @@ def admin_mass_ban(): results = {"success": 0, "failed": 0, "skipped": 0} for ip in ip_list: ip = ip.strip() - if not ip or ip.startswith("192.168."): + # 2026-04-30: substring "192.168." → is_allowlisted so all + # trusted networks (LAN gateways, IPv6 loopback, custom + # entries) are skipped, not just one /16. + if not ip or is_allowlisted(ip): results["skipped"] += 1 continue try: @@ -12713,7 +12741,17 @@ SENTINEL_MODEL = "qwen2.5:latest" SENTINEL_INTERVAL = 300 # 5 minutes _sentinel_last_pos = 0 _sentinel_results = [] # last 50 analyses -_sentinel_stats = {"scans": 0, "bans": 0, "last_run": None, "last_error": None, "next_scan_ts": 0} +_sentinel_stats = { + "scans": 0, "bans": 0, "last_run": None, "last_error": None, "next_scan_ts": 0, + # 2026-04-30 J: track consecutive AI-query failures so we can + # fire a callback (email alert) when Ollama is sustainedly busy + # or unreachable. Pre-fix a model-busy state preserved log + # position + skipped the scan with no operator notification. + "consecutive_ai_failures": 0, + "ai_busy_alerted": False, # one alert per outage; clears on first success +} +SENTINEL_AI_FAILURE_ALERT_THRESHOLD = 3 # consecutive failures before email +SENTINEL_AI_RETRY_DELAY_SECS = 30 # wait before retry inside same scan def _sentinel_log_entry(msg): """Write to sentinel log file.""" @@ -12761,7 +12799,10 @@ def _sentinel_scan(): if token.startswith("ip="): ip = token[3:] break - if ip and not ip.startswith("192.168."): + # 2026-04-30: was substring "192.168." — sentinel now skips + # ALL allowlisted IPs from analysis (saves tokens + prevents + # the AI judge from getting confused by legitimate admin traffic). + if ip and not is_allowlisted(ip): ip_activity[ip].append(line) if not ip_activity: @@ -12826,21 +12867,85 @@ def _sentinel_scan(): for ip, summary, _ in analysis_items[:15]: # max 15 IPs per scan prompt += summary + "\n" - # Query local AI - try: - cfg = load_config() - base = cfg["providers"]["ollama"].get("base_url", "http://localhost:11434") - resp = requests.post(f"{base}/api/generate", json={ - "model": SENTINEL_MODEL, "prompt": prompt, "stream": False, - "options": {"num_ctx": 4096, "temperature": 0.1} - }, timeout=60) - resp.raise_for_status() - ai_response = resp.json()["response"] - except Exception as e: - _sentinel_stats["last_error"] = f"AI query failed: {e}" - _sentinel_log_entry(f"AI_ERROR error={e}") + # Query local AI. 2026-04-30 J fix: retry once on model-busy / + # connection / timeout, and fire an operator callback when the + # AI is sustainedly unreachable. Pre-fix a single Ollama hiccup + # silently dropped the scan with no notification — operator only + # discovered the gap by checking sentinel-status manually. + cfg = load_config() + base = cfg["providers"]["ollama"].get("base_url", "http://localhost:11434") + body = { + "model": SENTINEL_MODEL, "prompt": prompt, "stream": False, + "options": {"num_ctx": 4096, "temperature": 0.1}, + } + ai_response = None + last_err = None + for attempt in range(2): # original try + 1 retry + try: + resp = requests.post(f"{base}/api/generate", json=body, timeout=60) + resp.raise_for_status() + ai_response = resp.json()["response"] + break + except (requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ReadTimeout) as e: + last_err = f"connection/timeout: {e}" + if attempt == 0: + _sentinel_log_entry(f"AI_BUSY_RETRY attempt=1 err={str(e)[:80]} sleeping={SENTINEL_AI_RETRY_DELAY_SECS}s") + time.sleep(SENTINEL_AI_RETRY_DELAY_SECS) + continue + except requests.exceptions.HTTPError as e: + # 503 Service Unavailable + 429 Too Many = busy; retry. + # Other HTTP errors (404 model missing, 400 bad prompt) won't + # recover from a retry, so fail fast. + sc = getattr(e.response, "status_code", 0) + last_err = f"HTTP {sc}: {e}" + if sc in (429, 503) and attempt == 0: + _sentinel_log_entry(f"AI_BUSY_RETRY attempt=1 status={sc} sleeping={SENTINEL_AI_RETRY_DELAY_SECS}s") + time.sleep(SENTINEL_AI_RETRY_DELAY_SECS) + continue + break + except Exception as e: + last_err = f"unexpected: {e}" + break + + if ai_response is None: + _sentinel_stats["consecutive_ai_failures"] += 1 + _sentinel_stats["last_error"] = f"AI query failed: {last_err}" + _sentinel_log_entry( + f"AI_ERROR error={last_err} consecutive={_sentinel_stats['consecutive_ai_failures']}" + ) + # Operator callback: fire a security alert email when the AI + # has been down for ≥N consecutive scans. One alert per outage — + # cleared on next successful scan so a flapping AI doesn't + # spam the inbox. + if (_sentinel_stats["consecutive_ai_failures"] >= SENTINEL_AI_FAILURE_ALERT_THRESHOLD + and not _sentinel_stats["ai_busy_alerted"]): + _sentinel_stats["ai_busy_alerted"] = True + try: + send_security_alert( + f"Sentinel AI unreachable ({_sentinel_stats['consecutive_ai_failures']} consecutive failures)", + f"The sentinel auto-scanner has been unable to reach the LLM judge for " + f"{_sentinel_stats['consecutive_ai_failures']} consecutive scans.\n\n" + f"Last error: {last_err}\n" + f"Model: {SENTINEL_MODEL}\n" + f"Endpoint: {base}\n\n" + f"Threats are being logged and surfaced in the threat-intel UI but " + f"NOT auto-banned during this outage. Manual review recommended.", + ) + except Exception as alert_err: + sec_log.error("SENTINEL_ALERT_SEND_FAILED err=%s", alert_err) return + # AI succeeded. Reset the failure counter + clear the alerted flag + # so the next outage gets its own notification. + if _sentinel_stats["consecutive_ai_failures"] > 0: + _sentinel_log_entry( + f"AI_RECOVERED after_failures={_sentinel_stats['consecutive_ai_failures']}" + ) + _sentinel_stats["consecutive_ai_failures"] = 0 + _sentinel_stats["ai_busy_alerted"] = False + # Parse AI response try: # Extract JSON from response (handle markdown code blocks) @@ -12866,9 +12971,41 @@ def _sentinel_scan(): ban_futures = [] def _execute_ban(ip, threat, reason, attack_type): - """Execute a single ban — fail2ban + nginx + kill connections.""" - subprocess.run(["fail2ban-client", "set", "llm-team-exploit", "banip", ip], - capture_output=True, text=True, timeout=5) + """Execute a single ban — fail2ban + nginx + kill connections. + + 2026-04-30 J fix: actually examine the fail2ban-client result. + Pre-fix capture_output=True was set but the result thrown away, + so a non-zero exit (jail not configured, IP already banned, IPv6 + format quirk) silently said "AI_BAN" in the log while the + attacker walked through unimpeded. Now logs returncode + stderr + on failure so the operator sees WHY the ban didn't stick.""" + try: + result = subprocess.run( + ["fail2ban-client", "set", "llm-team-exploit", "banip", ip], + capture_output=True, text=True, timeout=5, + ) + if result.returncode != 0: + sec_log.error( + "FAIL2BAN_BAN_FAILED ip=%s rc=%d stdout=%s stderr=%s", + ip, result.returncode, + result.stdout.strip()[:200], + result.stderr.strip()[:200], + ) + _sentinel_log_entry( + f"FAIL2BAN_FAILED ip={ip} rc={result.returncode} " + f"err={result.stderr.strip()[:120]}" + ) + # Continue anyway — nginx layer is independent and may + # still take effect. + except subprocess.TimeoutExpired: + sec_log.error("FAIL2BAN_TIMEOUT ip=%s — client didn't return in 5s", ip) + _sentinel_log_entry(f"FAIL2BAN_TIMEOUT ip={ip}") + except FileNotFoundError: + sec_log.error("FAIL2BAN_NOT_INSTALLED ip=%s — fail2ban-client not in PATH", ip) + _sentinel_log_entry(f"FAIL2BAN_NOT_INSTALLED ip={ip}") + except Exception as e: + sec_log.error("FAIL2BAN_ERROR ip=%s err=%s", ip, e) + _sentinel_log_entry(f"FAIL2BAN_ERROR ip={ip} err={e}") _nginx_ban(ip) _kill_connections(ip) sec_log.warning("AI_BAN ip=%s threat=%s reason=%s attack=%s", ip, threat, reason, attack_type) @@ -12892,7 +13029,11 @@ def _sentinel_scan(): if len(_sentinel_results) > 50: _sentinel_results.pop(0) - if action == "ban" and ip and not ip.startswith("192.168."): + # 2026-04-30: was substring "192.168." — replaced with + # canonical is_allowlisted so the sentinel's AI verdict + # can't accidentally ban any allowlisted IP that slipped + # past the analysis filter (defense in depth). + if action == "ban" and ip and not is_allowlisted(ip): ban_futures.append(executor.submit(_execute_ban, ip, threat, reason, attack_type)) else: _sentinel_log_entry(f"AI_VERDICT ip={ip} threat={threat} action={action} reason={reason} attack_type={attack_type}")