#!/usr/bin/env python3 """ Health Manager ============== Production health check infrastructure for monitoring system dependencies. Monitors: - HashiCorp Vault (authentication, policies) - DragonflyDB (state management) - SQLite Ledger (audit trail) - Agent processes Part of Phase 8: Production Hardening. """ import json import sqlite3 import subprocess import time from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum from pathlib import Path from typing import Optional, Callable import redis class HealthStatus(str, Enum): """Health check status""" HEALTHY = "healthy" DEGRADED = "degraded" UNHEALTHY = "unhealthy" UNKNOWN = "unknown" class DependencyType(str, Enum): """Types of system dependencies""" VAULT = "vault" DRAGONFLY = "dragonfly" LEDGER = "ledger" AGENT = "agent" NETWORK = "network" @dataclass class HealthCheckResult: """Result of a health check""" dependency: DependencyType status: HealthStatus latency_ms: float message: str checked_at: str = "" details: dict = field(default_factory=dict) def __post_init__(self): if not self.checked_at: self.checked_at = datetime.now(timezone.utc).isoformat() def to_dict(self) -> dict: return { "dependency": self.dependency.value, "status": self.status.value, "latency_ms": self.latency_ms, "message": self.message, "checked_at": self.checked_at, "details": self.details } @dataclass class SystemHealth: """Overall system health status""" status: HealthStatus checks: list[HealthCheckResult] healthy_count: int degraded_count: int unhealthy_count: int checked_at: str = "" def __post_init__(self): if not self.checked_at: self.checked_at = datetime.now(timezone.utc).isoformat() def to_dict(self) -> dict: return { "status": self.status.value, "healthy": self.healthy_count, "degraded": self.degraded_count, "unhealthy": self.unhealthy_count, "checked_at": self.checked_at, "checks": [c.to_dict() for c in self.checks] } class HealthManager: """ Manages health checks for all system dependencies. Features: - Individual dependency health checks - Aggregate system health - Configurable thresholds - Health history tracking """ # Default configuration VAULT_ADDR = "https://127.0.0.1:8200" VAULT_TOKEN_FILE = "/opt/vault/init-keys.json" REDIS_HOST = "127.0.0.1" REDIS_PORT = 6379 LEDGER_PATH = "/opt/agent-governance/ledger/governance.db" # Thresholds (ms) LATENCY_WARN_THRESHOLD = 500 LATENCY_CRITICAL_THRESHOLD = 2000 CHECK_TIMEOUT = 5 def __init__(self, base_path: str = "/opt/agent-governance"): self.base_path = Path(base_path) self._vault_token: Optional[str] = None self._redis: Optional[redis.Redis] = None self._history: list[SystemHealth] = [] def _now(self) -> str: return datetime.now(timezone.utc).isoformat() def _get_vault_token(self) -> str: """Get Vault root token""" if self._vault_token: return self._vault_token try: with open(self.VAULT_TOKEN_FILE) as f: self._vault_token = json.load(f)["root_token"] return self._vault_token except Exception: return "" def _get_redis(self) -> Optional[redis.Redis]: """Get Redis/DragonflyDB client""" if self._redis: return self._redis try: # Get credentials from Vault token = self._get_vault_token() result = subprocess.run([ "curl", "-sk", "-H", f"X-Vault-Token: {token}", f"{self.VAULT_ADDR}/v1/secret/data/services/dragonfly" ], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT) creds = json.loads(result.stdout)["data"]["data"] self._redis = redis.Redis( host=creds["host"], port=int(creds["port"]), password=creds["password"], decode_responses=True, socket_timeout=self.CHECK_TIMEOUT ) return self._redis except Exception: return None def check_vault(self) -> HealthCheckResult: """Check Vault health""" start = time.time() try: token = self._get_vault_token() if not token: return HealthCheckResult( dependency=DependencyType.VAULT, status=HealthStatus.UNHEALTHY, latency_ms=0, message="Cannot read Vault token" ) result = subprocess.run([ "curl", "-sk", "-H", f"X-Vault-Token: {token}", f"{self.VAULT_ADDR}/v1/sys/health" ], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT) latency = (time.time() - start) * 1000 data = json.loads(result.stdout) if data.get("sealed"): return HealthCheckResult( dependency=DependencyType.VAULT, status=HealthStatus.UNHEALTHY, latency_ms=latency, message="Vault is sealed", details=data ) if not data.get("initialized"): return HealthCheckResult( dependency=DependencyType.VAULT, status=HealthStatus.UNHEALTHY, latency_ms=latency, message="Vault is not initialized", details=data ) status = HealthStatus.HEALTHY if latency > self.LATENCY_CRITICAL_THRESHOLD: status = HealthStatus.DEGRADED elif latency > self.LATENCY_WARN_THRESHOLD: status = HealthStatus.DEGRADED return HealthCheckResult( dependency=DependencyType.VAULT, status=status, latency_ms=latency, message="Vault is healthy", details={"initialized": True, "sealed": False} ) except subprocess.TimeoutExpired: return HealthCheckResult( dependency=DependencyType.VAULT, status=HealthStatus.UNHEALTHY, latency_ms=self.CHECK_TIMEOUT * 1000, message="Vault health check timed out" ) except Exception as e: return HealthCheckResult( dependency=DependencyType.VAULT, status=HealthStatus.UNHEALTHY, latency_ms=(time.time() - start) * 1000, message=f"Vault check failed: {str(e)}" ) def check_dragonfly(self) -> HealthCheckResult: """Check DragonflyDB health""" start = time.time() try: r = self._get_redis() if not r: return HealthCheckResult( dependency=DependencyType.DRAGONFLY, status=HealthStatus.UNHEALTHY, latency_ms=0, message="Cannot connect to DragonflyDB" ) # Ping test r.ping() latency = (time.time() - start) * 1000 # Get info info = r.info("server") status = HealthStatus.HEALTHY if latency > self.LATENCY_CRITICAL_THRESHOLD: status = HealthStatus.DEGRADED elif latency > self.LATENCY_WARN_THRESHOLD: status = HealthStatus.DEGRADED return HealthCheckResult( dependency=DependencyType.DRAGONFLY, status=status, latency_ms=latency, message="DragonflyDB is healthy", details={ "version": info.get("redis_version", "unknown"), "uptime_seconds": info.get("uptime_in_seconds", 0) } ) except redis.ConnectionError: return HealthCheckResult( dependency=DependencyType.DRAGONFLY, status=HealthStatus.UNHEALTHY, latency_ms=(time.time() - start) * 1000, message="DragonflyDB connection refused" ) except Exception as e: return HealthCheckResult( dependency=DependencyType.DRAGONFLY, status=HealthStatus.UNHEALTHY, latency_ms=(time.time() - start) * 1000, message=f"DragonflyDB check failed: {str(e)}" ) def check_ledger(self) -> HealthCheckResult: """Check SQLite Ledger health""" start = time.time() try: if not Path(self.LEDGER_PATH).exists(): return HealthCheckResult( dependency=DependencyType.LEDGER, status=HealthStatus.UNHEALTHY, latency_ms=0, message="Ledger database file not found" ) conn = sqlite3.connect(self.LEDGER_PATH, timeout=self.CHECK_TIMEOUT) cursor = conn.cursor() # Check tables exist cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") tables = [row[0] for row in cursor.fetchall()] required_tables = ["agent_metrics", "violations", "promotions", "orchestration_log"] missing = [t for t in required_tables if t not in tables] latency = (time.time() - start) * 1000 conn.close() if missing: return HealthCheckResult( dependency=DependencyType.LEDGER, status=HealthStatus.DEGRADED, latency_ms=latency, message=f"Missing tables: {', '.join(missing)}", details={"tables": tables, "missing": missing} ) status = HealthStatus.HEALTHY if latency > self.LATENCY_WARN_THRESHOLD: status = HealthStatus.DEGRADED return HealthCheckResult( dependency=DependencyType.LEDGER, status=status, latency_ms=latency, message="Ledger is healthy", details={"tables": tables} ) except sqlite3.OperationalError as e: return HealthCheckResult( dependency=DependencyType.LEDGER, status=HealthStatus.UNHEALTHY, latency_ms=(time.time() - start) * 1000, message=f"Ledger error: {str(e)}" ) except Exception as e: return HealthCheckResult( dependency=DependencyType.LEDGER, status=HealthStatus.UNHEALTHY, latency_ms=(time.time() - start) * 1000, message=f"Ledger check failed: {str(e)}" ) def check_all(self) -> SystemHealth: """Run all health checks and return aggregate status""" checks = [ self.check_vault(), self.check_dragonfly(), self.check_ledger() ] healthy = sum(1 for c in checks if c.status == HealthStatus.HEALTHY) degraded = sum(1 for c in checks if c.status == HealthStatus.DEGRADED) unhealthy = sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY) # Determine overall status if unhealthy > 0: overall = HealthStatus.UNHEALTHY elif degraded > 0: overall = HealthStatus.DEGRADED else: overall = HealthStatus.HEALTHY health = SystemHealth( status=overall, checks=checks, healthy_count=healthy, degraded_count=degraded, unhealthy_count=unhealthy ) # Track history self._history.append(health) if len(self._history) > 100: self._history = self._history[-100:] # Store in DragonflyDB try: r = self._get_redis() if r: r.set("health:latest", json.dumps(health.to_dict())) r.lpush("health:history", json.dumps(health.to_dict())) r.ltrim("health:history", 0, 99) except Exception: pass return health def get_status(self) -> dict: """Get current health status as dict""" health = self.check_all() return health.to_dict() def is_healthy(self) -> bool: """Quick check if system is healthy""" health = self.check_all() return health.status == HealthStatus.HEALTHY # ============================================================================= # CLI Interface # ============================================================================= def main(): import argparse parser = argparse.ArgumentParser(description="Health Manager CLI") parser.add_argument("command", choices=["check", "status", "watch"], help="Command to run") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--interval", type=int, default=5, help="Watch interval in seconds") args = parser.parse_args() manager = HealthManager() if args.command == "check": health = manager.check_all() if args.json: print(json.dumps(health.to_dict(), indent=2)) else: print(f"\nSystem Health: {health.status.value.upper()}") print(f"Healthy: {health.healthy_count} | Degraded: {health.degraded_count} | Unhealthy: {health.unhealthy_count}") print() for check in health.checks: icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(check.status.value, "?") print(f" {icon} {check.dependency.value}: {check.message} ({check.latency_ms:.1f}ms)") elif args.command == "status": health = manager.check_all() print("HEALTHY" if health.status == HealthStatus.HEALTHY else health.status.value.upper()) elif args.command == "watch": print("Watching health (Ctrl+C to stop)...") try: while True: health = manager.check_all() timestamp = datetime.now().strftime("%H:%M:%S") status_icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(health.status.value, "?") print(f"[{timestamp}] {status_icon} {health.status.value} - V:{health.checks[0].latency_ms:.0f}ms D:{health.checks[1].latency_ms:.0f}ms L:{health.checks[2].latency_ms:.0f}ms") time.sleep(args.interval) except KeyboardInterrupt: print("\nStopped.") if __name__ == "__main__": main()