agent-governance/runtime/health_manager.py

#!/usr/bin/env python3
"""
Health Manager
==============
Production health check infrastructure for monitoring system dependencies.

Monitors:
- HashiCorp Vault (authentication, policies)
- DragonflyDB (state management)
- SQLite Ledger (audit trail)
- Agent processes

Part of Phase 8: Production Hardening.
"""

import json
import sqlite3
import subprocess
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Optional, Callable
import redis


class HealthStatus(str, Enum):
    """Health check status"""
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    UNHEALTHY = "unhealthy"
    UNKNOWN = "unknown"


class DependencyType(str, Enum):
    """Types of system dependencies"""
    VAULT = "vault"
    DRAGONFLY = "dragonfly"
    LEDGER = "ledger"
    AGENT = "agent"
    NETWORK = "network"


@dataclass
class HealthCheckResult:
    """Result of a health check"""
    dependency: DependencyType
    status: HealthStatus
    latency_ms: float
    message: str
    checked_at: str = ""
    details: dict = field(default_factory=dict)

    def __post_init__(self):
        if not self.checked_at:
            self.checked_at = datetime.now(timezone.utc).isoformat()

    def to_dict(self) -> dict:
        return {
            "dependency": self.dependency.value,
            "status": self.status.value,
            "latency_ms": self.latency_ms,
            "message": self.message,
            "checked_at": self.checked_at,
            "details": self.details
        }


@dataclass
class SystemHealth:
    """Overall system health status"""
    status: HealthStatus
    checks: list[HealthCheckResult]
    healthy_count: int
    degraded_count: int
    unhealthy_count: int
    checked_at: str = ""

    def __post_init__(self):
        if not self.checked_at:
            self.checked_at = datetime.now(timezone.utc).isoformat()

    def to_dict(self) -> dict:
        return {
            "status": self.status.value,
            "healthy": self.healthy_count,
            "degraded": self.degraded_count,
            "unhealthy": self.unhealthy_count,
            "checked_at": self.checked_at,
            "checks": [c.to_dict() for c in self.checks]
        }


class HealthManager:
    """
    Manages health checks for all system dependencies.

    Features:
    - Individual dependency health checks
    - Aggregate system health
    - Configurable thresholds
    - Health history tracking
    """

    # Default configuration
    VAULT_ADDR = "https://127.0.0.1:8200"
    VAULT_TOKEN_FILE = "/opt/vault/init-keys.json"
    REDIS_HOST = "127.0.0.1"
    REDIS_PORT = 6379
    LEDGER_PATH = "/opt/agent-governance/ledger/governance.db"

    # Thresholds (ms)
    LATENCY_WARN_THRESHOLD = 500
    LATENCY_CRITICAL_THRESHOLD = 2000
    CHECK_TIMEOUT = 5

    def __init__(self, base_path: str = "/opt/agent-governance"):
        self.base_path = Path(base_path)
        self._vault_token: Optional[str] = None
        self._redis: Optional[redis.Redis] = None
        self._history: list[SystemHealth] = []

    def _now(self) -> str:
        return datetime.now(timezone.utc).isoformat()

    def _get_vault_token(self) -> str:
        """Get Vault root token"""
        if self._vault_token:
            return self._vault_token
        try:
            with open(self.VAULT_TOKEN_FILE) as f:
                self._vault_token = json.load(f)["root_token"]
                return self._vault_token
        except Exception:
            return ""

    def _get_redis(self) -> Optional[redis.Redis]:
        """Get Redis/DragonflyDB client"""
        if self._redis:
            return self._redis
        try:
            # Get credentials from Vault
            token = self._get_vault_token()
            result = subprocess.run([
                "curl", "-sk",
                "-H", f"X-Vault-Token: {token}",
                f"{self.VAULT_ADDR}/v1/secret/data/services/dragonfly"
            ], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)

            creds = json.loads(result.stdout)["data"]["data"]
            self._redis = redis.Redis(
                host=creds["host"],
                port=int(creds["port"]),
                password=creds["password"],
                decode_responses=True,
                socket_timeout=self.CHECK_TIMEOUT
            )
            return self._redis
        except Exception:
            return None

    def check_vault(self) -> HealthCheckResult:
        """Check Vault health"""
        start = time.time()
        try:
            token = self._get_vault_token()
            if not token:
                return HealthCheckResult(
                    dependency=DependencyType.VAULT,
                    status=HealthStatus.UNHEALTHY,
                    latency_ms=0,
                    message="Cannot read Vault token"
                )

            result = subprocess.run([
                "curl", "-sk",
                "-H", f"X-Vault-Token: {token}",
                f"{self.VAULT_ADDR}/v1/sys/health"
            ], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)

            latency = (time.time() - start) * 1000
            data = json.loads(result.stdout)

            if data.get("sealed"):
                return HealthCheckResult(
                    dependency=DependencyType.VAULT,
                    status=HealthStatus.UNHEALTHY,
                    latency_ms=latency,
                    message="Vault is sealed",
                    details=data
                )

            if not data.get("initialized"):
                return HealthCheckResult(
                    dependency=DependencyType.VAULT,
                    status=HealthStatus.UNHEALTHY,
                    latency_ms=latency,
                    message="Vault is not initialized",
                    details=data
                )

            status = HealthStatus.HEALTHY
            if latency > self.LATENCY_CRITICAL_THRESHOLD:
                status = HealthStatus.DEGRADED
            elif latency > self.LATENCY_WARN_THRESHOLD:
                status = HealthStatus.DEGRADED

            return HealthCheckResult(
                dependency=DependencyType.VAULT,
                status=status,
                latency_ms=latency,
                message="Vault is healthy",
                details={"initialized": True, "sealed": False}
            )

        except subprocess.TimeoutExpired:
            return HealthCheckResult(
                dependency=DependencyType.VAULT,
                status=HealthStatus.UNHEALTHY,
                latency_ms=self.CHECK_TIMEOUT * 1000,
                message="Vault health check timed out"
            )
        except Exception as e:
            return HealthCheckResult(
                dependency=DependencyType.VAULT,
                status=HealthStatus.UNHEALTHY,
                latency_ms=(time.time() - start) * 1000,
                message=f"Vault check failed: {str(e)}"
            )

    def check_dragonfly(self) -> HealthCheckResult:
        """Check DragonflyDB health"""
        start = time.time()
        try:
            r = self._get_redis()
            if not r:
                return HealthCheckResult(
                    dependency=DependencyType.DRAGONFLY,
                    status=HealthStatus.UNHEALTHY,
                    latency_ms=0,
                    message="Cannot connect to DragonflyDB"
                )

            # Ping test
            r.ping()
            latency = (time.time() - start) * 1000

            # Get info
            info = r.info("server")

            status = HealthStatus.HEALTHY
            if latency > self.LATENCY_CRITICAL_THRESHOLD:
                status = HealthStatus.DEGRADED
            elif latency > self.LATENCY_WARN_THRESHOLD:
                status = HealthStatus.DEGRADED

            return HealthCheckResult(
                dependency=DependencyType.DRAGONFLY,
                status=status,
                latency_ms=latency,
                message="DragonflyDB is healthy",
                details={
                    "version": info.get("redis_version", "unknown"),
                    "uptime_seconds": info.get("uptime_in_seconds", 0)
                }
            )

        except redis.ConnectionError:
            return HealthCheckResult(
                dependency=DependencyType.DRAGONFLY,
                status=HealthStatus.UNHEALTHY,
                latency_ms=(time.time() - start) * 1000,
                message="DragonflyDB connection refused"
            )
        except Exception as e:
            return HealthCheckResult(
                dependency=DependencyType.DRAGONFLY,
                status=HealthStatus.UNHEALTHY,
                latency_ms=(time.time() - start) * 1000,
                message=f"DragonflyDB check failed: {str(e)}"
            )

    def check_ledger(self) -> HealthCheckResult:
        """Check SQLite Ledger health"""
        start = time.time()
        try:
            if not Path(self.LEDGER_PATH).exists():
                return HealthCheckResult(
                    dependency=DependencyType.LEDGER,
                    status=HealthStatus.UNHEALTHY,
                    latency_ms=0,
                    message="Ledger database file not found"
                )

            conn = sqlite3.connect(self.LEDGER_PATH, timeout=self.CHECK_TIMEOUT)
            cursor = conn.cursor()

            # Check tables exist
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = [row[0] for row in cursor.fetchall()]
            required_tables = ["agent_metrics", "violations", "promotions", "orchestration_log"]

            missing = [t for t in required_tables if t not in tables]

            latency = (time.time() - start) * 1000
            conn.close()

            if missing:
                return HealthCheckResult(
                    dependency=DependencyType.LEDGER,
                    status=HealthStatus.DEGRADED,
                    latency_ms=latency,
                    message=f"Missing tables: {', '.join(missing)}",
                    details={"tables": tables, "missing": missing}
                )

            status = HealthStatus.HEALTHY
            if latency > self.LATENCY_WARN_THRESHOLD:
                status = HealthStatus.DEGRADED

            return HealthCheckResult(
                dependency=DependencyType.LEDGER,
                status=status,
                latency_ms=latency,
                message="Ledger is healthy",
                details={"tables": tables}
            )

        except sqlite3.OperationalError as e:
            return HealthCheckResult(
                dependency=DependencyType.LEDGER,
                status=HealthStatus.UNHEALTHY,
                latency_ms=(time.time() - start) * 1000,
                message=f"Ledger error: {str(e)}"
            )
        except Exception as e:
            return HealthCheckResult(
                dependency=DependencyType.LEDGER,
                status=HealthStatus.UNHEALTHY,
                latency_ms=(time.time() - start) * 1000,
                message=f"Ledger check failed: {str(e)}"
            )

    def check_all(self) -> SystemHealth:
        """Run all health checks and return aggregate status"""
        checks = [
            self.check_vault(),
            self.check_dragonfly(),
            self.check_ledger()
        ]

        healthy = sum(1 for c in checks if c.status == HealthStatus.HEALTHY)
        degraded = sum(1 for c in checks if c.status == HealthStatus.DEGRADED)
        unhealthy = sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY)

        # Determine overall status
        if unhealthy > 0:
            overall = HealthStatus.UNHEALTHY
        elif degraded > 0:
            overall = HealthStatus.DEGRADED
        else:
            overall = HealthStatus.HEALTHY

        health = SystemHealth(
            status=overall,
            checks=checks,
            healthy_count=healthy,
            degraded_count=degraded,
            unhealthy_count=unhealthy
        )

        # Track history
        self._history.append(health)
        if len(self._history) > 100:
            self._history = self._history[-100:]

        # Store in DragonflyDB
        try:
            r = self._get_redis()
            if r:
                r.set("health:latest", json.dumps(health.to_dict()))
                r.lpush("health:history", json.dumps(health.to_dict()))
                r.ltrim("health:history", 0, 99)
        except Exception:
            pass

        return health

    def get_status(self) -> dict:
        """Get current health status as dict"""
        health = self.check_all()
        return health.to_dict()

    def is_healthy(self) -> bool:
        """Quick check if system is healthy"""
        health = self.check_all()
        return health.status == HealthStatus.HEALTHY


# =============================================================================
# CLI Interface
# =============================================================================

def main():
    import argparse

    parser = argparse.ArgumentParser(description="Health Manager CLI")
    parser.add_argument("command", choices=["check", "status", "watch"],
                       help="Command to run")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--interval", type=int, default=5,
                       help="Watch interval in seconds")

    args = parser.parse_args()
    manager = HealthManager()

    if args.command == "check":
        health = manager.check_all()
        if args.json:
            print(json.dumps(health.to_dict(), indent=2))
        else:
            print(f"\nSystem Health: {health.status.value.upper()}")
            print(f"Healthy: {health.healthy_count} | Degraded: {health.degraded_count} | Unhealthy: {health.unhealthy_count}")
            print()
            for check in health.checks:
                icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(check.status.value, "?")
                print(f"  {icon} {check.dependency.value}: {check.message} ({check.latency_ms:.1f}ms)")

    elif args.command == "status":
        health = manager.check_all()
        print("HEALTHY" if health.status == HealthStatus.HEALTHY else health.status.value.upper())

    elif args.command == "watch":
        print("Watching health (Ctrl+C to stop)...")
        try:
            while True:
                health = manager.check_all()
                timestamp = datetime.now().strftime("%H:%M:%S")
                status_icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(health.status.value, "?")
                print(f"[{timestamp}] {status_icon} {health.status.value} - V:{health.checks[0].latency_ms:.0f}ms D:{health.checks[1].latency_ms:.0f}ms L:{health.checks[2].latency_ms:.0f}ms")
                time.sleep(args.interval)
        except KeyboardInterrupt:
            print("\nStopped.")


if __name__ == "__main__":
    main()