agent-governance/runtime/health_manager.py
profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

449 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Health Manager
==============
Production health check infrastructure for monitoring system dependencies.
Monitors:
- HashiCorp Vault (authentication, policies)
- DragonflyDB (state management)
- SQLite Ledger (audit trail)
- Agent processes
Part of Phase 8: Production Hardening.
"""
import json
import sqlite3
import subprocess
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Optional, Callable
import redis
class HealthStatus(str, Enum):
"""Health check status"""
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
UNKNOWN = "unknown"
class DependencyType(str, Enum):
"""Types of system dependencies"""
VAULT = "vault"
DRAGONFLY = "dragonfly"
LEDGER = "ledger"
AGENT = "agent"
NETWORK = "network"
@dataclass
class HealthCheckResult:
"""Result of a health check"""
dependency: DependencyType
status: HealthStatus
latency_ms: float
message: str
checked_at: str = ""
details: dict = field(default_factory=dict)
def __post_init__(self):
if not self.checked_at:
self.checked_at = datetime.now(timezone.utc).isoformat()
def to_dict(self) -> dict:
return {
"dependency": self.dependency.value,
"status": self.status.value,
"latency_ms": self.latency_ms,
"message": self.message,
"checked_at": self.checked_at,
"details": self.details
}
@dataclass
class SystemHealth:
"""Overall system health status"""
status: HealthStatus
checks: list[HealthCheckResult]
healthy_count: int
degraded_count: int
unhealthy_count: int
checked_at: str = ""
def __post_init__(self):
if not self.checked_at:
self.checked_at = datetime.now(timezone.utc).isoformat()
def to_dict(self) -> dict:
return {
"status": self.status.value,
"healthy": self.healthy_count,
"degraded": self.degraded_count,
"unhealthy": self.unhealthy_count,
"checked_at": self.checked_at,
"checks": [c.to_dict() for c in self.checks]
}
class HealthManager:
"""
Manages health checks for all system dependencies.
Features:
- Individual dependency health checks
- Aggregate system health
- Configurable thresholds
- Health history tracking
"""
# Default configuration
VAULT_ADDR = "https://127.0.0.1:8200"
VAULT_TOKEN_FILE = "/opt/vault/init-keys.json"
REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
LEDGER_PATH = "/opt/agent-governance/ledger/governance.db"
# Thresholds (ms)
LATENCY_WARN_THRESHOLD = 500
LATENCY_CRITICAL_THRESHOLD = 2000
CHECK_TIMEOUT = 5
def __init__(self, base_path: str = "/opt/agent-governance"):
self.base_path = Path(base_path)
self._vault_token: Optional[str] = None
self._redis: Optional[redis.Redis] = None
self._history: list[SystemHealth] = []
def _now(self) -> str:
return datetime.now(timezone.utc).isoformat()
def _get_vault_token(self) -> str:
"""Get Vault root token"""
if self._vault_token:
return self._vault_token
try:
with open(self.VAULT_TOKEN_FILE) as f:
self._vault_token = json.load(f)["root_token"]
return self._vault_token
except Exception:
return ""
def _get_redis(self) -> Optional[redis.Redis]:
"""Get Redis/DragonflyDB client"""
if self._redis:
return self._redis
try:
# Get credentials from Vault
token = self._get_vault_token()
result = subprocess.run([
"curl", "-sk",
"-H", f"X-Vault-Token: {token}",
f"{self.VAULT_ADDR}/v1/secret/data/services/dragonfly"
], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)
creds = json.loads(result.stdout)["data"]["data"]
self._redis = redis.Redis(
host=creds["host"],
port=int(creds["port"]),
password=creds["password"],
decode_responses=True,
socket_timeout=self.CHECK_TIMEOUT
)
return self._redis
except Exception:
return None
def check_vault(self) -> HealthCheckResult:
"""Check Vault health"""
start = time.time()
try:
token = self._get_vault_token()
if not token:
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=HealthStatus.UNHEALTHY,
latency_ms=0,
message="Cannot read Vault token"
)
result = subprocess.run([
"curl", "-sk",
"-H", f"X-Vault-Token: {token}",
f"{self.VAULT_ADDR}/v1/sys/health"
], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)
latency = (time.time() - start) * 1000
data = json.loads(result.stdout)
if data.get("sealed"):
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=HealthStatus.UNHEALTHY,
latency_ms=latency,
message="Vault is sealed",
details=data
)
if not data.get("initialized"):
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=HealthStatus.UNHEALTHY,
latency_ms=latency,
message="Vault is not initialized",
details=data
)
status = HealthStatus.HEALTHY
if latency > self.LATENCY_CRITICAL_THRESHOLD:
status = HealthStatus.DEGRADED
elif latency > self.LATENCY_WARN_THRESHOLD:
status = HealthStatus.DEGRADED
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=status,
latency_ms=latency,
message="Vault is healthy",
details={"initialized": True, "sealed": False}
)
except subprocess.TimeoutExpired:
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=HealthStatus.UNHEALTHY,
latency_ms=self.CHECK_TIMEOUT * 1000,
message="Vault health check timed out"
)
except Exception as e:
return HealthCheckResult(
dependency=DependencyType.VAULT,
status=HealthStatus.UNHEALTHY,
latency_ms=(time.time() - start) * 1000,
message=f"Vault check failed: {str(e)}"
)
def check_dragonfly(self) -> HealthCheckResult:
"""Check DragonflyDB health"""
start = time.time()
try:
r = self._get_redis()
if not r:
return HealthCheckResult(
dependency=DependencyType.DRAGONFLY,
status=HealthStatus.UNHEALTHY,
latency_ms=0,
message="Cannot connect to DragonflyDB"
)
# Ping test
r.ping()
latency = (time.time() - start) * 1000
# Get info
info = r.info("server")
status = HealthStatus.HEALTHY
if latency > self.LATENCY_CRITICAL_THRESHOLD:
status = HealthStatus.DEGRADED
elif latency > self.LATENCY_WARN_THRESHOLD:
status = HealthStatus.DEGRADED
return HealthCheckResult(
dependency=DependencyType.DRAGONFLY,
status=status,
latency_ms=latency,
message="DragonflyDB is healthy",
details={
"version": info.get("redis_version", "unknown"),
"uptime_seconds": info.get("uptime_in_seconds", 0)
}
)
except redis.ConnectionError:
return HealthCheckResult(
dependency=DependencyType.DRAGONFLY,
status=HealthStatus.UNHEALTHY,
latency_ms=(time.time() - start) * 1000,
message="DragonflyDB connection refused"
)
except Exception as e:
return HealthCheckResult(
dependency=DependencyType.DRAGONFLY,
status=HealthStatus.UNHEALTHY,
latency_ms=(time.time() - start) * 1000,
message=f"DragonflyDB check failed: {str(e)}"
)
def check_ledger(self) -> HealthCheckResult:
"""Check SQLite Ledger health"""
start = time.time()
try:
if not Path(self.LEDGER_PATH).exists():
return HealthCheckResult(
dependency=DependencyType.LEDGER,
status=HealthStatus.UNHEALTHY,
latency_ms=0,
message="Ledger database file not found"
)
conn = sqlite3.connect(self.LEDGER_PATH, timeout=self.CHECK_TIMEOUT)
cursor = conn.cursor()
# Check tables exist
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = [row[0] for row in cursor.fetchall()]
required_tables = ["agent_metrics", "violations", "promotions", "orchestration_log"]
missing = [t for t in required_tables if t not in tables]
latency = (time.time() - start) * 1000
conn.close()
if missing:
return HealthCheckResult(
dependency=DependencyType.LEDGER,
status=HealthStatus.DEGRADED,
latency_ms=latency,
message=f"Missing tables: {', '.join(missing)}",
details={"tables": tables, "missing": missing}
)
status = HealthStatus.HEALTHY
if latency > self.LATENCY_WARN_THRESHOLD:
status = HealthStatus.DEGRADED
return HealthCheckResult(
dependency=DependencyType.LEDGER,
status=status,
latency_ms=latency,
message="Ledger is healthy",
details={"tables": tables}
)
except sqlite3.OperationalError as e:
return HealthCheckResult(
dependency=DependencyType.LEDGER,
status=HealthStatus.UNHEALTHY,
latency_ms=(time.time() - start) * 1000,
message=f"Ledger error: {str(e)}"
)
except Exception as e:
return HealthCheckResult(
dependency=DependencyType.LEDGER,
status=HealthStatus.UNHEALTHY,
latency_ms=(time.time() - start) * 1000,
message=f"Ledger check failed: {str(e)}"
)
def check_all(self) -> SystemHealth:
"""Run all health checks and return aggregate status"""
checks = [
self.check_vault(),
self.check_dragonfly(),
self.check_ledger()
]
healthy = sum(1 for c in checks if c.status == HealthStatus.HEALTHY)
degraded = sum(1 for c in checks if c.status == HealthStatus.DEGRADED)
unhealthy = sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY)
# Determine overall status
if unhealthy > 0:
overall = HealthStatus.UNHEALTHY
elif degraded > 0:
overall = HealthStatus.DEGRADED
else:
overall = HealthStatus.HEALTHY
health = SystemHealth(
status=overall,
checks=checks,
healthy_count=healthy,
degraded_count=degraded,
unhealthy_count=unhealthy
)
# Track history
self._history.append(health)
if len(self._history) > 100:
self._history = self._history[-100:]
# Store in DragonflyDB
try:
r = self._get_redis()
if r:
r.set("health:latest", json.dumps(health.to_dict()))
r.lpush("health:history", json.dumps(health.to_dict()))
r.ltrim("health:history", 0, 99)
except Exception:
pass
return health
def get_status(self) -> dict:
"""Get current health status as dict"""
health = self.check_all()
return health.to_dict()
def is_healthy(self) -> bool:
"""Quick check if system is healthy"""
health = self.check_all()
return health.status == HealthStatus.HEALTHY
# =============================================================================
# CLI Interface
# =============================================================================
def main():
import argparse
parser = argparse.ArgumentParser(description="Health Manager CLI")
parser.add_argument("command", choices=["check", "status", "watch"],
help="Command to run")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--interval", type=int, default=5,
help="Watch interval in seconds")
args = parser.parse_args()
manager = HealthManager()
if args.command == "check":
health = manager.check_all()
if args.json:
print(json.dumps(health.to_dict(), indent=2))
else:
print(f"\nSystem Health: {health.status.value.upper()}")
print(f"Healthy: {health.healthy_count} | Degraded: {health.degraded_count} | Unhealthy: {health.unhealthy_count}")
print()
for check in health.checks:
icon = {"healthy": "", "degraded": "⚠️", "unhealthy": ""}.get(check.status.value, "?")
print(f" {icon} {check.dependency.value}: {check.message} ({check.latency_ms:.1f}ms)")
elif args.command == "status":
health = manager.check_all()
print("HEALTHY" if health.status == HealthStatus.HEALTHY else health.status.value.upper())
elif args.command == "watch":
print("Watching health (Ctrl+C to stop)...")
try:
while True:
health = manager.check_all()
timestamp = datetime.now().strftime("%H:%M:%S")
status_icon = {"healthy": "", "degraded": "⚠️", "unhealthy": ""}.get(health.status.value, "?")
print(f"[{timestamp}] {status_icon} {health.status.value} - V:{health.checks[0].latency_ms:.0f}ms D:{health.checks[1].latency_ms:.0f}ms L:{health.checks[2].latency_ms:.0f}ms")
time.sleep(args.interval)
except KeyboardInterrupt:
print("\nStopped.")
if __name__ == "__main__":
main()