Phase 8 Production Hardening with complete governance infrastructure: - Vault integration with tiered policies (T0-T4) - DragonflyDB state management - SQLite audit ledger - Pipeline DSL and templates - Promotion/revocation engine - Checkpoint system for session persistence - Health manager and circuit breaker for fault tolerance - GitHub/Slack integrations - Architectural test pipeline with bug watcher, suggestion engine, council review - Multi-agent chaos testing framework Test Results: - Governance tests: 68/68 passing - E2E workflow: 16/16 passing - Phase 2 Vault: 14/14 passing - Integration tests: 27/27 passing Coverage: 57.6% average across 12 phases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
449 lines
15 KiB
Python
449 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Health Manager
|
|
==============
|
|
Production health check infrastructure for monitoring system dependencies.
|
|
|
|
Monitors:
|
|
- HashiCorp Vault (authentication, policies)
|
|
- DragonflyDB (state management)
|
|
- SQLite Ledger (audit trail)
|
|
- Agent processes
|
|
|
|
Part of Phase 8: Production Hardening.
|
|
"""
|
|
|
|
import json
|
|
import sqlite3
|
|
import subprocess
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Optional, Callable
|
|
import redis
|
|
|
|
|
|
class HealthStatus(str, Enum):
|
|
"""Health check status"""
|
|
HEALTHY = "healthy"
|
|
DEGRADED = "degraded"
|
|
UNHEALTHY = "unhealthy"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class DependencyType(str, Enum):
|
|
"""Types of system dependencies"""
|
|
VAULT = "vault"
|
|
DRAGONFLY = "dragonfly"
|
|
LEDGER = "ledger"
|
|
AGENT = "agent"
|
|
NETWORK = "network"
|
|
|
|
|
|
@dataclass
|
|
class HealthCheckResult:
|
|
"""Result of a health check"""
|
|
dependency: DependencyType
|
|
status: HealthStatus
|
|
latency_ms: float
|
|
message: str
|
|
checked_at: str = ""
|
|
details: dict = field(default_factory=dict)
|
|
|
|
def __post_init__(self):
|
|
if not self.checked_at:
|
|
self.checked_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"dependency": self.dependency.value,
|
|
"status": self.status.value,
|
|
"latency_ms": self.latency_ms,
|
|
"message": self.message,
|
|
"checked_at": self.checked_at,
|
|
"details": self.details
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class SystemHealth:
|
|
"""Overall system health status"""
|
|
status: HealthStatus
|
|
checks: list[HealthCheckResult]
|
|
healthy_count: int
|
|
degraded_count: int
|
|
unhealthy_count: int
|
|
checked_at: str = ""
|
|
|
|
def __post_init__(self):
|
|
if not self.checked_at:
|
|
self.checked_at = datetime.now(timezone.utc).isoformat()
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"status": self.status.value,
|
|
"healthy": self.healthy_count,
|
|
"degraded": self.degraded_count,
|
|
"unhealthy": self.unhealthy_count,
|
|
"checked_at": self.checked_at,
|
|
"checks": [c.to_dict() for c in self.checks]
|
|
}
|
|
|
|
|
|
class HealthManager:
|
|
"""
|
|
Manages health checks for all system dependencies.
|
|
|
|
Features:
|
|
- Individual dependency health checks
|
|
- Aggregate system health
|
|
- Configurable thresholds
|
|
- Health history tracking
|
|
"""
|
|
|
|
# Default configuration
|
|
VAULT_ADDR = "https://127.0.0.1:8200"
|
|
VAULT_TOKEN_FILE = "/opt/vault/init-keys.json"
|
|
REDIS_HOST = "127.0.0.1"
|
|
REDIS_PORT = 6379
|
|
LEDGER_PATH = "/opt/agent-governance/ledger/governance.db"
|
|
|
|
# Thresholds (ms)
|
|
LATENCY_WARN_THRESHOLD = 500
|
|
LATENCY_CRITICAL_THRESHOLD = 2000
|
|
CHECK_TIMEOUT = 5
|
|
|
|
def __init__(self, base_path: str = "/opt/agent-governance"):
|
|
self.base_path = Path(base_path)
|
|
self._vault_token: Optional[str] = None
|
|
self._redis: Optional[redis.Redis] = None
|
|
self._history: list[SystemHealth] = []
|
|
|
|
def _now(self) -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
def _get_vault_token(self) -> str:
|
|
"""Get Vault root token"""
|
|
if self._vault_token:
|
|
return self._vault_token
|
|
try:
|
|
with open(self.VAULT_TOKEN_FILE) as f:
|
|
self._vault_token = json.load(f)["root_token"]
|
|
return self._vault_token
|
|
except Exception:
|
|
return ""
|
|
|
|
def _get_redis(self) -> Optional[redis.Redis]:
|
|
"""Get Redis/DragonflyDB client"""
|
|
if self._redis:
|
|
return self._redis
|
|
try:
|
|
# Get credentials from Vault
|
|
token = self._get_vault_token()
|
|
result = subprocess.run([
|
|
"curl", "-sk",
|
|
"-H", f"X-Vault-Token: {token}",
|
|
f"{self.VAULT_ADDR}/v1/secret/data/services/dragonfly"
|
|
], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)
|
|
|
|
creds = json.loads(result.stdout)["data"]["data"]
|
|
self._redis = redis.Redis(
|
|
host=creds["host"],
|
|
port=int(creds["port"]),
|
|
password=creds["password"],
|
|
decode_responses=True,
|
|
socket_timeout=self.CHECK_TIMEOUT
|
|
)
|
|
return self._redis
|
|
except Exception:
|
|
return None
|
|
|
|
def check_vault(self) -> HealthCheckResult:
|
|
"""Check Vault health"""
|
|
start = time.time()
|
|
try:
|
|
token = self._get_vault_token()
|
|
if not token:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=0,
|
|
message="Cannot read Vault token"
|
|
)
|
|
|
|
result = subprocess.run([
|
|
"curl", "-sk",
|
|
"-H", f"X-Vault-Token: {token}",
|
|
f"{self.VAULT_ADDR}/v1/sys/health"
|
|
], capture_output=True, text=True, timeout=self.CHECK_TIMEOUT)
|
|
|
|
latency = (time.time() - start) * 1000
|
|
data = json.loads(result.stdout)
|
|
|
|
if data.get("sealed"):
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=latency,
|
|
message="Vault is sealed",
|
|
details=data
|
|
)
|
|
|
|
if not data.get("initialized"):
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=latency,
|
|
message="Vault is not initialized",
|
|
details=data
|
|
)
|
|
|
|
status = HealthStatus.HEALTHY
|
|
if latency > self.LATENCY_CRITICAL_THRESHOLD:
|
|
status = HealthStatus.DEGRADED
|
|
elif latency > self.LATENCY_WARN_THRESHOLD:
|
|
status = HealthStatus.DEGRADED
|
|
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=status,
|
|
latency_ms=latency,
|
|
message="Vault is healthy",
|
|
details={"initialized": True, "sealed": False}
|
|
)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=self.CHECK_TIMEOUT * 1000,
|
|
message="Vault health check timed out"
|
|
)
|
|
except Exception as e:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.VAULT,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=(time.time() - start) * 1000,
|
|
message=f"Vault check failed: {str(e)}"
|
|
)
|
|
|
|
def check_dragonfly(self) -> HealthCheckResult:
|
|
"""Check DragonflyDB health"""
|
|
start = time.time()
|
|
try:
|
|
r = self._get_redis()
|
|
if not r:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.DRAGONFLY,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=0,
|
|
message="Cannot connect to DragonflyDB"
|
|
)
|
|
|
|
# Ping test
|
|
r.ping()
|
|
latency = (time.time() - start) * 1000
|
|
|
|
# Get info
|
|
info = r.info("server")
|
|
|
|
status = HealthStatus.HEALTHY
|
|
if latency > self.LATENCY_CRITICAL_THRESHOLD:
|
|
status = HealthStatus.DEGRADED
|
|
elif latency > self.LATENCY_WARN_THRESHOLD:
|
|
status = HealthStatus.DEGRADED
|
|
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.DRAGONFLY,
|
|
status=status,
|
|
latency_ms=latency,
|
|
message="DragonflyDB is healthy",
|
|
details={
|
|
"version": info.get("redis_version", "unknown"),
|
|
"uptime_seconds": info.get("uptime_in_seconds", 0)
|
|
}
|
|
)
|
|
|
|
except redis.ConnectionError:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.DRAGONFLY,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=(time.time() - start) * 1000,
|
|
message="DragonflyDB connection refused"
|
|
)
|
|
except Exception as e:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.DRAGONFLY,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=(time.time() - start) * 1000,
|
|
message=f"DragonflyDB check failed: {str(e)}"
|
|
)
|
|
|
|
def check_ledger(self) -> HealthCheckResult:
|
|
"""Check SQLite Ledger health"""
|
|
start = time.time()
|
|
try:
|
|
if not Path(self.LEDGER_PATH).exists():
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.LEDGER,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=0,
|
|
message="Ledger database file not found"
|
|
)
|
|
|
|
conn = sqlite3.connect(self.LEDGER_PATH, timeout=self.CHECK_TIMEOUT)
|
|
cursor = conn.cursor()
|
|
|
|
# Check tables exist
|
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
tables = [row[0] for row in cursor.fetchall()]
|
|
required_tables = ["agent_metrics", "violations", "promotions", "orchestration_log"]
|
|
|
|
missing = [t for t in required_tables if t not in tables]
|
|
|
|
latency = (time.time() - start) * 1000
|
|
conn.close()
|
|
|
|
if missing:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.LEDGER,
|
|
status=HealthStatus.DEGRADED,
|
|
latency_ms=latency,
|
|
message=f"Missing tables: {', '.join(missing)}",
|
|
details={"tables": tables, "missing": missing}
|
|
)
|
|
|
|
status = HealthStatus.HEALTHY
|
|
if latency > self.LATENCY_WARN_THRESHOLD:
|
|
status = HealthStatus.DEGRADED
|
|
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.LEDGER,
|
|
status=status,
|
|
latency_ms=latency,
|
|
message="Ledger is healthy",
|
|
details={"tables": tables}
|
|
)
|
|
|
|
except sqlite3.OperationalError as e:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.LEDGER,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=(time.time() - start) * 1000,
|
|
message=f"Ledger error: {str(e)}"
|
|
)
|
|
except Exception as e:
|
|
return HealthCheckResult(
|
|
dependency=DependencyType.LEDGER,
|
|
status=HealthStatus.UNHEALTHY,
|
|
latency_ms=(time.time() - start) * 1000,
|
|
message=f"Ledger check failed: {str(e)}"
|
|
)
|
|
|
|
def check_all(self) -> SystemHealth:
|
|
"""Run all health checks and return aggregate status"""
|
|
checks = [
|
|
self.check_vault(),
|
|
self.check_dragonfly(),
|
|
self.check_ledger()
|
|
]
|
|
|
|
healthy = sum(1 for c in checks if c.status == HealthStatus.HEALTHY)
|
|
degraded = sum(1 for c in checks if c.status == HealthStatus.DEGRADED)
|
|
unhealthy = sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY)
|
|
|
|
# Determine overall status
|
|
if unhealthy > 0:
|
|
overall = HealthStatus.UNHEALTHY
|
|
elif degraded > 0:
|
|
overall = HealthStatus.DEGRADED
|
|
else:
|
|
overall = HealthStatus.HEALTHY
|
|
|
|
health = SystemHealth(
|
|
status=overall,
|
|
checks=checks,
|
|
healthy_count=healthy,
|
|
degraded_count=degraded,
|
|
unhealthy_count=unhealthy
|
|
)
|
|
|
|
# Track history
|
|
self._history.append(health)
|
|
if len(self._history) > 100:
|
|
self._history = self._history[-100:]
|
|
|
|
# Store in DragonflyDB
|
|
try:
|
|
r = self._get_redis()
|
|
if r:
|
|
r.set("health:latest", json.dumps(health.to_dict()))
|
|
r.lpush("health:history", json.dumps(health.to_dict()))
|
|
r.ltrim("health:history", 0, 99)
|
|
except Exception:
|
|
pass
|
|
|
|
return health
|
|
|
|
def get_status(self) -> dict:
|
|
"""Get current health status as dict"""
|
|
health = self.check_all()
|
|
return health.to_dict()
|
|
|
|
def is_healthy(self) -> bool:
|
|
"""Quick check if system is healthy"""
|
|
health = self.check_all()
|
|
return health.status == HealthStatus.HEALTHY
|
|
|
|
|
|
# =============================================================================
|
|
# CLI Interface
|
|
# =============================================================================
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Health Manager CLI")
|
|
parser.add_argument("command", choices=["check", "status", "watch"],
|
|
help="Command to run")
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
parser.add_argument("--interval", type=int, default=5,
|
|
help="Watch interval in seconds")
|
|
|
|
args = parser.parse_args()
|
|
manager = HealthManager()
|
|
|
|
if args.command == "check":
|
|
health = manager.check_all()
|
|
if args.json:
|
|
print(json.dumps(health.to_dict(), indent=2))
|
|
else:
|
|
print(f"\nSystem Health: {health.status.value.upper()}")
|
|
print(f"Healthy: {health.healthy_count} | Degraded: {health.degraded_count} | Unhealthy: {health.unhealthy_count}")
|
|
print()
|
|
for check in health.checks:
|
|
icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(check.status.value, "?")
|
|
print(f" {icon} {check.dependency.value}: {check.message} ({check.latency_ms:.1f}ms)")
|
|
|
|
elif args.command == "status":
|
|
health = manager.check_all()
|
|
print("HEALTHY" if health.status == HealthStatus.HEALTHY else health.status.value.upper())
|
|
|
|
elif args.command == "watch":
|
|
print("Watching health (Ctrl+C to stop)...")
|
|
try:
|
|
while True:
|
|
health = manager.check_all()
|
|
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
status_icon = {"healthy": "✅", "degraded": "⚠️", "unhealthy": "❌"}.get(health.status.value, "?")
|
|
print(f"[{timestamp}] {status_icon} {health.status.value} - V:{health.checks[0].latency_ms:.0f}ms D:{health.checks[1].latency_ms:.0f}ms L:{health.checks[2].latency_ms:.0f}ms")
|
|
time.sleep(args.interval)
|
|
except KeyboardInterrupt:
|
|
print("\nStopped.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|