#!/usr/bin/env python3 """ Circuit Breaker =============== Implements circuit breaker pattern for graceful degradation when dependencies fail. States: - CLOSED: Normal operation, requests flow through - OPEN: Circuit tripped, requests fail fast - HALF_OPEN: Testing if dependency recovered Part of Phase 8: Production Hardening. """ import json import time import threading from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum from functools import wraps from typing import Any, Callable, Optional, TypeVar, Generic import redis class CircuitState(str, Enum): """Circuit breaker states""" CLOSED = "closed" # Normal operation OPEN = "open" # Failing fast HALF_OPEN = "half_open" # Testing recovery class FailureType(str, Enum): """Types of failures tracked""" TIMEOUT = "timeout" CONNECTION_ERROR = "connection_error" EXCEPTION = "exception" THRESHOLD_EXCEEDED = "threshold_exceeded" @dataclass class CircuitStats: """Statistics for a circuit breaker""" total_calls: int = 0 successful_calls: int = 0 failed_calls: int = 0 rejected_calls: int = 0 last_failure_time: Optional[str] = None last_success_time: Optional[str] = None consecutive_failures: int = 0 consecutive_successes: int = 0 def to_dict(self) -> dict: return { "total_calls": self.total_calls, "successful_calls": self.successful_calls, "failed_calls": self.failed_calls, "rejected_calls": self.rejected_calls, "last_failure_time": self.last_failure_time, "last_success_time": self.last_success_time, "consecutive_failures": self.consecutive_failures, "consecutive_successes": self.consecutive_successes, "success_rate": self.success_rate } @property def success_rate(self) -> float: if self.total_calls == 0: return 1.0 return self.successful_calls / self.total_calls @dataclass class CircuitConfig: """Configuration for a circuit breaker""" name: str failure_threshold: int = 5 # Failures before opening success_threshold: int = 3 # Successes to close from half-open timeout_seconds: float = 30.0 # Time before trying again (open -> half-open) call_timeout: float = 10.0 # Timeout for individual calls half_open_max_calls: int = 3 # Max concurrent calls in half-open class CircuitOpenError(Exception): """Raised when circuit is open and rejecting calls""" def __init__(self, circuit_name: str, retry_after: float): self.circuit_name = circuit_name self.retry_after = retry_after super().__init__(f"Circuit '{circuit_name}' is open. Retry after {retry_after:.1f}s") T = TypeVar('T') class CircuitBreaker: """ Circuit breaker implementation for a single dependency. Usage: breaker = CircuitBreaker(CircuitConfig(name="vault")) @breaker def call_vault(): # ... make vault request pass # Or manual: with breaker: # ... protected code """ def __init__(self, config: CircuitConfig): self.config = config self._state = CircuitState.CLOSED self._stats = CircuitStats() self._last_failure_time: Optional[float] = None self._half_open_calls = 0 self._lock = threading.RLock() @property def state(self) -> CircuitState: with self._lock: # Check if we should transition from OPEN to HALF_OPEN if self._state == CircuitState.OPEN: if self._last_failure_time: elapsed = time.time() - self._last_failure_time if elapsed >= self.config.timeout_seconds: self._state = CircuitState.HALF_OPEN self._half_open_calls = 0 return self._state @property def stats(self) -> CircuitStats: return self._stats def _now(self) -> str: return datetime.now(timezone.utc).isoformat() def _record_success(self): """Record a successful call""" with self._lock: self._stats.total_calls += 1 self._stats.successful_calls += 1 self._stats.consecutive_successes += 1 self._stats.consecutive_failures = 0 self._stats.last_success_time = self._now() if self._state == CircuitState.HALF_OPEN: if self._stats.consecutive_successes >= self.config.success_threshold: self._state = CircuitState.CLOSED self._stats.consecutive_successes = 0 def _record_failure(self, failure_type: FailureType = FailureType.EXCEPTION): """Record a failed call""" with self._lock: self._stats.total_calls += 1 self._stats.failed_calls += 1 self._stats.consecutive_failures += 1 self._stats.consecutive_successes = 0 self._stats.last_failure_time = self._now() self._last_failure_time = time.time() if self._state == CircuitState.CLOSED: if self._stats.consecutive_failures >= self.config.failure_threshold: self._state = CircuitState.OPEN elif self._state == CircuitState.HALF_OPEN: # Any failure in half-open goes back to open self._state = CircuitState.OPEN def _record_rejection(self): """Record a rejected call (circuit open)""" with self._lock: self._stats.total_calls += 1 self._stats.rejected_calls += 1 def _can_execute(self) -> bool: """Check if a call can be executed""" state = self.state # This may transition OPEN -> HALF_OPEN if state == CircuitState.CLOSED: return True if state == CircuitState.OPEN: return False if state == CircuitState.HALF_OPEN: with self._lock: if self._half_open_calls < self.config.half_open_max_calls: self._half_open_calls += 1 return True return False return False def _get_retry_after(self) -> float: """Get seconds until circuit might close""" if self._last_failure_time: elapsed = time.time() - self._last_failure_time remaining = self.config.timeout_seconds - elapsed return max(0, remaining) return 0 def execute(self, func: Callable[[], T]) -> T: """Execute a function with circuit breaker protection""" if not self._can_execute(): self._record_rejection() raise CircuitOpenError(self.config.name, self._get_retry_after()) try: result = func() self._record_success() return result except Exception as e: self._record_failure() raise def __call__(self, func: Callable) -> Callable: """Decorator usage""" @wraps(func) def wrapper(*args, **kwargs): return self.execute(lambda: func(*args, **kwargs)) return wrapper def __enter__(self): """Context manager entry""" if not self._can_execute(): self._record_rejection() raise CircuitOpenError(self.config.name, self._get_retry_after()) return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" if exc_type is None: self._record_success() else: self._record_failure() return False # Don't suppress exceptions def reset(self): """Manually reset the circuit breaker""" with self._lock: self._state = CircuitState.CLOSED self._stats = CircuitStats() self._last_failure_time = None self._half_open_calls = 0 def force_open(self): """Manually open the circuit""" with self._lock: self._state = CircuitState.OPEN self._last_failure_time = time.time() def to_dict(self) -> dict: """Get circuit breaker state as dict""" return { "name": self.config.name, "state": self.state.value, "stats": self._stats.to_dict(), "config": { "failure_threshold": self.config.failure_threshold, "success_threshold": self.config.success_threshold, "timeout_seconds": self.config.timeout_seconds } } class CircuitBreakerRegistry: """ Registry for managing multiple circuit breakers. Usage: registry = CircuitBreakerRegistry() registry.register("vault", CircuitConfig(name="vault", failure_threshold=3)) @registry.protect("vault") def call_vault(): # ... """ def __init__(self, redis_client: Optional[redis.Redis] = None): self._breakers: dict[str, CircuitBreaker] = {} self._redis = redis_client self._lock = threading.RLock() def register(self, name: str, config: Optional[CircuitConfig] = None) -> CircuitBreaker: """Register a circuit breaker""" with self._lock: if name not in self._breakers: cfg = config or CircuitConfig(name=name) self._breakers[name] = CircuitBreaker(cfg) return self._breakers[name] def get(self, name: str) -> Optional[CircuitBreaker]: """Get a circuit breaker by name""" return self._breakers.get(name) def protect(self, name: str) -> Callable: """Decorator to protect a function with a circuit breaker""" def decorator(func: Callable) -> Callable: breaker = self.register(name) return breaker(func) return decorator def get_all_status(self) -> dict: """Get status of all circuit breakers""" return { name: breaker.to_dict() for name, breaker in self._breakers.items() } def reset_all(self): """Reset all circuit breakers""" for breaker in self._breakers.values(): breaker.reset() def persist(self): """Persist circuit breaker states to Redis""" if not self._redis: return try: data = json.dumps(self.get_all_status()) self._redis.set("circuit_breakers:status", data) except Exception: pass # ============================================================================= # Pre-configured Circuit Breakers for Governance System # ============================================================================= # Global registry _registry = CircuitBreakerRegistry() def get_vault_breaker() -> CircuitBreaker: """Get circuit breaker for Vault operations""" return _registry.register("vault", CircuitConfig( name="vault", failure_threshold=3, success_threshold=2, timeout_seconds=30.0, call_timeout=10.0 )) def get_dragonfly_breaker() -> CircuitBreaker: """Get circuit breaker for DragonflyDB operations""" return _registry.register("dragonfly", CircuitConfig( name="dragonfly", failure_threshold=5, success_threshold=3, timeout_seconds=15.0, call_timeout=5.0 )) def get_ledger_breaker() -> CircuitBreaker: """Get circuit breaker for Ledger operations""" return _registry.register("ledger", CircuitConfig( name="ledger", failure_threshold=3, success_threshold=2, timeout_seconds=10.0, call_timeout=5.0 )) def get_registry() -> CircuitBreakerRegistry: """Get the global circuit breaker registry""" return _registry # ============================================================================= # CLI Interface # ============================================================================= def main(): import argparse parser = argparse.ArgumentParser(description="Circuit Breaker CLI") parser.add_argument("command", choices=["status", "reset", "test"], help="Command to run") parser.add_argument("--name", type=str, help="Circuit breaker name") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() # Initialize default breakers get_vault_breaker() get_dragonfly_breaker() get_ledger_breaker() if args.command == "status": status = _registry.get_all_status() if args.json: print(json.dumps(status, indent=2)) else: print("\nCircuit Breaker Status") print("=" * 50) for name, data in status.items(): state_icon = { "closed": "✅", "open": "❌", "half_open": "⚠️" }.get(data["state"], "?") stats = data["stats"] print(f"\n{state_icon} {name}: {data['state'].upper()}") print(f" Calls: {stats['total_calls']} total, {stats['successful_calls']} success, {stats['failed_calls']} failed") print(f" Success Rate: {stats['success_rate']*100:.1f}%") print(f" Consecutive: {stats['consecutive_failures']} failures, {stats['consecutive_successes']} successes") elif args.command == "reset": if args.name: breaker = _registry.get(args.name) if breaker: breaker.reset() print(f"Reset circuit breaker: {args.name}") else: print(f"Circuit breaker not found: {args.name}") else: _registry.reset_all() print("Reset all circuit breakers") elif args.command == "test": print("\nTesting circuit breaker behavior...") # Create a test breaker test_breaker = CircuitBreaker(CircuitConfig( name="test", failure_threshold=3, success_threshold=2, timeout_seconds=5.0 )) # Simulate failures print("\n1. Simulating 3 failures to trip circuit:") for i in range(3): try: @test_breaker def failing_call(): raise Exception("Simulated failure") failing_call() except Exception as e: print(f" Call {i+1}: Failed - {e}") print(f"\n Circuit state: {test_breaker.state.value}") # Try a call when open print("\n2. Trying call when circuit is open:") try: @test_breaker def blocked_call(): return "success" blocked_call() except CircuitOpenError as e: print(f" Rejected: {e}") print(f"\n Stats: {test_breaker.stats.to_dict()}") print("\nTest complete!") if __name__ == "__main__": main()