Phase 8 Production Hardening with complete governance infrastructure: - Vault integration with tiered policies (T0-T4) - DragonflyDB state management - SQLite audit ledger - Pipeline DSL and templates - Promotion/revocation engine - Checkpoint system for session persistence - Health manager and circuit breaker for fault tolerance - GitHub/Slack integrations - Architectural test pipeline with bug watcher, suggestion engine, council review - Multi-agent chaos testing framework Test Results: - Governance tests: 68/68 passing - E2E workflow: 16/16 passing - Phase 2 Vault: 14/14 passing - Integration tests: 27/27 passing Coverage: 57.6% average across 12 phases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
466 lines
15 KiB
Python
466 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Circuit Breaker
|
|
===============
|
|
Implements circuit breaker pattern for graceful degradation when dependencies fail.
|
|
|
|
States:
|
|
- CLOSED: Normal operation, requests flow through
|
|
- OPEN: Circuit tripped, requests fail fast
|
|
- HALF_OPEN: Testing if dependency recovered
|
|
|
|
Part of Phase 8: Production Hardening.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import threading
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from enum import Enum
|
|
from functools import wraps
|
|
from typing import Any, Callable, Optional, TypeVar, Generic
|
|
import redis
|
|
|
|
|
|
class CircuitState(str, Enum):
|
|
"""Circuit breaker states"""
|
|
CLOSED = "closed" # Normal operation
|
|
OPEN = "open" # Failing fast
|
|
HALF_OPEN = "half_open" # Testing recovery
|
|
|
|
|
|
class FailureType(str, Enum):
|
|
"""Types of failures tracked"""
|
|
TIMEOUT = "timeout"
|
|
CONNECTION_ERROR = "connection_error"
|
|
EXCEPTION = "exception"
|
|
THRESHOLD_EXCEEDED = "threshold_exceeded"
|
|
|
|
|
|
@dataclass
|
|
class CircuitStats:
|
|
"""Statistics for a circuit breaker"""
|
|
total_calls: int = 0
|
|
successful_calls: int = 0
|
|
failed_calls: int = 0
|
|
rejected_calls: int = 0
|
|
last_failure_time: Optional[str] = None
|
|
last_success_time: Optional[str] = None
|
|
consecutive_failures: int = 0
|
|
consecutive_successes: int = 0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"total_calls": self.total_calls,
|
|
"successful_calls": self.successful_calls,
|
|
"failed_calls": self.failed_calls,
|
|
"rejected_calls": self.rejected_calls,
|
|
"last_failure_time": self.last_failure_time,
|
|
"last_success_time": self.last_success_time,
|
|
"consecutive_failures": self.consecutive_failures,
|
|
"consecutive_successes": self.consecutive_successes,
|
|
"success_rate": self.success_rate
|
|
}
|
|
|
|
@property
|
|
def success_rate(self) -> float:
|
|
if self.total_calls == 0:
|
|
return 1.0
|
|
return self.successful_calls / self.total_calls
|
|
|
|
|
|
@dataclass
|
|
class CircuitConfig:
|
|
"""Configuration for a circuit breaker"""
|
|
name: str
|
|
failure_threshold: int = 5 # Failures before opening
|
|
success_threshold: int = 3 # Successes to close from half-open
|
|
timeout_seconds: float = 30.0 # Time before trying again (open -> half-open)
|
|
call_timeout: float = 10.0 # Timeout for individual calls
|
|
half_open_max_calls: int = 3 # Max concurrent calls in half-open
|
|
|
|
|
|
class CircuitOpenError(Exception):
|
|
"""Raised when circuit is open and rejecting calls"""
|
|
def __init__(self, circuit_name: str, retry_after: float):
|
|
self.circuit_name = circuit_name
|
|
self.retry_after = retry_after
|
|
super().__init__(f"Circuit '{circuit_name}' is open. Retry after {retry_after:.1f}s")
|
|
|
|
|
|
T = TypeVar('T')
|
|
|
|
|
|
class CircuitBreaker:
|
|
"""
|
|
Circuit breaker implementation for a single dependency.
|
|
|
|
Usage:
|
|
breaker = CircuitBreaker(CircuitConfig(name="vault"))
|
|
|
|
@breaker
|
|
def call_vault():
|
|
# ... make vault request
|
|
pass
|
|
|
|
# Or manual:
|
|
with breaker:
|
|
# ... protected code
|
|
"""
|
|
|
|
def __init__(self, config: CircuitConfig):
|
|
self.config = config
|
|
self._state = CircuitState.CLOSED
|
|
self._stats = CircuitStats()
|
|
self._last_failure_time: Optional[float] = None
|
|
self._half_open_calls = 0
|
|
self._lock = threading.RLock()
|
|
|
|
@property
|
|
def state(self) -> CircuitState:
|
|
with self._lock:
|
|
# Check if we should transition from OPEN to HALF_OPEN
|
|
if self._state == CircuitState.OPEN:
|
|
if self._last_failure_time:
|
|
elapsed = time.time() - self._last_failure_time
|
|
if elapsed >= self.config.timeout_seconds:
|
|
self._state = CircuitState.HALF_OPEN
|
|
self._half_open_calls = 0
|
|
return self._state
|
|
|
|
@property
|
|
def stats(self) -> CircuitStats:
|
|
return self._stats
|
|
|
|
def _now(self) -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
def _record_success(self):
|
|
"""Record a successful call"""
|
|
with self._lock:
|
|
self._stats.total_calls += 1
|
|
self._stats.successful_calls += 1
|
|
self._stats.consecutive_successes += 1
|
|
self._stats.consecutive_failures = 0
|
|
self._stats.last_success_time = self._now()
|
|
|
|
if self._state == CircuitState.HALF_OPEN:
|
|
if self._stats.consecutive_successes >= self.config.success_threshold:
|
|
self._state = CircuitState.CLOSED
|
|
self._stats.consecutive_successes = 0
|
|
|
|
def _record_failure(self, failure_type: FailureType = FailureType.EXCEPTION):
|
|
"""Record a failed call"""
|
|
with self._lock:
|
|
self._stats.total_calls += 1
|
|
self._stats.failed_calls += 1
|
|
self._stats.consecutive_failures += 1
|
|
self._stats.consecutive_successes = 0
|
|
self._stats.last_failure_time = self._now()
|
|
self._last_failure_time = time.time()
|
|
|
|
if self._state == CircuitState.CLOSED:
|
|
if self._stats.consecutive_failures >= self.config.failure_threshold:
|
|
self._state = CircuitState.OPEN
|
|
|
|
elif self._state == CircuitState.HALF_OPEN:
|
|
# Any failure in half-open goes back to open
|
|
self._state = CircuitState.OPEN
|
|
|
|
def _record_rejection(self):
|
|
"""Record a rejected call (circuit open)"""
|
|
with self._lock:
|
|
self._stats.total_calls += 1
|
|
self._stats.rejected_calls += 1
|
|
|
|
def _can_execute(self) -> bool:
|
|
"""Check if a call can be executed"""
|
|
state = self.state # This may transition OPEN -> HALF_OPEN
|
|
|
|
if state == CircuitState.CLOSED:
|
|
return True
|
|
|
|
if state == CircuitState.OPEN:
|
|
return False
|
|
|
|
if state == CircuitState.HALF_OPEN:
|
|
with self._lock:
|
|
if self._half_open_calls < self.config.half_open_max_calls:
|
|
self._half_open_calls += 1
|
|
return True
|
|
return False
|
|
|
|
return False
|
|
|
|
def _get_retry_after(self) -> float:
|
|
"""Get seconds until circuit might close"""
|
|
if self._last_failure_time:
|
|
elapsed = time.time() - self._last_failure_time
|
|
remaining = self.config.timeout_seconds - elapsed
|
|
return max(0, remaining)
|
|
return 0
|
|
|
|
def execute(self, func: Callable[[], T]) -> T:
|
|
"""Execute a function with circuit breaker protection"""
|
|
if not self._can_execute():
|
|
self._record_rejection()
|
|
raise CircuitOpenError(self.config.name, self._get_retry_after())
|
|
|
|
try:
|
|
result = func()
|
|
self._record_success()
|
|
return result
|
|
except Exception as e:
|
|
self._record_failure()
|
|
raise
|
|
|
|
def __call__(self, func: Callable) -> Callable:
|
|
"""Decorator usage"""
|
|
@wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
return self.execute(lambda: func(*args, **kwargs))
|
|
return wrapper
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry"""
|
|
if not self._can_execute():
|
|
self._record_rejection()
|
|
raise CircuitOpenError(self.config.name, self._get_retry_after())
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit"""
|
|
if exc_type is None:
|
|
self._record_success()
|
|
else:
|
|
self._record_failure()
|
|
return False # Don't suppress exceptions
|
|
|
|
def reset(self):
|
|
"""Manually reset the circuit breaker"""
|
|
with self._lock:
|
|
self._state = CircuitState.CLOSED
|
|
self._stats = CircuitStats()
|
|
self._last_failure_time = None
|
|
self._half_open_calls = 0
|
|
|
|
def force_open(self):
|
|
"""Manually open the circuit"""
|
|
with self._lock:
|
|
self._state = CircuitState.OPEN
|
|
self._last_failure_time = time.time()
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Get circuit breaker state as dict"""
|
|
return {
|
|
"name": self.config.name,
|
|
"state": self.state.value,
|
|
"stats": self._stats.to_dict(),
|
|
"config": {
|
|
"failure_threshold": self.config.failure_threshold,
|
|
"success_threshold": self.config.success_threshold,
|
|
"timeout_seconds": self.config.timeout_seconds
|
|
}
|
|
}
|
|
|
|
|
|
class CircuitBreakerRegistry:
|
|
"""
|
|
Registry for managing multiple circuit breakers.
|
|
|
|
Usage:
|
|
registry = CircuitBreakerRegistry()
|
|
registry.register("vault", CircuitConfig(name="vault", failure_threshold=3))
|
|
|
|
@registry.protect("vault")
|
|
def call_vault():
|
|
# ...
|
|
"""
|
|
|
|
def __init__(self, redis_client: Optional[redis.Redis] = None):
|
|
self._breakers: dict[str, CircuitBreaker] = {}
|
|
self._redis = redis_client
|
|
self._lock = threading.RLock()
|
|
|
|
def register(self, name: str, config: Optional[CircuitConfig] = None) -> CircuitBreaker:
|
|
"""Register a circuit breaker"""
|
|
with self._lock:
|
|
if name not in self._breakers:
|
|
cfg = config or CircuitConfig(name=name)
|
|
self._breakers[name] = CircuitBreaker(cfg)
|
|
return self._breakers[name]
|
|
|
|
def get(self, name: str) -> Optional[CircuitBreaker]:
|
|
"""Get a circuit breaker by name"""
|
|
return self._breakers.get(name)
|
|
|
|
def protect(self, name: str) -> Callable:
|
|
"""Decorator to protect a function with a circuit breaker"""
|
|
def decorator(func: Callable) -> Callable:
|
|
breaker = self.register(name)
|
|
return breaker(func)
|
|
return decorator
|
|
|
|
def get_all_status(self) -> dict:
|
|
"""Get status of all circuit breakers"""
|
|
return {
|
|
name: breaker.to_dict()
|
|
for name, breaker in self._breakers.items()
|
|
}
|
|
|
|
def reset_all(self):
|
|
"""Reset all circuit breakers"""
|
|
for breaker in self._breakers.values():
|
|
breaker.reset()
|
|
|
|
def persist(self):
|
|
"""Persist circuit breaker states to Redis"""
|
|
if not self._redis:
|
|
return
|
|
|
|
try:
|
|
data = json.dumps(self.get_all_status())
|
|
self._redis.set("circuit_breakers:status", data)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# =============================================================================
|
|
# Pre-configured Circuit Breakers for Governance System
|
|
# =============================================================================
|
|
|
|
# Global registry
|
|
_registry = CircuitBreakerRegistry()
|
|
|
|
|
|
def get_vault_breaker() -> CircuitBreaker:
|
|
"""Get circuit breaker for Vault operations"""
|
|
return _registry.register("vault", CircuitConfig(
|
|
name="vault",
|
|
failure_threshold=3,
|
|
success_threshold=2,
|
|
timeout_seconds=30.0,
|
|
call_timeout=10.0
|
|
))
|
|
|
|
|
|
def get_dragonfly_breaker() -> CircuitBreaker:
|
|
"""Get circuit breaker for DragonflyDB operations"""
|
|
return _registry.register("dragonfly", CircuitConfig(
|
|
name="dragonfly",
|
|
failure_threshold=5,
|
|
success_threshold=3,
|
|
timeout_seconds=15.0,
|
|
call_timeout=5.0
|
|
))
|
|
|
|
|
|
def get_ledger_breaker() -> CircuitBreaker:
|
|
"""Get circuit breaker for Ledger operations"""
|
|
return _registry.register("ledger", CircuitConfig(
|
|
name="ledger",
|
|
failure_threshold=3,
|
|
success_threshold=2,
|
|
timeout_seconds=10.0,
|
|
call_timeout=5.0
|
|
))
|
|
|
|
|
|
def get_registry() -> CircuitBreakerRegistry:
|
|
"""Get the global circuit breaker registry"""
|
|
return _registry
|
|
|
|
|
|
# =============================================================================
|
|
# CLI Interface
|
|
# =============================================================================
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Circuit Breaker CLI")
|
|
parser.add_argument("command", choices=["status", "reset", "test"],
|
|
help="Command to run")
|
|
parser.add_argument("--name", type=str, help="Circuit breaker name")
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize default breakers
|
|
get_vault_breaker()
|
|
get_dragonfly_breaker()
|
|
get_ledger_breaker()
|
|
|
|
if args.command == "status":
|
|
status = _registry.get_all_status()
|
|
if args.json:
|
|
print(json.dumps(status, indent=2))
|
|
else:
|
|
print("\nCircuit Breaker Status")
|
|
print("=" * 50)
|
|
for name, data in status.items():
|
|
state_icon = {
|
|
"closed": "✅",
|
|
"open": "❌",
|
|
"half_open": "⚠️"
|
|
}.get(data["state"], "?")
|
|
stats = data["stats"]
|
|
print(f"\n{state_icon} {name}: {data['state'].upper()}")
|
|
print(f" Calls: {stats['total_calls']} total, {stats['successful_calls']} success, {stats['failed_calls']} failed")
|
|
print(f" Success Rate: {stats['success_rate']*100:.1f}%")
|
|
print(f" Consecutive: {stats['consecutive_failures']} failures, {stats['consecutive_successes']} successes")
|
|
|
|
elif args.command == "reset":
|
|
if args.name:
|
|
breaker = _registry.get(args.name)
|
|
if breaker:
|
|
breaker.reset()
|
|
print(f"Reset circuit breaker: {args.name}")
|
|
else:
|
|
print(f"Circuit breaker not found: {args.name}")
|
|
else:
|
|
_registry.reset_all()
|
|
print("Reset all circuit breakers")
|
|
|
|
elif args.command == "test":
|
|
print("\nTesting circuit breaker behavior...")
|
|
|
|
# Create a test breaker
|
|
test_breaker = CircuitBreaker(CircuitConfig(
|
|
name="test",
|
|
failure_threshold=3,
|
|
success_threshold=2,
|
|
timeout_seconds=5.0
|
|
))
|
|
|
|
# Simulate failures
|
|
print("\n1. Simulating 3 failures to trip circuit:")
|
|
for i in range(3):
|
|
try:
|
|
@test_breaker
|
|
def failing_call():
|
|
raise Exception("Simulated failure")
|
|
failing_call()
|
|
except Exception as e:
|
|
print(f" Call {i+1}: Failed - {e}")
|
|
|
|
print(f"\n Circuit state: {test_breaker.state.value}")
|
|
|
|
# Try a call when open
|
|
print("\n2. Trying call when circuit is open:")
|
|
try:
|
|
@test_breaker
|
|
def blocked_call():
|
|
return "success"
|
|
blocked_call()
|
|
except CircuitOpenError as e:
|
|
print(f" Rejected: {e}")
|
|
|
|
print(f"\n Stats: {test_breaker.stats.to_dict()}")
|
|
print("\nTest complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|