agent-governance/runtime/circuit_breaker.py
profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

466 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Circuit Breaker
===============
Implements circuit breaker pattern for graceful degradation when dependencies fail.
States:
- CLOSED: Normal operation, requests flow through
- OPEN: Circuit tripped, requests fail fast
- HALF_OPEN: Testing if dependency recovered
Part of Phase 8: Production Hardening.
"""
import json
import time
import threading
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from functools import wraps
from typing import Any, Callable, Optional, TypeVar, Generic
import redis
class CircuitState(str, Enum):
"""Circuit breaker states"""
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing fast
HALF_OPEN = "half_open" # Testing recovery
class FailureType(str, Enum):
"""Types of failures tracked"""
TIMEOUT = "timeout"
CONNECTION_ERROR = "connection_error"
EXCEPTION = "exception"
THRESHOLD_EXCEEDED = "threshold_exceeded"
@dataclass
class CircuitStats:
"""Statistics for a circuit breaker"""
total_calls: int = 0
successful_calls: int = 0
failed_calls: int = 0
rejected_calls: int = 0
last_failure_time: Optional[str] = None
last_success_time: Optional[str] = None
consecutive_failures: int = 0
consecutive_successes: int = 0
def to_dict(self) -> dict:
return {
"total_calls": self.total_calls,
"successful_calls": self.successful_calls,
"failed_calls": self.failed_calls,
"rejected_calls": self.rejected_calls,
"last_failure_time": self.last_failure_time,
"last_success_time": self.last_success_time,
"consecutive_failures": self.consecutive_failures,
"consecutive_successes": self.consecutive_successes,
"success_rate": self.success_rate
}
@property
def success_rate(self) -> float:
if self.total_calls == 0:
return 1.0
return self.successful_calls / self.total_calls
@dataclass
class CircuitConfig:
"""Configuration for a circuit breaker"""
name: str
failure_threshold: int = 5 # Failures before opening
success_threshold: int = 3 # Successes to close from half-open
timeout_seconds: float = 30.0 # Time before trying again (open -> half-open)
call_timeout: float = 10.0 # Timeout for individual calls
half_open_max_calls: int = 3 # Max concurrent calls in half-open
class CircuitOpenError(Exception):
"""Raised when circuit is open and rejecting calls"""
def __init__(self, circuit_name: str, retry_after: float):
self.circuit_name = circuit_name
self.retry_after = retry_after
super().__init__(f"Circuit '{circuit_name}' is open. Retry after {retry_after:.1f}s")
T = TypeVar('T')
class CircuitBreaker:
"""
Circuit breaker implementation for a single dependency.
Usage:
breaker = CircuitBreaker(CircuitConfig(name="vault"))
@breaker
def call_vault():
# ... make vault request
pass
# Or manual:
with breaker:
# ... protected code
"""
def __init__(self, config: CircuitConfig):
self.config = config
self._state = CircuitState.CLOSED
self._stats = CircuitStats()
self._last_failure_time: Optional[float] = None
self._half_open_calls = 0
self._lock = threading.RLock()
@property
def state(self) -> CircuitState:
with self._lock:
# Check if we should transition from OPEN to HALF_OPEN
if self._state == CircuitState.OPEN:
if self._last_failure_time:
elapsed = time.time() - self._last_failure_time
if elapsed >= self.config.timeout_seconds:
self._state = CircuitState.HALF_OPEN
self._half_open_calls = 0
return self._state
@property
def stats(self) -> CircuitStats:
return self._stats
def _now(self) -> str:
return datetime.now(timezone.utc).isoformat()
def _record_success(self):
"""Record a successful call"""
with self._lock:
self._stats.total_calls += 1
self._stats.successful_calls += 1
self._stats.consecutive_successes += 1
self._stats.consecutive_failures = 0
self._stats.last_success_time = self._now()
if self._state == CircuitState.HALF_OPEN:
if self._stats.consecutive_successes >= self.config.success_threshold:
self._state = CircuitState.CLOSED
self._stats.consecutive_successes = 0
def _record_failure(self, failure_type: FailureType = FailureType.EXCEPTION):
"""Record a failed call"""
with self._lock:
self._stats.total_calls += 1
self._stats.failed_calls += 1
self._stats.consecutive_failures += 1
self._stats.consecutive_successes = 0
self._stats.last_failure_time = self._now()
self._last_failure_time = time.time()
if self._state == CircuitState.CLOSED:
if self._stats.consecutive_failures >= self.config.failure_threshold:
self._state = CircuitState.OPEN
elif self._state == CircuitState.HALF_OPEN:
# Any failure in half-open goes back to open
self._state = CircuitState.OPEN
def _record_rejection(self):
"""Record a rejected call (circuit open)"""
with self._lock:
self._stats.total_calls += 1
self._stats.rejected_calls += 1
def _can_execute(self) -> bool:
"""Check if a call can be executed"""
state = self.state # This may transition OPEN -> HALF_OPEN
if state == CircuitState.CLOSED:
return True
if state == CircuitState.OPEN:
return False
if state == CircuitState.HALF_OPEN:
with self._lock:
if self._half_open_calls < self.config.half_open_max_calls:
self._half_open_calls += 1
return True
return False
return False
def _get_retry_after(self) -> float:
"""Get seconds until circuit might close"""
if self._last_failure_time:
elapsed = time.time() - self._last_failure_time
remaining = self.config.timeout_seconds - elapsed
return max(0, remaining)
return 0
def execute(self, func: Callable[[], T]) -> T:
"""Execute a function with circuit breaker protection"""
if not self._can_execute():
self._record_rejection()
raise CircuitOpenError(self.config.name, self._get_retry_after())
try:
result = func()
self._record_success()
return result
except Exception as e:
self._record_failure()
raise
def __call__(self, func: Callable) -> Callable:
"""Decorator usage"""
@wraps(func)
def wrapper(*args, **kwargs):
return self.execute(lambda: func(*args, **kwargs))
return wrapper
def __enter__(self):
"""Context manager entry"""
if not self._can_execute():
self._record_rejection()
raise CircuitOpenError(self.config.name, self._get_retry_after())
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
if exc_type is None:
self._record_success()
else:
self._record_failure()
return False # Don't suppress exceptions
def reset(self):
"""Manually reset the circuit breaker"""
with self._lock:
self._state = CircuitState.CLOSED
self._stats = CircuitStats()
self._last_failure_time = None
self._half_open_calls = 0
def force_open(self):
"""Manually open the circuit"""
with self._lock:
self._state = CircuitState.OPEN
self._last_failure_time = time.time()
def to_dict(self) -> dict:
"""Get circuit breaker state as dict"""
return {
"name": self.config.name,
"state": self.state.value,
"stats": self._stats.to_dict(),
"config": {
"failure_threshold": self.config.failure_threshold,
"success_threshold": self.config.success_threshold,
"timeout_seconds": self.config.timeout_seconds
}
}
class CircuitBreakerRegistry:
"""
Registry for managing multiple circuit breakers.
Usage:
registry = CircuitBreakerRegistry()
registry.register("vault", CircuitConfig(name="vault", failure_threshold=3))
@registry.protect("vault")
def call_vault():
# ...
"""
def __init__(self, redis_client: Optional[redis.Redis] = None):
self._breakers: dict[str, CircuitBreaker] = {}
self._redis = redis_client
self._lock = threading.RLock()
def register(self, name: str, config: Optional[CircuitConfig] = None) -> CircuitBreaker:
"""Register a circuit breaker"""
with self._lock:
if name not in self._breakers:
cfg = config or CircuitConfig(name=name)
self._breakers[name] = CircuitBreaker(cfg)
return self._breakers[name]
def get(self, name: str) -> Optional[CircuitBreaker]:
"""Get a circuit breaker by name"""
return self._breakers.get(name)
def protect(self, name: str) -> Callable:
"""Decorator to protect a function with a circuit breaker"""
def decorator(func: Callable) -> Callable:
breaker = self.register(name)
return breaker(func)
return decorator
def get_all_status(self) -> dict:
"""Get status of all circuit breakers"""
return {
name: breaker.to_dict()
for name, breaker in self._breakers.items()
}
def reset_all(self):
"""Reset all circuit breakers"""
for breaker in self._breakers.values():
breaker.reset()
def persist(self):
"""Persist circuit breaker states to Redis"""
if not self._redis:
return
try:
data = json.dumps(self.get_all_status())
self._redis.set("circuit_breakers:status", data)
except Exception:
pass
# =============================================================================
# Pre-configured Circuit Breakers for Governance System
# =============================================================================
# Global registry
_registry = CircuitBreakerRegistry()
def get_vault_breaker() -> CircuitBreaker:
"""Get circuit breaker for Vault operations"""
return _registry.register("vault", CircuitConfig(
name="vault",
failure_threshold=3,
success_threshold=2,
timeout_seconds=30.0,
call_timeout=10.0
))
def get_dragonfly_breaker() -> CircuitBreaker:
"""Get circuit breaker for DragonflyDB operations"""
return _registry.register("dragonfly", CircuitConfig(
name="dragonfly",
failure_threshold=5,
success_threshold=3,
timeout_seconds=15.0,
call_timeout=5.0
))
def get_ledger_breaker() -> CircuitBreaker:
"""Get circuit breaker for Ledger operations"""
return _registry.register("ledger", CircuitConfig(
name="ledger",
failure_threshold=3,
success_threshold=2,
timeout_seconds=10.0,
call_timeout=5.0
))
def get_registry() -> CircuitBreakerRegistry:
"""Get the global circuit breaker registry"""
return _registry
# =============================================================================
# CLI Interface
# =============================================================================
def main():
import argparse
parser = argparse.ArgumentParser(description="Circuit Breaker CLI")
parser.add_argument("command", choices=["status", "reset", "test"],
help="Command to run")
parser.add_argument("--name", type=str, help="Circuit breaker name")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
# Initialize default breakers
get_vault_breaker()
get_dragonfly_breaker()
get_ledger_breaker()
if args.command == "status":
status = _registry.get_all_status()
if args.json:
print(json.dumps(status, indent=2))
else:
print("\nCircuit Breaker Status")
print("=" * 50)
for name, data in status.items():
state_icon = {
"closed": "",
"open": "",
"half_open": "⚠️"
}.get(data["state"], "?")
stats = data["stats"]
print(f"\n{state_icon} {name}: {data['state'].upper()}")
print(f" Calls: {stats['total_calls']} total, {stats['successful_calls']} success, {stats['failed_calls']} failed")
print(f" Success Rate: {stats['success_rate']*100:.1f}%")
print(f" Consecutive: {stats['consecutive_failures']} failures, {stats['consecutive_successes']} successes")
elif args.command == "reset":
if args.name:
breaker = _registry.get(args.name)
if breaker:
breaker.reset()
print(f"Reset circuit breaker: {args.name}")
else:
print(f"Circuit breaker not found: {args.name}")
else:
_registry.reset_all()
print("Reset all circuit breakers")
elif args.command == "test":
print("\nTesting circuit breaker behavior...")
# Create a test breaker
test_breaker = CircuitBreaker(CircuitConfig(
name="test",
failure_threshold=3,
success_threshold=2,
timeout_seconds=5.0
))
# Simulate failures
print("\n1. Simulating 3 failures to trip circuit:")
for i in range(3):
try:
@test_breaker
def failing_call():
raise Exception("Simulated failure")
failing_call()
except Exception as e:
print(f" Call {i+1}: Failed - {e}")
print(f"\n Circuit state: {test_breaker.state.value}")
# Try a call when open
print("\n2. Trying call when circuit is open:")
try:
@test_breaker
def blocked_call():
return "success"
blocked_call()
except CircuitOpenError as e:
print(f" Rejected: {e}")
print(f"\n Stats: {test_breaker.stats.to_dict()}")
print("\nTest complete!")
if __name__ == "__main__":
main()