#!/usr/bin/env python3 """ Chaos Testing Framework for Agent Governance Tests agent resilience under adverse conditions: - Random failures - Network issues (simulated) - Token revocations - Lock timeouts - State corruption - Resource exhaustion """ import asyncio import json import random import time import sqlite3 from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum from pathlib import Path from typing import Dict, List, Any, Optional, Callable, Tuple import redis import threading REDIS_HOST = "127.0.0.1" REDIS_PORT = 6379 REDIS_PASSWORD = "governance2026" LEDGER_PATH = Path("/opt/agent-governance/ledger/governance.db") class ChaosType(Enum): """Types of chaos that can be injected""" REVOKE_TOKEN = "revoke_token" EXPIRE_LOCK = "expire_lock" CORRUPT_STATE = "corrupt_state" DELAY_RESPONSE = "delay_response" RANDOM_ERROR = "random_error" HEARTBEAT_TIMEOUT = "heartbeat_timeout" ERROR_BUDGET_EXCEED = "error_budget_exceed" @dataclass class ChaosEvent: """A chaos event that was injected""" chaos_type: ChaosType target_agent: str timestamp: datetime details: Dict[str, Any] recovered: bool = False recovery_time: Optional[float] = None @dataclass class ChaosResult: """Result of a chaos test""" test_name: str chaos_type: ChaosType target_agent: str injected: bool detected: bool recovered: bool recovery_time_ms: Optional[float] passed: bool details: Dict[str, Any] class ChaosInjector: """ Injects chaos conditions into the agent governance system. """ def __init__(self): self.redis = redis.Redis( host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, decode_responses=True ) self.events: List[ChaosEvent] = [] def inject(self, chaos_type: ChaosType, agent_id: str, **kwargs) -> ChaosEvent: """Inject a chaos condition""" event = ChaosEvent( chaos_type=chaos_type, target_agent=agent_id, timestamp=datetime.utcnow(), details=kwargs ) if chaos_type == ChaosType.REVOKE_TOKEN: self._revoke_token(agent_id) elif chaos_type == ChaosType.EXPIRE_LOCK: self._expire_lock(agent_id) elif chaos_type == ChaosType.CORRUPT_STATE: self._corrupt_state(agent_id, kwargs.get("field", "status")) elif chaos_type == ChaosType.DELAY_RESPONSE: # Delay is applied at response time pass elif chaos_type == ChaosType.RANDOM_ERROR: self._inject_random_error(agent_id) elif chaos_type == ChaosType.HEARTBEAT_TIMEOUT: self._timeout_heartbeat(agent_id) elif chaos_type == ChaosType.ERROR_BUDGET_EXCEED: self._exceed_error_budget(agent_id) self.events.append(event) return event def _revoke_token(self, agent_id: str): """Simulate token revocation""" self.redis.set(f"agent:{agent_id}:revoke_signal", "1") self.redis.hset(f"agent:{agent_id}:state", "status", "revoked") def _expire_lock(self, agent_id: str): """Force expire an agent's lock""" self.redis.delete(f"agent:{agent_id}:lock") def _corrupt_state(self, agent_id: str, field: str): """Corrupt a specific state field""" self.redis.hset(f"agent:{agent_id}:state", field, "CORRUPTED_" + str(random.randint(1000, 9999))) def _inject_random_error(self, agent_id: str): """Inject a random error into the error counter""" error_types = ["LLM_ERROR", "NETWORK_ERROR", "TIMEOUT_ERROR", "VALIDATION_ERROR"] error_type = random.choice(error_types) self.redis.hincrby(f"agent:{agent_id}:errors", "total_errors", 1) self.redis.hincrby(f"agent:{agent_id}:errors", error_type, 1) def _timeout_heartbeat(self, agent_id: str): """Remove heartbeat to simulate timeout""" self.redis.delete(f"agent:{agent_id}:heartbeat") def _exceed_error_budget(self, agent_id: str): """Push error count beyond budget""" self.redis.hset(f"agent:{agent_id}:errors", mapping={ "total_errors": "10", "procedure_violations": "2" }) class ResilienceChecker: """ Checks if agents properly handle chaos conditions. """ def __init__(self): self.redis = redis.Redis( host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, decode_responses=True ) def check_revocation_detected(self, agent_id: str) -> bool: """Check if agent detected its revocation""" state = self.redis.hgetall(f"agent:{agent_id}:state") return state.get("status") in ["revoked", "terminated", "error"] def check_lock_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]: """Check if agent recovered from lock loss""" start = time.time() while time.time() - start < timeout_seconds: lock = self.redis.get(f"agent:{agent_id}:lock") if lock == agent_id: return True, (time.time() - start) * 1000 time.sleep(0.1) return False, None def check_state_valid(self, agent_id: str) -> bool: """Check if agent state is valid (not corrupted)""" state = self.redis.hgetall(f"agent:{agent_id}:state") for key, value in state.items(): if "CORRUPTED" in str(value): return False return True def check_heartbeat_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]: """Check if agent restored heartbeat""" start = time.time() while time.time() - start < timeout_seconds: hb = self.redis.get(f"agent:{agent_id}:heartbeat") if hb: return True, (time.time() - start) * 1000 time.sleep(0.1) return False, None def check_error_budget_response(self, agent_id: str) -> bool: """Check if agent responded to exceeded error budget""" revoke_signal = self.redis.get(f"agent:{agent_id}:revoke_signal") state = self.redis.hgetall(f"agent:{agent_id}:state") # Agent should either be revoked or have recognized the budget issue return revoke_signal == "1" or state.get("status") in ["error_budget_exceeded", "revoked"] class ChaosTestRunner: """ Runs chaos tests against the agent governance system. """ def __init__(self): self.injector = ChaosInjector() self.checker = ResilienceChecker() self.results: List[ChaosResult] = [] self.redis = redis.Redis( host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, decode_responses=True ) def setup_test_agent(self, agent_id: str): """Set up a test agent for chaos testing""" # Initialize agent state self.redis.hset(f"agent:{agent_id}:state", mapping={ "status": "running", "phase": "EXECUTE", "step": "1", "started_at": datetime.utcnow().isoformat() }) # Initialize error counters self.redis.hset(f"agent:{agent_id}:errors", mapping={ "total_errors": "0", "procedure_violations": "0" }) # Set lock self.redis.set(f"agent:{agent_id}:lock", agent_id, ex=300) # Set heartbeat self.redis.set(f"agent:{agent_id}:heartbeat", datetime.utcnow().isoformat(), ex=60) # Clear revocation signal self.redis.delete(f"agent:{agent_id}:revoke_signal") def cleanup_test_agent(self, agent_id: str): """Clean up test agent data""" keys = self.redis.keys(f"agent:{agent_id}:*") if keys: self.redis.delete(*keys) def run_test(self, test_name: str, chaos_type: ChaosType, agent_id: str, check_func: Callable, **chaos_kwargs) -> ChaosResult: """Run a single chaos test""" # Setup self.setup_test_agent(agent_id) time.sleep(0.1) # Let state stabilize # Inject chaos event = self.injector.inject(chaos_type, agent_id, **chaos_kwargs) # Check detection/recovery time.sleep(0.2) # Allow time for detection check_result = check_func(agent_id) if isinstance(check_result, tuple): detected, recovery_time = check_result else: detected = check_result recovery_time = None # Determine if test passed # For most chaos types, detection is success (agent noticed the issue) passed = detected result = ChaosResult( test_name=test_name, chaos_type=chaos_type, target_agent=agent_id, injected=True, detected=detected, recovered=recovery_time is not None, recovery_time_ms=recovery_time, passed=passed, details={"event": str(event.details)} ) self.results.append(result) # Cleanup self.cleanup_test_agent(agent_id) return result def run_all_tests(self) -> Dict[str, Any]: """Run the full chaos test suite""" print("\n" + "=" * 60) print("CHAOS TEST SUITE") print("=" * 60 + "\n") tests = [ ( "Token Revocation Detection", ChaosType.REVOKE_TOKEN, "chaos-agent-001", self.checker.check_revocation_detected, {} ), ( "Lock Expiration Handling", ChaosType.EXPIRE_LOCK, "chaos-agent-002", lambda a: (self.redis.get(f"agent:{a}:lock") is None, 0), {} ), ( "State Corruption Detection", ChaosType.CORRUPT_STATE, "chaos-agent-003", lambda a: not self.checker.check_state_valid(a), {"field": "phase"} ), ( "Heartbeat Timeout", ChaosType.HEARTBEAT_TIMEOUT, "chaos-agent-004", lambda a: self.redis.get(f"agent:{a}:heartbeat") is None, {} ), ( "Error Budget Exceeded", ChaosType.ERROR_BUDGET_EXCEED, "chaos-agent-005", lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) >= 10, {} ), ( "Random Error Injection", ChaosType.RANDOM_ERROR, "chaos-agent-006", lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) > 0, {} ), ] passed = 0 failed = 0 for test_name, chaos_type, agent_id, check_func, kwargs in tests: result = self.run_test(test_name, chaos_type, agent_id, check_func, **kwargs) status = "āœ“ PASS" if result.passed else "āœ— FAIL" recovery = f" (recovered in {result.recovery_time_ms:.0f}ms)" if result.recovery_time_ms else "" print(f" {status}: {test_name}{recovery}") if result.passed: passed += 1 else: failed += 1 # Summary print(f"\n{'='*60}") print(f"RESULTS: {passed}/{passed+failed} passed") print(f"{'='*60}\n") return { "total_tests": len(tests), "passed": passed, "failed": failed, "success_rate": passed / len(tests) if tests else 0, "results": [ { "test": r.test_name, "chaos_type": r.chaos_type.value, "passed": r.passed, "detected": r.detected, "recovery_ms": r.recovery_time_ms } for r in self.results ] } class ChaosMonkey: """ Continuous chaos injection for stress testing. Randomly injects chaos conditions over a period of time. """ def __init__(self, target_agents: List[str], duration_seconds: int = 60): self.target_agents = target_agents self.duration = duration_seconds self.injector = ChaosInjector() self.running = False self.events_injected = 0 def start(self): """Start the chaos monkey""" self.running = True self.events_injected = 0 print(f"\n🐵 Chaos Monkey started (duration: {self.duration}s)") print(f" Target agents: {', '.join(self.target_agents)}") start_time = time.time() while self.running and (time.time() - start_time) < self.duration: # Random delay between chaos events time.sleep(random.uniform(0.5, 2.0)) # Pick random agent and chaos type agent = random.choice(self.target_agents) chaos_type = random.choice(list(ChaosType)) # Inject chaos self.injector.inject(chaos_type, agent) self.events_injected += 1 print(f" šŸ”„ Injected {chaos_type.value} into {agent}") print(f"\n🐵 Chaos Monkey stopped ({self.events_injected} events injected)") def stop(self): """Stop the chaos monkey""" self.running = False def main(): """Run chaos tests""" runner = ChaosTestRunner() results = runner.run_all_tests() print("\nDetailed Results:") print("-" * 40) for r in results["results"]: status = "PASS" if r["passed"] else "FAIL" print(f" [{status}] {r['test']}") print(f" Type: {r['chaos_type']}, Detected: {r['detected']}") return 0 if results["failed"] == 0 else 1 if __name__ == "__main__": import sys sys.exit(main())