agent-governance/tests/chaos/chaos_test.py
profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

440 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Chaos Testing Framework for Agent Governance
Tests agent resilience under adverse conditions:
- Random failures
- Network issues (simulated)
- Token revocations
- Lock timeouts
- State corruption
- Resource exhaustion
"""
import asyncio
import json
import random
import time
import sqlite3
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, List, Any, Optional, Callable, Tuple
import redis
import threading
REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
REDIS_PASSWORD = "governance2026"
LEDGER_PATH = Path("/opt/agent-governance/ledger/governance.db")
class ChaosType(Enum):
"""Types of chaos that can be injected"""
REVOKE_TOKEN = "revoke_token"
EXPIRE_LOCK = "expire_lock"
CORRUPT_STATE = "corrupt_state"
DELAY_RESPONSE = "delay_response"
RANDOM_ERROR = "random_error"
HEARTBEAT_TIMEOUT = "heartbeat_timeout"
ERROR_BUDGET_EXCEED = "error_budget_exceed"
@dataclass
class ChaosEvent:
"""A chaos event that was injected"""
chaos_type: ChaosType
target_agent: str
timestamp: datetime
details: Dict[str, Any]
recovered: bool = False
recovery_time: Optional[float] = None
@dataclass
class ChaosResult:
"""Result of a chaos test"""
test_name: str
chaos_type: ChaosType
target_agent: str
injected: bool
detected: bool
recovered: bool
recovery_time_ms: Optional[float]
passed: bool
details: Dict[str, Any]
class ChaosInjector:
"""
Injects chaos conditions into the agent governance system.
"""
def __init__(self):
self.redis = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
self.events: List[ChaosEvent] = []
def inject(self, chaos_type: ChaosType, agent_id: str, **kwargs) -> ChaosEvent:
"""Inject a chaos condition"""
event = ChaosEvent(
chaos_type=chaos_type,
target_agent=agent_id,
timestamp=datetime.utcnow(),
details=kwargs
)
if chaos_type == ChaosType.REVOKE_TOKEN:
self._revoke_token(agent_id)
elif chaos_type == ChaosType.EXPIRE_LOCK:
self._expire_lock(agent_id)
elif chaos_type == ChaosType.CORRUPT_STATE:
self._corrupt_state(agent_id, kwargs.get("field", "status"))
elif chaos_type == ChaosType.DELAY_RESPONSE:
# Delay is applied at response time
pass
elif chaos_type == ChaosType.RANDOM_ERROR:
self._inject_random_error(agent_id)
elif chaos_type == ChaosType.HEARTBEAT_TIMEOUT:
self._timeout_heartbeat(agent_id)
elif chaos_type == ChaosType.ERROR_BUDGET_EXCEED:
self._exceed_error_budget(agent_id)
self.events.append(event)
return event
def _revoke_token(self, agent_id: str):
"""Simulate token revocation"""
self.redis.set(f"agent:{agent_id}:revoke_signal", "1")
self.redis.hset(f"agent:{agent_id}:state", "status", "revoked")
def _expire_lock(self, agent_id: str):
"""Force expire an agent's lock"""
self.redis.delete(f"agent:{agent_id}:lock")
def _corrupt_state(self, agent_id: str, field: str):
"""Corrupt a specific state field"""
self.redis.hset(f"agent:{agent_id}:state", field, "CORRUPTED_" + str(random.randint(1000, 9999)))
def _inject_random_error(self, agent_id: str):
"""Inject a random error into the error counter"""
error_types = ["LLM_ERROR", "NETWORK_ERROR", "TIMEOUT_ERROR", "VALIDATION_ERROR"]
error_type = random.choice(error_types)
self.redis.hincrby(f"agent:{agent_id}:errors", "total_errors", 1)
self.redis.hincrby(f"agent:{agent_id}:errors", error_type, 1)
def _timeout_heartbeat(self, agent_id: str):
"""Remove heartbeat to simulate timeout"""
self.redis.delete(f"agent:{agent_id}:heartbeat")
def _exceed_error_budget(self, agent_id: str):
"""Push error count beyond budget"""
self.redis.hset(f"agent:{agent_id}:errors", mapping={
"total_errors": "10",
"procedure_violations": "2"
})
class ResilienceChecker:
"""
Checks if agents properly handle chaos conditions.
"""
def __init__(self):
self.redis = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
def check_revocation_detected(self, agent_id: str) -> bool:
"""Check if agent detected its revocation"""
state = self.redis.hgetall(f"agent:{agent_id}:state")
return state.get("status") in ["revoked", "terminated", "error"]
def check_lock_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]:
"""Check if agent recovered from lock loss"""
start = time.time()
while time.time() - start < timeout_seconds:
lock = self.redis.get(f"agent:{agent_id}:lock")
if lock == agent_id:
return True, (time.time() - start) * 1000
time.sleep(0.1)
return False, None
def check_state_valid(self, agent_id: str) -> bool:
"""Check if agent state is valid (not corrupted)"""
state = self.redis.hgetall(f"agent:{agent_id}:state")
for key, value in state.items():
if "CORRUPTED" in str(value):
return False
return True
def check_heartbeat_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]:
"""Check if agent restored heartbeat"""
start = time.time()
while time.time() - start < timeout_seconds:
hb = self.redis.get(f"agent:{agent_id}:heartbeat")
if hb:
return True, (time.time() - start) * 1000
time.sleep(0.1)
return False, None
def check_error_budget_response(self, agent_id: str) -> bool:
"""Check if agent responded to exceeded error budget"""
revoke_signal = self.redis.get(f"agent:{agent_id}:revoke_signal")
state = self.redis.hgetall(f"agent:{agent_id}:state")
# Agent should either be revoked or have recognized the budget issue
return revoke_signal == "1" or state.get("status") in ["error_budget_exceeded", "revoked"]
class ChaosTestRunner:
"""
Runs chaos tests against the agent governance system.
"""
def __init__(self):
self.injector = ChaosInjector()
self.checker = ResilienceChecker()
self.results: List[ChaosResult] = []
self.redis = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
def setup_test_agent(self, agent_id: str):
"""Set up a test agent for chaos testing"""
# Initialize agent state
self.redis.hset(f"agent:{agent_id}:state", mapping={
"status": "running",
"phase": "EXECUTE",
"step": "1",
"started_at": datetime.utcnow().isoformat()
})
# Initialize error counters
self.redis.hset(f"agent:{agent_id}:errors", mapping={
"total_errors": "0",
"procedure_violations": "0"
})
# Set lock
self.redis.set(f"agent:{agent_id}:lock", agent_id, ex=300)
# Set heartbeat
self.redis.set(f"agent:{agent_id}:heartbeat", datetime.utcnow().isoformat(), ex=60)
# Clear revocation signal
self.redis.delete(f"agent:{agent_id}:revoke_signal")
def cleanup_test_agent(self, agent_id: str):
"""Clean up test agent data"""
keys = self.redis.keys(f"agent:{agent_id}:*")
if keys:
self.redis.delete(*keys)
def run_test(self, test_name: str, chaos_type: ChaosType,
agent_id: str, check_func: Callable, **chaos_kwargs) -> ChaosResult:
"""Run a single chaos test"""
# Setup
self.setup_test_agent(agent_id)
time.sleep(0.1) # Let state stabilize
# Inject chaos
event = self.injector.inject(chaos_type, agent_id, **chaos_kwargs)
# Check detection/recovery
time.sleep(0.2) # Allow time for detection
check_result = check_func(agent_id)
if isinstance(check_result, tuple):
detected, recovery_time = check_result
else:
detected = check_result
recovery_time = None
# Determine if test passed
# For most chaos types, detection is success (agent noticed the issue)
passed = detected
result = ChaosResult(
test_name=test_name,
chaos_type=chaos_type,
target_agent=agent_id,
injected=True,
detected=detected,
recovered=recovery_time is not None,
recovery_time_ms=recovery_time,
passed=passed,
details={"event": str(event.details)}
)
self.results.append(result)
# Cleanup
self.cleanup_test_agent(agent_id)
return result
def run_all_tests(self) -> Dict[str, Any]:
"""Run the full chaos test suite"""
print("\n" + "=" * 60)
print("CHAOS TEST SUITE")
print("=" * 60 + "\n")
tests = [
(
"Token Revocation Detection",
ChaosType.REVOKE_TOKEN,
"chaos-agent-001",
self.checker.check_revocation_detected,
{}
),
(
"Lock Expiration Handling",
ChaosType.EXPIRE_LOCK,
"chaos-agent-002",
lambda a: (self.redis.get(f"agent:{a}:lock") is None, 0),
{}
),
(
"State Corruption Detection",
ChaosType.CORRUPT_STATE,
"chaos-agent-003",
lambda a: not self.checker.check_state_valid(a),
{"field": "phase"}
),
(
"Heartbeat Timeout",
ChaosType.HEARTBEAT_TIMEOUT,
"chaos-agent-004",
lambda a: self.redis.get(f"agent:{a}:heartbeat") is None,
{}
),
(
"Error Budget Exceeded",
ChaosType.ERROR_BUDGET_EXCEED,
"chaos-agent-005",
lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) >= 10,
{}
),
(
"Random Error Injection",
ChaosType.RANDOM_ERROR,
"chaos-agent-006",
lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) > 0,
{}
),
]
passed = 0
failed = 0
for test_name, chaos_type, agent_id, check_func, kwargs in tests:
result = self.run_test(test_name, chaos_type, agent_id, check_func, **kwargs)
status = "✓ PASS" if result.passed else "✗ FAIL"
recovery = f" (recovered in {result.recovery_time_ms:.0f}ms)" if result.recovery_time_ms else ""
print(f" {status}: {test_name}{recovery}")
if result.passed:
passed += 1
else:
failed += 1
# Summary
print(f"\n{'='*60}")
print(f"RESULTS: {passed}/{passed+failed} passed")
print(f"{'='*60}\n")
return {
"total_tests": len(tests),
"passed": passed,
"failed": failed,
"success_rate": passed / len(tests) if tests else 0,
"results": [
{
"test": r.test_name,
"chaos_type": r.chaos_type.value,
"passed": r.passed,
"detected": r.detected,
"recovery_ms": r.recovery_time_ms
}
for r in self.results
]
}
class ChaosMonkey:
"""
Continuous chaos injection for stress testing.
Randomly injects chaos conditions over a period of time.
"""
def __init__(self, target_agents: List[str], duration_seconds: int = 60):
self.target_agents = target_agents
self.duration = duration_seconds
self.injector = ChaosInjector()
self.running = False
self.events_injected = 0
def start(self):
"""Start the chaos monkey"""
self.running = True
self.events_injected = 0
print(f"\n🐵 Chaos Monkey started (duration: {self.duration}s)")
print(f" Target agents: {', '.join(self.target_agents)}")
start_time = time.time()
while self.running and (time.time() - start_time) < self.duration:
# Random delay between chaos events
time.sleep(random.uniform(0.5, 2.0))
# Pick random agent and chaos type
agent = random.choice(self.target_agents)
chaos_type = random.choice(list(ChaosType))
# Inject chaos
self.injector.inject(chaos_type, agent)
self.events_injected += 1
print(f" 🔥 Injected {chaos_type.value} into {agent}")
print(f"\n🐵 Chaos Monkey stopped ({self.events_injected} events injected)")
def stop(self):
"""Stop the chaos monkey"""
self.running = False
def main():
"""Run chaos tests"""
runner = ChaosTestRunner()
results = runner.run_all_tests()
print("\nDetailed Results:")
print("-" * 40)
for r in results["results"]:
status = "PASS" if r["passed"] else "FAIL"
print(f" [{status}] {r['test']}")
print(f" Type: {r['chaos_type']}, Detected: {r['detected']}")
return 0 if results["failed"] == 0 else 1
if __name__ == "__main__":
import sys
sys.exit(main())