agent-governance/tests/chaos/chaos_test.py

#!/usr/bin/env python3
"""
Chaos Testing Framework for Agent Governance

Tests agent resilience under adverse conditions:
- Random failures
- Network issues (simulated)
- Token revocations
- Lock timeouts
- State corruption
- Resource exhaustion
"""

import asyncio
import json
import random
import time
import sqlite3
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, List, Any, Optional, Callable, Tuple
import redis
import threading

REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
REDIS_PASSWORD = "governance2026"
LEDGER_PATH = Path("/opt/agent-governance/ledger/governance.db")


class ChaosType(Enum):
    """Types of chaos that can be injected"""
    REVOKE_TOKEN = "revoke_token"
    EXPIRE_LOCK = "expire_lock"
    CORRUPT_STATE = "corrupt_state"
    DELAY_RESPONSE = "delay_response"
    RANDOM_ERROR = "random_error"
    HEARTBEAT_TIMEOUT = "heartbeat_timeout"
    ERROR_BUDGET_EXCEED = "error_budget_exceed"


@dataclass
class ChaosEvent:
    """A chaos event that was injected"""
    chaos_type: ChaosType
    target_agent: str
    timestamp: datetime
    details: Dict[str, Any]
    recovered: bool = False
    recovery_time: Optional[float] = None


@dataclass
class ChaosResult:
    """Result of a chaos test"""
    test_name: str
    chaos_type: ChaosType
    target_agent: str
    injected: bool
    detected: bool
    recovered: bool
    recovery_time_ms: Optional[float]
    passed: bool
    details: Dict[str, Any]


class ChaosInjector:
    """
    Injects chaos conditions into the agent governance system.
    """

    def __init__(self):
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )
        self.events: List[ChaosEvent] = []

    def inject(self, chaos_type: ChaosType, agent_id: str, **kwargs) -> ChaosEvent:
        """Inject a chaos condition"""
        event = ChaosEvent(
            chaos_type=chaos_type,
            target_agent=agent_id,
            timestamp=datetime.utcnow(),
            details=kwargs
        )

        if chaos_type == ChaosType.REVOKE_TOKEN:
            self._revoke_token(agent_id)
        elif chaos_type == ChaosType.EXPIRE_LOCK:
            self._expire_lock(agent_id)
        elif chaos_type == ChaosType.CORRUPT_STATE:
            self._corrupt_state(agent_id, kwargs.get("field", "status"))
        elif chaos_type == ChaosType.DELAY_RESPONSE:
            # Delay is applied at response time
            pass
        elif chaos_type == ChaosType.RANDOM_ERROR:
            self._inject_random_error(agent_id)
        elif chaos_type == ChaosType.HEARTBEAT_TIMEOUT:
            self._timeout_heartbeat(agent_id)
        elif chaos_type == ChaosType.ERROR_BUDGET_EXCEED:
            self._exceed_error_budget(agent_id)

        self.events.append(event)
        return event

    def _revoke_token(self, agent_id: str):
        """Simulate token revocation"""
        self.redis.set(f"agent:{agent_id}:revoke_signal", "1")
        self.redis.hset(f"agent:{agent_id}:state", "status", "revoked")

    def _expire_lock(self, agent_id: str):
        """Force expire an agent's lock"""
        self.redis.delete(f"agent:{agent_id}:lock")

    def _corrupt_state(self, agent_id: str, field: str):
        """Corrupt a specific state field"""
        self.redis.hset(f"agent:{agent_id}:state", field, "CORRUPTED_" + str(random.randint(1000, 9999)))

    def _inject_random_error(self, agent_id: str):
        """Inject a random error into the error counter"""
        error_types = ["LLM_ERROR", "NETWORK_ERROR", "TIMEOUT_ERROR", "VALIDATION_ERROR"]
        error_type = random.choice(error_types)
        self.redis.hincrby(f"agent:{agent_id}:errors", "total_errors", 1)
        self.redis.hincrby(f"agent:{agent_id}:errors", error_type, 1)

    def _timeout_heartbeat(self, agent_id: str):
        """Remove heartbeat to simulate timeout"""
        self.redis.delete(f"agent:{agent_id}:heartbeat")

    def _exceed_error_budget(self, agent_id: str):
        """Push error count beyond budget"""
        self.redis.hset(f"agent:{agent_id}:errors", mapping={
            "total_errors": "10",
            "procedure_violations": "2"
        })


class ResilienceChecker:
    """
    Checks if agents properly handle chaos conditions.
    """

    def __init__(self):
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )

    def check_revocation_detected(self, agent_id: str) -> bool:
        """Check if agent detected its revocation"""
        state = self.redis.hgetall(f"agent:{agent_id}:state")
        return state.get("status") in ["revoked", "terminated", "error"]

    def check_lock_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]:
        """Check if agent recovered from lock loss"""
        start = time.time()
        while time.time() - start < timeout_seconds:
            lock = self.redis.get(f"agent:{agent_id}:lock")
            if lock == agent_id:
                return True, (time.time() - start) * 1000
            time.sleep(0.1)
        return False, None

    def check_state_valid(self, agent_id: str) -> bool:
        """Check if agent state is valid (not corrupted)"""
        state = self.redis.hgetall(f"agent:{agent_id}:state")
        for key, value in state.items():
            if "CORRUPTED" in str(value):
                return False
        return True

    def check_heartbeat_recovery(self, agent_id: str, timeout_seconds: float = 5.0) -> Tuple[bool, float]:
        """Check if agent restored heartbeat"""
        start = time.time()
        while time.time() - start < timeout_seconds:
            hb = self.redis.get(f"agent:{agent_id}:heartbeat")
            if hb:
                return True, (time.time() - start) * 1000
            time.sleep(0.1)
        return False, None

    def check_error_budget_response(self, agent_id: str) -> bool:
        """Check if agent responded to exceeded error budget"""
        revoke_signal = self.redis.get(f"agent:{agent_id}:revoke_signal")
        state = self.redis.hgetall(f"agent:{agent_id}:state")
        # Agent should either be revoked or have recognized the budget issue
        return revoke_signal == "1" or state.get("status") in ["error_budget_exceeded", "revoked"]


class ChaosTestRunner:
    """
    Runs chaos tests against the agent governance system.
    """

    def __init__(self):
        self.injector = ChaosInjector()
        self.checker = ResilienceChecker()
        self.results: List[ChaosResult] = []
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )

    def setup_test_agent(self, agent_id: str):
        """Set up a test agent for chaos testing"""
        # Initialize agent state
        self.redis.hset(f"agent:{agent_id}:state", mapping={
            "status": "running",
            "phase": "EXECUTE",
            "step": "1",
            "started_at": datetime.utcnow().isoformat()
        })

        # Initialize error counters
        self.redis.hset(f"agent:{agent_id}:errors", mapping={
            "total_errors": "0",
            "procedure_violations": "0"
        })

        # Set lock
        self.redis.set(f"agent:{agent_id}:lock", agent_id, ex=300)

        # Set heartbeat
        self.redis.set(f"agent:{agent_id}:heartbeat", datetime.utcnow().isoformat(), ex=60)

        # Clear revocation signal
        self.redis.delete(f"agent:{agent_id}:revoke_signal")

    def cleanup_test_agent(self, agent_id: str):
        """Clean up test agent data"""
        keys = self.redis.keys(f"agent:{agent_id}:*")
        if keys:
            self.redis.delete(*keys)

    def run_test(self, test_name: str, chaos_type: ChaosType,
                 agent_id: str, check_func: Callable, **chaos_kwargs) -> ChaosResult:
        """Run a single chaos test"""
        # Setup
        self.setup_test_agent(agent_id)
        time.sleep(0.1)  # Let state stabilize

        # Inject chaos
        event = self.injector.inject(chaos_type, agent_id, **chaos_kwargs)

        # Check detection/recovery
        time.sleep(0.2)  # Allow time for detection

        check_result = check_func(agent_id)
        if isinstance(check_result, tuple):
            detected, recovery_time = check_result
        else:
            detected = check_result
            recovery_time = None

        # Determine if test passed
        # For most chaos types, detection is success (agent noticed the issue)
        passed = detected

        result = ChaosResult(
            test_name=test_name,
            chaos_type=chaos_type,
            target_agent=agent_id,
            injected=True,
            detected=detected,
            recovered=recovery_time is not None,
            recovery_time_ms=recovery_time,
            passed=passed,
            details={"event": str(event.details)}
        )

        self.results.append(result)

        # Cleanup
        self.cleanup_test_agent(agent_id)

        return result

    def run_all_tests(self) -> Dict[str, Any]:
        """Run the full chaos test suite"""
        print("\n" + "=" * 60)
        print("CHAOS TEST SUITE")
        print("=" * 60 + "\n")

        tests = [
            (
                "Token Revocation Detection",
                ChaosType.REVOKE_TOKEN,
                "chaos-agent-001",
                self.checker.check_revocation_detected,
                {}
            ),
            (
                "Lock Expiration Handling",
                ChaosType.EXPIRE_LOCK,
                "chaos-agent-002",
                lambda a: (self.redis.get(f"agent:{a}:lock") is None, 0),
                {}
            ),
            (
                "State Corruption Detection",
                ChaosType.CORRUPT_STATE,
                "chaos-agent-003",
                lambda a: not self.checker.check_state_valid(a),
                {"field": "phase"}
            ),
            (
                "Heartbeat Timeout",
                ChaosType.HEARTBEAT_TIMEOUT,
                "chaos-agent-004",
                lambda a: self.redis.get(f"agent:{a}:heartbeat") is None,
                {}
            ),
            (
                "Error Budget Exceeded",
                ChaosType.ERROR_BUDGET_EXCEED,
                "chaos-agent-005",
                lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) >= 10,
                {}
            ),
            (
                "Random Error Injection",
                ChaosType.RANDOM_ERROR,
                "chaos-agent-006",
                lambda a: int(self.redis.hget(f"agent:{a}:errors", "total_errors") or 0) > 0,
                {}
            ),
        ]

        passed = 0
        failed = 0

        for test_name, chaos_type, agent_id, check_func, kwargs in tests:
            result = self.run_test(test_name, chaos_type, agent_id, check_func, **kwargs)

            status = "✓ PASS" if result.passed else "✗ FAIL"
            recovery = f" (recovered in {result.recovery_time_ms:.0f}ms)" if result.recovery_time_ms else ""

            print(f"  {status}: {test_name}{recovery}")

            if result.passed:
                passed += 1
            else:
                failed += 1

        # Summary
        print(f"\n{'='*60}")
        print(f"RESULTS: {passed}/{passed+failed} passed")
        print(f"{'='*60}\n")

        return {
            "total_tests": len(tests),
            "passed": passed,
            "failed": failed,
            "success_rate": passed / len(tests) if tests else 0,
            "results": [
                {
                    "test": r.test_name,
                    "chaos_type": r.chaos_type.value,
                    "passed": r.passed,
                    "detected": r.detected,
                    "recovery_ms": r.recovery_time_ms
                }
                for r in self.results
            ]
        }


class ChaosMonkey:
    """
    Continuous chaos injection for stress testing.

    Randomly injects chaos conditions over a period of time.
    """

    def __init__(self, target_agents: List[str], duration_seconds: int = 60):
        self.target_agents = target_agents
        self.duration = duration_seconds
        self.injector = ChaosInjector()
        self.running = False
        self.events_injected = 0

    def start(self):
        """Start the chaos monkey"""
        self.running = True
        self.events_injected = 0

        print(f"\n🐵 Chaos Monkey started (duration: {self.duration}s)")
        print(f"   Target agents: {', '.join(self.target_agents)}")

        start_time = time.time()

        while self.running and (time.time() - start_time) < self.duration:
            # Random delay between chaos events
            time.sleep(random.uniform(0.5, 2.0))

            # Pick random agent and chaos type
            agent = random.choice(self.target_agents)
            chaos_type = random.choice(list(ChaosType))

            # Inject chaos
            self.injector.inject(chaos_type, agent)
            self.events_injected += 1

            print(f"   🔥 Injected {chaos_type.value} into {agent}")

        print(f"\n🐵 Chaos Monkey stopped ({self.events_injected} events injected)")

    def stop(self):
        """Stop the chaos monkey"""
        self.running = False


def main():
    """Run chaos tests"""
    runner = ChaosTestRunner()
    results = runner.run_all_tests()

    print("\nDetailed Results:")
    print("-" * 40)
    for r in results["results"]:
        status = "PASS" if r["passed"] else "FAIL"
        print(f"  [{status}] {r['test']}")
        print(f"       Type: {r['chaos_type']}, Detected: {r['detected']}")

    return 0 if results["failed"] == 0 else 1


if __name__ == "__main__":
    import sys
    sys.exit(main())