agent-governance/tests/test_auto_recovery.py

#!/usr/bin/env python3
"""
Auto-Recovery Test for Consensus Failures

Tests the system's ability to automatically recover from consensus failures
by spawning new pipelines with the collected failure context.

Exit codes:
  - 0: All tests passed
  - 1: Some tests failed
  - 2: Consensus failure (triggers auto-recovery)
  - 3: Aborted (timeout/stuck - triggers auto-recovery)
"""

import asyncio
import json
import time
import subprocess
import sys
from datetime import datetime
from pathlib import Path
import redis

REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
REDIS_PASSWORD = "governance2026"
UI_BASE_URL = "http://127.0.0.1:3000"


class AutoRecoveryTester:
    """Tests auto-recovery for consensus failures."""

    def __init__(self):
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )
        self.results = []

    def log(self, msg: str, level: str = "INFO"):
        """Log a message with timestamp."""
        ts = datetime.utcnow().strftime("%H:%M:%S")
        print(f"[{ts}] [{level}] {msg}")

    def setup_mock_pipeline(self, pipeline_id: str, task_id: str, objective: str) -> bool:
        """Set up a mock pipeline in DragonflyDB for testing."""
        try:
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "task_id": task_id,
                "objective": objective,
                "status": "RUNNING",
                "created_at": datetime.utcnow().isoformat(),
                "agents": json.dumps(["ALPHA", "BETA"]),
                "run_number": "1"
            })
            self.log(f"Created mock pipeline: {pipeline_id}")
            return True
        except Exception as e:
            self.log(f"Failed to create pipeline: {e}", "ERROR")
            return False

    def simulate_consensus_failure(self, pipeline_id: str) -> bool:
        """Simulate a consensus failure by setting appropriate state."""
        try:
            # Set consensus failure state
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "status": "CONSENSUS_FAILED",
                "consensus": json.dumps({
                    "achieved": False,
                    "proposals": [
                        {"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
                        {"agent": "BETA", "proposal": "Solution B", "score": 0.5}
                    ],
                    "conflicts": [
                        {"type": "approach", "agents": ["ALPHA", "BETA"]}
                    ]
                })
            })

            # Store failure context
            failure_context = {
                "pipeline_id": pipeline_id,
                "failure_reason": "consensus_failed",
                "proposals": [
                    {"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
                    {"agent": "BETA", "proposal": "Solution B", "score": 0.5}
                ],
                "conflicts": [{"type": "approach"}],
                "iteration_count": 5,
                "run_number": 1,
                "timestamp": datetime.utcnow().isoformat()
            }
            self.redis.set(
                f"pipeline:{pipeline_id}:failure_context",
                json.dumps(failure_context)
            )

            self.log(f"Simulated consensus failure for pipeline: {pipeline_id}")
            return True
        except Exception as e:
            self.log(f"Failed to simulate failure: {e}", "ERROR")
            return False

    def check_recovery_pipeline_created(self, original_pipeline_id: str, timeout: float = 5.0) -> dict:
        """Check if a recovery pipeline was created."""
        start = time.time()
        while time.time() - start < timeout:
            try:
                # Check if original pipeline has recovery_pipeline reference
                recovery_id = self.redis.hget(f"pipeline:{original_pipeline_id}", "recovery_pipeline")
                if recovery_id:
                    # Verify recovery pipeline exists
                    recovery_data = self.redis.hgetall(f"pipeline:{recovery_id}")
                    if recovery_data:
                        return {
                            "found": True,
                            "recovery_pipeline_id": recovery_id,
                            "recovery_data": recovery_data,
                            "elapsed_ms": (time.time() - start) * 1000
                        }
            except Exception:
                pass
            time.sleep(0.2)

        return {"found": False, "elapsed_ms": timeout * 1000}

    def verify_failure_context_passed(self, recovery_pipeline_id: str) -> dict:
        """Verify the recovery pipeline received the failure context."""
        try:
            data = self.redis.hgetall(f"pipeline:{recovery_pipeline_id}")
            prior_context = data.get("prior_context")

            return {
                "has_prior_context": prior_context is not None,
                "prior_pipeline": data.get("prior_pipeline"),
                "run_number": data.get("run_number"),
                "force_gamma": data.get("force_gamma") == "true"
            }
        except Exception as e:
            return {"error": str(e)}

    def cleanup_test_pipelines(self, pipeline_ids: list):
        """Clean up test pipelines."""
        for pid in pipeline_ids:
            try:
                keys = self.redis.keys(f"pipeline:{pid}*")
                if keys:
                    self.redis.delete(*keys)
                self.log(f"Cleaned up pipeline: {pid}")
            except Exception:
                pass

    def test_consensus_failure_detection(self) -> dict:
        """Test 1: Verify consensus failure is properly detected."""
        test_name = "Consensus Failure Detection"
        pipeline_id = f"test-recovery-{int(time.time())}"
        task_id = "test-task-001"
        objective = "Test consensus failure detection"

        try:
            # Setup
            self.setup_mock_pipeline(pipeline_id, task_id, objective)

            # Simulate failure
            self.simulate_consensus_failure(pipeline_id)

            # Check state
            status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
            failure_ctx = self.redis.get(f"pipeline:{pipeline_id}:failure_context")

            passed = status == "CONSENSUS_FAILED" and failure_ctx is not None

            result = {
                "test": test_name,
                "passed": passed,
                "status_detected": status,
                "failure_context_stored": failure_ctx is not None
            }

            # Cleanup
            self.cleanup_test_pipelines([pipeline_id])

            return result

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def test_max_recovery_limit(self) -> dict:
        """Test 2: Verify max recovery attempts are respected."""
        test_name = "Max Recovery Limit"
        pipeline_id = f"test-max-recovery-{int(time.time())}"

        try:
            # Create pipeline at max recovery attempts
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "task_id": "test-task",
                "objective": "Test max recovery",
                "status": "CONSENSUS_FAILED",
                "run_number": "3"  # Already at max (3)
            })

            # Store failure context indicating max reached
            failure_context = {
                "run_number": 3,
                "failure_reason": "consensus_failed",
                "proposals": [],
                "conflicts": []
            }
            self.redis.set(
                f"pipeline:{pipeline_id}:failure_context",
                json.dumps(failure_context)
            )

            # Verify no further recovery is attempted
            time.sleep(0.5)
            recovery_id = self.redis.hget(f"pipeline:{pipeline_id}", "recovery_pipeline")

            passed = recovery_id is None  # Should NOT have recovery

            result = {
                "test": test_name,
                "passed": passed,
                "run_number": 3,
                "recovery_attempted": recovery_id is not None
            }

            self.cleanup_test_pipelines([pipeline_id])
            return result

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def test_failure_context_structure(self) -> dict:
        """Test 3: Verify failure context has required structure."""
        test_name = "Failure Context Structure"

        required_fields = [
            "pipeline_id",
            "failure_reason",
            "proposals",
            "conflicts",
            "run_number",
            "timestamp"
        ]

        try:
            sample_context = {
                "pipeline_id": "test-123",
                "failure_reason": "consensus_failed",
                "proposals": [
                    {"agent": "ALPHA", "proposal": "Test", "score": 0.5}
                ],
                "conflicts": [{"type": "approach"}],
                "iteration_count": 5,
                "run_number": 1,
                "timestamp": datetime.utcnow().isoformat()
            }

            missing = [f for f in required_fields if f not in sample_context]
            passed = len(missing) == 0

            return {
                "test": test_name,
                "passed": passed,
                "required_fields": required_fields,
                "missing_fields": missing,
                "sample_valid": passed
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def test_gamma_force_on_recovery(self) -> dict:
        """Test 4: Verify GAMMA is forced on recovery attempts."""
        test_name = "GAMMA Force on Recovery"
        pipeline_id = f"test-gamma-{int(time.time())}"
        recovery_id = f"test-gamma-recovery-{int(time.time())}"

        try:
            # Create original pipeline
            self.setup_mock_pipeline(pipeline_id, "task-gamma", "Test GAMMA forcing")

            # Create mock recovery pipeline with force_gamma
            self.redis.hset(f"pipeline:{recovery_id}", mapping={
                "task_id": "task-gamma",
                "objective": "[RECOVERY] Test GAMMA forcing",
                "status": "SPAWNED",
                "prior_pipeline": pipeline_id,
                "run_number": "2",
                "force_gamma": "true"
            })

            # Link them
            self.redis.hset(f"pipeline:{pipeline_id}", "recovery_pipeline", recovery_id)

            # Verify
            force_gamma = self.redis.hget(f"pipeline:{recovery_id}", "force_gamma")
            passed = force_gamma == "true"

            result = {
                "test": test_name,
                "passed": passed,
                "force_gamma_set": force_gamma == "true",
                "recovery_linked": True
            }

            self.cleanup_test_pipelines([pipeline_id, recovery_id])
            return result

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def test_iteration_timeout_abort(self) -> dict:
        """Test 5: Verify iteration timeout triggers abort."""
        test_name = "Iteration Timeout Abort"
        pipeline_id = f"test-timeout-{int(time.time())}"

        try:
            # Create pipeline that has exceeded iterations
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "task_id": "task-timeout",
                "objective": "Test timeout",
                "status": "ABORTED",
                "iteration_count": "12",
                "max_iterations": "10",
                "abort_reason": "iteration_limit"
            })

            # Verify abort state
            status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
            reason = self.redis.hget(f"pipeline:{pipeline_id}", "abort_reason")

            passed = status == "ABORTED" and reason == "iteration_limit"

            result = {
                "test": test_name,
                "passed": passed,
                "status": status,
                "abort_reason": reason
            }

            self.cleanup_test_pipelines([pipeline_id])
            return result

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def test_stuck_detection(self) -> dict:
        """Test 6: Verify stuck agent detection."""
        test_name = "Stuck Agent Detection"
        pipeline_id = f"test-stuck-{int(time.time())}"

        try:
            # Create pipeline with old last_progress timestamp
            old_time = datetime.utcnow().timestamp() - 120  # 2 minutes ago

            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "task_id": "task-stuck",
                "objective": "Test stuck detection",
                "status": "RUNNING",
                "last_progress": str(old_time)
            })

            # Check if stuck detection would trigger (60 second timeout)
            last_progress = float(self.redis.hget(f"pipeline:{pipeline_id}", "last_progress") or 0)
            current = datetime.utcnow().timestamp()
            stuck = (current - last_progress) > 60

            passed = stuck  # Should be detected as stuck

            result = {
                "test": test_name,
                "passed": passed,
                "time_since_progress_seconds": current - last_progress,
                "stuck_detected": stuck
            }

            self.cleanup_test_pipelines([pipeline_id])
            return result

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def run_all_tests(self) -> dict:
        """Run all auto-recovery tests."""
        print("\n" + "=" * 60)
        print("AUTO-RECOVERY TEST SUITE")
        print("=" * 60 + "\n")

        tests = [
            self.test_consensus_failure_detection,
            self.test_max_recovery_limit,
            self.test_failure_context_structure,
            self.test_gamma_force_on_recovery,
            self.test_iteration_timeout_abort,
            self.test_stuck_detection,
        ]

        passed = 0
        failed = 0

        for test_func in tests:
            result = test_func()
            self.results.append(result)

            status = "PASS" if result["passed"] else "FAIL"
            symbol = "✓" if result["passed"] else "✗"

            print(f"  {symbol} {status}: {result['test']}")

            if result["passed"]:
                passed += 1
            else:
                failed += 1
                if "error" in result:
                    print(f"       Error: {result['error']}")

        print(f"\n{'='*60}")
        print(f"RESULTS: {passed}/{passed+failed} passed")
        print(f"{'='*60}\n")

        return {
            "total_tests": len(tests),
            "passed": passed,
            "failed": failed,
            "success_rate": passed / len(tests) if tests else 0,
            "results": self.results
        }


def main():
    """Run auto-recovery tests."""
    tester = AutoRecoveryTester()
    results = tester.run_all_tests()

    print("\nDetailed Results:")
    print("-" * 40)
    for r in results["results"]:
        status = "PASS" if r["passed"] else "FAIL"
        print(f"  [{status}] {r['test']}")
        for k, v in r.items():
            if k not in ["test", "passed"]:
                print(f"       {k}: {v}")

    return 0 if results["failed"] == 0 else 1


if __name__ == "__main__":
    sys.exit(main())