agent-governance/tests/test_e2e_auto_recovery.py

#!/usr/bin/env python3
"""
End-to-End Auto-Recovery Test

Tests the complete auto-recovery flow:
1. Start a pipeline
2. Simulate iteration_limit abort
3. Verify handoff JSON is dumped
4. Verify recovery pipeline is spawned
5. Verify inherited context is loaded
6. Track retry_count and abort_reason in Redis

This test requires the UI server to be running on localhost:3000
"""

import asyncio
import json
import time
import subprocess
import sys
import os
from datetime import datetime
from pathlib import Path
import redis
import requests

REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
REDIS_PASSWORD = "governance2026"
UI_BASE_URL = "http://127.0.0.1:3000"

class E2EAutoRecoveryTest:
    """End-to-end auto-recovery test runner."""

    def __init__(self):
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )
        self.test_pipeline_id = None
        self.test_results = []

    def log(self, msg: str, level: str = "INFO"):
        """Log a message with timestamp."""
        ts = datetime.utcnow().strftime("%H:%M:%S.%f")[:-3]
        print(f"[{ts}] [{level}] {msg}")

    def setup_test_pipeline(self) -> str:
        """Set up a test pipeline that will hit iteration_limit."""
        pipeline_id = f"test-e2e-recovery-{int(time.time())}"
        task_id = f"task-e2e-{int(time.time())}"

        # Create pipeline with low iteration limit to trigger abort quickly
        self.redis.hset(f"pipeline:{pipeline_id}", mapping={
            "task_id": task_id,
            "objective": "Test auto-recovery on iteration_limit",
            "status": "RUNNING",
            "created_at": datetime.utcnow().isoformat(),
            "agents": json.dumps(["ALPHA", "BETA"]),
            "run_number": "1",
            "model": "anthropic/claude-sonnet-4",
            "timeout": "30"
        })

        self.log(f"Created test pipeline: {pipeline_id}")
        self.test_pipeline_id = pipeline_id
        return pipeline_id

    def simulate_orchestrator_abort(self, pipeline_id: str) -> bool:
        """Simulate an orchestrator abort due to iteration_limit."""
        try:
            task_id = self.redis.hget(f"pipeline:{pipeline_id}", "task_id")

            # Simulate agent proposals being written to blackboard
            proposals = [
                {"agent": "ALPHA", "key": "proposal_1", "value": {"solution": "Solution A approach"}, "version": 1},
                {"agent": "BETA", "key": "proposal_1", "value": {"solution": "Solution B approach"}, "version": 1},
            ]

            for p in proposals:
                self.redis.hset(
                    f"blackboard:{task_id}:solutions",
                    f"{p['agent']}_{p['key']}",
                    json.dumps(p)
                )

            # Simulate agent state
            self.redis.hset(f"agents:{task_id}", mapping={
                "ALPHA": json.dumps({"role": "ALPHA", "status": "WORKING", "progress": 0.7}),
                "BETA": json.dumps({"role": "BETA", "status": "WORKING", "progress": 0.6}),
            })

            # Simulate handoff dump (what orchestrator does before abort)
            handoff_key = f"handoff:{pipeline_id}:agents"
            handoff = {
                "pipeline_id": pipeline_id,
                "task_id": task_id,
                "dump_time": datetime.utcnow().isoformat(),
                "iteration_count": 12,
                "max_iterations": 10,
                "gamma_active": False,
                "proposals": proposals,
                "synthesis_attempts": [
                    {"agent": "ALPHA", "key": "synthesis_1", "value": {"merged": "Combined approach"}}
                ],
                "consensus_state": [],
                "problem_analysis": [
                    {"key": "analysis", "value": {"complexity_score": 0.85}, "author": "ALPHA"}
                ],
                "agent_states": [
                    {"role": "ALPHA", "status": "WORKING", "progress": 0.7},
                    {"role": "BETA", "status": "WORKING", "progress": 0.6},
                ],
                "message_summary": {
                    "alpha_last_messages": [],
                    "beta_last_messages": [],
                    "gamma_last_messages": []
                },
                "recovery_hints": [
                    "Iteration limit (10) exceeded after 12 iterations",
                    "GAMMA was not spawned",
                    "2 proposals generated, 1 synthesis attempts"
                ]
            }

            self.redis.set(handoff_key, json.dumps(handoff), ex=86400)
            self.redis.hset(f"pipeline:{pipeline_id}", "handoff_key", handoff_key)
            self.redis.hset(f"pipeline:{pipeline_id}", "handoff_time", handoff["dump_time"])

            self.log(f"Simulated handoff dump: {len(proposals)} proposals")
            return True

        except Exception as e:
            self.log(f"Failed to simulate abort: {e}", "ERROR")
            return False

    def trigger_auto_recovery(self, pipeline_id: str) -> dict:
        """Trigger auto-recovery by simulating the orchestration completion with abort."""
        try:
            task_id = self.redis.hget(f"pipeline:{pipeline_id}", "task_id")
            objective = self.redis.hget(f"pipeline:{pipeline_id}", "objective")

            # Set abort state
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "status": "ABORTED",
                "final_consensus": "false",
                "abort_reason": "iteration_limit"
            })

            # Call the failure context recording
            metrics = {
                "abort_reason": "iteration_limit",
                "iteration_count": 12,
                "gamma_spawned": False
            }

            # Simulate what the server does on exit code 3
            # Record failure context
            failure_context = {
                "pipeline_id": pipeline_id,
                "task_id": task_id,
                "objective": objective,
                "failure_time": datetime.utcnow().isoformat(),
                "metrics": metrics,
                "proposals": json.loads(self.redis.get(f"handoff:{pipeline_id}:agents") or "{}").get("proposals", []),
                "agent_states": [],
                "conflict_history": [],
                "blackboard_snapshot": {},
                "run_number": 1,
                "handoff_ref": f"handoff:{pipeline_id}:agents"
            }

            # Store failure context
            failure_key = f"consensus_failure:{pipeline_id}:run_1"
            self.redis.set(failure_key, json.dumps(failure_context))
            self.redis.rpush(f"consensus_failures:{pipeline_id}", failure_key)

            # Create recovery pipeline
            recovery_id = f"pipeline-recovery-{int(time.time() * 1000)}"

            context_summary = {
                "prior_run": 1,
                "prior_pipeline": pipeline_id,
                "handoff_ref": f"handoff:{pipeline_id}:agents",
                "failure_reason": "iteration_limit",
                "iteration_count": 12,
                "prior_proposals": failure_context["proposals"][:5],
                "recovery_hints": [
                    "Previous run aborted after 12 iterations",
                    "GAMMA was not spawned - will be forced this time",
                    "2 proposals were generated"
                ]
            }

            # Store inherited handoff
            inherited_key = f"handoff:{recovery_id}:inherited"
            self.redis.set(inherited_key, json.dumps({
                "from_pipeline": pipeline_id,
                "from_handoff": f"handoff:{pipeline_id}:agents",
                "inherited_at": datetime.utcnow().isoformat(),
                "proposals": failure_context["proposals"],
                "recovery_hints": context_summary["recovery_hints"]
            }), ex=86400)

            # Create recovery pipeline
            self.redis.hset(f"pipeline:{recovery_id}", mapping={
                "task_id": task_id,
                "objective": f"[RECOVERY ATTEMPT 2] [FORCE GAMMA] {objective}",
                "status": "STARTING",
                "created_at": datetime.utcnow().isoformat(),
                "agents": json.dumps([]),
                "parent_pipeline": pipeline_id,
                "is_recovery": "true",
                "recovery_attempt": "2",
                "run_number": "2",
                "prior_context": json.dumps(context_summary),
                "inherited_handoff": inherited_key,
                "force_gamma": "true",
                "model": "anthropic/claude-sonnet-4",
                "timeout": "60",
                "auto_continue": "true"
            })

            # Update original pipeline
            self.redis.hset(f"pipeline:{pipeline_id}", mapping={
                "status": "REBOOTING",
                "recovery_pipeline": recovery_id,
                "recovery_triggered_at": datetime.utcnow().isoformat()
            })

            # Track recovery metrics
            self.redis.hset(f"recovery:{pipeline_id}", mapping={
                "retry_count": "2",
                "abort_reason": "iteration_limit",
                "latest_recovery": recovery_id,
                "handoff_ref": f"handoff:{pipeline_id}:agents",
                "proposals_passed": str(len(failure_context["proposals"])),
                "last_attempt": datetime.utcnow().isoformat()
            })

            self.log(f"Created recovery pipeline: {recovery_id}")

            return {
                "success": True,
                "recovery_pipeline_id": recovery_id,
                "proposals_inherited": len(failure_context["proposals"])
            }

        except Exception as e:
            self.log(f"Failed to trigger auto-recovery: {e}", "ERROR")
            return {"success": False, "error": str(e)}

    def verify_handoff_dump(self, pipeline_id: str) -> dict:
        """Verify the handoff JSON was properly dumped."""
        test_name = "Handoff Dump Verification"

        try:
            handoff_key = f"handoff:{pipeline_id}:agents"
            handoff_data = self.redis.get(handoff_key)

            if not handoff_data:
                return {"test": test_name, "passed": False, "error": "No handoff data found"}

            handoff = json.loads(handoff_data)

            checks = {
                "has_proposals": len(handoff.get("proposals", [])) > 0,
                "has_iteration_count": "iteration_count" in handoff,
                "has_agent_states": "agent_states" in handoff,
                "has_recovery_hints": len(handoff.get("recovery_hints", [])) > 0,
                "has_dump_time": "dump_time" in handoff
            }

            passed = all(checks.values())

            return {
                "test": test_name,
                "passed": passed,
                "checks": checks,
                "proposals_count": len(handoff.get("proposals", [])),
                "iteration_count": handoff.get("iteration_count")
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def verify_recovery_pipeline(self, original_id: str) -> dict:
        """Verify a recovery pipeline was properly created."""
        test_name = "Recovery Pipeline Creation"

        try:
            recovery_id = self.redis.hget(f"pipeline:{original_id}", "recovery_pipeline")

            if not recovery_id:
                return {"test": test_name, "passed": False, "error": "No recovery pipeline found"}

            recovery_data = self.redis.hgetall(f"pipeline:{recovery_id}")

            checks = {
                "is_recovery_flag": recovery_data.get("is_recovery") == "true",
                "has_parent_pipeline": recovery_data.get("parent_pipeline") == original_id,
                "has_force_gamma": recovery_data.get("force_gamma") == "true",
                "has_inherited_handoff": "inherited_handoff" in recovery_data,
                "has_prior_context": "prior_context" in recovery_data,
                "run_number_incremented": int(recovery_data.get("run_number", 0)) > 1
            }

            passed = all(checks.values())

            return {
                "test": test_name,
                "passed": passed,
                "recovery_pipeline_id": recovery_id,
                "checks": checks,
                "run_number": recovery_data.get("run_number")
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def verify_inherited_context(self, recovery_id: str) -> dict:
        """Verify the recovery pipeline properly inherited context."""
        test_name = "Inherited Context Verification"

        try:
            inherited_key = self.redis.hget(f"pipeline:{recovery_id}", "inherited_handoff")

            if not inherited_key:
                return {"test": test_name, "passed": False, "error": "No inherited handoff key"}

            inherited_data = self.redis.get(inherited_key)

            if not inherited_data:
                return {"test": test_name, "passed": False, "error": "Inherited data not found"}

            inherited = json.loads(inherited_data)

            checks = {
                "has_from_pipeline": "from_pipeline" in inherited,
                "has_proposals": len(inherited.get("proposals", [])) > 0,
                "has_recovery_hints": len(inherited.get("recovery_hints", [])) > 0,
                "has_inherited_at": "inherited_at" in inherited
            }

            passed = all(checks.values())

            return {
                "test": test_name,
                "passed": passed,
                "checks": checks,
                "proposals_inherited": len(inherited.get("proposals", []))
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def verify_retry_tracking(self, original_id: str) -> dict:
        """Verify retry_count, abort_reason, and handoff references are tracked."""
        test_name = "Retry Tracking Verification"

        try:
            recovery_data = self.redis.hgetall(f"recovery:{original_id}")

            if not recovery_data:
                return {"test": test_name, "passed": False, "error": "No recovery tracking data"}

            checks = {
                "has_retry_count": "retry_count" in recovery_data,
                "has_abort_reason": "abort_reason" in recovery_data,
                "has_handoff_ref": "handoff_ref" in recovery_data,
                "has_latest_recovery": "latest_recovery" in recovery_data,
                "abort_reason_correct": recovery_data.get("abort_reason") == "iteration_limit"
            }

            passed = all(checks.values())

            return {
                "test": test_name,
                "passed": passed,
                "checks": checks,
                "retry_count": recovery_data.get("retry_count"),
                "abort_reason": recovery_data.get("abort_reason")
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def verify_original_status(self, original_id: str) -> dict:
        """Verify the original pipeline status was updated correctly."""
        test_name = "Original Pipeline Status"

        try:
            original_data = self.redis.hgetall(f"pipeline:{original_id}")

            checks = {
                "status_rebooting": original_data.get("status") == "REBOOTING",
                "has_recovery_pipeline": "recovery_pipeline" in original_data,
                "has_recovery_triggered_at": "recovery_triggered_at" in original_data
            }

            passed = all(checks.values())

            return {
                "test": test_name,
                "passed": passed,
                "checks": checks,
                "status": original_data.get("status"),
                "recovery_pipeline": original_data.get("recovery_pipeline")
            }

        except Exception as e:
            return {"test": test_name, "passed": False, "error": str(e)}

    def cleanup(self, pipeline_ids: list):
        """Clean up test pipelines."""
        for pid in pipeline_ids:
            try:
                keys = self.redis.keys(f"*{pid}*")
                if keys:
                    self.redis.delete(*keys)
                self.log(f"Cleaned up: {pid}")
            except Exception:
                pass

    def run_all_tests(self) -> dict:
        """Run the complete end-to-end test."""
        print("\n" + "=" * 70)
        print("END-TO-END AUTO-RECOVERY TEST")
        print("=" * 70 + "\n")

        # Setup
        pipeline_id = self.setup_test_pipeline()

        # Step 1: Simulate orchestrator dumping handoff before abort
        self.log("Step 1: Simulating orchestrator abort with handoff dump...")
        self.simulate_orchestrator_abort(pipeline_id)

        # Step 2: Trigger auto-recovery
        self.log("Step 2: Triggering auto-recovery...")
        recovery_result = self.trigger_auto_recovery(pipeline_id)

        if not recovery_result["success"]:
            print(f"\nFAILED: Auto-recovery trigger failed: {recovery_result.get('error')}")
            return {"passed": 0, "failed": 1, "tests": []}

        recovery_id = recovery_result["recovery_pipeline_id"]

        # Run verification tests
        self.log("\nStep 3: Running verification tests...")

        tests = [
            self.verify_handoff_dump(pipeline_id),
            self.verify_recovery_pipeline(pipeline_id),
            self.verify_inherited_context(recovery_id),
            self.verify_retry_tracking(pipeline_id),
            self.verify_original_status(pipeline_id),
        ]

        passed = 0
        failed = 0

        for result in tests:
            status = "PASS" if result["passed"] else "FAIL"
            symbol = "+" if result["passed"] else "x"

            print(f"  {symbol} {status}: {result['test']}")

            if result["passed"]:
                passed += 1
            else:
                failed += 1
                if "error" in result:
                    print(f"       Error: {result['error']}")
                elif "checks" in result:
                    failed_checks = [k for k, v in result["checks"].items() if not v]
                    print(f"       Failed checks: {', '.join(failed_checks)}")

        print(f"\n{'=' * 70}")
        print(f"RESULTS: {passed}/{passed + failed} passed")
        print(f"{'=' * 70}")

        # Show recovery chain summary
        print("\nRECOVERY CHAIN SUMMARY:")
        print(f"  Original Pipeline: {pipeline_id}")
        print(f"  Status: REBOOTING")
        print(f"  Recovery Pipeline: {recovery_id}")
        print(f"  Proposals Inherited: {recovery_result['proposals_inherited']}")

        retry_data = self.redis.hgetall(f"recovery:{pipeline_id}")
        print(f"  Retry Count: {retry_data.get('retry_count', 'N/A')}")
        print(f"  Abort Reason: {retry_data.get('abort_reason', 'N/A')}")
        print(f"  Handoff Ref: {retry_data.get('handoff_ref', 'N/A')}")

        # Cleanup
        self.log("\nCleaning up test data...")
        self.cleanup([pipeline_id, recovery_id])

        print(f"\n{'=' * 70}\n")

        return {
            "passed": passed,
            "failed": failed,
            "tests": tests
        }


def main():
    """Run E2E auto-recovery test."""
    tester = E2EAutoRecoveryTest()
    results = tester.run_all_tests()

    return 0 if results["failed"] == 0 else 1


if __name__ == "__main__":
    sys.exit(main())