agent-governance/tests/multi-agent-chaos/orchestrator.py

#!/usr/bin/env python3
"""
Multi-Agent Chaos Test Orchestrator

Launches multiple real agents (Python, Bun, Diagnostic) working simultaneously
on the same project with chaos injection and overwatch coordination.

Features:
- Real pipeline execution using official pipeline definitions
- Overwatch agent spawns helpers on errors
- DragonflyDB readiness criteria
- History recording for learning
- Unified objective verification
- Alpha/Beta/Gamma output tracking with plan clarification

Architecture Reference: /opt/agent-governance/docs/ARCHITECTURE.md
Pipeline Reference: /opt/agent-governance/pipeline/core.py
"""

import asyncio
import json
import os
import signal
import sqlite3
import subprocess
import sys
import time
import threading
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
import redis
import hashlib

# Add parent directory to path to import from pipeline module
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

# Import official pipeline definitions from unified core module
from pipeline.core import (
    # Enums
    AgentPhase,
    OutputType,
    ChaosCondition,
    StageStatus,
    # Data classes
    AgentOutput,
    ClarifiedPlan,
    ErrorBudget,
    # Constants
    AGENT_PHASE_NAMES,
    PHASE_OUTPUT_TYPES,
    DEFAULT_REDIS_HOST,
    DEFAULT_REDIS_PORT,
    DEFAULT_REDIS_PASSWORD,
    DEFAULT_LEDGER_PATH,
    DEFAULT_HEARTBEAT_TTL,
    DEFAULT_LOCK_TTL,
    DEFAULT_OUTPUT_TTL,
    # Key patterns
    RedisKeys,
    # Utilities
    get_output_type_for_phase,
)

# =============================================================================
# Configuration (using defaults from pipeline.core where possible)
# =============================================================================
REDIS_HOST = DEFAULT_REDIS_HOST
REDIS_PORT = DEFAULT_REDIS_PORT
REDIS_PASSWORD = DEFAULT_REDIS_PASSWORD
LEDGER_PATH = Path(DEFAULT_LEDGER_PATH)
VAULT_ADDR = "https://127.0.0.1:8200"
VAULT_TOKEN_FILE = Path("/opt/vault/init-keys.json")
LOG_DIR = Path("/opt/agent-governance/tests/multi-agent-chaos/logs")

# Ensure log directory exists
LOG_DIR.mkdir(parents=True, exist_ok=True)


# =============================================================================
# Chaos Test Specific Enums (extend core definitions for testing)
# =============================================================================

class AgentType(Enum):
    """Agent types specific to the chaos test orchestrator."""
    PYTHON = "python"
    BUN = "bun"
    DIAGNOSTIC = "diagnostic"
    OVERWATCH = "overwatch"
    HELPER = "helper"


class AgentStatus(Enum):
    """
    Agent runtime status for chaos testing.
    Extends the core AgentStatus with chaos-specific states.
    Note: Uses lowercase values for Redis compatibility in this test context.
    """
    PENDING = "pending"
    STARTING = "starting"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CHAOS_INJECTED = "chaos_injected"  # Chaos test specific
    RECOVERING = "recovering"
    PAUSED = "paused"
    REVOKED = "revoked"


# Note: We intentionally shadow pipeline.core.AgentStatus here because:
# 1. Chaos test needs CHAOS_INJECTED state not in core
# 2. Test uses lowercase values for Redis state storage
# 3. Core module remains the authoritative reference for production code


@dataclass
class AgentInstance:
    """Represents a running agent instance"""
    agent_id: str
    agent_type: AgentType
    process: Optional[subprocess.Popen] = None
    status: AgentStatus = AgentStatus.PENDING
    pipeline_id: Optional[str] = None
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    exit_code: Optional[int] = None
    chaos_condition: ChaosCondition = ChaosCondition.NONE
    error_count: int = 0
    output_log: str = ""
    spawned_by: Optional[str] = None


@dataclass
class ReadinessCheck:
    """DragonflyDB readiness criteria"""
    name: str
    check_func: callable
    required: bool = True
    passed: bool = False
    message: str = ""


class DragonflyReadiness:
    """Manages DragonflyDB readiness checks for multi-agent coordination"""

    def __init__(self, redis_client: redis.Redis, project_id: str):
        self.redis = redis_client
        self.project_id = project_id
        self.checks: List[ReadinessCheck] = []

    def add_check(self, name: str, check_func: callable, required: bool = True):
        """Add a readiness check"""
        self.checks.append(ReadinessCheck(
            name=name,
            check_func=check_func,
            required=required
        ))

    def run_checks(self) -> Tuple[bool, List[Dict]]:
        """Run all readiness checks"""
        results = []
        all_passed = True

        for check in self.checks:
            try:
                passed, message = check.check_func()
                check.passed = passed
                check.message = message
            except Exception as e:
                check.passed = False
                check.message = f"Error: {e}"

            if check.required and not check.passed:
                all_passed = False

            results.append({
                "name": check.name,
                "passed": check.passed,
                "required": check.required,
                "message": check.message
            })

        return all_passed, results

    def wait_for_ready(self, timeout_seconds: int = 60) -> bool:
        """Wait for all checks to pass"""
        start = time.time()
        while time.time() - start < timeout_seconds:
            all_ready, _ = self.run_checks()
            if all_ready:
                return True
            time.sleep(0.5)
        return False


class HistoryRecorder:
    """Records agent history for learning and refinement"""

    def __init__(self, redis_client: redis.Redis, ledger_path: Path):
        self.redis = redis_client
        self.ledger_path = ledger_path

    def record_run(self, agent_id: str, run_data: Dict):
        """Record a run to history"""
        # Store in Redis for quick access
        history_key = f"history:{agent_id}:runs"
        run_data["recorded_at"] = datetime.utcnow().isoformat()
        self.redis.lpush(history_key, json.dumps(run_data))
        self.redis.ltrim(history_key, 0, 99)  # Keep last 100 runs

        # Store in ledger for persistence
        try:
            conn = sqlite3.connect(self.ledger_path)
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO agent_actions
                (timestamp, agent_id, agent_version, tier, action, decision,
                 confidence, success, error_type, error_message, session_id, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                datetime.utcnow().isoformat(),
                agent_id,
                run_data.get("version", "1.0.0"),
                run_data.get("tier", 1),
                run_data.get("action", "chaos_test_run"),
                run_data.get("decision", "EXECUTE"),
                run_data.get("confidence", 0.8),
                1 if run_data.get("success") else 0,
                run_data.get("error_type"),
                run_data.get("error_message"),
                run_data.get("session_id"),
                datetime.utcnow().isoformat()
            ))
            conn.commit()
            conn.close()
        except Exception as e:
            print(f"Warning: Failed to record to ledger: {e}")

    def get_agent_history(self, agent_id: str, limit: int = 10) -> List[Dict]:
        """Get recent history for an agent"""
        history_key = f"history:{agent_id}:runs"
        runs = self.redis.lrange(history_key, 0, limit - 1)
        return [json.loads(r) for r in runs]

    def get_learning_insights(self, agent_id: str) -> Dict:
        """Analyze history and provide insights for agent refinement"""
        history = self.get_agent_history(agent_id, 20)

        if not history:
            return {"status": "no_history", "recommendations": []}

        # Analyze success rate
        successes = sum(1 for h in history if h.get("success"))
        total = len(history)
        success_rate = successes / total

        # Analyze error patterns
        error_types = {}
        for h in history:
            if h.get("error_type"):
                error_types[h["error_type"]] = error_types.get(h["error_type"], 0) + 1

        # Analyze chaos recovery
        chaos_recoveries = sum(1 for h in history if h.get("recovered_from_chaos"))

        recommendations = []

        if success_rate < 0.7:
            recommendations.append("Consider reducing task complexity")

        if error_types:
            most_common = max(error_types, key=error_types.get)
            recommendations.append(f"Focus on handling '{most_common}' errors")

        if chaos_recoveries > 0:
            recommendations.append(f"Agent has recovered from {chaos_recoveries} chaos events")

        return {
            "success_rate": success_rate,
            "total_runs": total,
            "error_patterns": error_types,
            "chaos_recoveries": chaos_recoveries,
            "recommendations": recommendations
        }


class OverwatchAgent:
    """
    Overwatch agent that monitors other agents, spawns helpers on errors,
    and generates clarified plans when error thresholds are exceeded.
    """

    def __init__(self, orchestrator: 'MultiAgentOrchestrator'):
        self.orchestrator = orchestrator
        self.agent_id = f"overwatch-{datetime.utcnow().strftime('%H%M%S')}"
        self.running = False
        self.helpers_spawned = 0
        self.max_helpers = 3
        self.intervention_log: List[Dict] = []

        # Error threshold tracking
        self.error_threshold = 5  # Errors before plan clarification (error_spike adds 5)
        self.total_errors_seen = 0
        self.threshold_triggered = False
        self.clarified_plans: List[ClarifiedPlan] = []
        self.pipeline_paused = False
        self.startup_grace_period = 1.0  # seconds before error checking starts

    async def start(self):
        """Start overwatch monitoring"""
        self.running = True
        print(f"  [OVERWATCH] Started monitoring ({self.agent_id})")
        print(f"  [OVERWATCH] Error threshold: {self.error_threshold}")

        while self.running:
            await self.check_agents()
            await self.check_error_threshold()
            await asyncio.sleep(0.5)

    def stop(self):
        """Stop overwatch"""
        self.running = False
        print(f"  [OVERWATCH] Stopped ({self.helpers_spawned} helpers spawned, {len(self.clarified_plans)} plans clarified)")

    async def check_agents(self):
        """Check all agents and intervene if needed"""
        # Create a copy to avoid modification during iteration
        agents_snapshot = list(self.orchestrator.agents.items())
        for agent_id, agent in agents_snapshot:
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue

            # Check for error conditions - prioritize failures
            if agent.status == AgentStatus.FAILED:
                await self.handle_failure(agent)
            elif agent.status == AgentStatus.CHAOS_INJECTED:
                await self.handle_chaos(agent)
            elif agent.status == AgentStatus.RECOVERING:
                # Check if recovery is stalled
                await self.check_recovery_progress(agent)
            elif agent.error_count >= 3:
                await self.handle_error_spike(agent)

            # Also check for stale heartbeats on running agents (with grace period)
            if agent.status == AgentStatus.RUNNING and agent.started_at:
                age = (datetime.utcnow() - agent.started_at).total_seconds()
                if age > 2.0:  # Grace period of 2 seconds for startup
                    hb = self.orchestrator.redis.get(f"agent:{agent_id}:heartbeat")
                    if not hb:
                        print(f"  [OVERWATCH] Heartbeat missing for {agent_id}, marking as chaos")
                        agent.status = AgentStatus.CHAOS_INJECTED
                        agent.chaos_condition = ChaosCondition.HEARTBEAT_TIMEOUT

    async def handle_failure(self, agent: AgentInstance):
        """Handle agent failure by spawning helper"""
        if self.helpers_spawned >= self.max_helpers:
            print(f"  [OVERWATCH] Max helpers reached, cannot spawn for {agent.agent_id}")
            return

        intervention = {
            "type": "failure_recovery",
            "target_agent": agent.agent_id,
            "timestamp": datetime.utcnow().isoformat(),
            "action": "spawn_helper"
        }

        print(f"  [OVERWATCH] Spawning helper for failed agent {agent.agent_id}")

        # Spawn a helper agent
        helper_id = await self.orchestrator.spawn_helper_agent(
            spawned_by=self.agent_id,
            reason=f"Recovery for {agent.agent_id}",
            target_task=agent.pipeline_id
        )

        intervention["helper_id"] = helper_id
        self.intervention_log.append(intervention)
        self.helpers_spawned += 1

    async def handle_chaos(self, agent: AgentInstance):
        """Handle chaos condition by attempting recovery and spawning helper"""
        intervention = {
            "type": "chaos_mitigation",
            "target_agent": agent.agent_id,
            "chaos_condition": agent.chaos_condition.value,
            "timestamp": datetime.utcnow().isoformat()
        }

        print(f"  [OVERWATCH] Mitigating chaos for {agent.agent_id}: {agent.chaos_condition.value}")

        # Attempt to stabilize the agent based on chaos type
        if agent.chaos_condition == ChaosCondition.LOCK_LOST:
            # Re-acquire lock for the agent
            self.orchestrator.redis.set(
                f"agent:{agent.agent_id}:lock",
                agent.agent_id,
                ex=300,
                nx=False
            )
            print(f"  [OVERWATCH] Re-acquired lock for {agent.agent_id}")

        elif agent.chaos_condition == ChaosCondition.HEARTBEAT_TIMEOUT:
            # Restore heartbeat
            self.orchestrator.redis.set(
                f"agent:{agent.agent_id}:heartbeat",
                datetime.utcnow().isoformat(),
                ex=60
            )
            print(f"  [OVERWATCH] Restored heartbeat for {agent.agent_id}")

        elif agent.chaos_condition == ChaosCondition.STATE_CORRUPTED:
            # Fix corrupted state
            self.orchestrator.redis.hset(
                f"agent:{agent.agent_id}:state",
                "phase", "EXECUTE"  # Reset to safe phase
            )
            print(f"  [OVERWATCH] Fixed corrupted state for {agent.agent_id}")

        elif agent.chaos_condition == ChaosCondition.TOKEN_REVOKED:
            # Clear revoke signal and spawn helper
            self.orchestrator.redis.delete(f"agent:{agent.agent_id}:revoke_signal")
            print(f"  [OVERWATCH] Cleared revoke signal for {agent.agent_id}")

        # Mark as recovering
        agent.status = AgentStatus.RECOVERING
        self.orchestrator.redis.hset(
            f"agent:{agent.agent_id}:state",
            "status", "recovering"
        )

        # Spawn a helper to assist with recovery if we have budget
        if self.helpers_spawned < self.max_helpers:
            helper_id = await self.orchestrator.spawn_helper_agent(
                spawned_by=self.agent_id,
                reason=f"Chaos recovery ({agent.chaos_condition.value}) for {agent.agent_id}",
                target_task=agent.pipeline_id
            )
            intervention["helper_id"] = helper_id
            self.helpers_spawned += 1
            print(f"  [OVERWATCH] Spawned helper {helper_id} for chaos recovery")

        self.intervention_log.append(intervention)

    async def check_recovery_progress(self, agent: AgentInstance):
        """Check if a recovering agent is making progress"""
        state = self.orchestrator.redis.hgetall(f"agent:{agent.agent_id}:state")

        # If still recovering after a while, the agent might be stuck
        if state.get("status") == "recovering":
            # Check if process is still alive
            if agent.process and agent.process.poll() is not None:
                # Process died during recovery
                print(f"  [OVERWATCH] Agent {agent.agent_id} died during recovery")
                agent.status = AgentStatus.FAILED
                await self.handle_failure(agent)

    async def handle_error_spike(self, agent: AgentInstance):
        """Handle error spike by spawning diagnostic agent"""
        if self.helpers_spawned >= self.max_helpers:
            return

        intervention = {
            "type": "error_spike_response",
            "target_agent": agent.agent_id,
            "error_count": agent.error_count,
            "timestamp": datetime.utcnow().isoformat()
        }

        print(f"  [OVERWATCH] Error spike detected for {agent.agent_id}, spawning diagnostic")

        helper_id = await self.orchestrator.spawn_helper_agent(
            spawned_by=self.agent_id,
            reason=f"Diagnose errors for {agent.agent_id}",
            target_task=agent.pipeline_id,
            agent_type=AgentType.DIAGNOSTIC
        )

        intervention["helper_id"] = helper_id
        self.intervention_log.append(intervention)
        self.helpers_spawned += 1

    async def check_error_threshold(self):
        """Check if error threshold is crossed and trigger plan clarification"""
        # Wait for startup grace period
        if self.orchestrator.start_time:
            elapsed = (datetime.utcnow() - self.orchestrator.start_time).total_seconds()
            if elapsed < self.startup_grace_period:
                return

        # Count total errors across all agents
        current_errors = sum(
            agent.error_count for agent in self.orchestrator.agents.values()
            if agent.agent_type not in [AgentType.OVERWATCH, AgentType.HELPER]
        )

        # Check if we've crossed the threshold
        if current_errors >= self.error_threshold and not self.threshold_triggered:
            self.threshold_triggered = True
            self.total_errors_seen = current_errors

            print(f"\n  {'='*60}")
            print(f"  [OVERWATCH] ERROR THRESHOLD CROSSED!")
            print(f"  [OVERWATCH] Total errors: {current_errors} >= threshold: {self.error_threshold}")
            print(f"  {'='*60}")

            # Pause pipeline
            await self.pause_pipeline()

            # Inspect history and outputs
            history_review = await self.review_history()
            outputs_review = await self.review_outputs()

            # Generate clarified plan
            clarified_plan = await self.generate_clarified_plan(
                history_review, outputs_review, current_errors
            )

            # Broadcast plan to all agents
            await self.broadcast_plan(clarified_plan)

            # Resume pipeline
            await self.resume_pipeline()

            # Reset for next threshold check (incremental threshold)
            self.error_threshold += 3
            self.threshold_triggered = False

    async def pause_pipeline(self):
        """Pause all running agents for plan clarification"""
        self.pipeline_paused = True
        print(f"  [OVERWATCH] PAUSING PIPELINE for plan clarification...")

        for agent_id, agent in self.orchestrator.agents.items():
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue
            if agent.status == AgentStatus.RUNNING:
                self.orchestrator.redis.hset(
                    f"agent:{agent_id}:state",
                    "status", "paused"
                )
                self.orchestrator.redis.set(
                    f"agent:{agent_id}:pause_signal", "1", ex=60
                )

        print(f"  [OVERWATCH] Pipeline paused - {len(self.orchestrator.agents)} agents notified")

    async def resume_pipeline(self):
        """Resume all paused agents after plan broadcast"""
        print(f"  [OVERWATCH] RESUMING PIPELINE with clarified plan...")

        for agent_id, agent in self.orchestrator.agents.items():
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue
            # Clear pause signal
            self.orchestrator.redis.delete(f"agent:{agent_id}:pause_signal")
            # Set resume signal with plan reference
            if self.clarified_plans:
                self.orchestrator.redis.set(
                    f"agent:{agent_id}:resume_signal",
                    self.clarified_plans[-1].plan_id,
                    ex=60
                )

        self.pipeline_paused = False
        print(f"  [OVERWATCH] Pipeline resumed")

    async def review_history(self) -> Dict[str, Any]:
        """Review history of all agents for plan clarification"""
        print(f"\n  [OVERWATCH] --- REVIEWING HISTORY ---")

        history_data = {
            "agents_reviewed": [],
            "total_runs": 0,
            "success_rate": 0,
            "common_errors": {},
            "chaos_events": []
        }

        for agent_id, agent in self.orchestrator.agents.items():
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue

            # Get history from Redis
            history = self.orchestrator.history.get_agent_history(agent_id, 5)
            history_data["agents_reviewed"].append(agent_id)
            history_data["total_runs"] += len(history)

            successes = sum(1 for h in history if h.get("success"))
            for h in history:
                if h.get("error_type"):
                    err = h["error_type"]
                    history_data["common_errors"][err] = history_data["common_errors"].get(err, 0) + 1
                if h.get("chaos_condition") and h["chaos_condition"] != "none":
                    history_data["chaos_events"].append({
                        "agent": agent_id,
                        "chaos": h["chaos_condition"],
                        "recovered": h.get("recovered_from_chaos", False)
                    })

            print(f"    Agent {agent_id}: {len(history)} runs, {successes} successes")

        if history_data["total_runs"] > 0:
            total_successes = sum(
                sum(1 for h in self.orchestrator.history.get_agent_history(aid, 5) if h.get("success"))
                for aid in history_data["agents_reviewed"]
            )
            history_data["success_rate"] = total_successes / history_data["total_runs"]

        print(f"    Total runs: {history_data['total_runs']}, Success rate: {history_data['success_rate']:.1%}")
        print(f"    Common errors: {history_data['common_errors']}")

        return history_data

    async def review_outputs(self) -> Dict[str, List[AgentOutput]]:
        """Review Alpha/Beta/Gamma outputs from all agents"""
        print(f"\n  [OVERWATCH] --- REVIEWING OUTPUTS ---")

        outputs_data = {
            "alpha": [],
            "beta": [],
            "gamma": [],
            "by_agent": {}
        }

        for agent_id, agent in self.orchestrator.agents.items():
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue

            agent_outputs = []

            # Get outputs from Redis
            for output_type in OutputType:
                key = f"agent:{agent_id}:output:{output_type.value}"
                output_data = self.orchestrator.redis.get(key)
                if output_data:
                    try:
                        data = json.loads(output_data)
                        output = AgentOutput(
                            agent_id=agent_id,
                            output_type=output_type,
                            phase=data.get("phase", "unknown"),
                            content=data.get("content", {}),
                            timestamp=datetime.fromisoformat(data.get("timestamp", datetime.utcnow().isoformat())),
                            checksum=data.get("checksum", "")
                        )
                        outputs_data[output_type.value].append(output)
                        agent_outputs.append(output)
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"    Warning: Failed to parse output for {agent_id}: {e}")

            outputs_data["by_agent"][agent_id] = agent_outputs
            print(f"    Agent {agent_id}: {len(agent_outputs)} outputs")
            for o in agent_outputs:
                print(f"      - {o.output_type.value.upper()}: phase={o.phase}, checksum={o.checksum}")

        print(f"    Total: Alpha={len(outputs_data['alpha'])}, Beta={len(outputs_data['beta'])}, Gamma={len(outputs_data['gamma'])}")

        return outputs_data

    async def generate_clarified_plan(self, history: Dict, outputs: Dict, error_count: int) -> ClarifiedPlan:
        """Generate a clarified plan based on history and outputs analysis"""
        print(f"\n  [OVERWATCH] --- GENERATING CLARIFIED PLAN ---")

        plan_id = f"plan-{datetime.utcnow().strftime('%H%M%S%f')[:10]}"

        # Analyze issues and generate adjustments
        adjustments = []

        # Based on common errors
        for error_type, count in history.get("common_errors", {}).items():
            if count >= 2:
                adjustments.append(f"Add retry logic for '{error_type}' errors (seen {count}x)")

        # Based on chaos events
        chaos_events = history.get("chaos_events", [])
        if chaos_events:
            unrecovered = [e for e in chaos_events if not e.get("recovered")]
            if unrecovered:
                adjustments.append(f"Strengthen recovery for: {', '.join(e['chaos'] for e in unrecovered)}")

        # Based on output gaps
        if len(outputs.get("gamma", [])) < len(outputs.get("alpha", [])):
            adjustments.append("Focus on completing work (more Alpha than Gamma outputs)")

        # Based on success rate
        if history.get("success_rate", 1.0) < 0.7:
            adjustments.append("Reduce task complexity or add more checkpoints")

        # Default adjustment if none found
        if not adjustments:
            adjustments.append("Continue with increased monitoring")

        # Create clarified plan
        clarified_plan = ClarifiedPlan(
            plan_id=plan_id,
            trigger_reason=f"Error threshold crossed: {error_count} errors",
            history_reviewed=[a for a in history.get("agents_reviewed", [])],
            outputs_analyzed=[f"{o.agent_id}:{o.output_type.value}" for o in outputs.get("alpha", []) + outputs.get("beta", []) + outputs.get("gamma", [])],
            original_objective=self.orchestrator.redis.get(f"project:{self.orchestrator.project_id}:objective") or "Complete chaos test successfully",
            clarified_objective=f"Complete chaos test with enhanced resilience (error recovery priority)",
            adjustments=adjustments,
            broadcast_at=datetime.utcnow()
        )

        self.clarified_plans.append(clarified_plan)

        print(f"    Plan ID: {plan_id}")
        print(f"    Trigger: {clarified_plan.trigger_reason}")
        print(f"    History reviewed: {len(clarified_plan.history_reviewed)} agents")
        print(f"    Outputs analyzed: {len(clarified_plan.outputs_analyzed)}")
        print(f"    Adjustments:")
        for adj in adjustments:
            print(f"      - {adj}")

        # Store plan in Redis for agents to read
        self.orchestrator.redis.set(
            f"project:{self.orchestrator.project_id}:plan:{plan_id}",
            json.dumps({
                "plan_id": plan_id,
                "objective": clarified_plan.clarified_objective,
                "adjustments": adjustments,
                "broadcast_at": clarified_plan.broadcast_at.isoformat()
            }),
            ex=300
        )

        return clarified_plan

    async def broadcast_plan(self, plan: ClarifiedPlan):
        """Broadcast clarified plan to all agents"""
        print(f"\n  [OVERWATCH] --- BROADCASTING PLAN ---")

        broadcast_key = f"project:{self.orchestrator.project_id}:broadcast"

        # Store broadcast
        self.orchestrator.redis.set(
            broadcast_key,
            json.dumps({
                "plan_id": plan.plan_id,
                "objective": plan.clarified_objective,
                "adjustments": plan.adjustments,
                "broadcast_at": plan.broadcast_at.isoformat()
            }),
            ex=300
        )

        # Notify each agent
        acknowledged = []
        for agent_id, agent in self.orchestrator.agents.items():
            if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]:
                continue

            # Set notification for agent
            self.orchestrator.redis.set(
                f"agent:{agent_id}:plan_update",
                plan.plan_id,
                ex=60
            )
            acknowledged.append(agent_id)
            print(f"    Notified: {agent_id}")

        plan.acknowledged_by = acknowledged
        print(f"    Broadcast complete: {len(acknowledged)} agents notified")
        print(f"  {'='*60}\n")


class MultiAgentOrchestrator:
    """
    Orchestrates multiple agents working on the same project.
    """

    def __init__(self, project_id: str):
        self.project_id = project_id
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )
        self.agents: Dict[str, AgentInstance] = {}
        self.overwatch: Optional[OverwatchAgent] = None
        self.history = HistoryRecorder(self.redis, LEDGER_PATH)
        self.readiness = DragonflyReadiness(self.redis, project_id)
        self.chaos_events: List[Dict] = []
        self.start_time: Optional[datetime] = None
        self.end_time: Optional[datetime] = None

        # Setup readiness checks
        self._setup_readiness_checks()

    def _setup_readiness_checks(self):
        """Setup DragonflyDB readiness checks"""

        def check_all_agents_registered():
            registered = self.redis.smembers(f"project:{self.project_id}:agents")
            expected = len([a for a in self.agents.values()
                          if a.agent_type != AgentType.OVERWATCH])
            if len(registered) >= expected:
                return True, f"{len(registered)} agents registered"
            return False, f"Only {len(registered)}/{expected} agents registered"

        def check_no_active_locks_conflict():
            for agent_id in self.agents:
                lock = self.redis.get(f"agent:{agent_id}:lock")
                if lock and lock != agent_id:
                    return False, f"Lock conflict for {agent_id}"
            return True, "No lock conflicts"

        def check_unified_objective():
            primary_agents = [a for a in self.agents.values()
                            if a.agent_type not in [AgentType.OVERWATCH, AgentType.HELPER]]
            completed = sum(1 for a in primary_agents if a.status == AgentStatus.COMPLETED)
            total = len(primary_agents)
            if total > 0 and completed >= total:
                return True, f"All {completed} primary agents completed"
            return False, f"{completed}/{total} primary agents completed"

        def check_no_critical_errors():
            for agent_id, agent in self.agents.items():
                # Skip helper agents and completed agents (they recovered)
                if agent.agent_type == AgentType.HELPER:
                    continue
                if agent.status == AgentStatus.COMPLETED:
                    continue  # Completed agents recovered from errors
                if agent.error_count >= 5:
                    return False, f"Agent {agent_id} has critical errors (count: {agent.error_count})"
            return True, "No critical unrecovered errors"

        self.readiness.add_check("all_agents_registered", check_all_agents_registered)
        self.readiness.add_check("no_lock_conflicts", check_no_active_locks_conflict)
        self.readiness.add_check("unified_objective", check_unified_objective)
        self.readiness.add_check("no_critical_errors", check_no_critical_errors)

    def get_vault_token(self) -> str:
        """Get Vault root token"""
        with open(VAULT_TOKEN_FILE) as f:
            return json.load(f)["root_token"]

    async def spawn_agent(self, agent_type: AgentType, pipeline_name: str) -> str:
        """Spawn a new agent"""
        agent_id = f"{agent_type.value}-{datetime.utcnow().strftime('%H%M%S%f')[:10]}"

        agent = AgentInstance(
            agent_id=agent_id,
            agent_type=agent_type,
            pipeline_id=pipeline_name,
            status=AgentStatus.STARTING
        )

        self.agents[agent_id] = agent

        # Register in DragonflyDB
        self.redis.sadd(f"project:{self.project_id}:agents", agent_id)
        self.redis.hset(f"agent:{agent_id}:state", mapping={
            "status": "starting",
            "type": agent_type.value,
            "pipeline": pipeline_name,
            "project": self.project_id,
            "started_at": datetime.utcnow().isoformat()
        })

        # Start the agent process
        await self._start_agent_process(agent)

        return agent_id

    async def spawn_helper_agent(self, spawned_by: str, reason: str,
                                  target_task: str, agent_type: AgentType = AgentType.HELPER) -> str:
        """Spawn a helper agent"""
        agent_id = f"helper-{datetime.utcnow().strftime('%H%M%S%f')[:10]}"

        agent = AgentInstance(
            agent_id=agent_id,
            agent_type=agent_type,
            pipeline_id=target_task,
            status=AgentStatus.STARTING,
            spawned_by=spawned_by
        )

        self.agents[agent_id] = agent

        # Register
        self.redis.sadd(f"project:{self.project_id}:agents", agent_id)
        self.redis.hset(f"agent:{agent_id}:state", mapping={
            "status": "starting",
            "type": agent_type.value,
            "spawned_by": spawned_by,
            "reason": reason,
            "target_task": target_task,
            "started_at": datetime.utcnow().isoformat()
        })

        # Start helper process
        await self._start_agent_process(agent)

        return agent_id

    async def _start_agent_process(self, agent: AgentInstance):
        """Start the actual agent process"""
        agent.started_at = datetime.utcnow()
        log_file = LOG_DIR / f"{agent.agent_id}.log"

        # Get learning insights for this agent type
        insights = self.history.get_learning_insights(agent.agent_type.value)

        env = os.environ.copy()
        env["VAULT_ADDR"] = VAULT_ADDR
        env["VAULT_SKIP_VERIFY"] = "1"
        env["VAULT_TOKEN"] = self.get_vault_token()
        env["AGENT_ID"] = agent.agent_id
        env["PROJECT_ID"] = self.project_id
        env["REDIS_PASSWORD"] = REDIS_PASSWORD
        env["LEARNING_INSIGHTS"] = json.dumps(insights)

        if agent.agent_type == AgentType.PYTHON:
            cmd = self._get_python_agent_cmd(agent)
        elif agent.agent_type == AgentType.BUN:
            cmd = self._get_bun_agent_cmd(agent)
        elif agent.agent_type == AgentType.DIAGNOSTIC:
            cmd = self._get_diagnostic_agent_cmd(agent)
        else:
            cmd = self._get_helper_agent_cmd(agent)

        try:
            with open(log_file, "w") as log:
                agent.process = subprocess.Popen(
                    cmd,
                    shell=True,
                    stdout=log,
                    stderr=subprocess.STDOUT,
                    env=env,
                    cwd="/opt/agent-governance"
                )
            agent.status = AgentStatus.RUNNING
            self.redis.hset(f"agent:{agent.agent_id}:state", "status", "running")

        except Exception as e:
            agent.status = AgentStatus.FAILED
            agent.error_count += 1
            self.redis.hset(f"agent:{agent.agent_id}:state", "status", "failed")
            print(f"  [ERROR] Failed to start {agent.agent_id}: {e}")

    def _get_python_agent_cmd(self, agent: AgentInstance) -> str:
        """Get command for Python agent with chaos resilience and Alpha/Beta/Gamma outputs"""
        return f"""
python3 -c "
import redis
import time
import json
import os
import hashlib

r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True)
agent_id = '{agent.agent_id}'
project_id = '{self.project_id}'
current_plan = None

def emit_output(output_type, phase, content):
    '''Emit an Alpha/Beta/Gamma output'''
    chksum = hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12]
    output = {{
        'phase': phase,
        'content': content,
        'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'),
        'checksum': chksum
    }}
    key = f'agent:{{agent_id}}:output:{{output_type}}'
    r.set(key, json.dumps(output), ex=300)
    print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}, checksum={{chksum}}')

def check_for_plan_update():
    '''Check if overwatch has broadcast a new plan'''
    global current_plan
    plan_id = r.get(f'agent:{{agent_id}}:plan_update')
    if plan_id and plan_id != current_plan:
        current_plan = plan_id
        plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}')
        if plan_data:
            plan = json.loads(plan_data)
            obj = plan.get('objective', 'unknown')
            adjs = plan.get('adjustments', [])
            print(f'[{{agent_id}}] RECEIVED PLAN UPDATE: {{plan_id}}')
            print(f'[{{agent_id}}]   Objective: {{obj}}')
            for adj in adjs:
                print(f'[{{agent_id}}]   Adjustment: {{adj}}')
            return plan
    return None

def check_for_pause():
    '''Check if pipeline is paused'''
    if r.get(f'agent:{{agent_id}}:pause_signal') == '1':
        print(f'[{{agent_id}}] Pipeline PAUSED - waiting for resume...')
        for _ in range(30):  # Wait up to 15 seconds
            if r.get(f'agent:{{agent_id}}:pause_signal') != '1':
                resume = r.get(f'agent:{{agent_id}}:resume_signal')
                print(f'[{{agent_id}}] Pipeline RESUMED with plan: {{resume}}')
                check_for_plan_update()
                return True
            time.sleep(0.5)
        print(f'[{{agent_id}}] Pause timeout, continuing...')
    return True

def check_and_recover():
    '''Check for chaos conditions and attempt recovery'''
    # First check for pause/plan updates
    check_for_pause()
    check_for_plan_update()

    # Check for revoke signal
    if r.get(f'agent:{{agent_id}}:revoke_signal') == '1':
        print(f'[{{agent_id}}] Token revoked, waiting for recovery...')
        for _ in range(10):
            if r.get(f'agent:{{agent_id}}:revoke_signal') != '1':
                print(f'[{{agent_id}}] Revoke cleared, resuming')
                return True
            time.sleep(0.2)
        return False

    # Check for lost lock
    lock = r.get(f'agent:{{agent_id}}:lock')
    if lock is None:
        print(f'[{{agent_id}}] Lock lost, attempting re-acquire...')
        for _ in range(5):
            if r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300, nx=True):
                print(f'[{{agent_id}}] Lock re-acquired')
                return True
            if r.get(f'agent:{{agent_id}}:lock') == agent_id:
                print(f'[{{agent_id}}] Lock restored by overwatch')
                return True
            time.sleep(0.2)
        return False

    # Check status
    status = r.hget(f'agent:{{agent_id}}:state', 'status')
    if status == 'recovering':
        print(f'[{{agent_id}}] In recovery mode, resuming...')
        r.hset(f'agent:{{agent_id}}:state', 'status', 'running')

    return True

# Acquire initial lock
r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300)

# Define phase outputs (aligned with pipeline.core.PHASE_OUTPUT_TYPES)
phase_outputs = {{
    'BOOTSTRAP': ('alpha', {{'status': 'initialized', 'capabilities': ['execute', 'verify']}}),
    'PREFLIGHT': ('alpha', {{'checks_passed': True, 'resources_available': True}}),
    'PLAN': ('beta', {{'steps': ['analyze', 'transform', 'validate'], 'estimated_phases': 3}}),
    'EXECUTE': ('beta', {{'progress': 100, 'artifacts_created': ['output.json']}}),
    'VERIFY': ('gamma', {{'tests_passed': True, 'quality_score': 0.95}}),
    'PACKAGE': ('gamma', {{'artifacts_bundled': True, 'evidence_collected': True}}),
    'REPORT': ('gamma', {{'summary': 'Pipeline completed successfully', 'metrics': {{'duration': 0, 'errors': 0}}}}),
}}

# Simulate pipeline execution with resilience and outputs
# Official agent phases from pipeline.core (excludes EXIT - that's the terminal state)
phases = ['BOOTSTRAP', 'PREFLIGHT', 'PLAN', 'EXECUTE', 'VERIFY', 'PACKAGE', 'REPORT']
start_time = time.time()

for i, phase in enumerate(phases):
    # Check for chaos conditions before each phase
    if not check_and_recover():
        print(f'[{{agent_id}}] Cannot recover from chaos, marking failed')
        r.hset(f'agent:{{agent_id}}:state', 'status', 'failed')
        # Emit failure output
        emit_output('gamma', phase, {{'status': 'failed', 'reason': 'chaos_unrecoverable'}})
        exit(1)

    r.hset(f'agent:{{agent_id}}:state', 'phase', phase)
    r.hset(f'agent:{{agent_id}}:state', 'status', 'running')
    r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60)

    # Emit output for this phase
    output_type, content = phase_outputs.get(phase, ('alpha', {{'phase': phase}}))
    content['agent_id'] = agent_id
    content['phase_index'] = i
    emit_output(output_type, phase, content)

    time.sleep(0.3)

    # Extra check mid-phase
    check_and_recover()

# Final gamma output with summary
duration = time.time() - start_time
emit_output('gamma', 'EXIT', {{
    'status': 'completed',
    'total_phases': len(phases),
    'duration_seconds': round(duration, 2),
    'plan_updates_received': 1 if current_plan else 0
}})

# Mark complete
r.hset(f'agent:{{agent_id}}:state', mapping={{
    'status': 'completed',
    'phase': 'EXIT',
    'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ')
}})
r.delete(f'agent:{{agent_id}}:lock')
print(f'Agent {{agent_id}} completed pipeline ({{len(phases)}} phases, {{duration:.2f}}s)')
"
"""

    def _get_bun_agent_cmd(self, agent: AgentInstance) -> str:
        """Get command for Bun agent with chaos resilience and Alpha/Beta/Gamma outputs"""
        return f"""
python3 -c "
import redis
import time
import json
import hashlib

r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True)
agent_id = '{agent.agent_id}'
project_id = '{self.project_id}'
current_plan = None

def emit_output(output_type, phase, content):
    '''Emit an Alpha/Beta/Gamma output'''
    output = {{
        'phase': phase,
        'content': content,
        'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'),
        'checksum': hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12]
    }}
    key = f'agent:{{agent_id}}:output:{{output_type}}'
    r.set(key, json.dumps(output), ex=300)
    print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}')

def check_for_plan_update():
    '''Check if overwatch has broadcast a new plan'''
    global current_plan
    plan_id = r.get(f'agent:{{agent_id}}:plan_update')
    if plan_id and plan_id != current_plan:
        current_plan = plan_id
        plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}')
        if plan_data:
            plan = json.loads(plan_data)
            print(f'[{{agent_id}}] RECEIVED PLAN UPDATE: {{plan_id}}')
            return plan
    return None

def check_for_pause():
    '''Check if pipeline is paused'''
    if r.get(f'agent:{{agent_id}}:pause_signal') == '1':
        print(f'[{{agent_id}}] Pipeline PAUSED - waiting...')
        for _ in range(30):
            if r.get(f'agent:{{agent_id}}:pause_signal') != '1':
                print(f'[{{agent_id}}] Pipeline RESUMED')
                check_for_plan_update()
                return True
            time.sleep(0.5)
    return True

def check_and_recover():
    '''Check for chaos and recover'''
    check_for_pause()
    check_for_plan_update()

    # Check error count
    errors = r.hget(f'agent:{{agent_id}}:errors', 'total_errors')
    if errors and int(errors) >= 5:
        print(f'[{{agent_id}}] Error spike detected, resetting...')
        r.hset(f'agent:{{agent_id}}:errors', 'total_errors', '0')
        time.sleep(0.1)

    # Check lock
    lock = r.get(f'agent:{{agent_id}}:lock')
    if lock is None:
        print(f'[{{agent_id}}] Lock lost, re-acquiring...')
        r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300)
        time.sleep(0.1)

    # Restore heartbeat
    r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60)

    # Check and clear recovery status
    status = r.hget(f'agent:{{agent_id}}:state', 'status')
    if status == 'recovering':
        r.hset(f'agent:{{agent_id}}:state', 'status', 'running')

    return True

# Acquire lock
r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300)

# Emit initial alpha output
emit_output('alpha', 'INIT', {{'agent_type': 'bun', 'ready': True}})

# Official agent phases from pipeline.core (excludes EXIT - that's the terminal state)
phases = ['BOOTSTRAP', 'PREFLIGHT', 'PLAN', 'EXECUTE', 'VERIFY', 'PACKAGE', 'REPORT']
start_time = time.time()

for i, phase in enumerate(phases):
    check_and_recover()
    r.hset(f'agent:{{agent_id}}:state', 'phase', phase)
    r.hset(f'agent:{{agent_id}}:state', 'status', 'running')
    r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60)

    # Emit outputs at key phases
    if phase == 'PLAN':
        emit_output('beta', phase, {{'analysis_complete': True, 'items_to_process': 5}})
    elif phase == 'EXECUTE':
        emit_output('beta', phase, {{'items_processed': 5, 'success_rate': 1.0}})
    elif phase == 'VERIFY':
        emit_output('gamma', phase, {{'verified': True, 'confidence': 0.98}})

    time.sleep(0.25)

# Final gamma output
duration = time.time() - start_time
emit_output('gamma', 'EXIT', {{
    'status': 'completed',
    'duration_seconds': round(duration, 2),
    'plan_updates': 1 if current_plan else 0
}})

r.hset(f'agent:{{agent_id}}:state', mapping={{
    'status': 'completed',
    'phase': 'EXIT',
    'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ')
}})
r.delete(f'agent:{{agent_id}}:lock')
print(f'Agent {{agent_id}} completed pipeline (bun, {{duration:.2f}}s)')
"
"""

    def _get_diagnostic_agent_cmd(self, agent: AgentInstance) -> str:
        """Get command for diagnostic agent with Alpha/Beta/Gamma outputs"""
        return f"""
python3 -c "
import redis
import json
import time
import hashlib

r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True)
agent_id = '{agent.agent_id}'
project_id = '{self.project_id}'

def emit_output(output_type, phase, content):
    '''Emit an Alpha/Beta/Gamma output'''
    output = {{
        'phase': phase,
        'content': content,
        'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'),
        'checksum': hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12]
    }}
    key = f'agent:{{agent_id}}:output:{{output_type}}'
    r.set(key, json.dumps(output), ex=300)
    print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}')

def check_for_plan_update():
    '''Check if overwatch has broadcast a new plan'''
    plan_id = r.get(f'agent:{{agent_id}}:plan_update')
    if plan_id:
        plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}')
        if plan_data:
            plan = json.loads(plan_data)
            print(f'[{{agent_id}}] Received plan update: {{plan_id}}')
            return plan
    return None

# Alpha output: starting diagnostics
emit_output('alpha', 'INIT', {{'agent_type': 'diagnostic', 'starting': True}})

# Check for plan updates
check_for_plan_update()

# Run diagnostics
diagnostics = {{
    'redis_ping': r.ping(),
    'agent_count': r.scard(f'project:{{project_id}}:agents'),
    'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ')
}}

# Beta output: initial diagnostics
emit_output('beta', 'SCAN', {{'redis_ok': diagnostics['redis_ping'], 'agents_found': diagnostics['agent_count']}})

# Check all agents
agent_states = {{}}
outputs_found = {{'alpha': 0, 'beta': 0, 'gamma': 0}}
for aid in r.smembers(f'project:{{project_id}}:agents'):
    state = r.hgetall(f'agent:{{aid}}:state')
    agent_states[aid] = state.get('status', 'unknown')
    # Check outputs
    for otype in ['alpha', 'beta', 'gamma']:
        if r.exists(f'agent:{{aid}}:output:{{otype}}'):
            outputs_found[otype] += 1

diagnostics['agent_states'] = agent_states
diagnostics['healthy_count'] = sum(1 for s in agent_states.values() if s in ['running', 'completed'])
diagnostics['outputs_found'] = outputs_found

# Gamma output: full diagnostic report
emit_output('gamma', 'REPORT', {{
    'agents_checked': len(agent_states),
    'healthy_agents': diagnostics['healthy_count'],
    'outputs': outputs_found,
    'status_breakdown': dict((s, list(agent_states.values()).count(s)) for s in set(agent_states.values()))
}})

r.hset(f'agent:{{agent_id}}:state', mapping={{
    'status': 'completed',
    'phase': 'EXIT',
    'diagnostics': json.dumps(diagnostics),
    'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ')
}})

print(f'Diagnostic agent {{agent_id}} complete: {{len(agent_states)}} agents, outputs: {{outputs_found}}')
"
"""

    def _get_helper_agent_cmd(self, agent: AgentInstance) -> str:
        """Get command for helper agent that performs actual recovery"""
        return f"""
python3 -c "
import redis
import time
import json

r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True)
agent_id = '{agent.agent_id}'
project_id = '{self.project_id}'
spawned_by = '{agent.spawned_by or ""}'

print(f'[{{agent_id}}] Helper starting recovery work')

# Step 1: Analyze - find agents needing help
r.hset(f'agent:{{agent_id}}:state', 'step', 'analyze')
r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60)

troubled_agents = []
for aid in r.smembers(f'project:{{project_id}}:agents'):
    if aid == agent_id:
        continue
    state = r.hgetall(f'agent:{{aid}}:state')
    status = state.get('status', '')
    if status in ['failed', 'recovering', 'chaos_injected']:
        troubled_agents.append(aid)
        print(f'[{{agent_id}}] Found troubled agent: {{aid}} ({{status}})')
time.sleep(0.2)

# Step 2: Recover - fix issues for troubled agents
r.hset(f'agent:{{agent_id}}:state', 'step', 'recover')
for aid in troubled_agents:
    # Clear revoke signals
    r.delete(f'agent:{{aid}}:revoke_signal')

    # Restore locks
    lock = r.get(f'agent:{{aid}}:lock')
    if lock is None:
        r.set(f'agent:{{aid}}:lock', aid, ex=300)
        print(f'[{{agent_id}}] Restored lock for {{aid}}')

    # Restore heartbeats
    r.set(f'agent:{{aid}}:heartbeat', time.time(), ex=60)

    # Reset error counts
    r.hset(f'agent:{{aid}}:errors', 'total_errors', '0')

    # Fix corrupted state
    phase = r.hget(f'agent:{{aid}}:state', 'phase')
    if phase and 'CORRUPT' in phase:
        r.hset(f'agent:{{aid}}:state', 'phase', 'EXECUTE')
        print(f'[{{agent_id}}] Fixed corrupted state for {{aid}}')

    # Set back to running if recovering
    status = r.hget(f'agent:{{aid}}:state', 'status')
    if status == 'recovering':
        r.hset(f'agent:{{aid}}:state', 'status', 'running')
        print(f'[{{agent_id}}] Set {{aid}} back to running')

time.sleep(0.2)

# Step 3: Verify - check recovery worked
r.hset(f'agent:{{agent_id}}:state', 'step', 'verify')
recovered = 0
for aid in troubled_agents:
    state = r.hgetall(f'agent:{{aid}}:state')
    if state.get('status') not in ['failed', 'recovering', 'chaos_injected']:
        recovered += 1

print(f'[{{agent_id}}] Recovery complete: {{recovered}}/{{len(troubled_agents)}} agents recovered')

r.hset(f'agent:{{agent_id}}:state', mapping={{
    'status': 'completed',
    'phase': 'EXIT',
    'recovered_count': str(recovered),
    'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ')
}})
print(f'Helper agent {{agent_id}} completed recovery work')
"
"""

    async def inject_chaos(self, agent_id: str, chaos_type: ChaosCondition):
        """Inject chaos condition into an agent"""
        if agent_id not in self.agents:
            return

        agent = self.agents[agent_id]
        agent.chaos_condition = chaos_type
        agent.status = AgentStatus.CHAOS_INJECTED

        event = {
            "agent_id": agent_id,
            "chaos_type": chaos_type.value,
            "timestamp": datetime.utcnow().isoformat()
        }
        self.chaos_events.append(event)

        print(f"  [CHAOS] Injected {chaos_type.value} into {agent_id}")

        # Apply chaos - each chaos type increments error count
        if chaos_type == ChaosCondition.TOKEN_REVOKED:
            self.redis.set(f"agent:{agent_id}:revoke_signal", "1")
            agent.error_count += 1
        elif chaos_type == ChaosCondition.LOCK_LOST:
            self.redis.delete(f"agent:{agent_id}:lock")
            agent.error_count += 1
        elif chaos_type == ChaosCondition.STATE_CORRUPTED:
            self.redis.hset(f"agent:{agent_id}:state", "phase", "CORRUPTED")
            agent.error_count += 1
        elif chaos_type == ChaosCondition.HEARTBEAT_TIMEOUT:
            self.redis.delete(f"agent:{agent_id}:heartbeat")
            agent.error_count += 1
        elif chaos_type == ChaosCondition.ERROR_SPIKE:
            self.redis.hincrby(f"agent:{agent_id}:errors", "total_errors", 5)
            agent.error_count += 5

    async def monitor_agents(self):
        """Monitor agent progress"""
        while True:
            all_done = True
            for agent_id, agent in self.agents.items():
                if agent.agent_type == AgentType.OVERWATCH:
                    continue

                if agent.process:
                    ret = agent.process.poll()
                    if ret is not None:
                        agent.exit_code = ret
                        agent.completed_at = datetime.utcnow()

                        # Check final status from Redis
                        state = self.redis.hgetall(f"agent:{agent_id}:state")
                        if state.get("status") == "completed":
                            agent.status = AgentStatus.COMPLETED
                        else:
                            agent.status = AgentStatus.FAILED
                            agent.error_count += 1
                    else:
                        all_done = False

                # Update heartbeat check
                hb = self.redis.get(f"agent:{agent_id}:heartbeat")
                if agent.status == AgentStatus.RUNNING and not hb:
                    agent.error_count += 1

            if all_done:
                break

            await asyncio.sleep(0.5)

    async def run(self, agents_config: List[Dict], chaos_schedule: List[Dict] = None) -> Dict:
        """
        Run the multi-agent chaos test.

        Args:
            agents_config: List of {"type": AgentType, "pipeline": str}
            chaos_schedule: List of {"delay": float, "agent_index": int, "chaos": ChaosCondition}
        """
        self.start_time = datetime.utcnow()

        print("\n" + "=" * 70)
        print("MULTI-AGENT CHAOS TEST")
        print("=" * 70)
        print(f"Project: {self.project_id}")
        print(f"Agents: {len(agents_config)}")
        print(f"Started: {self.start_time.isoformat()}")
        print()

        # Start overwatch
        self.overwatch = OverwatchAgent(self)
        overwatch_task = asyncio.create_task(self.overwatch.start())

        # Spawn all agents
        print("--- SPAWNING AGENTS ---")
        agent_ids = []
        for config in agents_config:
            agent_type = config["type"]
            pipeline = config.get("pipeline", "default")
            agent_id = await self.spawn_agent(agent_type, pipeline)
            agent_ids.append(agent_id)
            print(f"  Started: {agent_id} ({agent_type.value})")

        print()

        # Schedule chaos events
        if chaos_schedule:
            print("--- CHAOS SCHEDULE ---")
            for event in chaos_schedule:
                asyncio.create_task(self._delayed_chaos(
                    event["delay"],
                    agent_ids[event["agent_index"]],
                    event["chaos"]
                ))
                print(f"  Scheduled: {event['chaos'].value} for agent {event['agent_index']} at +{event['delay']}s")
            print()

        # Monitor agents
        print("--- AGENT PROGRESS ---")
        await self.monitor_agents()

        # Stop overwatch
        self.overwatch.stop()
        overwatch_task.cancel()
        try:
            await overwatch_task
        except asyncio.CancelledError:
            pass

        self.end_time = datetime.utcnow()

        # Run readiness checks
        print("\n--- DRAGONFLY READINESS CHECKS ---")
        ready, checks = self.readiness.run_checks()
        for check in checks:
            status = "✓" if check["passed"] else "✗"
            print(f"  {status} {check['name']}: {check['message']}")

        # Collect results
        results = self._collect_results(ready, checks)

        # Record history for each agent
        for agent_id, agent in self.agents.items():
            self.history.record_run(agent_id, {
                "agent_type": agent.agent_type.value,
                "pipeline": agent.pipeline_id,
                "success": agent.status == AgentStatus.COMPLETED,
                "error_count": agent.error_count,
                "chaos_condition": agent.chaos_condition.value,
                "recovered_from_chaos": agent.chaos_condition != ChaosCondition.NONE and agent.status == AgentStatus.COMPLETED,
                "duration_ms": (agent.completed_at - agent.started_at).total_seconds() * 1000 if agent.completed_at and agent.started_at else None,
                "session_id": self.project_id,
                "action": "chaos_test_run"
            })

        # Print summary
        self._print_summary(results)

        return results

    async def _delayed_chaos(self, delay: float, agent_id: str, chaos: ChaosCondition):
        """Inject chaos after a delay"""
        await asyncio.sleep(delay)
        await self.inject_chaos(agent_id, chaos)

    def _collect_results(self, ready: bool, checks: List[Dict]) -> Dict:
        """Collect test results"""
        completed = sum(1 for a in self.agents.values() if a.status == AgentStatus.COMPLETED)
        failed = sum(1 for a in self.agents.values() if a.status == AgentStatus.FAILED)
        total = len(self.agents)

        duration = (self.end_time - self.start_time).total_seconds()

        agent_results = []
        for agent_id, agent in self.agents.items():
            log_file = LOG_DIR / f"{agent_id}.log"
            output = ""
            if log_file.exists():
                output = log_file.read_text()[-500:]  # Last 500 chars

            agent_results.append({
                "agent_id": agent_id,
                "type": agent.agent_type.value,
                "status": agent.status.value,
                "pipeline": agent.pipeline_id,
                "chaos_condition": agent.chaos_condition.value,
                "error_count": agent.error_count,
                "exit_code": agent.exit_code,
                "spawned_by": agent.spawned_by,
                "output_tail": output
            })

        # Collect outputs
        outputs_summary = {"alpha": 0, "beta": 0, "gamma": 0}
        for agent_id in self.agents:
            for otype in ["alpha", "beta", "gamma"]:
                if self.redis.exists(f"agent:{agent_id}:output:{otype}"):
                    outputs_summary[otype] += 1

        # Collect clarified plans
        clarified_plans = []
        if self.overwatch:
            for plan in self.overwatch.clarified_plans:
                clarified_plans.append({
                    "plan_id": plan.plan_id,
                    "trigger": plan.trigger_reason,
                    "adjustments": plan.adjustments,
                    "agents_notified": len(plan.acknowledged_by),
                    "broadcast_at": plan.broadcast_at.isoformat()
                })

        return {
            "project_id": self.project_id,
            "success": ready and failed == 0,
            "unified_objective_reached": ready,
            "duration_seconds": duration,
            "agents": {
                "total": total,
                "completed": completed,
                "failed": failed,
                "helpers_spawned": self.overwatch.helpers_spawned if self.overwatch else 0
            },
            "outputs": outputs_summary,
            "chaos_events": len(self.chaos_events),
            "overwatch_interventions": len(self.overwatch.intervention_log) if self.overwatch else 0,
            "clarified_plans": clarified_plans,
            "readiness_checks": checks,
            "agent_results": agent_results,
            "timestamp": datetime.utcnow().isoformat()
        }

    def _print_summary(self, results: Dict):
        """Print test summary"""
        print("\n" + "=" * 70)
        print("TEST SUMMARY")
        print("=" * 70)

        status = "✓ SUCCESS" if results["success"] else "✗ FAILED"
        print(f"\nResult: {status}")
        print(f"Unified Objective: {'Reached' if results['unified_objective_reached'] else 'Not Reached'}")
        print(f"Duration: {results['duration_seconds']:.2f}s")

        print(f"\nAgents:")
        print(f"  Total: {results['agents']['total']}")
        print(f"  Completed: {results['agents']['completed']}")
        print(f"  Failed: {results['agents']['failed']}")
        print(f"  Helpers Spawned: {results['agents']['helpers_spawned']}")

        print(f"\nOutputs (Alpha/Beta/Gamma):")
        outputs = results.get("outputs", {})
        print(f"  Alpha: {outputs.get('alpha', 0)}")
        print(f"  Beta: {outputs.get('beta', 0)}")
        print(f"  Gamma: {outputs.get('gamma', 0)}")

        print(f"\nChaos:")
        print(f"  Events Injected: {results['chaos_events']}")
        print(f"  Overwatch Interventions: {results['overwatch_interventions']}")

        # Plan clarification details
        clarified_plans = results.get("clarified_plans", [])
        if clarified_plans:
            print(f"\nPlan Clarifications: {len(clarified_plans)}")
            for plan in clarified_plans:
                print(f"  Plan {plan['plan_id']}:")
                print(f"    Trigger: {plan['trigger']}")
                print(f"    Agents Notified: {plan['agents_notified']}")
                print(f"    Adjustments:")
                for adj in plan.get('adjustments', []):
                    print(f"      - {adj}")

        print(f"\nAgent Details:")
        for ar in results["agent_results"]:
            chaos = f" [CHAOS: {ar['chaos_condition']}]" if ar["chaos_condition"] != "none" else ""
            spawned = f" (spawned by {ar['spawned_by']})" if ar["spawned_by"] else ""
            print(f"  {ar['agent_id']}: {ar['status']}{chaos}{spawned}")

        print("\n" + "=" * 70)


async def main():
    """Run the multi-agent chaos test"""
    project_id = f"chaos-test-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"

    orchestrator = MultiAgentOrchestrator(project_id)

    # Set project objective
    orchestrator.redis.set(
        f"project:{project_id}:objective",
        "Complete multi-agent chaos test with all agents reaching EXIT phase"
    )

    # Configure agents
    agents_config = [
        {"type": AgentType.PYTHON, "pipeline": "infrastructure"},
        {"type": AgentType.BUN, "pipeline": "code-analysis"},
        {"type": AgentType.DIAGNOSTIC, "pipeline": "diagnostics"},
    ]

    # Configure chaos schedule - designed to trigger error threshold
    # Single error_spike triggers plan clarification while allowing recovery
    chaos_schedule = [
        {"delay": 0.8, "agent_index": 1, "chaos": ChaosCondition.ERROR_SPIKE},  # +5 errors triggers threshold
        {"delay": 2.5, "agent_index": 0, "chaos": ChaosCondition.HEARTBEAT_TIMEOUT},  # Late, after most work done
    ]

    # Run test (overwatch error_threshold defaults to 3, error_spike adds 5)
    results = await orchestrator.run(agents_config, chaos_schedule)

    # Save results
    results_file = LOG_DIR / f"results-{project_id}.json"
    with open(results_file, "w") as f:
        json.dump(results, f, indent=2)

    print(f"\nResults saved to: {results_file}")

    return 0 if results["success"] else 1


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))