#!/usr/bin/env python3 """ Multi-Agent Chaos Test Orchestrator Launches multiple real agents (Python, Bun, Diagnostic) working simultaneously on the same project with chaos injection and overwatch coordination. Features: - Real pipeline execution using official pipeline definitions - Overwatch agent spawns helpers on errors - DragonflyDB readiness criteria - History recording for learning - Unified objective verification - Alpha/Beta/Gamma output tracking with plan clarification Architecture Reference: /opt/agent-governance/docs/ARCHITECTURE.md Pipeline Reference: /opt/agent-governance/pipeline/core.py """ import asyncio import json import os import signal import sqlite3 import subprocess import sys import time import threading from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum from pathlib import Path from typing import Dict, List, Any, Optional, Tuple import redis import hashlib # Add parent directory to path to import from pipeline module sys.path.insert(0, str(Path(__file__).parent.parent.parent)) # Import official pipeline definitions from unified core module from pipeline.core import ( # Enums AgentPhase, OutputType, ChaosCondition, StageStatus, # Data classes AgentOutput, ClarifiedPlan, ErrorBudget, # Constants AGENT_PHASE_NAMES, PHASE_OUTPUT_TYPES, DEFAULT_REDIS_HOST, DEFAULT_REDIS_PORT, DEFAULT_REDIS_PASSWORD, DEFAULT_LEDGER_PATH, DEFAULT_HEARTBEAT_TTL, DEFAULT_LOCK_TTL, DEFAULT_OUTPUT_TTL, # Key patterns RedisKeys, # Utilities get_output_type_for_phase, ) # ============================================================================= # Configuration (using defaults from pipeline.core where possible) # ============================================================================= REDIS_HOST = DEFAULT_REDIS_HOST REDIS_PORT = DEFAULT_REDIS_PORT REDIS_PASSWORD = DEFAULT_REDIS_PASSWORD LEDGER_PATH = Path(DEFAULT_LEDGER_PATH) VAULT_ADDR = "https://127.0.0.1:8200" VAULT_TOKEN_FILE = Path("/opt/vault/init-keys.json") LOG_DIR = Path("/opt/agent-governance/tests/multi-agent-chaos/logs") # Ensure log directory exists LOG_DIR.mkdir(parents=True, exist_ok=True) # ============================================================================= # Chaos Test Specific Enums (extend core definitions for testing) # ============================================================================= class AgentType(Enum): """Agent types specific to the chaos test orchestrator.""" PYTHON = "python" BUN = "bun" DIAGNOSTIC = "diagnostic" OVERWATCH = "overwatch" HELPER = "helper" class AgentStatus(Enum): """ Agent runtime status for chaos testing. Extends the core AgentStatus with chaos-specific states. Note: Uses lowercase values for Redis compatibility in this test context. """ PENDING = "pending" STARTING = "starting" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" CHAOS_INJECTED = "chaos_injected" # Chaos test specific RECOVERING = "recovering" PAUSED = "paused" REVOKED = "revoked" # Note: We intentionally shadow pipeline.core.AgentStatus here because: # 1. Chaos test needs CHAOS_INJECTED state not in core # 2. Test uses lowercase values for Redis state storage # 3. Core module remains the authoritative reference for production code @dataclass class AgentInstance: """Represents a running agent instance""" agent_id: str agent_type: AgentType process: Optional[subprocess.Popen] = None status: AgentStatus = AgentStatus.PENDING pipeline_id: Optional[str] = None started_at: Optional[datetime] = None completed_at: Optional[datetime] = None exit_code: Optional[int] = None chaos_condition: ChaosCondition = ChaosCondition.NONE error_count: int = 0 output_log: str = "" spawned_by: Optional[str] = None @dataclass class ReadinessCheck: """DragonflyDB readiness criteria""" name: str check_func: callable required: bool = True passed: bool = False message: str = "" class DragonflyReadiness: """Manages DragonflyDB readiness checks for multi-agent coordination""" def __init__(self, redis_client: redis.Redis, project_id: str): self.redis = redis_client self.project_id = project_id self.checks: List[ReadinessCheck] = [] def add_check(self, name: str, check_func: callable, required: bool = True): """Add a readiness check""" self.checks.append(ReadinessCheck( name=name, check_func=check_func, required=required )) def run_checks(self) -> Tuple[bool, List[Dict]]: """Run all readiness checks""" results = [] all_passed = True for check in self.checks: try: passed, message = check.check_func() check.passed = passed check.message = message except Exception as e: check.passed = False check.message = f"Error: {e}" if check.required and not check.passed: all_passed = False results.append({ "name": check.name, "passed": check.passed, "required": check.required, "message": check.message }) return all_passed, results def wait_for_ready(self, timeout_seconds: int = 60) -> bool: """Wait for all checks to pass""" start = time.time() while time.time() - start < timeout_seconds: all_ready, _ = self.run_checks() if all_ready: return True time.sleep(0.5) return False class HistoryRecorder: """Records agent history for learning and refinement""" def __init__(self, redis_client: redis.Redis, ledger_path: Path): self.redis = redis_client self.ledger_path = ledger_path def record_run(self, agent_id: str, run_data: Dict): """Record a run to history""" # Store in Redis for quick access history_key = f"history:{agent_id}:runs" run_data["recorded_at"] = datetime.utcnow().isoformat() self.redis.lpush(history_key, json.dumps(run_data)) self.redis.ltrim(history_key, 0, 99) # Keep last 100 runs # Store in ledger for persistence try: conn = sqlite3.connect(self.ledger_path) cursor = conn.cursor() cursor.execute(""" INSERT INTO agent_actions (timestamp, agent_id, agent_version, tier, action, decision, confidence, success, error_type, error_message, session_id, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( datetime.utcnow().isoformat(), agent_id, run_data.get("version", "1.0.0"), run_data.get("tier", 1), run_data.get("action", "chaos_test_run"), run_data.get("decision", "EXECUTE"), run_data.get("confidence", 0.8), 1 if run_data.get("success") else 0, run_data.get("error_type"), run_data.get("error_message"), run_data.get("session_id"), datetime.utcnow().isoformat() )) conn.commit() conn.close() except Exception as e: print(f"Warning: Failed to record to ledger: {e}") def get_agent_history(self, agent_id: str, limit: int = 10) -> List[Dict]: """Get recent history for an agent""" history_key = f"history:{agent_id}:runs" runs = self.redis.lrange(history_key, 0, limit - 1) return [json.loads(r) for r in runs] def get_learning_insights(self, agent_id: str) -> Dict: """Analyze history and provide insights for agent refinement""" history = self.get_agent_history(agent_id, 20) if not history: return {"status": "no_history", "recommendations": []} # Analyze success rate successes = sum(1 for h in history if h.get("success")) total = len(history) success_rate = successes / total # Analyze error patterns error_types = {} for h in history: if h.get("error_type"): error_types[h["error_type"]] = error_types.get(h["error_type"], 0) + 1 # Analyze chaos recovery chaos_recoveries = sum(1 for h in history if h.get("recovered_from_chaos")) recommendations = [] if success_rate < 0.7: recommendations.append("Consider reducing task complexity") if error_types: most_common = max(error_types, key=error_types.get) recommendations.append(f"Focus on handling '{most_common}' errors") if chaos_recoveries > 0: recommendations.append(f"Agent has recovered from {chaos_recoveries} chaos events") return { "success_rate": success_rate, "total_runs": total, "error_patterns": error_types, "chaos_recoveries": chaos_recoveries, "recommendations": recommendations } class OverwatchAgent: """ Overwatch agent that monitors other agents, spawns helpers on errors, and generates clarified plans when error thresholds are exceeded. """ def __init__(self, orchestrator: 'MultiAgentOrchestrator'): self.orchestrator = orchestrator self.agent_id = f"overwatch-{datetime.utcnow().strftime('%H%M%S')}" self.running = False self.helpers_spawned = 0 self.max_helpers = 3 self.intervention_log: List[Dict] = [] # Error threshold tracking self.error_threshold = 5 # Errors before plan clarification (error_spike adds 5) self.total_errors_seen = 0 self.threshold_triggered = False self.clarified_plans: List[ClarifiedPlan] = [] self.pipeline_paused = False self.startup_grace_period = 1.0 # seconds before error checking starts async def start(self): """Start overwatch monitoring""" self.running = True print(f" [OVERWATCH] Started monitoring ({self.agent_id})") print(f" [OVERWATCH] Error threshold: {self.error_threshold}") while self.running: await self.check_agents() await self.check_error_threshold() await asyncio.sleep(0.5) def stop(self): """Stop overwatch""" self.running = False print(f" [OVERWATCH] Stopped ({self.helpers_spawned} helpers spawned, {len(self.clarified_plans)} plans clarified)") async def check_agents(self): """Check all agents and intervene if needed""" # Create a copy to avoid modification during iteration agents_snapshot = list(self.orchestrator.agents.items()) for agent_id, agent in agents_snapshot: if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue # Check for error conditions - prioritize failures if agent.status == AgentStatus.FAILED: await self.handle_failure(agent) elif agent.status == AgentStatus.CHAOS_INJECTED: await self.handle_chaos(agent) elif agent.status == AgentStatus.RECOVERING: # Check if recovery is stalled await self.check_recovery_progress(agent) elif agent.error_count >= 3: await self.handle_error_spike(agent) # Also check for stale heartbeats on running agents (with grace period) if agent.status == AgentStatus.RUNNING and agent.started_at: age = (datetime.utcnow() - agent.started_at).total_seconds() if age > 2.0: # Grace period of 2 seconds for startup hb = self.orchestrator.redis.get(f"agent:{agent_id}:heartbeat") if not hb: print(f" [OVERWATCH] Heartbeat missing for {agent_id}, marking as chaos") agent.status = AgentStatus.CHAOS_INJECTED agent.chaos_condition = ChaosCondition.HEARTBEAT_TIMEOUT async def handle_failure(self, agent: AgentInstance): """Handle agent failure by spawning helper""" if self.helpers_spawned >= self.max_helpers: print(f" [OVERWATCH] Max helpers reached, cannot spawn for {agent.agent_id}") return intervention = { "type": "failure_recovery", "target_agent": agent.agent_id, "timestamp": datetime.utcnow().isoformat(), "action": "spawn_helper" } print(f" [OVERWATCH] Spawning helper for failed agent {agent.agent_id}") # Spawn a helper agent helper_id = await self.orchestrator.spawn_helper_agent( spawned_by=self.agent_id, reason=f"Recovery for {agent.agent_id}", target_task=agent.pipeline_id ) intervention["helper_id"] = helper_id self.intervention_log.append(intervention) self.helpers_spawned += 1 async def handle_chaos(self, agent: AgentInstance): """Handle chaos condition by attempting recovery and spawning helper""" intervention = { "type": "chaos_mitigation", "target_agent": agent.agent_id, "chaos_condition": agent.chaos_condition.value, "timestamp": datetime.utcnow().isoformat() } print(f" [OVERWATCH] Mitigating chaos for {agent.agent_id}: {agent.chaos_condition.value}") # Attempt to stabilize the agent based on chaos type if agent.chaos_condition == ChaosCondition.LOCK_LOST: # Re-acquire lock for the agent self.orchestrator.redis.set( f"agent:{agent.agent_id}:lock", agent.agent_id, ex=300, nx=False ) print(f" [OVERWATCH] Re-acquired lock for {agent.agent_id}") elif agent.chaos_condition == ChaosCondition.HEARTBEAT_TIMEOUT: # Restore heartbeat self.orchestrator.redis.set( f"agent:{agent.agent_id}:heartbeat", datetime.utcnow().isoformat(), ex=60 ) print(f" [OVERWATCH] Restored heartbeat for {agent.agent_id}") elif agent.chaos_condition == ChaosCondition.STATE_CORRUPTED: # Fix corrupted state self.orchestrator.redis.hset( f"agent:{agent.agent_id}:state", "phase", "EXECUTE" # Reset to safe phase ) print(f" [OVERWATCH] Fixed corrupted state for {agent.agent_id}") elif agent.chaos_condition == ChaosCondition.TOKEN_REVOKED: # Clear revoke signal and spawn helper self.orchestrator.redis.delete(f"agent:{agent.agent_id}:revoke_signal") print(f" [OVERWATCH] Cleared revoke signal for {agent.agent_id}") # Mark as recovering agent.status = AgentStatus.RECOVERING self.orchestrator.redis.hset( f"agent:{agent.agent_id}:state", "status", "recovering" ) # Spawn a helper to assist with recovery if we have budget if self.helpers_spawned < self.max_helpers: helper_id = await self.orchestrator.spawn_helper_agent( spawned_by=self.agent_id, reason=f"Chaos recovery ({agent.chaos_condition.value}) for {agent.agent_id}", target_task=agent.pipeline_id ) intervention["helper_id"] = helper_id self.helpers_spawned += 1 print(f" [OVERWATCH] Spawned helper {helper_id} for chaos recovery") self.intervention_log.append(intervention) async def check_recovery_progress(self, agent: AgentInstance): """Check if a recovering agent is making progress""" state = self.orchestrator.redis.hgetall(f"agent:{agent.agent_id}:state") # If still recovering after a while, the agent might be stuck if state.get("status") == "recovering": # Check if process is still alive if agent.process and agent.process.poll() is not None: # Process died during recovery print(f" [OVERWATCH] Agent {agent.agent_id} died during recovery") agent.status = AgentStatus.FAILED await self.handle_failure(agent) async def handle_error_spike(self, agent: AgentInstance): """Handle error spike by spawning diagnostic agent""" if self.helpers_spawned >= self.max_helpers: return intervention = { "type": "error_spike_response", "target_agent": agent.agent_id, "error_count": agent.error_count, "timestamp": datetime.utcnow().isoformat() } print(f" [OVERWATCH] Error spike detected for {agent.agent_id}, spawning diagnostic") helper_id = await self.orchestrator.spawn_helper_agent( spawned_by=self.agent_id, reason=f"Diagnose errors for {agent.agent_id}", target_task=agent.pipeline_id, agent_type=AgentType.DIAGNOSTIC ) intervention["helper_id"] = helper_id self.intervention_log.append(intervention) self.helpers_spawned += 1 async def check_error_threshold(self): """Check if error threshold is crossed and trigger plan clarification""" # Wait for startup grace period if self.orchestrator.start_time: elapsed = (datetime.utcnow() - self.orchestrator.start_time).total_seconds() if elapsed < self.startup_grace_period: return # Count total errors across all agents current_errors = sum( agent.error_count for agent in self.orchestrator.agents.values() if agent.agent_type not in [AgentType.OVERWATCH, AgentType.HELPER] ) # Check if we've crossed the threshold if current_errors >= self.error_threshold and not self.threshold_triggered: self.threshold_triggered = True self.total_errors_seen = current_errors print(f"\n {'='*60}") print(f" [OVERWATCH] ERROR THRESHOLD CROSSED!") print(f" [OVERWATCH] Total errors: {current_errors} >= threshold: {self.error_threshold}") print(f" {'='*60}") # Pause pipeline await self.pause_pipeline() # Inspect history and outputs history_review = await self.review_history() outputs_review = await self.review_outputs() # Generate clarified plan clarified_plan = await self.generate_clarified_plan( history_review, outputs_review, current_errors ) # Broadcast plan to all agents await self.broadcast_plan(clarified_plan) # Resume pipeline await self.resume_pipeline() # Reset for next threshold check (incremental threshold) self.error_threshold += 3 self.threshold_triggered = False async def pause_pipeline(self): """Pause all running agents for plan clarification""" self.pipeline_paused = True print(f" [OVERWATCH] PAUSING PIPELINE for plan clarification...") for agent_id, agent in self.orchestrator.agents.items(): if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue if agent.status == AgentStatus.RUNNING: self.orchestrator.redis.hset( f"agent:{agent_id}:state", "status", "paused" ) self.orchestrator.redis.set( f"agent:{agent_id}:pause_signal", "1", ex=60 ) print(f" [OVERWATCH] Pipeline paused - {len(self.orchestrator.agents)} agents notified") async def resume_pipeline(self): """Resume all paused agents after plan broadcast""" print(f" [OVERWATCH] RESUMING PIPELINE with clarified plan...") for agent_id, agent in self.orchestrator.agents.items(): if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue # Clear pause signal self.orchestrator.redis.delete(f"agent:{agent_id}:pause_signal") # Set resume signal with plan reference if self.clarified_plans: self.orchestrator.redis.set( f"agent:{agent_id}:resume_signal", self.clarified_plans[-1].plan_id, ex=60 ) self.pipeline_paused = False print(f" [OVERWATCH] Pipeline resumed") async def review_history(self) -> Dict[str, Any]: """Review history of all agents for plan clarification""" print(f"\n [OVERWATCH] --- REVIEWING HISTORY ---") history_data = { "agents_reviewed": [], "total_runs": 0, "success_rate": 0, "common_errors": {}, "chaos_events": [] } for agent_id, agent in self.orchestrator.agents.items(): if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue # Get history from Redis history = self.orchestrator.history.get_agent_history(agent_id, 5) history_data["agents_reviewed"].append(agent_id) history_data["total_runs"] += len(history) successes = sum(1 for h in history if h.get("success")) for h in history: if h.get("error_type"): err = h["error_type"] history_data["common_errors"][err] = history_data["common_errors"].get(err, 0) + 1 if h.get("chaos_condition") and h["chaos_condition"] != "none": history_data["chaos_events"].append({ "agent": agent_id, "chaos": h["chaos_condition"], "recovered": h.get("recovered_from_chaos", False) }) print(f" Agent {agent_id}: {len(history)} runs, {successes} successes") if history_data["total_runs"] > 0: total_successes = sum( sum(1 for h in self.orchestrator.history.get_agent_history(aid, 5) if h.get("success")) for aid in history_data["agents_reviewed"] ) history_data["success_rate"] = total_successes / history_data["total_runs"] print(f" Total runs: {history_data['total_runs']}, Success rate: {history_data['success_rate']:.1%}") print(f" Common errors: {history_data['common_errors']}") return history_data async def review_outputs(self) -> Dict[str, List[AgentOutput]]: """Review Alpha/Beta/Gamma outputs from all agents""" print(f"\n [OVERWATCH] --- REVIEWING OUTPUTS ---") outputs_data = { "alpha": [], "beta": [], "gamma": [], "by_agent": {} } for agent_id, agent in self.orchestrator.agents.items(): if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue agent_outputs = [] # Get outputs from Redis for output_type in OutputType: key = f"agent:{agent_id}:output:{output_type.value}" output_data = self.orchestrator.redis.get(key) if output_data: try: data = json.loads(output_data) output = AgentOutput( agent_id=agent_id, output_type=output_type, phase=data.get("phase", "unknown"), content=data.get("content", {}), timestamp=datetime.fromisoformat(data.get("timestamp", datetime.utcnow().isoformat())), checksum=data.get("checksum", "") ) outputs_data[output_type.value].append(output) agent_outputs.append(output) except (json.JSONDecodeError, KeyError) as e: print(f" Warning: Failed to parse output for {agent_id}: {e}") outputs_data["by_agent"][agent_id] = agent_outputs print(f" Agent {agent_id}: {len(agent_outputs)} outputs") for o in agent_outputs: print(f" - {o.output_type.value.upper()}: phase={o.phase}, checksum={o.checksum}") print(f" Total: Alpha={len(outputs_data['alpha'])}, Beta={len(outputs_data['beta'])}, Gamma={len(outputs_data['gamma'])}") return outputs_data async def generate_clarified_plan(self, history: Dict, outputs: Dict, error_count: int) -> ClarifiedPlan: """Generate a clarified plan based on history and outputs analysis""" print(f"\n [OVERWATCH] --- GENERATING CLARIFIED PLAN ---") plan_id = f"plan-{datetime.utcnow().strftime('%H%M%S%f')[:10]}" # Analyze issues and generate adjustments adjustments = [] # Based on common errors for error_type, count in history.get("common_errors", {}).items(): if count >= 2: adjustments.append(f"Add retry logic for '{error_type}' errors (seen {count}x)") # Based on chaos events chaos_events = history.get("chaos_events", []) if chaos_events: unrecovered = [e for e in chaos_events if not e.get("recovered")] if unrecovered: adjustments.append(f"Strengthen recovery for: {', '.join(e['chaos'] for e in unrecovered)}") # Based on output gaps if len(outputs.get("gamma", [])) < len(outputs.get("alpha", [])): adjustments.append("Focus on completing work (more Alpha than Gamma outputs)") # Based on success rate if history.get("success_rate", 1.0) < 0.7: adjustments.append("Reduce task complexity or add more checkpoints") # Default adjustment if none found if not adjustments: adjustments.append("Continue with increased monitoring") # Create clarified plan clarified_plan = ClarifiedPlan( plan_id=plan_id, trigger_reason=f"Error threshold crossed: {error_count} errors", history_reviewed=[a for a in history.get("agents_reviewed", [])], outputs_analyzed=[f"{o.agent_id}:{o.output_type.value}" for o in outputs.get("alpha", []) + outputs.get("beta", []) + outputs.get("gamma", [])], original_objective=self.orchestrator.redis.get(f"project:{self.orchestrator.project_id}:objective") or "Complete chaos test successfully", clarified_objective=f"Complete chaos test with enhanced resilience (error recovery priority)", adjustments=adjustments, broadcast_at=datetime.utcnow() ) self.clarified_plans.append(clarified_plan) print(f" Plan ID: {plan_id}") print(f" Trigger: {clarified_plan.trigger_reason}") print(f" History reviewed: {len(clarified_plan.history_reviewed)} agents") print(f" Outputs analyzed: {len(clarified_plan.outputs_analyzed)}") print(f" Adjustments:") for adj in adjustments: print(f" - {adj}") # Store plan in Redis for agents to read self.orchestrator.redis.set( f"project:{self.orchestrator.project_id}:plan:{plan_id}", json.dumps({ "plan_id": plan_id, "objective": clarified_plan.clarified_objective, "adjustments": adjustments, "broadcast_at": clarified_plan.broadcast_at.isoformat() }), ex=300 ) return clarified_plan async def broadcast_plan(self, plan: ClarifiedPlan): """Broadcast clarified plan to all agents""" print(f"\n [OVERWATCH] --- BROADCASTING PLAN ---") broadcast_key = f"project:{self.orchestrator.project_id}:broadcast" # Store broadcast self.orchestrator.redis.set( broadcast_key, json.dumps({ "plan_id": plan.plan_id, "objective": plan.clarified_objective, "adjustments": plan.adjustments, "broadcast_at": plan.broadcast_at.isoformat() }), ex=300 ) # Notify each agent acknowledged = [] for agent_id, agent in self.orchestrator.agents.items(): if agent.agent_type in [AgentType.OVERWATCH, AgentType.HELPER]: continue # Set notification for agent self.orchestrator.redis.set( f"agent:{agent_id}:plan_update", plan.plan_id, ex=60 ) acknowledged.append(agent_id) print(f" Notified: {agent_id}") plan.acknowledged_by = acknowledged print(f" Broadcast complete: {len(acknowledged)} agents notified") print(f" {'='*60}\n") class MultiAgentOrchestrator: """ Orchestrates multiple agents working on the same project. """ def __init__(self, project_id: str): self.project_id = project_id self.redis = redis.Redis( host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, decode_responses=True ) self.agents: Dict[str, AgentInstance] = {} self.overwatch: Optional[OverwatchAgent] = None self.history = HistoryRecorder(self.redis, LEDGER_PATH) self.readiness = DragonflyReadiness(self.redis, project_id) self.chaos_events: List[Dict] = [] self.start_time: Optional[datetime] = None self.end_time: Optional[datetime] = None # Setup readiness checks self._setup_readiness_checks() def _setup_readiness_checks(self): """Setup DragonflyDB readiness checks""" def check_all_agents_registered(): registered = self.redis.smembers(f"project:{self.project_id}:agents") expected = len([a for a in self.agents.values() if a.agent_type != AgentType.OVERWATCH]) if len(registered) >= expected: return True, f"{len(registered)} agents registered" return False, f"Only {len(registered)}/{expected} agents registered" def check_no_active_locks_conflict(): for agent_id in self.agents: lock = self.redis.get(f"agent:{agent_id}:lock") if lock and lock != agent_id: return False, f"Lock conflict for {agent_id}" return True, "No lock conflicts" def check_unified_objective(): primary_agents = [a for a in self.agents.values() if a.agent_type not in [AgentType.OVERWATCH, AgentType.HELPER]] completed = sum(1 for a in primary_agents if a.status == AgentStatus.COMPLETED) total = len(primary_agents) if total > 0 and completed >= total: return True, f"All {completed} primary agents completed" return False, f"{completed}/{total} primary agents completed" def check_no_critical_errors(): for agent_id, agent in self.agents.items(): # Skip helper agents and completed agents (they recovered) if agent.agent_type == AgentType.HELPER: continue if agent.status == AgentStatus.COMPLETED: continue # Completed agents recovered from errors if agent.error_count >= 5: return False, f"Agent {agent_id} has critical errors (count: {agent.error_count})" return True, "No critical unrecovered errors" self.readiness.add_check("all_agents_registered", check_all_agents_registered) self.readiness.add_check("no_lock_conflicts", check_no_active_locks_conflict) self.readiness.add_check("unified_objective", check_unified_objective) self.readiness.add_check("no_critical_errors", check_no_critical_errors) def get_vault_token(self) -> str: """Get Vault root token""" with open(VAULT_TOKEN_FILE) as f: return json.load(f)["root_token"] async def spawn_agent(self, agent_type: AgentType, pipeline_name: str) -> str: """Spawn a new agent""" agent_id = f"{agent_type.value}-{datetime.utcnow().strftime('%H%M%S%f')[:10]}" agent = AgentInstance( agent_id=agent_id, agent_type=agent_type, pipeline_id=pipeline_name, status=AgentStatus.STARTING ) self.agents[agent_id] = agent # Register in DragonflyDB self.redis.sadd(f"project:{self.project_id}:agents", agent_id) self.redis.hset(f"agent:{agent_id}:state", mapping={ "status": "starting", "type": agent_type.value, "pipeline": pipeline_name, "project": self.project_id, "started_at": datetime.utcnow().isoformat() }) # Start the agent process await self._start_agent_process(agent) return agent_id async def spawn_helper_agent(self, spawned_by: str, reason: str, target_task: str, agent_type: AgentType = AgentType.HELPER) -> str: """Spawn a helper agent""" agent_id = f"helper-{datetime.utcnow().strftime('%H%M%S%f')[:10]}" agent = AgentInstance( agent_id=agent_id, agent_type=agent_type, pipeline_id=target_task, status=AgentStatus.STARTING, spawned_by=spawned_by ) self.agents[agent_id] = agent # Register self.redis.sadd(f"project:{self.project_id}:agents", agent_id) self.redis.hset(f"agent:{agent_id}:state", mapping={ "status": "starting", "type": agent_type.value, "spawned_by": spawned_by, "reason": reason, "target_task": target_task, "started_at": datetime.utcnow().isoformat() }) # Start helper process await self._start_agent_process(agent) return agent_id async def _start_agent_process(self, agent: AgentInstance): """Start the actual agent process""" agent.started_at = datetime.utcnow() log_file = LOG_DIR / f"{agent.agent_id}.log" # Get learning insights for this agent type insights = self.history.get_learning_insights(agent.agent_type.value) env = os.environ.copy() env["VAULT_ADDR"] = VAULT_ADDR env["VAULT_SKIP_VERIFY"] = "1" env["VAULT_TOKEN"] = self.get_vault_token() env["AGENT_ID"] = agent.agent_id env["PROJECT_ID"] = self.project_id env["REDIS_PASSWORD"] = REDIS_PASSWORD env["LEARNING_INSIGHTS"] = json.dumps(insights) if agent.agent_type == AgentType.PYTHON: cmd = self._get_python_agent_cmd(agent) elif agent.agent_type == AgentType.BUN: cmd = self._get_bun_agent_cmd(agent) elif agent.agent_type == AgentType.DIAGNOSTIC: cmd = self._get_diagnostic_agent_cmd(agent) else: cmd = self._get_helper_agent_cmd(agent) try: with open(log_file, "w") as log: agent.process = subprocess.Popen( cmd, shell=True, stdout=log, stderr=subprocess.STDOUT, env=env, cwd="/opt/agent-governance" ) agent.status = AgentStatus.RUNNING self.redis.hset(f"agent:{agent.agent_id}:state", "status", "running") except Exception as e: agent.status = AgentStatus.FAILED agent.error_count += 1 self.redis.hset(f"agent:{agent.agent_id}:state", "status", "failed") print(f" [ERROR] Failed to start {agent.agent_id}: {e}") def _get_python_agent_cmd(self, agent: AgentInstance) -> str: """Get command for Python agent with chaos resilience and Alpha/Beta/Gamma outputs""" return f""" python3 -c " import redis import time import json import os import hashlib r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True) agent_id = '{agent.agent_id}' project_id = '{self.project_id}' current_plan = None def emit_output(output_type, phase, content): '''Emit an Alpha/Beta/Gamma output''' chksum = hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12] output = {{ 'phase': phase, 'content': content, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'), 'checksum': chksum }} key = f'agent:{{agent_id}}:output:{{output_type}}' r.set(key, json.dumps(output), ex=300) print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}, checksum={{chksum}}') def check_for_plan_update(): '''Check if overwatch has broadcast a new plan''' global current_plan plan_id = r.get(f'agent:{{agent_id}}:plan_update') if plan_id and plan_id != current_plan: current_plan = plan_id plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}') if plan_data: plan = json.loads(plan_data) obj = plan.get('objective', 'unknown') adjs = plan.get('adjustments', []) print(f'[{{agent_id}}] RECEIVED PLAN UPDATE: {{plan_id}}') print(f'[{{agent_id}}] Objective: {{obj}}') for adj in adjs: print(f'[{{agent_id}}] Adjustment: {{adj}}') return plan return None def check_for_pause(): '''Check if pipeline is paused''' if r.get(f'agent:{{agent_id}}:pause_signal') == '1': print(f'[{{agent_id}}] Pipeline PAUSED - waiting for resume...') for _ in range(30): # Wait up to 15 seconds if r.get(f'agent:{{agent_id}}:pause_signal') != '1': resume = r.get(f'agent:{{agent_id}}:resume_signal') print(f'[{{agent_id}}] Pipeline RESUMED with plan: {{resume}}') check_for_plan_update() return True time.sleep(0.5) print(f'[{{agent_id}}] Pause timeout, continuing...') return True def check_and_recover(): '''Check for chaos conditions and attempt recovery''' # First check for pause/plan updates check_for_pause() check_for_plan_update() # Check for revoke signal if r.get(f'agent:{{agent_id}}:revoke_signal') == '1': print(f'[{{agent_id}}] Token revoked, waiting for recovery...') for _ in range(10): if r.get(f'agent:{{agent_id}}:revoke_signal') != '1': print(f'[{{agent_id}}] Revoke cleared, resuming') return True time.sleep(0.2) return False # Check for lost lock lock = r.get(f'agent:{{agent_id}}:lock') if lock is None: print(f'[{{agent_id}}] Lock lost, attempting re-acquire...') for _ in range(5): if r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300, nx=True): print(f'[{{agent_id}}] Lock re-acquired') return True if r.get(f'agent:{{agent_id}}:lock') == agent_id: print(f'[{{agent_id}}] Lock restored by overwatch') return True time.sleep(0.2) return False # Check status status = r.hget(f'agent:{{agent_id}}:state', 'status') if status == 'recovering': print(f'[{{agent_id}}] In recovery mode, resuming...') r.hset(f'agent:{{agent_id}}:state', 'status', 'running') return True # Acquire initial lock r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300) # Define phase outputs (aligned with pipeline.core.PHASE_OUTPUT_TYPES) phase_outputs = {{ 'BOOTSTRAP': ('alpha', {{'status': 'initialized', 'capabilities': ['execute', 'verify']}}), 'PREFLIGHT': ('alpha', {{'checks_passed': True, 'resources_available': True}}), 'PLAN': ('beta', {{'steps': ['analyze', 'transform', 'validate'], 'estimated_phases': 3}}), 'EXECUTE': ('beta', {{'progress': 100, 'artifacts_created': ['output.json']}}), 'VERIFY': ('gamma', {{'tests_passed': True, 'quality_score': 0.95}}), 'PACKAGE': ('gamma', {{'artifacts_bundled': True, 'evidence_collected': True}}), 'REPORT': ('gamma', {{'summary': 'Pipeline completed successfully', 'metrics': {{'duration': 0, 'errors': 0}}}}), }} # Simulate pipeline execution with resilience and outputs # Official agent phases from pipeline.core (excludes EXIT - that's the terminal state) phases = ['BOOTSTRAP', 'PREFLIGHT', 'PLAN', 'EXECUTE', 'VERIFY', 'PACKAGE', 'REPORT'] start_time = time.time() for i, phase in enumerate(phases): # Check for chaos conditions before each phase if not check_and_recover(): print(f'[{{agent_id}}] Cannot recover from chaos, marking failed') r.hset(f'agent:{{agent_id}}:state', 'status', 'failed') # Emit failure output emit_output('gamma', phase, {{'status': 'failed', 'reason': 'chaos_unrecoverable'}}) exit(1) r.hset(f'agent:{{agent_id}}:state', 'phase', phase) r.hset(f'agent:{{agent_id}}:state', 'status', 'running') r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60) # Emit output for this phase output_type, content = phase_outputs.get(phase, ('alpha', {{'phase': phase}})) content['agent_id'] = agent_id content['phase_index'] = i emit_output(output_type, phase, content) time.sleep(0.3) # Extra check mid-phase check_and_recover() # Final gamma output with summary duration = time.time() - start_time emit_output('gamma', 'EXIT', {{ 'status': 'completed', 'total_phases': len(phases), 'duration_seconds': round(duration, 2), 'plan_updates_received': 1 if current_plan else 0 }}) # Mark complete r.hset(f'agent:{{agent_id}}:state', mapping={{ 'status': 'completed', 'phase': 'EXIT', 'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ') }}) r.delete(f'agent:{{agent_id}}:lock') print(f'Agent {{agent_id}} completed pipeline ({{len(phases)}} phases, {{duration:.2f}}s)') " """ def _get_bun_agent_cmd(self, agent: AgentInstance) -> str: """Get command for Bun agent with chaos resilience and Alpha/Beta/Gamma outputs""" return f""" python3 -c " import redis import time import json import hashlib r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True) agent_id = '{agent.agent_id}' project_id = '{self.project_id}' current_plan = None def emit_output(output_type, phase, content): '''Emit an Alpha/Beta/Gamma output''' output = {{ 'phase': phase, 'content': content, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'), 'checksum': hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12] }} key = f'agent:{{agent_id}}:output:{{output_type}}' r.set(key, json.dumps(output), ex=300) print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}') def check_for_plan_update(): '''Check if overwatch has broadcast a new plan''' global current_plan plan_id = r.get(f'agent:{{agent_id}}:plan_update') if plan_id and plan_id != current_plan: current_plan = plan_id plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}') if plan_data: plan = json.loads(plan_data) print(f'[{{agent_id}}] RECEIVED PLAN UPDATE: {{plan_id}}') return plan return None def check_for_pause(): '''Check if pipeline is paused''' if r.get(f'agent:{{agent_id}}:pause_signal') == '1': print(f'[{{agent_id}}] Pipeline PAUSED - waiting...') for _ in range(30): if r.get(f'agent:{{agent_id}}:pause_signal') != '1': print(f'[{{agent_id}}] Pipeline RESUMED') check_for_plan_update() return True time.sleep(0.5) return True def check_and_recover(): '''Check for chaos and recover''' check_for_pause() check_for_plan_update() # Check error count errors = r.hget(f'agent:{{agent_id}}:errors', 'total_errors') if errors and int(errors) >= 5: print(f'[{{agent_id}}] Error spike detected, resetting...') r.hset(f'agent:{{agent_id}}:errors', 'total_errors', '0') time.sleep(0.1) # Check lock lock = r.get(f'agent:{{agent_id}}:lock') if lock is None: print(f'[{{agent_id}}] Lock lost, re-acquiring...') r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300) time.sleep(0.1) # Restore heartbeat r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60) # Check and clear recovery status status = r.hget(f'agent:{{agent_id}}:state', 'status') if status == 'recovering': r.hset(f'agent:{{agent_id}}:state', 'status', 'running') return True # Acquire lock r.set(f'agent:{{agent_id}}:lock', agent_id, ex=300) # Emit initial alpha output emit_output('alpha', 'INIT', {{'agent_type': 'bun', 'ready': True}}) # Official agent phases from pipeline.core (excludes EXIT - that's the terminal state) phases = ['BOOTSTRAP', 'PREFLIGHT', 'PLAN', 'EXECUTE', 'VERIFY', 'PACKAGE', 'REPORT'] start_time = time.time() for i, phase in enumerate(phases): check_and_recover() r.hset(f'agent:{{agent_id}}:state', 'phase', phase) r.hset(f'agent:{{agent_id}}:state', 'status', 'running') r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60) # Emit outputs at key phases if phase == 'PLAN': emit_output('beta', phase, {{'analysis_complete': True, 'items_to_process': 5}}) elif phase == 'EXECUTE': emit_output('beta', phase, {{'items_processed': 5, 'success_rate': 1.0}}) elif phase == 'VERIFY': emit_output('gamma', phase, {{'verified': True, 'confidence': 0.98}}) time.sleep(0.25) # Final gamma output duration = time.time() - start_time emit_output('gamma', 'EXIT', {{ 'status': 'completed', 'duration_seconds': round(duration, 2), 'plan_updates': 1 if current_plan else 0 }}) r.hset(f'agent:{{agent_id}}:state', mapping={{ 'status': 'completed', 'phase': 'EXIT', 'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ') }}) r.delete(f'agent:{{agent_id}}:lock') print(f'Agent {{agent_id}} completed pipeline (bun, {{duration:.2f}}s)') " """ def _get_diagnostic_agent_cmd(self, agent: AgentInstance) -> str: """Get command for diagnostic agent with Alpha/Beta/Gamma outputs""" return f""" python3 -c " import redis import json import time import hashlib r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True) agent_id = '{agent.agent_id}' project_id = '{self.project_id}' def emit_output(output_type, phase, content): '''Emit an Alpha/Beta/Gamma output''' output = {{ 'phase': phase, 'content': content, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ'), 'checksum': hashlib.sha256(json.dumps(content, sort_keys=True).encode()).hexdigest()[:12] }} key = f'agent:{{agent_id}}:output:{{output_type}}' r.set(key, json.dumps(output), ex=300) print(f'[{{agent_id}}] Emitted {{output_type.upper()}} output: phase={{phase}}') def check_for_plan_update(): '''Check if overwatch has broadcast a new plan''' plan_id = r.get(f'agent:{{agent_id}}:plan_update') if plan_id: plan_data = r.get(f'project:{{project_id}}:plan:{{plan_id}}') if plan_data: plan = json.loads(plan_data) print(f'[{{agent_id}}] Received plan update: {{plan_id}}') return plan return None # Alpha output: starting diagnostics emit_output('alpha', 'INIT', {{'agent_type': 'diagnostic', 'starting': True}}) # Check for plan updates check_for_plan_update() # Run diagnostics diagnostics = {{ 'redis_ping': r.ping(), 'agent_count': r.scard(f'project:{{project_id}}:agents'), 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ') }} # Beta output: initial diagnostics emit_output('beta', 'SCAN', {{'redis_ok': diagnostics['redis_ping'], 'agents_found': diagnostics['agent_count']}}) # Check all agents agent_states = {{}} outputs_found = {{'alpha': 0, 'beta': 0, 'gamma': 0}} for aid in r.smembers(f'project:{{project_id}}:agents'): state = r.hgetall(f'agent:{{aid}}:state') agent_states[aid] = state.get('status', 'unknown') # Check outputs for otype in ['alpha', 'beta', 'gamma']: if r.exists(f'agent:{{aid}}:output:{{otype}}'): outputs_found[otype] += 1 diagnostics['agent_states'] = agent_states diagnostics['healthy_count'] = sum(1 for s in agent_states.values() if s in ['running', 'completed']) diagnostics['outputs_found'] = outputs_found # Gamma output: full diagnostic report emit_output('gamma', 'REPORT', {{ 'agents_checked': len(agent_states), 'healthy_agents': diagnostics['healthy_count'], 'outputs': outputs_found, 'status_breakdown': dict((s, list(agent_states.values()).count(s)) for s in set(agent_states.values())) }}) r.hset(f'agent:{{agent_id}}:state', mapping={{ 'status': 'completed', 'phase': 'EXIT', 'diagnostics': json.dumps(diagnostics), 'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ') }}) print(f'Diagnostic agent {{agent_id}} complete: {{len(agent_states)}} agents, outputs: {{outputs_found}}') " """ def _get_helper_agent_cmd(self, agent: AgentInstance) -> str: """Get command for helper agent that performs actual recovery""" return f""" python3 -c " import redis import time import json r = redis.Redis(host='127.0.0.1', port=6379, password='{REDIS_PASSWORD}', decode_responses=True) agent_id = '{agent.agent_id}' project_id = '{self.project_id}' spawned_by = '{agent.spawned_by or ""}' print(f'[{{agent_id}}] Helper starting recovery work') # Step 1: Analyze - find agents needing help r.hset(f'agent:{{agent_id}}:state', 'step', 'analyze') r.set(f'agent:{{agent_id}}:heartbeat', time.time(), ex=60) troubled_agents = [] for aid in r.smembers(f'project:{{project_id}}:agents'): if aid == agent_id: continue state = r.hgetall(f'agent:{{aid}}:state') status = state.get('status', '') if status in ['failed', 'recovering', 'chaos_injected']: troubled_agents.append(aid) print(f'[{{agent_id}}] Found troubled agent: {{aid}} ({{status}})') time.sleep(0.2) # Step 2: Recover - fix issues for troubled agents r.hset(f'agent:{{agent_id}}:state', 'step', 'recover') for aid in troubled_agents: # Clear revoke signals r.delete(f'agent:{{aid}}:revoke_signal') # Restore locks lock = r.get(f'agent:{{aid}}:lock') if lock is None: r.set(f'agent:{{aid}}:lock', aid, ex=300) print(f'[{{agent_id}}] Restored lock for {{aid}}') # Restore heartbeats r.set(f'agent:{{aid}}:heartbeat', time.time(), ex=60) # Reset error counts r.hset(f'agent:{{aid}}:errors', 'total_errors', '0') # Fix corrupted state phase = r.hget(f'agent:{{aid}}:state', 'phase') if phase and 'CORRUPT' in phase: r.hset(f'agent:{{aid}}:state', 'phase', 'EXECUTE') print(f'[{{agent_id}}] Fixed corrupted state for {{aid}}') # Set back to running if recovering status = r.hget(f'agent:{{aid}}:state', 'status') if status == 'recovering': r.hset(f'agent:{{aid}}:state', 'status', 'running') print(f'[{{agent_id}}] Set {{aid}} back to running') time.sleep(0.2) # Step 3: Verify - check recovery worked r.hset(f'agent:{{agent_id}}:state', 'step', 'verify') recovered = 0 for aid in troubled_agents: state = r.hgetall(f'agent:{{aid}}:state') if state.get('status') not in ['failed', 'recovering', 'chaos_injected']: recovered += 1 print(f'[{{agent_id}}] Recovery complete: {{recovered}}/{{len(troubled_agents)}} agents recovered') r.hset(f'agent:{{agent_id}}:state', mapping={{ 'status': 'completed', 'phase': 'EXIT', 'recovered_count': str(recovered), 'completed_at': time.strftime('%Y-%m-%dT%H:%M:%SZ') }}) print(f'Helper agent {{agent_id}} completed recovery work') " """ async def inject_chaos(self, agent_id: str, chaos_type: ChaosCondition): """Inject chaos condition into an agent""" if agent_id not in self.agents: return agent = self.agents[agent_id] agent.chaos_condition = chaos_type agent.status = AgentStatus.CHAOS_INJECTED event = { "agent_id": agent_id, "chaos_type": chaos_type.value, "timestamp": datetime.utcnow().isoformat() } self.chaos_events.append(event) print(f" [CHAOS] Injected {chaos_type.value} into {agent_id}") # Apply chaos - each chaos type increments error count if chaos_type == ChaosCondition.TOKEN_REVOKED: self.redis.set(f"agent:{agent_id}:revoke_signal", "1") agent.error_count += 1 elif chaos_type == ChaosCondition.LOCK_LOST: self.redis.delete(f"agent:{agent_id}:lock") agent.error_count += 1 elif chaos_type == ChaosCondition.STATE_CORRUPTED: self.redis.hset(f"agent:{agent_id}:state", "phase", "CORRUPTED") agent.error_count += 1 elif chaos_type == ChaosCondition.HEARTBEAT_TIMEOUT: self.redis.delete(f"agent:{agent_id}:heartbeat") agent.error_count += 1 elif chaos_type == ChaosCondition.ERROR_SPIKE: self.redis.hincrby(f"agent:{agent_id}:errors", "total_errors", 5) agent.error_count += 5 async def monitor_agents(self): """Monitor agent progress""" while True: all_done = True for agent_id, agent in self.agents.items(): if agent.agent_type == AgentType.OVERWATCH: continue if agent.process: ret = agent.process.poll() if ret is not None: agent.exit_code = ret agent.completed_at = datetime.utcnow() # Check final status from Redis state = self.redis.hgetall(f"agent:{agent_id}:state") if state.get("status") == "completed": agent.status = AgentStatus.COMPLETED else: agent.status = AgentStatus.FAILED agent.error_count += 1 else: all_done = False # Update heartbeat check hb = self.redis.get(f"agent:{agent_id}:heartbeat") if agent.status == AgentStatus.RUNNING and not hb: agent.error_count += 1 if all_done: break await asyncio.sleep(0.5) async def run(self, agents_config: List[Dict], chaos_schedule: List[Dict] = None) -> Dict: """ Run the multi-agent chaos test. Args: agents_config: List of {"type": AgentType, "pipeline": str} chaos_schedule: List of {"delay": float, "agent_index": int, "chaos": ChaosCondition} """ self.start_time = datetime.utcnow() print("\n" + "=" * 70) print("MULTI-AGENT CHAOS TEST") print("=" * 70) print(f"Project: {self.project_id}") print(f"Agents: {len(agents_config)}") print(f"Started: {self.start_time.isoformat()}") print() # Start overwatch self.overwatch = OverwatchAgent(self) overwatch_task = asyncio.create_task(self.overwatch.start()) # Spawn all agents print("--- SPAWNING AGENTS ---") agent_ids = [] for config in agents_config: agent_type = config["type"] pipeline = config.get("pipeline", "default") agent_id = await self.spawn_agent(agent_type, pipeline) agent_ids.append(agent_id) print(f" Started: {agent_id} ({agent_type.value})") print() # Schedule chaos events if chaos_schedule: print("--- CHAOS SCHEDULE ---") for event in chaos_schedule: asyncio.create_task(self._delayed_chaos( event["delay"], agent_ids[event["agent_index"]], event["chaos"] )) print(f" Scheduled: {event['chaos'].value} for agent {event['agent_index']} at +{event['delay']}s") print() # Monitor agents print("--- AGENT PROGRESS ---") await self.monitor_agents() # Stop overwatch self.overwatch.stop() overwatch_task.cancel() try: await overwatch_task except asyncio.CancelledError: pass self.end_time = datetime.utcnow() # Run readiness checks print("\n--- DRAGONFLY READINESS CHECKS ---") ready, checks = self.readiness.run_checks() for check in checks: status = "✓" if check["passed"] else "✗" print(f" {status} {check['name']}: {check['message']}") # Collect results results = self._collect_results(ready, checks) # Record history for each agent for agent_id, agent in self.agents.items(): self.history.record_run(agent_id, { "agent_type": agent.agent_type.value, "pipeline": agent.pipeline_id, "success": agent.status == AgentStatus.COMPLETED, "error_count": agent.error_count, "chaos_condition": agent.chaos_condition.value, "recovered_from_chaos": agent.chaos_condition != ChaosCondition.NONE and agent.status == AgentStatus.COMPLETED, "duration_ms": (agent.completed_at - agent.started_at).total_seconds() * 1000 if agent.completed_at and agent.started_at else None, "session_id": self.project_id, "action": "chaos_test_run" }) # Print summary self._print_summary(results) return results async def _delayed_chaos(self, delay: float, agent_id: str, chaos: ChaosCondition): """Inject chaos after a delay""" await asyncio.sleep(delay) await self.inject_chaos(agent_id, chaos) def _collect_results(self, ready: bool, checks: List[Dict]) -> Dict: """Collect test results""" completed = sum(1 for a in self.agents.values() if a.status == AgentStatus.COMPLETED) failed = sum(1 for a in self.agents.values() if a.status == AgentStatus.FAILED) total = len(self.agents) duration = (self.end_time - self.start_time).total_seconds() agent_results = [] for agent_id, agent in self.agents.items(): log_file = LOG_DIR / f"{agent_id}.log" output = "" if log_file.exists(): output = log_file.read_text()[-500:] # Last 500 chars agent_results.append({ "agent_id": agent_id, "type": agent.agent_type.value, "status": agent.status.value, "pipeline": agent.pipeline_id, "chaos_condition": agent.chaos_condition.value, "error_count": agent.error_count, "exit_code": agent.exit_code, "spawned_by": agent.spawned_by, "output_tail": output }) # Collect outputs outputs_summary = {"alpha": 0, "beta": 0, "gamma": 0} for agent_id in self.agents: for otype in ["alpha", "beta", "gamma"]: if self.redis.exists(f"agent:{agent_id}:output:{otype}"): outputs_summary[otype] += 1 # Collect clarified plans clarified_plans = [] if self.overwatch: for plan in self.overwatch.clarified_plans: clarified_plans.append({ "plan_id": plan.plan_id, "trigger": plan.trigger_reason, "adjustments": plan.adjustments, "agents_notified": len(plan.acknowledged_by), "broadcast_at": plan.broadcast_at.isoformat() }) return { "project_id": self.project_id, "success": ready and failed == 0, "unified_objective_reached": ready, "duration_seconds": duration, "agents": { "total": total, "completed": completed, "failed": failed, "helpers_spawned": self.overwatch.helpers_spawned if self.overwatch else 0 }, "outputs": outputs_summary, "chaos_events": len(self.chaos_events), "overwatch_interventions": len(self.overwatch.intervention_log) if self.overwatch else 0, "clarified_plans": clarified_plans, "readiness_checks": checks, "agent_results": agent_results, "timestamp": datetime.utcnow().isoformat() } def _print_summary(self, results: Dict): """Print test summary""" print("\n" + "=" * 70) print("TEST SUMMARY") print("=" * 70) status = "✓ SUCCESS" if results["success"] else "✗ FAILED" print(f"\nResult: {status}") print(f"Unified Objective: {'Reached' if results['unified_objective_reached'] else 'Not Reached'}") print(f"Duration: {results['duration_seconds']:.2f}s") print(f"\nAgents:") print(f" Total: {results['agents']['total']}") print(f" Completed: {results['agents']['completed']}") print(f" Failed: {results['agents']['failed']}") print(f" Helpers Spawned: {results['agents']['helpers_spawned']}") print(f"\nOutputs (Alpha/Beta/Gamma):") outputs = results.get("outputs", {}) print(f" Alpha: {outputs.get('alpha', 0)}") print(f" Beta: {outputs.get('beta', 0)}") print(f" Gamma: {outputs.get('gamma', 0)}") print(f"\nChaos:") print(f" Events Injected: {results['chaos_events']}") print(f" Overwatch Interventions: {results['overwatch_interventions']}") # Plan clarification details clarified_plans = results.get("clarified_plans", []) if clarified_plans: print(f"\nPlan Clarifications: {len(clarified_plans)}") for plan in clarified_plans: print(f" Plan {plan['plan_id']}:") print(f" Trigger: {plan['trigger']}") print(f" Agents Notified: {plan['agents_notified']}") print(f" Adjustments:") for adj in plan.get('adjustments', []): print(f" - {adj}") print(f"\nAgent Details:") for ar in results["agent_results"]: chaos = f" [CHAOS: {ar['chaos_condition']}]" if ar["chaos_condition"] != "none" else "" spawned = f" (spawned by {ar['spawned_by']})" if ar["spawned_by"] else "" print(f" {ar['agent_id']}: {ar['status']}{chaos}{spawned}") print("\n" + "=" * 70) async def main(): """Run the multi-agent chaos test""" project_id = f"chaos-test-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}" orchestrator = MultiAgentOrchestrator(project_id) # Set project objective orchestrator.redis.set( f"project:{project_id}:objective", "Complete multi-agent chaos test with all agents reaching EXIT phase" ) # Configure agents agents_config = [ {"type": AgentType.PYTHON, "pipeline": "infrastructure"}, {"type": AgentType.BUN, "pipeline": "code-analysis"}, {"type": AgentType.DIAGNOSTIC, "pipeline": "diagnostics"}, ] # Configure chaos schedule - designed to trigger error threshold # Single error_spike triggers plan clarification while allowing recovery chaos_schedule = [ {"delay": 0.8, "agent_index": 1, "chaos": ChaosCondition.ERROR_SPIKE}, # +5 errors triggers threshold {"delay": 2.5, "agent_index": 0, "chaos": ChaosCondition.HEARTBEAT_TIMEOUT}, # Late, after most work done ] # Run test (overwatch error_threshold defaults to 3, error_spike adds 5) results = await orchestrator.run(agents_config, chaos_schedule) # Save results results_file = LOG_DIR / f"results-{project_id}.json" with open(results_file, "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {results_file}") return 0 if results["success"] else 1 if __name__ == "__main__": sys.exit(asyncio.run(main()))