#!/usr/bin/env python3 """ Governance Monitor Agents ========================= These agents watch and enforce, they don't do work. - Execution Watcher: Heartbeats, stuck detection, lock expiry - Compliance Watcher: Artifact checks, forbidden action detection - Vault Lease Watcher: Token validity, revocation enforcement """ import json import time from datetime import datetime, timezone, timedelta from typing import Optional import threading from governance import ( GovernanceManager, AgentPhase, AgentStatus, RevocationType ) class ExecutionWatcher: """ Monitors agent execution health. - Checks heartbeats - Detects stuck agents - Handles lock expiry """ def __init__(self, heartbeat_timeout: int = 60, stuck_threshold: int = 300): self.gov = GovernanceManager() self.heartbeat_timeout = heartbeat_timeout # seconds self.stuck_threshold = stuck_threshold # seconds without progress def check_agent(self, agent_id: str) -> dict: """Check an agent's execution health""" result = { "agent_id": agent_id, "healthy": True, "issues": [] } state = self.gov.get_agent_state(agent_id) if not state: result["healthy"] = False result["issues"].append("NO_STATE") return result # Skip completed/revoked agents if state.status in [AgentStatus.COMPLETED, AgentStatus.REVOKED]: return result # Check heartbeat if not self.gov.is_alive(agent_id): result["healthy"] = False result["issues"].append("HEARTBEAT_TIMEOUT") # Check lock if not self.gov.has_lock(agent_id) and state.status == AgentStatus.RUNNING: result["healthy"] = False result["issues"].append("LOCK_EXPIRED") # Check for stuck (no progress) if state.last_progress_at: last_progress = datetime.fromisoformat(state.last_progress_at.replace("Z", "+00:00")) age = (datetime.now(timezone.utc) - last_progress).total_seconds() if age > self.stuck_threshold: result["healthy"] = False result["issues"].append(f"STUCK_{int(age)}s") return result def enforce(self, agent_id: str) -> Optional[str]: """Enforce health requirements, return action taken""" check = self.check_agent(agent_id) if check["healthy"]: return None for issue in check["issues"]: if issue == "HEARTBEAT_TIMEOUT": self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT, "No heartbeat received within timeout") return "REVOKED:HEARTBEAT" elif issue == "LOCK_EXPIRED": self.gov.revoke_agent(agent_id, RevocationType.LOCK_EXPIRED, "Lock expired while agent was running") return "REVOKED:LOCK" elif issue.startswith("STUCK_"): self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT, f"Agent stuck with no progress: {issue}") return f"REVOKED:{issue}" return None class ComplianceWatcher: """ Monitors agent compliance with procedures. - Checks required artifacts exist - Detects forbidden actions - Validates phase transitions """ def __init__(self): self.gov = GovernanceManager() def check_agent(self, agent_id: str) -> dict: """Check an agent's compliance""" result = { "agent_id": agent_id, "compliant": True, "violations": [] } state = self.gov.get_agent_state(agent_id) packet = self.gov.get_instruction_packet(agent_id) if not state or not packet: result["compliant"] = False result["violations"].append("MISSING_STATE_OR_PACKET") return result # Check if EXECUTE was entered without PLAN artifact if state.phase in [AgentPhase.EXECUTE, AgentPhase.VERIFY, AgentPhase.PACKAGE]: if not self.gov.has_required_artifact(packet.task_id, "plan"): result["compliant"] = False result["violations"].append("EXECUTE_WITHOUT_PLAN_ARTIFACT") # Check required steps from constraints required_steps = packet.constraints.get("required_steps", []) # This would need more sophisticated tracking in production return result def enforce(self, agent_id: str) -> Optional[str]: """Enforce compliance, return action taken""" check = self.check_agent(agent_id) if check["compliant"]: return None for violation in check["violations"]: if violation == "EXECUTE_WITHOUT_PLAN_ARTIFACT": self.gov.revoke_agent(agent_id, RevocationType.PROCEDURE_VIOLATION, "Attempted EXECUTE phase without plan artifact") return "REVOKED:NO_PLAN_ARTIFACT" return None class VaultLeaseWatcher: """ Monitors Vault token/lease validity. - Checks token accessibility - Confirms revocation signals - Enforces token revocation """ def __init__(self): self.gov = GovernanceManager() self.db = self.gov.db def set_revocation_signal(self, agent_id: str): """Set a signal that this agent should be revoked""" self.db.set(f"agent:{agent_id}:revoke_signal", "1", ex=300) def has_revocation_signal(self, agent_id: str) -> bool: """Check if revocation signal is set""" return self.db.exists(f"agent:{agent_id}:revoke_signal") def clear_revocation_signal(self, agent_id: str): """Clear revocation signal after enforcement""" self.db.delete(f"agent:{agent_id}:revoke_signal") def enforce(self, agent_id: str) -> Optional[str]: """Enforce revocation signal""" if self.has_revocation_signal(agent_id): self.gov.revoke_agent(agent_id, RevocationType.MANUAL, "Revocation signal received") self.clear_revocation_signal(agent_id) return "REVOKED:SIGNAL" return None class GovernanceMonitorDaemon: """ Background daemon that runs all monitors periodically. """ def __init__(self, interval: int = 10): self.interval = interval self.gov = GovernanceManager() self.execution_watcher = ExecutionWatcher() self.compliance_watcher = ComplianceWatcher() self.vault_watcher = VaultLeaseWatcher() self.running = False def get_active_agents(self) -> list[str]: """Get list of agents that need monitoring""" # Get all agent state keys keys = self.gov.db.keys("agent:*:state") agents = [] for key in keys: agent_id = key.split(":")[1] state = self.gov.get_agent_state(agent_id) if state and state.status == AgentStatus.RUNNING: agents.append(agent_id) return agents def run_checks(self) -> list[dict]: """Run all monitors once""" results = [] for agent_id in self.get_active_agents(): result = { "agent_id": agent_id, "timestamp": datetime.now(timezone.utc).isoformat(), "actions": [] } # Execution check action = self.execution_watcher.enforce(agent_id) if action: result["actions"].append(action) # Compliance check (only if not already revoked) state = self.gov.get_agent_state(agent_id) if state and state.status != AgentStatus.REVOKED: action = self.compliance_watcher.enforce(agent_id) if action: result["actions"].append(action) # Vault check if state and state.status != AgentStatus.REVOKED: action = self.vault_watcher.enforce(agent_id) if action: result["actions"].append(action) if result["actions"]: results.append(result) return results def run_daemon(self): """Run as a daemon (blocking)""" self.running = True print(f"[MONITOR] Starting governance monitor daemon (interval: {self.interval}s)") while self.running: results = self.run_checks() for r in results: print(f"[MONITOR] {r['agent_id']}: {r['actions']}") time.sleep(self.interval) def stop(self): """Stop the daemon""" self.running = False # ============================================================================= # CLI # ============================================================================= if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: monitors.py ") print("Commands:") print(" check - Run all checks on an agent") print(" daemon - Run monitor daemon") print(" signal - Send revocation signal") sys.exit(1) cmd = sys.argv[1] if cmd == "check": agent_id = sys.argv[2] exec_watch = ExecutionWatcher() comp_watch = ComplianceWatcher() print(f"=== Checking Agent: {agent_id} ===\n") print("Execution Check:") result = exec_watch.check_agent(agent_id) print(f" Healthy: {result['healthy']}") print(f" Issues: {result['issues']}") print("\nCompliance Check:") result = comp_watch.check_agent(agent_id) print(f" Compliant: {result['compliant']}") print(f" Violations: {result['violations']}") elif cmd == "daemon": daemon = GovernanceMonitorDaemon(interval=10) try: daemon.run_daemon() except KeyboardInterrupt: daemon.stop() print("\n[MONITOR] Stopped") elif cmd == "signal": agent_id = sys.argv[2] watcher = VaultLeaseWatcher() watcher.set_revocation_signal(agent_id) print(f"Revocation signal set for {agent_id}") else: print(f"Unknown command: {cmd}")