agent-governance/agents/llm-planner/monitors.py

#!/usr/bin/env python3
"""
Governance Monitor Agents
=========================
These agents watch and enforce, they don't do work.

- Execution Watcher: Heartbeats, stuck detection, lock expiry
- Compliance Watcher: Artifact checks, forbidden action detection
- Vault Lease Watcher: Token validity, revocation enforcement
"""

import json
import time
from datetime import datetime, timezone, timedelta
from typing import Optional
import threading

from governance import (
    GovernanceManager,
    AgentPhase,
    AgentStatus,
    RevocationType
)


class ExecutionWatcher:
    """
    Monitors agent execution health.
    - Checks heartbeats
    - Detects stuck agents
    - Handles lock expiry
    """

    def __init__(self, heartbeat_timeout: int = 60, stuck_threshold: int = 300):
        self.gov = GovernanceManager()
        self.heartbeat_timeout = heartbeat_timeout  # seconds
        self.stuck_threshold = stuck_threshold  # seconds without progress

    def check_agent(self, agent_id: str) -> dict:
        """Check an agent's execution health"""
        result = {
            "agent_id": agent_id,
            "healthy": True,
            "issues": []
        }

        state = self.gov.get_agent_state(agent_id)
        if not state:
            result["healthy"] = False
            result["issues"].append("NO_STATE")
            return result

        # Skip completed/revoked agents
        if state.status in [AgentStatus.COMPLETED, AgentStatus.REVOKED]:
            return result

        # Check heartbeat
        if not self.gov.is_alive(agent_id):
            result["healthy"] = False
            result["issues"].append("HEARTBEAT_TIMEOUT")

        # Check lock
        if not self.gov.has_lock(agent_id) and state.status == AgentStatus.RUNNING:
            result["healthy"] = False
            result["issues"].append("LOCK_EXPIRED")

        # Check for stuck (no progress)
        if state.last_progress_at:
            last_progress = datetime.fromisoformat(state.last_progress_at.replace("Z", "+00:00"))
            age = (datetime.now(timezone.utc) - last_progress).total_seconds()
            if age > self.stuck_threshold:
                result["healthy"] = False
                result["issues"].append(f"STUCK_{int(age)}s")

        return result

    def enforce(self, agent_id: str) -> Optional[str]:
        """Enforce health requirements, return action taken"""
        check = self.check_agent(agent_id)

        if check["healthy"]:
            return None

        for issue in check["issues"]:
            if issue == "HEARTBEAT_TIMEOUT":
                self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
                                      "No heartbeat received within timeout")
                return "REVOKED:HEARTBEAT"

            elif issue == "LOCK_EXPIRED":
                self.gov.revoke_agent(agent_id, RevocationType.LOCK_EXPIRED,
                                      "Lock expired while agent was running")
                return "REVOKED:LOCK"

            elif issue.startswith("STUCK_"):
                self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
                                      f"Agent stuck with no progress: {issue}")
                return f"REVOKED:{issue}"

        return None


class ComplianceWatcher:
    """
    Monitors agent compliance with procedures.
    - Checks required artifacts exist
    - Detects forbidden actions
    - Validates phase transitions
    """

    def __init__(self):
        self.gov = GovernanceManager()

    def check_agent(self, agent_id: str) -> dict:
        """Check an agent's compliance"""
        result = {
            "agent_id": agent_id,
            "compliant": True,
            "violations": []
        }

        state = self.gov.get_agent_state(agent_id)
        packet = self.gov.get_instruction_packet(agent_id)

        if not state or not packet:
            result["compliant"] = False
            result["violations"].append("MISSING_STATE_OR_PACKET")
            return result

        # Check if EXECUTE was entered without PLAN artifact
        if state.phase in [AgentPhase.EXECUTE, AgentPhase.VERIFY, AgentPhase.PACKAGE]:
            if not self.gov.has_required_artifact(packet.task_id, "plan"):
                result["compliant"] = False
                result["violations"].append("EXECUTE_WITHOUT_PLAN_ARTIFACT")

        # Check required steps from constraints
        required_steps = packet.constraints.get("required_steps", [])
        # This would need more sophisticated tracking in production

        return result

    def enforce(self, agent_id: str) -> Optional[str]:
        """Enforce compliance, return action taken"""
        check = self.check_agent(agent_id)

        if check["compliant"]:
            return None

        for violation in check["violations"]:
            if violation == "EXECUTE_WITHOUT_PLAN_ARTIFACT":
                self.gov.revoke_agent(agent_id, RevocationType.PROCEDURE_VIOLATION,
                                      "Attempted EXECUTE phase without plan artifact")
                return "REVOKED:NO_PLAN_ARTIFACT"

        return None


class VaultLeaseWatcher:
    """
    Monitors Vault token/lease validity.
    - Checks token accessibility
    - Confirms revocation signals
    - Enforces token revocation
    """

    def __init__(self):
        self.gov = GovernanceManager()
        self.db = self.gov.db

    def set_revocation_signal(self, agent_id: str):
        """Set a signal that this agent should be revoked"""
        self.db.set(f"agent:{agent_id}:revoke_signal", "1", ex=300)

    def has_revocation_signal(self, agent_id: str) -> bool:
        """Check if revocation signal is set"""
        return self.db.exists(f"agent:{agent_id}:revoke_signal")

    def clear_revocation_signal(self, agent_id: str):
        """Clear revocation signal after enforcement"""
        self.db.delete(f"agent:{agent_id}:revoke_signal")

    def enforce(self, agent_id: str) -> Optional[str]:
        """Enforce revocation signal"""
        if self.has_revocation_signal(agent_id):
            self.gov.revoke_agent(agent_id, RevocationType.MANUAL,
                                  "Revocation signal received")
            self.clear_revocation_signal(agent_id)
            return "REVOKED:SIGNAL"
        return None


class GovernanceMonitorDaemon:
    """
    Background daemon that runs all monitors periodically.
    """

    def __init__(self, interval: int = 10):
        self.interval = interval
        self.gov = GovernanceManager()
        self.execution_watcher = ExecutionWatcher()
        self.compliance_watcher = ComplianceWatcher()
        self.vault_watcher = VaultLeaseWatcher()
        self.running = False

    def get_active_agents(self) -> list[str]:
        """Get list of agents that need monitoring"""
        # Get all agent state keys
        keys = self.gov.db.keys("agent:*:state")
        agents = []
        for key in keys:
            agent_id = key.split(":")[1]
            state = self.gov.get_agent_state(agent_id)
            if state and state.status == AgentStatus.RUNNING:
                agents.append(agent_id)
        return agents

    def run_checks(self) -> list[dict]:
        """Run all monitors once"""
        results = []

        for agent_id in self.get_active_agents():
            result = {
                "agent_id": agent_id,
                "timestamp": datetime.now(timezone.utc).isoformat(),
                "actions": []
            }

            # Execution check
            action = self.execution_watcher.enforce(agent_id)
            if action:
                result["actions"].append(action)

            # Compliance check (only if not already revoked)
            state = self.gov.get_agent_state(agent_id)
            if state and state.status != AgentStatus.REVOKED:
                action = self.compliance_watcher.enforce(agent_id)
                if action:
                    result["actions"].append(action)

            # Vault check
            if state and state.status != AgentStatus.REVOKED:
                action = self.vault_watcher.enforce(agent_id)
                if action:
                    result["actions"].append(action)

            if result["actions"]:
                results.append(result)

        return results

    def run_daemon(self):
        """Run as a daemon (blocking)"""
        self.running = True
        print(f"[MONITOR] Starting governance monitor daemon (interval: {self.interval}s)")

        while self.running:
            results = self.run_checks()
            for r in results:
                print(f"[MONITOR] {r['agent_id']}: {r['actions']}")
            time.sleep(self.interval)

    def stop(self):
        """Stop the daemon"""
        self.running = False


# =============================================================================
# CLI
# =============================================================================

if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage: monitors.py <command>")
        print("Commands:")
        print("  check <agent_id>  - Run all checks on an agent")
        print("  daemon            - Run monitor daemon")
        print("  signal <agent_id> - Send revocation signal")
        sys.exit(1)

    cmd = sys.argv[1]

    if cmd == "check":
        agent_id = sys.argv[2]

        exec_watch = ExecutionWatcher()
        comp_watch = ComplianceWatcher()

        print(f"=== Checking Agent: {agent_id} ===\n")

        print("Execution Check:")
        result = exec_watch.check_agent(agent_id)
        print(f"  Healthy: {result['healthy']}")
        print(f"  Issues: {result['issues']}")

        print("\nCompliance Check:")
        result = comp_watch.check_agent(agent_id)
        print(f"  Compliant: {result['compliant']}")
        print(f"  Violations: {result['violations']}")

    elif cmd == "daemon":
        daemon = GovernanceMonitorDaemon(interval=10)
        try:
            daemon.run_daemon()
        except KeyboardInterrupt:
            daemon.stop()
            print("\n[MONITOR] Stopped")

    elif cmd == "signal":
        agent_id = sys.argv[2]
        watcher = VaultLeaseWatcher()
        watcher.set_revocation_signal(agent_id)
        print(f"Revocation signal set for {agent_id}")

    else:
        print(f"Unknown command: {cmd}")