Phase 8 Production Hardening with complete governance infrastructure: - Vault integration with tiered policies (T0-T4) - DragonflyDB state management - SQLite audit ledger - Pipeline DSL and templates - Promotion/revocation engine - Checkpoint system for session persistence - Health manager and circuit breaker for fault tolerance - GitHub/Slack integrations - Architectural test pipeline with bug watcher, suggestion engine, council review - Multi-agent chaos testing framework Test Results: - Governance tests: 68/68 passing - E2E workflow: 16/16 passing - Phase 2 Vault: 14/14 passing - Integration tests: 27/27 passing Coverage: 57.6% average across 12 phases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
318 lines
10 KiB
Python
Executable File
318 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Governance Monitor Agents
|
|
=========================
|
|
These agents watch and enforce, they don't do work.
|
|
|
|
- Execution Watcher: Heartbeats, stuck detection, lock expiry
|
|
- Compliance Watcher: Artifact checks, forbidden action detection
|
|
- Vault Lease Watcher: Token validity, revocation enforcement
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Optional
|
|
import threading
|
|
|
|
from governance import (
|
|
GovernanceManager,
|
|
AgentPhase,
|
|
AgentStatus,
|
|
RevocationType
|
|
)
|
|
|
|
|
|
class ExecutionWatcher:
|
|
"""
|
|
Monitors agent execution health.
|
|
- Checks heartbeats
|
|
- Detects stuck agents
|
|
- Handles lock expiry
|
|
"""
|
|
|
|
def __init__(self, heartbeat_timeout: int = 60, stuck_threshold: int = 300):
|
|
self.gov = GovernanceManager()
|
|
self.heartbeat_timeout = heartbeat_timeout # seconds
|
|
self.stuck_threshold = stuck_threshold # seconds without progress
|
|
|
|
def check_agent(self, agent_id: str) -> dict:
|
|
"""Check an agent's execution health"""
|
|
result = {
|
|
"agent_id": agent_id,
|
|
"healthy": True,
|
|
"issues": []
|
|
}
|
|
|
|
state = self.gov.get_agent_state(agent_id)
|
|
if not state:
|
|
result["healthy"] = False
|
|
result["issues"].append("NO_STATE")
|
|
return result
|
|
|
|
# Skip completed/revoked agents
|
|
if state.status in [AgentStatus.COMPLETED, AgentStatus.REVOKED]:
|
|
return result
|
|
|
|
# Check heartbeat
|
|
if not self.gov.is_alive(agent_id):
|
|
result["healthy"] = False
|
|
result["issues"].append("HEARTBEAT_TIMEOUT")
|
|
|
|
# Check lock
|
|
if not self.gov.has_lock(agent_id) and state.status == AgentStatus.RUNNING:
|
|
result["healthy"] = False
|
|
result["issues"].append("LOCK_EXPIRED")
|
|
|
|
# Check for stuck (no progress)
|
|
if state.last_progress_at:
|
|
last_progress = datetime.fromisoformat(state.last_progress_at.replace("Z", "+00:00"))
|
|
age = (datetime.now(timezone.utc) - last_progress).total_seconds()
|
|
if age > self.stuck_threshold:
|
|
result["healthy"] = False
|
|
result["issues"].append(f"STUCK_{int(age)}s")
|
|
|
|
return result
|
|
|
|
def enforce(self, agent_id: str) -> Optional[str]:
|
|
"""Enforce health requirements, return action taken"""
|
|
check = self.check_agent(agent_id)
|
|
|
|
if check["healthy"]:
|
|
return None
|
|
|
|
for issue in check["issues"]:
|
|
if issue == "HEARTBEAT_TIMEOUT":
|
|
self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
|
|
"No heartbeat received within timeout")
|
|
return "REVOKED:HEARTBEAT"
|
|
|
|
elif issue == "LOCK_EXPIRED":
|
|
self.gov.revoke_agent(agent_id, RevocationType.LOCK_EXPIRED,
|
|
"Lock expired while agent was running")
|
|
return "REVOKED:LOCK"
|
|
|
|
elif issue.startswith("STUCK_"):
|
|
self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
|
|
f"Agent stuck with no progress: {issue}")
|
|
return f"REVOKED:{issue}"
|
|
|
|
return None
|
|
|
|
|
|
class ComplianceWatcher:
|
|
"""
|
|
Monitors agent compliance with procedures.
|
|
- Checks required artifacts exist
|
|
- Detects forbidden actions
|
|
- Validates phase transitions
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.gov = GovernanceManager()
|
|
|
|
def check_agent(self, agent_id: str) -> dict:
|
|
"""Check an agent's compliance"""
|
|
result = {
|
|
"agent_id": agent_id,
|
|
"compliant": True,
|
|
"violations": []
|
|
}
|
|
|
|
state = self.gov.get_agent_state(agent_id)
|
|
packet = self.gov.get_instruction_packet(agent_id)
|
|
|
|
if not state or not packet:
|
|
result["compliant"] = False
|
|
result["violations"].append("MISSING_STATE_OR_PACKET")
|
|
return result
|
|
|
|
# Check if EXECUTE was entered without PLAN artifact
|
|
if state.phase in [AgentPhase.EXECUTE, AgentPhase.VERIFY, AgentPhase.PACKAGE]:
|
|
if not self.gov.has_required_artifact(packet.task_id, "plan"):
|
|
result["compliant"] = False
|
|
result["violations"].append("EXECUTE_WITHOUT_PLAN_ARTIFACT")
|
|
|
|
# Check required steps from constraints
|
|
required_steps = packet.constraints.get("required_steps", [])
|
|
# This would need more sophisticated tracking in production
|
|
|
|
return result
|
|
|
|
def enforce(self, agent_id: str) -> Optional[str]:
|
|
"""Enforce compliance, return action taken"""
|
|
check = self.check_agent(agent_id)
|
|
|
|
if check["compliant"]:
|
|
return None
|
|
|
|
for violation in check["violations"]:
|
|
if violation == "EXECUTE_WITHOUT_PLAN_ARTIFACT":
|
|
self.gov.revoke_agent(agent_id, RevocationType.PROCEDURE_VIOLATION,
|
|
"Attempted EXECUTE phase without plan artifact")
|
|
return "REVOKED:NO_PLAN_ARTIFACT"
|
|
|
|
return None
|
|
|
|
|
|
class VaultLeaseWatcher:
|
|
"""
|
|
Monitors Vault token/lease validity.
|
|
- Checks token accessibility
|
|
- Confirms revocation signals
|
|
- Enforces token revocation
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.gov = GovernanceManager()
|
|
self.db = self.gov.db
|
|
|
|
def set_revocation_signal(self, agent_id: str):
|
|
"""Set a signal that this agent should be revoked"""
|
|
self.db.set(f"agent:{agent_id}:revoke_signal", "1", ex=300)
|
|
|
|
def has_revocation_signal(self, agent_id: str) -> bool:
|
|
"""Check if revocation signal is set"""
|
|
return self.db.exists(f"agent:{agent_id}:revoke_signal")
|
|
|
|
def clear_revocation_signal(self, agent_id: str):
|
|
"""Clear revocation signal after enforcement"""
|
|
self.db.delete(f"agent:{agent_id}:revoke_signal")
|
|
|
|
def enforce(self, agent_id: str) -> Optional[str]:
|
|
"""Enforce revocation signal"""
|
|
if self.has_revocation_signal(agent_id):
|
|
self.gov.revoke_agent(agent_id, RevocationType.MANUAL,
|
|
"Revocation signal received")
|
|
self.clear_revocation_signal(agent_id)
|
|
return "REVOKED:SIGNAL"
|
|
return None
|
|
|
|
|
|
class GovernanceMonitorDaemon:
|
|
"""
|
|
Background daemon that runs all monitors periodically.
|
|
"""
|
|
|
|
def __init__(self, interval: int = 10):
|
|
self.interval = interval
|
|
self.gov = GovernanceManager()
|
|
self.execution_watcher = ExecutionWatcher()
|
|
self.compliance_watcher = ComplianceWatcher()
|
|
self.vault_watcher = VaultLeaseWatcher()
|
|
self.running = False
|
|
|
|
def get_active_agents(self) -> list[str]:
|
|
"""Get list of agents that need monitoring"""
|
|
# Get all agent state keys
|
|
keys = self.gov.db.keys("agent:*:state")
|
|
agents = []
|
|
for key in keys:
|
|
agent_id = key.split(":")[1]
|
|
state = self.gov.get_agent_state(agent_id)
|
|
if state and state.status == AgentStatus.RUNNING:
|
|
agents.append(agent_id)
|
|
return agents
|
|
|
|
def run_checks(self) -> list[dict]:
|
|
"""Run all monitors once"""
|
|
results = []
|
|
|
|
for agent_id in self.get_active_agents():
|
|
result = {
|
|
"agent_id": agent_id,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"actions": []
|
|
}
|
|
|
|
# Execution check
|
|
action = self.execution_watcher.enforce(agent_id)
|
|
if action:
|
|
result["actions"].append(action)
|
|
|
|
# Compliance check (only if not already revoked)
|
|
state = self.gov.get_agent_state(agent_id)
|
|
if state and state.status != AgentStatus.REVOKED:
|
|
action = self.compliance_watcher.enforce(agent_id)
|
|
if action:
|
|
result["actions"].append(action)
|
|
|
|
# Vault check
|
|
if state and state.status != AgentStatus.REVOKED:
|
|
action = self.vault_watcher.enforce(agent_id)
|
|
if action:
|
|
result["actions"].append(action)
|
|
|
|
if result["actions"]:
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
def run_daemon(self):
|
|
"""Run as a daemon (blocking)"""
|
|
self.running = True
|
|
print(f"[MONITOR] Starting governance monitor daemon (interval: {self.interval}s)")
|
|
|
|
while self.running:
|
|
results = self.run_checks()
|
|
for r in results:
|
|
print(f"[MONITOR] {r['agent_id']}: {r['actions']}")
|
|
time.sleep(self.interval)
|
|
|
|
def stop(self):
|
|
"""Stop the daemon"""
|
|
self.running = False
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: monitors.py <command>")
|
|
print("Commands:")
|
|
print(" check <agent_id> - Run all checks on an agent")
|
|
print(" daemon - Run monitor daemon")
|
|
print(" signal <agent_id> - Send revocation signal")
|
|
sys.exit(1)
|
|
|
|
cmd = sys.argv[1]
|
|
|
|
if cmd == "check":
|
|
agent_id = sys.argv[2]
|
|
|
|
exec_watch = ExecutionWatcher()
|
|
comp_watch = ComplianceWatcher()
|
|
|
|
print(f"=== Checking Agent: {agent_id} ===\n")
|
|
|
|
print("Execution Check:")
|
|
result = exec_watch.check_agent(agent_id)
|
|
print(f" Healthy: {result['healthy']}")
|
|
print(f" Issues: {result['issues']}")
|
|
|
|
print("\nCompliance Check:")
|
|
result = comp_watch.check_agent(agent_id)
|
|
print(f" Compliant: {result['compliant']}")
|
|
print(f" Violations: {result['violations']}")
|
|
|
|
elif cmd == "daemon":
|
|
daemon = GovernanceMonitorDaemon(interval=10)
|
|
try:
|
|
daemon.run_daemon()
|
|
except KeyboardInterrupt:
|
|
daemon.stop()
|
|
print("\n[MONITOR] Stopped")
|
|
|
|
elif cmd == "signal":
|
|
agent_id = sys.argv[2]
|
|
watcher = VaultLeaseWatcher()
|
|
watcher.set_revocation_signal(agent_id)
|
|
print(f"Revocation signal set for {agent_id}")
|
|
|
|
else:
|
|
print(f"Unknown command: {cmd}")
|