profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

318 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Governance Monitor Agents
=========================
These agents watch and enforce, they don't do work.
- Execution Watcher: Heartbeats, stuck detection, lock expiry
- Compliance Watcher: Artifact checks, forbidden action detection
- Vault Lease Watcher: Token validity, revocation enforcement
"""
import json
import time
from datetime import datetime, timezone, timedelta
from typing import Optional
import threading
from governance import (
GovernanceManager,
AgentPhase,
AgentStatus,
RevocationType
)
class ExecutionWatcher:
"""
Monitors agent execution health.
- Checks heartbeats
- Detects stuck agents
- Handles lock expiry
"""
def __init__(self, heartbeat_timeout: int = 60, stuck_threshold: int = 300):
self.gov = GovernanceManager()
self.heartbeat_timeout = heartbeat_timeout # seconds
self.stuck_threshold = stuck_threshold # seconds without progress
def check_agent(self, agent_id: str) -> dict:
"""Check an agent's execution health"""
result = {
"agent_id": agent_id,
"healthy": True,
"issues": []
}
state = self.gov.get_agent_state(agent_id)
if not state:
result["healthy"] = False
result["issues"].append("NO_STATE")
return result
# Skip completed/revoked agents
if state.status in [AgentStatus.COMPLETED, AgentStatus.REVOKED]:
return result
# Check heartbeat
if not self.gov.is_alive(agent_id):
result["healthy"] = False
result["issues"].append("HEARTBEAT_TIMEOUT")
# Check lock
if not self.gov.has_lock(agent_id) and state.status == AgentStatus.RUNNING:
result["healthy"] = False
result["issues"].append("LOCK_EXPIRED")
# Check for stuck (no progress)
if state.last_progress_at:
last_progress = datetime.fromisoformat(state.last_progress_at.replace("Z", "+00:00"))
age = (datetime.now(timezone.utc) - last_progress).total_seconds()
if age > self.stuck_threshold:
result["healthy"] = False
result["issues"].append(f"STUCK_{int(age)}s")
return result
def enforce(self, agent_id: str) -> Optional[str]:
"""Enforce health requirements, return action taken"""
check = self.check_agent(agent_id)
if check["healthy"]:
return None
for issue in check["issues"]:
if issue == "HEARTBEAT_TIMEOUT":
self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
"No heartbeat received within timeout")
return "REVOKED:HEARTBEAT"
elif issue == "LOCK_EXPIRED":
self.gov.revoke_agent(agent_id, RevocationType.LOCK_EXPIRED,
"Lock expired while agent was running")
return "REVOKED:LOCK"
elif issue.startswith("STUCK_"):
self.gov.revoke_agent(agent_id, RevocationType.HEARTBEAT_TIMEOUT,
f"Agent stuck with no progress: {issue}")
return f"REVOKED:{issue}"
return None
class ComplianceWatcher:
"""
Monitors agent compliance with procedures.
- Checks required artifacts exist
- Detects forbidden actions
- Validates phase transitions
"""
def __init__(self):
self.gov = GovernanceManager()
def check_agent(self, agent_id: str) -> dict:
"""Check an agent's compliance"""
result = {
"agent_id": agent_id,
"compliant": True,
"violations": []
}
state = self.gov.get_agent_state(agent_id)
packet = self.gov.get_instruction_packet(agent_id)
if not state or not packet:
result["compliant"] = False
result["violations"].append("MISSING_STATE_OR_PACKET")
return result
# Check if EXECUTE was entered without PLAN artifact
if state.phase in [AgentPhase.EXECUTE, AgentPhase.VERIFY, AgentPhase.PACKAGE]:
if not self.gov.has_required_artifact(packet.task_id, "plan"):
result["compliant"] = False
result["violations"].append("EXECUTE_WITHOUT_PLAN_ARTIFACT")
# Check required steps from constraints
required_steps = packet.constraints.get("required_steps", [])
# This would need more sophisticated tracking in production
return result
def enforce(self, agent_id: str) -> Optional[str]:
"""Enforce compliance, return action taken"""
check = self.check_agent(agent_id)
if check["compliant"]:
return None
for violation in check["violations"]:
if violation == "EXECUTE_WITHOUT_PLAN_ARTIFACT":
self.gov.revoke_agent(agent_id, RevocationType.PROCEDURE_VIOLATION,
"Attempted EXECUTE phase without plan artifact")
return "REVOKED:NO_PLAN_ARTIFACT"
return None
class VaultLeaseWatcher:
"""
Monitors Vault token/lease validity.
- Checks token accessibility
- Confirms revocation signals
- Enforces token revocation
"""
def __init__(self):
self.gov = GovernanceManager()
self.db = self.gov.db
def set_revocation_signal(self, agent_id: str):
"""Set a signal that this agent should be revoked"""
self.db.set(f"agent:{agent_id}:revoke_signal", "1", ex=300)
def has_revocation_signal(self, agent_id: str) -> bool:
"""Check if revocation signal is set"""
return self.db.exists(f"agent:{agent_id}:revoke_signal")
def clear_revocation_signal(self, agent_id: str):
"""Clear revocation signal after enforcement"""
self.db.delete(f"agent:{agent_id}:revoke_signal")
def enforce(self, agent_id: str) -> Optional[str]:
"""Enforce revocation signal"""
if self.has_revocation_signal(agent_id):
self.gov.revoke_agent(agent_id, RevocationType.MANUAL,
"Revocation signal received")
self.clear_revocation_signal(agent_id)
return "REVOKED:SIGNAL"
return None
class GovernanceMonitorDaemon:
"""
Background daemon that runs all monitors periodically.
"""
def __init__(self, interval: int = 10):
self.interval = interval
self.gov = GovernanceManager()
self.execution_watcher = ExecutionWatcher()
self.compliance_watcher = ComplianceWatcher()
self.vault_watcher = VaultLeaseWatcher()
self.running = False
def get_active_agents(self) -> list[str]:
"""Get list of agents that need monitoring"""
# Get all agent state keys
keys = self.gov.db.keys("agent:*:state")
agents = []
for key in keys:
agent_id = key.split(":")[1]
state = self.gov.get_agent_state(agent_id)
if state and state.status == AgentStatus.RUNNING:
agents.append(agent_id)
return agents
def run_checks(self) -> list[dict]:
"""Run all monitors once"""
results = []
for agent_id in self.get_active_agents():
result = {
"agent_id": agent_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"actions": []
}
# Execution check
action = self.execution_watcher.enforce(agent_id)
if action:
result["actions"].append(action)
# Compliance check (only if not already revoked)
state = self.gov.get_agent_state(agent_id)
if state and state.status != AgentStatus.REVOKED:
action = self.compliance_watcher.enforce(agent_id)
if action:
result["actions"].append(action)
# Vault check
if state and state.status != AgentStatus.REVOKED:
action = self.vault_watcher.enforce(agent_id)
if action:
result["actions"].append(action)
if result["actions"]:
results.append(result)
return results
def run_daemon(self):
"""Run as a daemon (blocking)"""
self.running = True
print(f"[MONITOR] Starting governance monitor daemon (interval: {self.interval}s)")
while self.running:
results = self.run_checks()
for r in results:
print(f"[MONITOR] {r['agent_id']}: {r['actions']}")
time.sleep(self.interval)
def stop(self):
"""Stop the daemon"""
self.running = False
# =============================================================================
# CLI
# =============================================================================
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: monitors.py <command>")
print("Commands:")
print(" check <agent_id> - Run all checks on an agent")
print(" daemon - Run monitor daemon")
print(" signal <agent_id> - Send revocation signal")
sys.exit(1)
cmd = sys.argv[1]
if cmd == "check":
agent_id = sys.argv[2]
exec_watch = ExecutionWatcher()
comp_watch = ComplianceWatcher()
print(f"=== Checking Agent: {agent_id} ===\n")
print("Execution Check:")
result = exec_watch.check_agent(agent_id)
print(f" Healthy: {result['healthy']}")
print(f" Issues: {result['issues']}")
print("\nCompliance Check:")
result = comp_watch.check_agent(agent_id)
print(f" Compliant: {result['compliant']}")
print(f" Violations: {result['violations']}")
elif cmd == "daemon":
daemon = GovernanceMonitorDaemon(interval=10)
try:
daemon.run_daemon()
except KeyboardInterrupt:
daemon.stop()
print("\n[MONITOR] Stopped")
elif cmd == "signal":
agent_id = sys.argv[2]
watcher = VaultLeaseWatcher()
watcher.set_revocation_signal(agent_id)
print(f"Revocation signal set for {agent_id}")
else:
print(f"Unknown command: {cmd}")