#!/usr/bin/env python3 """ Agent Runtime Governance via DragonflyDB ========================================= Implements real-time agent control: instruction packets, state tracking, error budgets, revocation, and handoffs. """ import json import hashlib import time from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from enum import Enum from typing import Any, Optional import redis import subprocess # ============================================================================= # Configuration # ============================================================================= def get_dragonfly_client() -> redis.Redis: """Get DragonflyDB client with credentials from Vault""" # Get credentials from Vault with open("/opt/vault/init-keys.json") as f: token = json.load(f)["root_token"] result = subprocess.run([ "curl", "-sk", "-H", f"X-Vault-Token: {token}", "https://127.0.0.1:8200/v1/secret/data/services/dragonfly" ], capture_output=True, text=True) creds = json.loads(result.stdout)["data"]["data"] return redis.Redis( host=creds["host"], port=int(creds["port"]), password=creds["password"], decode_responses=True ) # ============================================================================= # Enums and Types # ============================================================================= class AgentPhase(str, Enum): BOOTSTRAP = "BOOTSTRAP" PREFLIGHT = "PREFLIGHT" PLAN = "PLAN" EXECUTE = "EXECUTE" VERIFY = "VERIFY" PACKAGE = "PACKAGE" REPORT = "REPORT" EXIT = "EXIT" REVOKED = "REVOKED" class AgentStatus(str, Enum): PENDING = "PENDING" RUNNING = "RUNNING" PAUSED = "PAUSED" COMPLETED = "COMPLETED" REVOKED = "REVOKED" FAILED = "FAILED" class RevocationType(str, Enum): ERROR_BUDGET_EXCEEDED = "ERROR_BUDGET_EXCEEDED" PROCEDURE_VIOLATION = "PROCEDURE_VIOLATION" FORBIDDEN_ACTION = "FORBIDDEN_ACTION" HEARTBEAT_TIMEOUT = "HEARTBEAT_TIMEOUT" LOCK_EXPIRED = "LOCK_EXPIRED" MANUAL = "MANUAL" # ============================================================================= # Data Classes # ============================================================================= @dataclass class ErrorBudget: max_total_errors: int = 12 max_same_error_repeats: int = 3 max_procedure_violations: int = 1 @dataclass class InstructionPacket: agent_id: str task_id: str created_for: str objective: str deliverables: list[str] constraints: dict success_criteria: list[str] error_budget: ErrorBudget escalation_rules: list[str] repo: str = "" pr_context: dict = field(default_factory=dict) revocation_context: dict = field(default_factory=dict) created_at: str = "" def __post_init__(self): if not self.created_at: self.created_at = datetime.now(timezone.utc).isoformat() if isinstance(self.error_budget, dict): self.error_budget = ErrorBudget(**self.error_budget) def to_dict(self) -> dict: d = asdict(self) d["error_budget"] = asdict(self.error_budget) return d @dataclass class AgentState: agent_id: str status: AgentStatus phase: AgentPhase step: str = "" started_at: str = "" last_progress_at: str = "" current_pr: Optional[int] = None notes: str = "" def to_dict(self) -> dict: return { "agent_id": self.agent_id, "status": self.status.value, "phase": self.phase.value, "step": self.step, "started_at": self.started_at, "last_progress_at": self.last_progress_at, "current_pr": self.current_pr, "notes": self.notes } @dataclass class HandoffObject: task_id: str previous_agent_id: str revoked: bool revocation_reason: dict last_known_state: dict what_was_tried: list[str] blocking_issue: str required_next_actions: list[str] constraints_reminder: list[str] artifacts: list[str] created_at: str = "" def __post_init__(self): if not self.created_at: self.created_at = datetime.now(timezone.utc).isoformat() def to_dict(self) -> dict: return asdict(self) # ============================================================================= # Governance Manager # ============================================================================= class GovernanceManager: """ Core runtime governance via DragonflyDB. Manages instruction packets, state, errors, locks, and handoffs. """ def __init__(self): self.db = get_dragonfly_client() self.lock_ttl = 300 # 5 minutes default lock TTL self.heartbeat_ttl = 60 # 1 minute heartbeat timeout def _now(self) -> str: return datetime.now(timezone.utc).isoformat() def _error_signature(self, error_type: str, message: str) -> str: """Generate a normalized error signature""" normalized = f"{error_type}:{message[:100]}".lower() return hashlib.md5(normalized.encode()).hexdigest()[:12] # ------------------------------------------------------------------------- # Instruction Packets # ------------------------------------------------------------------------- def create_instruction_packet(self, packet: InstructionPacket) -> bool: """Store an instruction packet for an agent""" key = f"agent:{packet.agent_id}:packet" return self.db.set(key, json.dumps(packet.to_dict())) def get_instruction_packet(self, agent_id: str) -> Optional[InstructionPacket]: """Retrieve an agent's instruction packet""" key = f"agent:{agent_id}:packet" data = self.db.get(key) if data: return InstructionPacket(**json.loads(data)) return None # ------------------------------------------------------------------------- # Agent State # ------------------------------------------------------------------------- def set_agent_state(self, state: AgentState) -> bool: """Update agent's runtime state""" state.last_progress_at = self._now() key = f"agent:{state.agent_id}:state" return self.db.set(key, json.dumps(state.to_dict())) def get_agent_state(self, agent_id: str) -> Optional[AgentState]: """Get agent's current state""" key = f"agent:{agent_id}:state" data = self.db.get(key) if data: d = json.loads(data) return AgentState( agent_id=d["agent_id"], status=AgentStatus(d["status"]), phase=AgentPhase(d["phase"]), step=d.get("step", ""), started_at=d.get("started_at", ""), last_progress_at=d.get("last_progress_at", ""), current_pr=d.get("current_pr"), notes=d.get("notes", "") ) return None def transition_phase(self, agent_id: str, phase: AgentPhase, step: str = "", notes: str = "") -> bool: """Transition agent to a new phase""" state = self.get_agent_state(agent_id) if not state: return False state.phase = phase state.step = step state.notes = notes return self.set_agent_state(state) # ------------------------------------------------------------------------- # Locking # ------------------------------------------------------------------------- def acquire_lock(self, agent_id: str, ttl: int = None) -> bool: """Acquire execution lock for an agent""" key = f"agent:{agent_id}:lock" ttl = ttl or self.lock_ttl # NX = only set if not exists return self.db.set(key, self._now(), nx=True, ex=ttl) def refresh_lock(self, agent_id: str, ttl: int = None) -> bool: """Refresh an existing lock""" key = f"agent:{agent_id}:lock" ttl = ttl or self.lock_ttl if self.db.exists(key): return self.db.expire(key, ttl) return False def release_lock(self, agent_id: str) -> bool: """Release execution lock""" key = f"agent:{agent_id}:lock" return self.db.delete(key) > 0 def has_lock(self, agent_id: str) -> bool: """Check if agent has a valid lock""" key = f"agent:{agent_id}:lock" return self.db.exists(key) # ------------------------------------------------------------------------- # Heartbeat # ------------------------------------------------------------------------- def heartbeat(self, agent_id: str) -> bool: """Update agent heartbeat""" key = f"agent:{agent_id}:heartbeat" return self.db.set(key, self._now(), ex=self.heartbeat_ttl) def is_alive(self, agent_id: str) -> bool: """Check if agent heartbeat is fresh""" key = f"agent:{agent_id}:heartbeat" return self.db.exists(key) # ------------------------------------------------------------------------- # Error Tracking # ------------------------------------------------------------------------- def record_error(self, agent_id: str, error_type: str, message: str) -> dict: """Record an error and return current counts""" key = f"agent:{agent_id}:errors" sig = self._error_signature(error_type, message) pipe = self.db.pipeline() pipe.hincrby(key, "total_errors", 1) pipe.hincrby(key, f"same_error:{sig}", 1) pipe.hset(key, "last_error_signature", sig) pipe.hset(key, "last_error_at", self._now()) pipe.hset(key, "last_error_type", error_type) pipe.hset(key, "last_error_message", message[:500]) pipe.execute() return self.get_error_counts(agent_id) def record_procedure_violation(self, agent_id: str, violation: str) -> int: """Record a procedure violation""" key = f"agent:{agent_id}:errors" self.db.hset(key, "last_violation", violation) self.db.hset(key, "last_violation_at", self._now()) return self.db.hincrby(key, "procedure_violations", 1) def get_error_counts(self, agent_id: str) -> dict: """Get all error counts for an agent""" key = f"agent:{agent_id}:errors" data = self.db.hgetall(key) return { "total_errors": int(data.get("total_errors", 0)), "procedure_violations": int(data.get("procedure_violations", 0)), "last_error_signature": data.get("last_error_signature", ""), "last_error_at": data.get("last_error_at", ""), "last_error_type": data.get("last_error_type", ""), "same_error_counts": { k.replace("same_error:", ""): int(v) for k, v in data.items() if k.startswith("same_error:") } } def check_error_budget(self, agent_id: str) -> tuple[bool, Optional[str]]: """Check if error budget is exceeded. Returns (ok, reason)""" packet = self.get_instruction_packet(agent_id) if not packet: return False, "NO_INSTRUCTION_PACKET" counts = self.get_error_counts(agent_id) budget = packet.error_budget # Check procedure violations if counts["procedure_violations"] >= budget.max_procedure_violations: return False, f"PROCEDURE_VIOLATIONS ({counts['procedure_violations']} >= {budget.max_procedure_violations})" # Check total errors if counts["total_errors"] >= budget.max_total_errors: return False, f"TOTAL_ERRORS ({counts['total_errors']} >= {budget.max_total_errors})" # Check same error repeats for sig, count in counts["same_error_counts"].items(): if count >= budget.max_same_error_repeats: return False, f"SAME_ERROR_REPEATED ({sig}: {count} >= {budget.max_same_error_repeats})" return True, None # ------------------------------------------------------------------------- # Task Management # ------------------------------------------------------------------------- def assign_agent_to_task(self, task_id: str, agent_id: str) -> bool: """Assign an agent to a task""" # Set active agent self.db.set(f"task:{task_id}:active_agent", agent_id) # Add to history self.db.rpush(f"task:{task_id}:history", json.dumps({ "agent_id": agent_id, "assigned_at": self._now(), "event": "ASSIGNED" })) return True def get_active_agent(self, task_id: str) -> Optional[str]: """Get the currently active agent for a task""" return self.db.get(f"task:{task_id}:active_agent") def get_task_history(self, task_id: str) -> list[dict]: """Get task agent history""" data = self.db.lrange(f"task:{task_id}:history", 0, -1) return [json.loads(d) for d in data] # ------------------------------------------------------------------------- # Revocation # ------------------------------------------------------------------------- def revoke_agent(self, agent_id: str, reason_type: RevocationType, details: str) -> bool: """Revoke an agent's access""" # Update state state = self.get_agent_state(agent_id) if state: state.status = AgentStatus.REVOKED state.phase = AgentPhase.REVOKED state.notes = f"Revoked: {reason_type.value} - {details}" self.set_agent_state(state) # Release lock self.release_lock(agent_id) # Write to revocation ledger revocation_event = { "agent_id": agent_id, "reason_type": reason_type.value, "details": details, "revoked_at": self._now() } self.db.rpush("revocations:ledger", json.dumps(revocation_event)) # Add to task history if we know the task packet = self.get_instruction_packet(agent_id) if packet: self.db.rpush(f"task:{packet.task_id}:history", json.dumps({ "agent_id": agent_id, "event": "REVOKED", "reason": reason_type.value, "revoked_at": self._now() })) return True def get_recent_revocations(self, count: int = 50) -> list[dict]: """Get recent revocation events""" data = self.db.lrange("revocations:ledger", -count, -1) return [json.loads(d) for d in data] # ------------------------------------------------------------------------- # Handoff # ------------------------------------------------------------------------- def create_handoff(self, handoff: HandoffObject) -> bool: """Create a handoff object for task continuity""" key = f"handoff:{handoff.task_id}:latest" return self.db.set(key, json.dumps(handoff.to_dict())) def get_handoff(self, task_id: str) -> Optional[HandoffObject]: """Get the latest handoff for a task""" key = f"handoff:{task_id}:latest" data = self.db.get(key) if data: return HandoffObject(**json.loads(data)) return None # ------------------------------------------------------------------------- # Artifacts # ------------------------------------------------------------------------- def register_artifact(self, task_id: str, artifact_type: str, reference: str) -> bool: """Register an artifact for a task""" key = f"task:{task_id}:artifacts" artifact = { "type": artifact_type, "reference": reference, "created_at": self._now() } self.db.rpush(key, json.dumps(artifact)) return True def get_artifacts(self, task_id: str) -> list[dict]: """Get all artifacts for a task""" key = f"task:{task_id}:artifacts" data = self.db.lrange(key, 0, -1) return [json.loads(d) for d in data] def has_required_artifact(self, task_id: str, artifact_type: str) -> bool: """Check if a required artifact exists""" artifacts = self.get_artifacts(task_id) return any(a["type"] == artifact_type for a in artifacts) # ============================================================================= # Worker Agent Runtime # ============================================================================= class WorkerRuntime: """ Runtime for worker agents. Handles bootstrap, state transitions, error reporting, and compliance. """ def __init__(self, agent_id: str): self.agent_id = agent_id self.gov = GovernanceManager() self.packet: Optional[InstructionPacket] = None self.state: Optional[AgentState] = None def bootstrap(self) -> tuple[bool, str]: """ Bootstrap the agent runtime. Returns (success, message) """ # Step 1: Read revocation ledger revocations = self.gov.get_recent_revocations(50) if revocations: print(f"[BOOTSTRAP] Read {len(revocations)} recent revocations") # Check if this agent was previously revoked for rev in revocations: if rev["agent_id"] == self.agent_id: return False, f"AGENT_PREVIOUSLY_REVOKED: {rev['reason_type']}" # Step 2: Read instruction packet self.packet = self.gov.get_instruction_packet(self.agent_id) if not self.packet: return False, "NO_INSTRUCTION_PACKET" print(f"[BOOTSTRAP] Loaded instruction packet for task: {self.packet.task_id}") # Step 3: Check for existing handoff handoff = self.gov.get_handoff(self.packet.task_id) if handoff and handoff.previous_agent_id != self.agent_id: print(f"[BOOTSTRAP] Found handoff from previous agent: {handoff.previous_agent_id}") print(f"[BOOTSTRAP] Revocation reason: {handoff.revocation_reason}") print(f"[BOOTSTRAP] Required next actions: {handoff.required_next_actions}") # Step 4: Acquire lock if not self.gov.acquire_lock(self.agent_id): return False, "CANNOT_ACQUIRE_LOCK" # Step 5: Initialize state self.state = AgentState( agent_id=self.agent_id, status=AgentStatus.RUNNING, phase=AgentPhase.BOOTSTRAP, step="initialized", started_at=datetime.now(timezone.utc).isoformat() ) self.gov.set_agent_state(self.state) # Step 6: Start heartbeat self.gov.heartbeat(self.agent_id) # Step 7: Assign to task self.gov.assign_agent_to_task(self.packet.task_id, self.agent_id) return True, "BOOTSTRAP_COMPLETE" def transition(self, phase: AgentPhase, step: str = "", notes: str = "") -> bool: """Transition to a new phase""" if not self.state: return False # Refresh heartbeat and lock self.gov.heartbeat(self.agent_id) self.gov.refresh_lock(self.agent_id) # Check error budget before transition ok, reason = self.gov.check_error_budget(self.agent_id) if not ok: self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason) return False # Update state self.state.phase = phase self.state.step = step self.state.notes = notes self.gov.set_agent_state(self.state) print(f"[PHASE] {phase.value} - {step}") return True def report_error(self, error_type: str, message: str) -> bool: """Report an error and check if we should continue""" counts = self.gov.record_error(self.agent_id, error_type, message) print(f"[ERROR] {error_type}: {message}") print(f"[ERROR] Counts: total={counts['total_errors']}, violations={counts['procedure_violations']}") ok, reason = self.gov.check_error_budget(self.agent_id) if not ok: print(f"[REVOKE] Error budget exceeded: {reason}") self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason) return False return True def report_violation(self, violation: str) -> bool: """Report a procedure violation""" count = self.gov.record_procedure_violation(self.agent_id, violation) print(f"[VIOLATION] {violation} (count: {count})") ok, reason = self.gov.check_error_budget(self.agent_id) if not ok: print(f"[REVOKE] Procedure violation limit: {reason}") self.gov.revoke_agent(self.agent_id, RevocationType.PROCEDURE_VIOLATION, reason) return False return True def register_artifact(self, artifact_type: str, reference: str): """Register a work artifact""" if self.packet: self.gov.register_artifact(self.packet.task_id, artifact_type, reference) print(f"[ARTIFACT] Registered: {artifact_type} -> {reference}") def complete(self, notes: str = "") -> bool: """Mark work as complete""" if self.state: self.state.status = AgentStatus.COMPLETED self.state.phase = AgentPhase.EXIT self.state.notes = notes self.gov.set_agent_state(self.state) self.gov.release_lock(self.agent_id) print(f"[COMPLETE] {notes}") return True def create_handoff(self, blocking_issue: str, what_was_tried: list[str], required_next_actions: list[str]) -> bool: """Create a handoff for the next agent""" if not self.packet or not self.state: return False handoff = HandoffObject( task_id=self.packet.task_id, previous_agent_id=self.agent_id, revoked=self.state.status == AgentStatus.REVOKED, revocation_reason={ "type": self.state.status.value, "details": self.state.notes }, last_known_state={ "phase": self.state.phase.value, "step": self.state.step }, what_was_tried=what_was_tried, blocking_issue=blocking_issue, required_next_actions=required_next_actions, constraints_reminder=self.packet.constraints.get("forbidden", []), artifacts=[a["reference"] for a in self.gov.get_artifacts(self.packet.task_id)] ) return self.gov.create_handoff(handoff) # ============================================================================= # CLI # ============================================================================= if __name__ == "__main__": import sys gov = GovernanceManager() if len(sys.argv) < 2: print("Usage: governance.py [args]") print("Commands:") print(" create-packet ") print(" get-state ") print(" get-errors ") print(" revocations") print(" test") sys.exit(1) cmd = sys.argv[1] if cmd == "create-packet": agent_id, task_id, objective = sys.argv[2], sys.argv[3], sys.argv[4] packet = InstructionPacket( agent_id=agent_id, task_id=task_id, created_for="CLI test", objective=objective, deliverables=["plan artifacts", "run logs"], constraints={ "scope": ["sandbox only"], "forbidden": ["no prod access", "no unrecorded root"], "required_steps": ["plan before apply"] }, success_criteria=["CI passes", "artifacts uploaded"], error_budget=ErrorBudget(), escalation_rules=["If blocked > 20m -> escalate"] ) gov.create_instruction_packet(packet) print(f"Created packet for {agent_id}") elif cmd == "get-state": agent_id = sys.argv[2] state = gov.get_agent_state(agent_id) if state: print(json.dumps(state.to_dict(), indent=2)) else: print("No state found") elif cmd == "get-errors": agent_id = sys.argv[2] errors = gov.get_error_counts(agent_id) print(json.dumps(errors, indent=2)) elif cmd == "revocations": revs = gov.get_recent_revocations() for r in revs: print(json.dumps(r)) elif cmd == "test": print("=== Testing Governance System ===\n") # Create test packet packet = InstructionPacket( agent_id="test-agent-001", task_id="test-task-001", created_for="Governance test", objective="Test the governance system", deliverables=["test output"], constraints={ "scope": ["sandbox"], "forbidden": ["prod access"], "required_steps": ["plan first"] }, success_criteria=["test passes"], error_budget=ErrorBudget(max_total_errors=5, max_same_error_repeats=2), escalation_rules=[] ) gov.create_instruction_packet(packet) print("[OK] Created instruction packet") # Test worker runtime worker = WorkerRuntime("test-agent-001") ok, msg = worker.bootstrap() print(f"[OK] Bootstrap: {msg}") # Simulate work phases worker.transition(AgentPhase.PREFLIGHT, "scope_check") worker.transition(AgentPhase.PLAN, "generating_plan") worker.register_artifact("plan", "plan_output_001") worker.transition(AgentPhase.EXECUTE, "applying") # Simulate some errors worker.report_error("VALIDATION_ERROR", "Variable not defined") worker.report_error("VALIDATION_ERROR", "Variable not defined") # Same error # Check error budget ok, reason = gov.check_error_budget("test-agent-001") print(f"[OK] Error budget check: ok={ok}, reason={reason}") # Complete worker.complete("Test completed successfully") print("\n=== Test Complete ===") else: print(f"Unknown command: {cmd}")