agent-governance/runtime/governance.py
profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

713 lines
26 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Agent Runtime Governance via DragonflyDB
=========================================
Implements real-time agent control: instruction packets, state tracking,
error budgets, revocation, and handoffs.
"""
import json
import hashlib
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from enum import Enum
from typing import Any, Optional
import redis
import subprocess
# =============================================================================
# Configuration
# =============================================================================
def get_dragonfly_client() -> redis.Redis:
"""Get DragonflyDB client with credentials from Vault"""
# Get credentials from Vault
with open("/opt/vault/init-keys.json") as f:
token = json.load(f)["root_token"]
result = subprocess.run([
"curl", "-sk",
"-H", f"X-Vault-Token: {token}",
"https://127.0.0.1:8200/v1/secret/data/services/dragonfly"
], capture_output=True, text=True)
creds = json.loads(result.stdout)["data"]["data"]
return redis.Redis(
host=creds["host"],
port=int(creds["port"]),
password=creds["password"],
decode_responses=True
)
# =============================================================================
# Import Core Definitions (from unified pipeline.core module)
# =============================================================================
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from pipeline.core import (
AgentPhase,
AgentStatus,
RevocationType,
ErrorBudget,
AGENT_PHASE_NAMES,
AGENT_PHASES_ORDERED,
RedisKeys,
DEFAULT_HEARTBEAT_TTL,
DEFAULT_LOCK_TTL,
)
@dataclass
class InstructionPacket:
agent_id: str
task_id: str
created_for: str
objective: str
deliverables: list[str]
constraints: dict
success_criteria: list[str]
error_budget: ErrorBudget
escalation_rules: list[str]
repo: str = ""
pr_context: dict = field(default_factory=dict)
revocation_context: dict = field(default_factory=dict)
created_at: str = ""
def __post_init__(self):
if not self.created_at:
self.created_at = datetime.now(timezone.utc).isoformat()
if isinstance(self.error_budget, dict):
self.error_budget = ErrorBudget(**self.error_budget)
def to_dict(self) -> dict:
d = asdict(self)
d["error_budget"] = asdict(self.error_budget)
return d
@dataclass
class AgentState:
agent_id: str
status: AgentStatus
phase: AgentPhase
step: str = ""
started_at: str = ""
last_progress_at: str = ""
current_pr: Optional[int] = None
notes: str = ""
def to_dict(self) -> dict:
return {
"agent_id": self.agent_id,
"status": self.status.value,
"phase": self.phase.value,
"step": self.step,
"started_at": self.started_at,
"last_progress_at": self.last_progress_at,
"current_pr": self.current_pr,
"notes": self.notes
}
@dataclass
class HandoffObject:
task_id: str
previous_agent_id: str
revoked: bool
revocation_reason: dict
last_known_state: dict
what_was_tried: list[str]
blocking_issue: str
required_next_actions: list[str]
constraints_reminder: list[str]
artifacts: list[str]
created_at: str = ""
def __post_init__(self):
if not self.created_at:
self.created_at = datetime.now(timezone.utc).isoformat()
def to_dict(self) -> dict:
return asdict(self)
# =============================================================================
# Governance Manager
# =============================================================================
class GovernanceManager:
"""
Core runtime governance via DragonflyDB.
Manages instruction packets, state, errors, locks, and handoffs.
"""
def __init__(self):
self.db = get_dragonfly_client()
self.lock_ttl = 300 # 5 minutes default lock TTL
self.heartbeat_ttl = 60 # 1 minute heartbeat timeout
def _now(self) -> str:
return datetime.now(timezone.utc).isoformat()
def _error_signature(self, error_type: str, message: str) -> str:
"""Generate a normalized error signature"""
normalized = f"{error_type}:{message[:100]}".lower()
return hashlib.md5(normalized.encode()).hexdigest()[:12]
# -------------------------------------------------------------------------
# Instruction Packets
# -------------------------------------------------------------------------
def create_instruction_packet(self, packet: InstructionPacket) -> bool:
"""Store an instruction packet for an agent"""
key = f"agent:{packet.agent_id}:packet"
return self.db.set(key, json.dumps(packet.to_dict()))
def get_instruction_packet(self, agent_id: str) -> Optional[InstructionPacket]:
"""Retrieve an agent's instruction packet"""
key = f"agent:{agent_id}:packet"
data = self.db.get(key)
if data:
return InstructionPacket(**json.loads(data))
return None
# -------------------------------------------------------------------------
# Agent State
# -------------------------------------------------------------------------
def set_agent_state(self, state: AgentState) -> bool:
"""Update agent's runtime state"""
state.last_progress_at = self._now()
key = f"agent:{state.agent_id}:state"
return self.db.set(key, json.dumps(state.to_dict()))
def get_agent_state(self, agent_id: str) -> Optional[AgentState]:
"""Get agent's current state"""
key = f"agent:{agent_id}:state"
data = self.db.get(key)
if data:
d = json.loads(data)
return AgentState(
agent_id=d["agent_id"],
status=AgentStatus(d["status"]),
phase=AgentPhase(d["phase"]),
step=d.get("step", ""),
started_at=d.get("started_at", ""),
last_progress_at=d.get("last_progress_at", ""),
current_pr=d.get("current_pr"),
notes=d.get("notes", "")
)
return None
def transition_phase(self, agent_id: str, phase: AgentPhase, step: str = "", notes: str = "") -> bool:
"""Transition agent to a new phase"""
state = self.get_agent_state(agent_id)
if not state:
return False
state.phase = phase
state.step = step
state.notes = notes
return self.set_agent_state(state)
# -------------------------------------------------------------------------
# Locking
# -------------------------------------------------------------------------
def acquire_lock(self, agent_id: str, ttl: int = None) -> bool:
"""Acquire execution lock for an agent"""
key = f"agent:{agent_id}:lock"
ttl = ttl or self.lock_ttl
# NX = only set if not exists
return self.db.set(key, self._now(), nx=True, ex=ttl)
def refresh_lock(self, agent_id: str, ttl: int = None) -> bool:
"""Refresh an existing lock"""
key = f"agent:{agent_id}:lock"
ttl = ttl or self.lock_ttl
if self.db.exists(key):
return self.db.expire(key, ttl)
return False
def release_lock(self, agent_id: str) -> bool:
"""Release execution lock"""
key = f"agent:{agent_id}:lock"
return self.db.delete(key) > 0
def has_lock(self, agent_id: str) -> bool:
"""Check if agent has a valid lock"""
key = f"agent:{agent_id}:lock"
return self.db.exists(key)
# -------------------------------------------------------------------------
# Heartbeat
# -------------------------------------------------------------------------
def heartbeat(self, agent_id: str) -> bool:
"""Update agent heartbeat"""
key = f"agent:{agent_id}:heartbeat"
return self.db.set(key, self._now(), ex=self.heartbeat_ttl)
def is_alive(self, agent_id: str) -> bool:
"""Check if agent heartbeat is fresh"""
key = f"agent:{agent_id}:heartbeat"
return self.db.exists(key)
# -------------------------------------------------------------------------
# Error Tracking
# -------------------------------------------------------------------------
def record_error(self, agent_id: str, error_type: str, message: str) -> dict:
"""Record an error and return current counts"""
key = f"agent:{agent_id}:errors"
sig = self._error_signature(error_type, message)
pipe = self.db.pipeline()
pipe.hincrby(key, "total_errors", 1)
pipe.hincrby(key, f"same_error:{sig}", 1)
pipe.hset(key, "last_error_signature", sig)
pipe.hset(key, "last_error_at", self._now())
pipe.hset(key, "last_error_type", error_type)
pipe.hset(key, "last_error_message", message[:500])
pipe.execute()
return self.get_error_counts(agent_id)
def record_procedure_violation(self, agent_id: str, violation: str) -> int:
"""Record a procedure violation"""
key = f"agent:{agent_id}:errors"
self.db.hset(key, "last_violation", violation)
self.db.hset(key, "last_violation_at", self._now())
return self.db.hincrby(key, "procedure_violations", 1)
def get_error_counts(self, agent_id: str) -> dict:
"""Get all error counts for an agent"""
key = f"agent:{agent_id}:errors"
data = self.db.hgetall(key)
return {
"total_errors": int(data.get("total_errors", 0)),
"procedure_violations": int(data.get("procedure_violations", 0)),
"last_error_signature": data.get("last_error_signature", ""),
"last_error_at": data.get("last_error_at", ""),
"last_error_type": data.get("last_error_type", ""),
"same_error_counts": {
k.replace("same_error:", ""): int(v)
for k, v in data.items()
if k.startswith("same_error:")
}
}
def check_error_budget(self, agent_id: str) -> tuple[bool, Optional[str]]:
"""Check if error budget is exceeded. Returns (ok, reason)"""
packet = self.get_instruction_packet(agent_id)
if not packet:
return False, "NO_INSTRUCTION_PACKET"
counts = self.get_error_counts(agent_id)
budget = packet.error_budget
# Check procedure violations
if counts["procedure_violations"] >= budget.max_procedure_violations:
return False, f"PROCEDURE_VIOLATIONS ({counts['procedure_violations']} >= {budget.max_procedure_violations})"
# Check total errors
if counts["total_errors"] >= budget.max_total_errors:
return False, f"TOTAL_ERRORS ({counts['total_errors']} >= {budget.max_total_errors})"
# Check same error repeats
for sig, count in counts["same_error_counts"].items():
if count >= budget.max_same_error_repeats:
return False, f"SAME_ERROR_REPEATED ({sig}: {count} >= {budget.max_same_error_repeats})"
return True, None
# -------------------------------------------------------------------------
# Task Management
# -------------------------------------------------------------------------
def assign_agent_to_task(self, task_id: str, agent_id: str) -> bool:
"""Assign an agent to a task"""
# Set active agent
self.db.set(f"task:{task_id}:active_agent", agent_id)
# Add to history
self.db.rpush(f"task:{task_id}:history", json.dumps({
"agent_id": agent_id,
"assigned_at": self._now(),
"event": "ASSIGNED"
}))
return True
def get_active_agent(self, task_id: str) -> Optional[str]:
"""Get the currently active agent for a task"""
return self.db.get(f"task:{task_id}:active_agent")
def get_task_history(self, task_id: str) -> list[dict]:
"""Get task agent history"""
data = self.db.lrange(f"task:{task_id}:history", 0, -1)
return [json.loads(d) for d in data]
# -------------------------------------------------------------------------
# Revocation
# -------------------------------------------------------------------------
def revoke_agent(self, agent_id: str, reason_type: RevocationType, details: str) -> bool:
"""Revoke an agent's access"""
# Update state
state = self.get_agent_state(agent_id)
if state:
state.status = AgentStatus.REVOKED
state.phase = AgentPhase.REVOKED
state.notes = f"Revoked: {reason_type.value} - {details}"
self.set_agent_state(state)
# Release lock
self.release_lock(agent_id)
# Write to revocation ledger
revocation_event = {
"agent_id": agent_id,
"reason_type": reason_type.value,
"details": details,
"revoked_at": self._now()
}
self.db.rpush("revocations:ledger", json.dumps(revocation_event))
# Add to task history if we know the task
packet = self.get_instruction_packet(agent_id)
if packet:
self.db.rpush(f"task:{packet.task_id}:history", json.dumps({
"agent_id": agent_id,
"event": "REVOKED",
"reason": reason_type.value,
"revoked_at": self._now()
}))
return True
def get_recent_revocations(self, count: int = 50) -> list[dict]:
"""Get recent revocation events"""
data = self.db.lrange("revocations:ledger", -count, -1)
return [json.loads(d) for d in data]
# -------------------------------------------------------------------------
# Handoff
# -------------------------------------------------------------------------
def create_handoff(self, handoff: HandoffObject) -> bool:
"""Create a handoff object for task continuity"""
key = f"handoff:{handoff.task_id}:latest"
return self.db.set(key, json.dumps(handoff.to_dict()))
def get_handoff(self, task_id: str) -> Optional[HandoffObject]:
"""Get the latest handoff for a task"""
key = f"handoff:{task_id}:latest"
data = self.db.get(key)
if data:
return HandoffObject(**json.loads(data))
return None
# -------------------------------------------------------------------------
# Artifacts
# -------------------------------------------------------------------------
def register_artifact(self, task_id: str, artifact_type: str, reference: str) -> bool:
"""Register an artifact for a task"""
key = f"task:{task_id}:artifacts"
artifact = {
"type": artifact_type,
"reference": reference,
"created_at": self._now()
}
self.db.rpush(key, json.dumps(artifact))
return True
def get_artifacts(self, task_id: str) -> list[dict]:
"""Get all artifacts for a task"""
key = f"task:{task_id}:artifacts"
data = self.db.lrange(key, 0, -1)
return [json.loads(d) for d in data]
def has_required_artifact(self, task_id: str, artifact_type: str) -> bool:
"""Check if a required artifact exists"""
artifacts = self.get_artifacts(task_id)
return any(a["type"] == artifact_type for a in artifacts)
# =============================================================================
# Worker Agent Runtime
# =============================================================================
class WorkerRuntime:
"""
Runtime for worker agents.
Handles bootstrap, state transitions, error reporting, and compliance.
"""
def __init__(self, agent_id: str):
self.agent_id = agent_id
self.gov = GovernanceManager()
self.packet: Optional[InstructionPacket] = None
self.state: Optional[AgentState] = None
def bootstrap(self) -> tuple[bool, str]:
"""
Bootstrap the agent runtime.
Returns (success, message)
"""
# Step 1: Read revocation ledger
revocations = self.gov.get_recent_revocations(50)
if revocations:
print(f"[BOOTSTRAP] Read {len(revocations)} recent revocations")
# Check if this agent was previously revoked
for rev in revocations:
if rev["agent_id"] == self.agent_id:
return False, f"AGENT_PREVIOUSLY_REVOKED: {rev['reason_type']}"
# Step 2: Read instruction packet
self.packet = self.gov.get_instruction_packet(self.agent_id)
if not self.packet:
return False, "NO_INSTRUCTION_PACKET"
print(f"[BOOTSTRAP] Loaded instruction packet for task: {self.packet.task_id}")
# Step 3: Check for existing handoff
handoff = self.gov.get_handoff(self.packet.task_id)
if handoff and handoff.previous_agent_id != self.agent_id:
print(f"[BOOTSTRAP] Found handoff from previous agent: {handoff.previous_agent_id}")
print(f"[BOOTSTRAP] Revocation reason: {handoff.revocation_reason}")
print(f"[BOOTSTRAP] Required next actions: {handoff.required_next_actions}")
# Step 4: Acquire lock
if not self.gov.acquire_lock(self.agent_id):
return False, "CANNOT_ACQUIRE_LOCK"
# Step 5: Initialize state
self.state = AgentState(
agent_id=self.agent_id,
status=AgentStatus.RUNNING,
phase=AgentPhase.BOOTSTRAP,
step="initialized",
started_at=datetime.now(timezone.utc).isoformat()
)
self.gov.set_agent_state(self.state)
# Step 6: Start heartbeat
self.gov.heartbeat(self.agent_id)
# Step 7: Assign to task
self.gov.assign_agent_to_task(self.packet.task_id, self.agent_id)
return True, "BOOTSTRAP_COMPLETE"
def transition(self, phase: AgentPhase, step: str = "", notes: str = "") -> bool:
"""Transition to a new phase"""
if not self.state:
return False
# Refresh heartbeat and lock
self.gov.heartbeat(self.agent_id)
self.gov.refresh_lock(self.agent_id)
# Check error budget before transition
ok, reason = self.gov.check_error_budget(self.agent_id)
if not ok:
self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason)
return False
# Update state
self.state.phase = phase
self.state.step = step
self.state.notes = notes
self.gov.set_agent_state(self.state)
print(f"[PHASE] {phase.value} - {step}")
return True
def report_error(self, error_type: str, message: str) -> bool:
"""Report an error and check if we should continue"""
counts = self.gov.record_error(self.agent_id, error_type, message)
print(f"[ERROR] {error_type}: {message}")
print(f"[ERROR] Counts: total={counts['total_errors']}, violations={counts['procedure_violations']}")
ok, reason = self.gov.check_error_budget(self.agent_id)
if not ok:
print(f"[REVOKE] Error budget exceeded: {reason}")
self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason)
return False
return True
def report_violation(self, violation: str) -> bool:
"""Report a procedure violation"""
count = self.gov.record_procedure_violation(self.agent_id, violation)
print(f"[VIOLATION] {violation} (count: {count})")
ok, reason = self.gov.check_error_budget(self.agent_id)
if not ok:
print(f"[REVOKE] Procedure violation limit: {reason}")
self.gov.revoke_agent(self.agent_id, RevocationType.PROCEDURE_VIOLATION, reason)
return False
return True
def register_artifact(self, artifact_type: str, reference: str):
"""Register a work artifact"""
if self.packet:
self.gov.register_artifact(self.packet.task_id, artifact_type, reference)
print(f"[ARTIFACT] Registered: {artifact_type} -> {reference}")
def complete(self, notes: str = "") -> bool:
"""Mark work as complete"""
if self.state:
self.state.status = AgentStatus.COMPLETED
self.state.phase = AgentPhase.EXIT
self.state.notes = notes
self.gov.set_agent_state(self.state)
self.gov.release_lock(self.agent_id)
print(f"[COMPLETE] {notes}")
return True
def create_handoff(self, blocking_issue: str, what_was_tried: list[str],
required_next_actions: list[str]) -> bool:
"""Create a handoff for the next agent"""
if not self.packet or not self.state:
return False
handoff = HandoffObject(
task_id=self.packet.task_id,
previous_agent_id=self.agent_id,
revoked=self.state.status == AgentStatus.REVOKED,
revocation_reason={
"type": self.state.status.value,
"details": self.state.notes
},
last_known_state={
"phase": self.state.phase.value,
"step": self.state.step
},
what_was_tried=what_was_tried,
blocking_issue=blocking_issue,
required_next_actions=required_next_actions,
constraints_reminder=self.packet.constraints.get("forbidden", []),
artifacts=[a["reference"] for a in self.gov.get_artifacts(self.packet.task_id)]
)
return self.gov.create_handoff(handoff)
# =============================================================================
# CLI
# =============================================================================
if __name__ == "__main__":
import sys
gov = GovernanceManager()
if len(sys.argv) < 2:
print("Usage: governance.py <command> [args]")
print("Commands:")
print(" create-packet <agent_id> <task_id> <objective>")
print(" get-state <agent_id>")
print(" get-errors <agent_id>")
print(" revocations")
print(" test")
sys.exit(1)
cmd = sys.argv[1]
if cmd == "create-packet":
agent_id, task_id, objective = sys.argv[2], sys.argv[3], sys.argv[4]
packet = InstructionPacket(
agent_id=agent_id,
task_id=task_id,
created_for="CLI test",
objective=objective,
deliverables=["plan artifacts", "run logs"],
constraints={
"scope": ["sandbox only"],
"forbidden": ["no prod access", "no unrecorded root"],
"required_steps": ["plan before apply"]
},
success_criteria=["CI passes", "artifacts uploaded"],
error_budget=ErrorBudget(),
escalation_rules=["If blocked > 20m -> escalate"]
)
gov.create_instruction_packet(packet)
print(f"Created packet for {agent_id}")
elif cmd == "get-state":
agent_id = sys.argv[2]
state = gov.get_agent_state(agent_id)
if state:
print(json.dumps(state.to_dict(), indent=2))
else:
print("No state found")
elif cmd == "get-errors":
agent_id = sys.argv[2]
errors = gov.get_error_counts(agent_id)
print(json.dumps(errors, indent=2))
elif cmd == "revocations":
revs = gov.get_recent_revocations()
for r in revs:
print(json.dumps(r))
elif cmd == "test":
print("=== Testing Governance System ===\n")
# Create test packet
packet = InstructionPacket(
agent_id="test-agent-001",
task_id="test-task-001",
created_for="Governance test",
objective="Test the governance system",
deliverables=["test output"],
constraints={
"scope": ["sandbox"],
"forbidden": ["prod access"],
"required_steps": ["plan first"]
},
success_criteria=["test passes"],
error_budget=ErrorBudget(max_total_errors=5, max_same_error_repeats=2),
escalation_rules=[]
)
gov.create_instruction_packet(packet)
print("[OK] Created instruction packet")
# Test worker runtime
worker = WorkerRuntime("test-agent-001")
ok, msg = worker.bootstrap()
print(f"[OK] Bootstrap: {msg}")
# Simulate work phases
worker.transition(AgentPhase.PREFLIGHT, "scope_check")
worker.transition(AgentPhase.PLAN, "generating_plan")
worker.register_artifact("plan", "plan_output_001")
worker.transition(AgentPhase.EXECUTE, "applying")
# Simulate some errors
worker.report_error("VALIDATION_ERROR", "Variable not defined")
worker.report_error("VALIDATION_ERROR", "Variable not defined") # Same error
# Check error budget
ok, reason = gov.check_error_budget("test-agent-001")
print(f"[OK] Error budget check: ok={ok}, reason={reason}")
# Complete
worker.complete("Test completed successfully")
print("\n=== Test Complete ===")
else:
print(f"Unknown command: {cmd}")