agent-governance/runtime/governance.py

#!/usr/bin/env python3
"""
Agent Runtime Governance via DragonflyDB
=========================================
Implements real-time agent control: instruction packets, state tracking,
error budgets, revocation, and handoffs.
"""

import json
import hashlib
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from enum import Enum
from typing import Any, Optional
import redis
import subprocess


# =============================================================================
# Configuration
# =============================================================================

def get_dragonfly_client() -> redis.Redis:
    """Get DragonflyDB client with credentials from Vault"""
    # Get credentials from Vault
    with open("/opt/vault/init-keys.json") as f:
        token = json.load(f)["root_token"]

    result = subprocess.run([
        "curl", "-sk",
        "-H", f"X-Vault-Token: {token}",
        "https://127.0.0.1:8200/v1/secret/data/services/dragonfly"
    ], capture_output=True, text=True)

    creds = json.loads(result.stdout)["data"]["data"]

    return redis.Redis(
        host=creds["host"],
        port=int(creds["port"]),
        password=creds["password"],
        decode_responses=True
    )


# =============================================================================
# Import Core Definitions (from unified pipeline.core module)
# =============================================================================

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from pipeline.core import (
    AgentPhase,
    AgentStatus,
    RevocationType,
    ErrorBudget,
    AGENT_PHASE_NAMES,
    AGENT_PHASES_ORDERED,
    RedisKeys,
    DEFAULT_HEARTBEAT_TTL,
    DEFAULT_LOCK_TTL,
)


@dataclass
class InstructionPacket:
    agent_id: str
    task_id: str
    created_for: str
    objective: str
    deliverables: list[str]
    constraints: dict
    success_criteria: list[str]
    error_budget: ErrorBudget
    escalation_rules: list[str]
    repo: str = ""
    pr_context: dict = field(default_factory=dict)
    revocation_context: dict = field(default_factory=dict)
    created_at: str = ""

    def __post_init__(self):
        if not self.created_at:
            self.created_at = datetime.now(timezone.utc).isoformat()
        if isinstance(self.error_budget, dict):
            self.error_budget = ErrorBudget(**self.error_budget)

    def to_dict(self) -> dict:
        d = asdict(self)
        d["error_budget"] = asdict(self.error_budget)
        return d


@dataclass
class AgentState:
    agent_id: str
    status: AgentStatus
    phase: AgentPhase
    step: str = ""
    started_at: str = ""
    last_progress_at: str = ""
    current_pr: Optional[int] = None
    notes: str = ""

    def to_dict(self) -> dict:
        return {
            "agent_id": self.agent_id,
            "status": self.status.value,
            "phase": self.phase.value,
            "step": self.step,
            "started_at": self.started_at,
            "last_progress_at": self.last_progress_at,
            "current_pr": self.current_pr,
            "notes": self.notes
        }


@dataclass
class HandoffObject:
    task_id: str
    previous_agent_id: str
    revoked: bool
    revocation_reason: dict
    last_known_state: dict
    what_was_tried: list[str]
    blocking_issue: str
    required_next_actions: list[str]
    constraints_reminder: list[str]
    artifacts: list[str]
    created_at: str = ""

    def __post_init__(self):
        if not self.created_at:
            self.created_at = datetime.now(timezone.utc).isoformat()

    def to_dict(self) -> dict:
        return asdict(self)


# =============================================================================
# Governance Manager
# =============================================================================

class GovernanceManager:
    """
    Core runtime governance via DragonflyDB.
    Manages instruction packets, state, errors, locks, and handoffs.
    """

    def __init__(self):
        self.db = get_dragonfly_client()
        self.lock_ttl = 300  # 5 minutes default lock TTL
        self.heartbeat_ttl = 60  # 1 minute heartbeat timeout

    def _now(self) -> str:
        return datetime.now(timezone.utc).isoformat()

    def _error_signature(self, error_type: str, message: str) -> str:
        """Generate a normalized error signature"""
        normalized = f"{error_type}:{message[:100]}".lower()
        return hashlib.md5(normalized.encode()).hexdigest()[:12]

    # -------------------------------------------------------------------------
    # Instruction Packets
    # -------------------------------------------------------------------------

    def create_instruction_packet(self, packet: InstructionPacket) -> bool:
        """Store an instruction packet for an agent"""
        key = f"agent:{packet.agent_id}:packet"
        return self.db.set(key, json.dumps(packet.to_dict()))

    def get_instruction_packet(self, agent_id: str) -> Optional[InstructionPacket]:
        """Retrieve an agent's instruction packet"""
        key = f"agent:{agent_id}:packet"
        data = self.db.get(key)
        if data:
            return InstructionPacket(**json.loads(data))
        return None

    # -------------------------------------------------------------------------
    # Agent State
    # -------------------------------------------------------------------------

    def set_agent_state(self, state: AgentState) -> bool:
        """Update agent's runtime state"""
        state.last_progress_at = self._now()
        key = f"agent:{state.agent_id}:state"
        return self.db.set(key, json.dumps(state.to_dict()))

    def get_agent_state(self, agent_id: str) -> Optional[AgentState]:
        """Get agent's current state"""
        key = f"agent:{agent_id}:state"
        data = self.db.get(key)
        if data:
            d = json.loads(data)
            return AgentState(
                agent_id=d["agent_id"],
                status=AgentStatus(d["status"]),
                phase=AgentPhase(d["phase"]),
                step=d.get("step", ""),
                started_at=d.get("started_at", ""),
                last_progress_at=d.get("last_progress_at", ""),
                current_pr=d.get("current_pr"),
                notes=d.get("notes", "")
            )
        return None

    def transition_phase(self, agent_id: str, phase: AgentPhase, step: str = "", notes: str = "") -> bool:
        """Transition agent to a new phase"""
        state = self.get_agent_state(agent_id)
        if not state:
            return False

        state.phase = phase
        state.step = step
        state.notes = notes
        return self.set_agent_state(state)

    # -------------------------------------------------------------------------
    # Locking
    # -------------------------------------------------------------------------

    def acquire_lock(self, agent_id: str, ttl: int = None) -> bool:
        """Acquire execution lock for an agent"""
        key = f"agent:{agent_id}:lock"
        ttl = ttl or self.lock_ttl
        # NX = only set if not exists
        return self.db.set(key, self._now(), nx=True, ex=ttl)

    def refresh_lock(self, agent_id: str, ttl: int = None) -> bool:
        """Refresh an existing lock"""
        key = f"agent:{agent_id}:lock"
        ttl = ttl or self.lock_ttl
        if self.db.exists(key):
            return self.db.expire(key, ttl)
        return False

    def release_lock(self, agent_id: str) -> bool:
        """Release execution lock"""
        key = f"agent:{agent_id}:lock"
        return self.db.delete(key) > 0

    def has_lock(self, agent_id: str) -> bool:
        """Check if agent has a valid lock"""
        key = f"agent:{agent_id}:lock"
        return self.db.exists(key)

    # -------------------------------------------------------------------------
    # Heartbeat
    # -------------------------------------------------------------------------

    def heartbeat(self, agent_id: str) -> bool:
        """Update agent heartbeat"""
        key = f"agent:{agent_id}:heartbeat"
        return self.db.set(key, self._now(), ex=self.heartbeat_ttl)

    def is_alive(self, agent_id: str) -> bool:
        """Check if agent heartbeat is fresh"""
        key = f"agent:{agent_id}:heartbeat"
        return self.db.exists(key)

    # -------------------------------------------------------------------------
    # Error Tracking
    # -------------------------------------------------------------------------

    def record_error(self, agent_id: str, error_type: str, message: str) -> dict:
        """Record an error and return current counts"""
        key = f"agent:{agent_id}:errors"
        sig = self._error_signature(error_type, message)

        pipe = self.db.pipeline()
        pipe.hincrby(key, "total_errors", 1)
        pipe.hincrby(key, f"same_error:{sig}", 1)
        pipe.hset(key, "last_error_signature", sig)
        pipe.hset(key, "last_error_at", self._now())
        pipe.hset(key, "last_error_type", error_type)
        pipe.hset(key, "last_error_message", message[:500])
        pipe.execute()

        return self.get_error_counts(agent_id)

    def record_procedure_violation(self, agent_id: str, violation: str) -> int:
        """Record a procedure violation"""
        key = f"agent:{agent_id}:errors"
        self.db.hset(key, "last_violation", violation)
        self.db.hset(key, "last_violation_at", self._now())
        return self.db.hincrby(key, "procedure_violations", 1)

    def get_error_counts(self, agent_id: str) -> dict:
        """Get all error counts for an agent"""
        key = f"agent:{agent_id}:errors"
        data = self.db.hgetall(key)
        return {
            "total_errors": int(data.get("total_errors", 0)),
            "procedure_violations": int(data.get("procedure_violations", 0)),
            "last_error_signature": data.get("last_error_signature", ""),
            "last_error_at": data.get("last_error_at", ""),
            "last_error_type": data.get("last_error_type", ""),
            "same_error_counts": {
                k.replace("same_error:", ""): int(v)
                for k, v in data.items()
                if k.startswith("same_error:")
            }
        }

    def check_error_budget(self, agent_id: str) -> tuple[bool, Optional[str]]:
        """Check if error budget is exceeded. Returns (ok, reason)"""
        packet = self.get_instruction_packet(agent_id)
        if not packet:
            return False, "NO_INSTRUCTION_PACKET"

        counts = self.get_error_counts(agent_id)
        budget = packet.error_budget

        # Check procedure violations
        if counts["procedure_violations"] >= budget.max_procedure_violations:
            return False, f"PROCEDURE_VIOLATIONS ({counts['procedure_violations']} >= {budget.max_procedure_violations})"

        # Check total errors
        if counts["total_errors"] >= budget.max_total_errors:
            return False, f"TOTAL_ERRORS ({counts['total_errors']} >= {budget.max_total_errors})"

        # Check same error repeats
        for sig, count in counts["same_error_counts"].items():
            if count >= budget.max_same_error_repeats:
                return False, f"SAME_ERROR_REPEATED ({sig}: {count} >= {budget.max_same_error_repeats})"

        return True, None

    # -------------------------------------------------------------------------
    # Task Management
    # -------------------------------------------------------------------------

    def assign_agent_to_task(self, task_id: str, agent_id: str) -> bool:
        """Assign an agent to a task"""
        # Set active agent
        self.db.set(f"task:{task_id}:active_agent", agent_id)
        # Add to history
        self.db.rpush(f"task:{task_id}:history", json.dumps({
            "agent_id": agent_id,
            "assigned_at": self._now(),
            "event": "ASSIGNED"
        }))
        return True

    def get_active_agent(self, task_id: str) -> Optional[str]:
        """Get the currently active agent for a task"""
        return self.db.get(f"task:{task_id}:active_agent")

    def get_task_history(self, task_id: str) -> list[dict]:
        """Get task agent history"""
        data = self.db.lrange(f"task:{task_id}:history", 0, -1)
        return [json.loads(d) for d in data]

    # -------------------------------------------------------------------------
    # Revocation
    # -------------------------------------------------------------------------

    def revoke_agent(self, agent_id: str, reason_type: RevocationType, details: str) -> bool:
        """Revoke an agent's access"""
        # Update state
        state = self.get_agent_state(agent_id)
        if state:
            state.status = AgentStatus.REVOKED
            state.phase = AgentPhase.REVOKED
            state.notes = f"Revoked: {reason_type.value} - {details}"
            self.set_agent_state(state)

        # Release lock
        self.release_lock(agent_id)

        # Write to revocation ledger
        revocation_event = {
            "agent_id": agent_id,
            "reason_type": reason_type.value,
            "details": details,
            "revoked_at": self._now()
        }
        self.db.rpush("revocations:ledger", json.dumps(revocation_event))

        # Add to task history if we know the task
        packet = self.get_instruction_packet(agent_id)
        if packet:
            self.db.rpush(f"task:{packet.task_id}:history", json.dumps({
                "agent_id": agent_id,
                "event": "REVOKED",
                "reason": reason_type.value,
                "revoked_at": self._now()
            }))

        return True

    def get_recent_revocations(self, count: int = 50) -> list[dict]:
        """Get recent revocation events"""
        data = self.db.lrange("revocations:ledger", -count, -1)
        return [json.loads(d) for d in data]

    # -------------------------------------------------------------------------
    # Handoff
    # -------------------------------------------------------------------------

    def create_handoff(self, handoff: HandoffObject) -> bool:
        """Create a handoff object for task continuity"""
        key = f"handoff:{handoff.task_id}:latest"
        return self.db.set(key, json.dumps(handoff.to_dict()))

    def get_handoff(self, task_id: str) -> Optional[HandoffObject]:
        """Get the latest handoff for a task"""
        key = f"handoff:{task_id}:latest"
        data = self.db.get(key)
        if data:
            return HandoffObject(**json.loads(data))
        return None

    # -------------------------------------------------------------------------
    # Artifacts
    # -------------------------------------------------------------------------

    def register_artifact(self, task_id: str, artifact_type: str, reference: str) -> bool:
        """Register an artifact for a task"""
        key = f"task:{task_id}:artifacts"
        artifact = {
            "type": artifact_type,
            "reference": reference,
            "created_at": self._now()
        }
        self.db.rpush(key, json.dumps(artifact))
        return True

    def get_artifacts(self, task_id: str) -> list[dict]:
        """Get all artifacts for a task"""
        key = f"task:{task_id}:artifacts"
        data = self.db.lrange(key, 0, -1)
        return [json.loads(d) for d in data]

    def has_required_artifact(self, task_id: str, artifact_type: str) -> bool:
        """Check if a required artifact exists"""
        artifacts = self.get_artifacts(task_id)
        return any(a["type"] == artifact_type for a in artifacts)


# =============================================================================
# Worker Agent Runtime
# =============================================================================

class WorkerRuntime:
    """
    Runtime for worker agents.
    Handles bootstrap, state transitions, error reporting, and compliance.
    """

    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.gov = GovernanceManager()
        self.packet: Optional[InstructionPacket] = None
        self.state: Optional[AgentState] = None

    def bootstrap(self) -> tuple[bool, str]:
        """
        Bootstrap the agent runtime.
        Returns (success, message)
        """
        # Step 1: Read revocation ledger
        revocations = self.gov.get_recent_revocations(50)
        if revocations:
            print(f"[BOOTSTRAP] Read {len(revocations)} recent revocations")
            # Check if this agent was previously revoked
            for rev in revocations:
                if rev["agent_id"] == self.agent_id:
                    return False, f"AGENT_PREVIOUSLY_REVOKED: {rev['reason_type']}"

        # Step 2: Read instruction packet
        self.packet = self.gov.get_instruction_packet(self.agent_id)
        if not self.packet:
            return False, "NO_INSTRUCTION_PACKET"

        print(f"[BOOTSTRAP] Loaded instruction packet for task: {self.packet.task_id}")

        # Step 3: Check for existing handoff
        handoff = self.gov.get_handoff(self.packet.task_id)
        if handoff and handoff.previous_agent_id != self.agent_id:
            print(f"[BOOTSTRAP] Found handoff from previous agent: {handoff.previous_agent_id}")
            print(f"[BOOTSTRAP] Revocation reason: {handoff.revocation_reason}")
            print(f"[BOOTSTRAP] Required next actions: {handoff.required_next_actions}")

        # Step 4: Acquire lock
        if not self.gov.acquire_lock(self.agent_id):
            return False, "CANNOT_ACQUIRE_LOCK"

        # Step 5: Initialize state
        self.state = AgentState(
            agent_id=self.agent_id,
            status=AgentStatus.RUNNING,
            phase=AgentPhase.BOOTSTRAP,
            step="initialized",
            started_at=datetime.now(timezone.utc).isoformat()
        )
        self.gov.set_agent_state(self.state)

        # Step 6: Start heartbeat
        self.gov.heartbeat(self.agent_id)

        # Step 7: Assign to task
        self.gov.assign_agent_to_task(self.packet.task_id, self.agent_id)

        return True, "BOOTSTRAP_COMPLETE"

    def transition(self, phase: AgentPhase, step: str = "", notes: str = "") -> bool:
        """Transition to a new phase"""
        if not self.state:
            return False

        # Refresh heartbeat and lock
        self.gov.heartbeat(self.agent_id)
        self.gov.refresh_lock(self.agent_id)

        # Check error budget before transition
        ok, reason = self.gov.check_error_budget(self.agent_id)
        if not ok:
            self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason)
            return False

        # Update state
        self.state.phase = phase
        self.state.step = step
        self.state.notes = notes
        self.gov.set_agent_state(self.state)

        print(f"[PHASE] {phase.value} - {step}")
        return True

    def report_error(self, error_type: str, message: str) -> bool:
        """Report an error and check if we should continue"""
        counts = self.gov.record_error(self.agent_id, error_type, message)
        print(f"[ERROR] {error_type}: {message}")
        print(f"[ERROR] Counts: total={counts['total_errors']}, violations={counts['procedure_violations']}")

        ok, reason = self.gov.check_error_budget(self.agent_id)
        if not ok:
            print(f"[REVOKE] Error budget exceeded: {reason}")
            self.gov.revoke_agent(self.agent_id, RevocationType.ERROR_BUDGET_EXCEEDED, reason)
            return False

        return True

    def report_violation(self, violation: str) -> bool:
        """Report a procedure violation"""
        count = self.gov.record_procedure_violation(self.agent_id, violation)
        print(f"[VIOLATION] {violation} (count: {count})")

        ok, reason = self.gov.check_error_budget(self.agent_id)
        if not ok:
            print(f"[REVOKE] Procedure violation limit: {reason}")
            self.gov.revoke_agent(self.agent_id, RevocationType.PROCEDURE_VIOLATION, reason)
            return False

        return True

    def register_artifact(self, artifact_type: str, reference: str):
        """Register a work artifact"""
        if self.packet:
            self.gov.register_artifact(self.packet.task_id, artifact_type, reference)
            print(f"[ARTIFACT] Registered: {artifact_type} -> {reference}")

    def complete(self, notes: str = "") -> bool:
        """Mark work as complete"""
        if self.state:
            self.state.status = AgentStatus.COMPLETED
            self.state.phase = AgentPhase.EXIT
            self.state.notes = notes
            self.gov.set_agent_state(self.state)

        self.gov.release_lock(self.agent_id)
        print(f"[COMPLETE] {notes}")
        return True

    def create_handoff(self, blocking_issue: str, what_was_tried: list[str],
                       required_next_actions: list[str]) -> bool:
        """Create a handoff for the next agent"""
        if not self.packet or not self.state:
            return False

        handoff = HandoffObject(
            task_id=self.packet.task_id,
            previous_agent_id=self.agent_id,
            revoked=self.state.status == AgentStatus.REVOKED,
            revocation_reason={
                "type": self.state.status.value,
                "details": self.state.notes
            },
            last_known_state={
                "phase": self.state.phase.value,
                "step": self.state.step
            },
            what_was_tried=what_was_tried,
            blocking_issue=blocking_issue,
            required_next_actions=required_next_actions,
            constraints_reminder=self.packet.constraints.get("forbidden", []),
            artifacts=[a["reference"] for a in self.gov.get_artifacts(self.packet.task_id)]
        )

        return self.gov.create_handoff(handoff)


# =============================================================================
# CLI
# =============================================================================

if __name__ == "__main__":
    import sys

    gov = GovernanceManager()

    if len(sys.argv) < 2:
        print("Usage: governance.py <command> [args]")
        print("Commands:")
        print("  create-packet <agent_id> <task_id> <objective>")
        print("  get-state <agent_id>")
        print("  get-errors <agent_id>")
        print("  revocations")
        print("  test")
        sys.exit(1)

    cmd = sys.argv[1]

    if cmd == "create-packet":
        agent_id, task_id, objective = sys.argv[2], sys.argv[3], sys.argv[4]
        packet = InstructionPacket(
            agent_id=agent_id,
            task_id=task_id,
            created_for="CLI test",
            objective=objective,
            deliverables=["plan artifacts", "run logs"],
            constraints={
                "scope": ["sandbox only"],
                "forbidden": ["no prod access", "no unrecorded root"],
                "required_steps": ["plan before apply"]
            },
            success_criteria=["CI passes", "artifacts uploaded"],
            error_budget=ErrorBudget(),
            escalation_rules=["If blocked > 20m -> escalate"]
        )
        gov.create_instruction_packet(packet)
        print(f"Created packet for {agent_id}")

    elif cmd == "get-state":
        agent_id = sys.argv[2]
        state = gov.get_agent_state(agent_id)
        if state:
            print(json.dumps(state.to_dict(), indent=2))
        else:
            print("No state found")

    elif cmd == "get-errors":
        agent_id = sys.argv[2]
        errors = gov.get_error_counts(agent_id)
        print(json.dumps(errors, indent=2))

    elif cmd == "revocations":
        revs = gov.get_recent_revocations()
        for r in revs:
            print(json.dumps(r))

    elif cmd == "test":
        print("=== Testing Governance System ===\n")

        # Create test packet
        packet = InstructionPacket(
            agent_id="test-agent-001",
            task_id="test-task-001",
            created_for="Governance test",
            objective="Test the governance system",
            deliverables=["test output"],
            constraints={
                "scope": ["sandbox"],
                "forbidden": ["prod access"],
                "required_steps": ["plan first"]
            },
            success_criteria=["test passes"],
            error_budget=ErrorBudget(max_total_errors=5, max_same_error_repeats=2),
            escalation_rules=[]
        )
        gov.create_instruction_packet(packet)
        print("[OK] Created instruction packet")

        # Test worker runtime
        worker = WorkerRuntime("test-agent-001")
        ok, msg = worker.bootstrap()
        print(f"[OK] Bootstrap: {msg}")

        # Simulate work phases
        worker.transition(AgentPhase.PREFLIGHT, "scope_check")
        worker.transition(AgentPhase.PLAN, "generating_plan")
        worker.register_artifact("plan", "plan_output_001")
        worker.transition(AgentPhase.EXECUTE, "applying")

        # Simulate some errors
        worker.report_error("VALIDATION_ERROR", "Variable not defined")
        worker.report_error("VALIDATION_ERROR", "Variable not defined")  # Same error

        # Check error budget
        ok, reason = gov.check_error_budget("test-agent-001")
        print(f"[OK] Error budget check: ok={ok}, reason={reason}")

        # Complete
        worker.complete("Test completed successfully")

        print("\n=== Test Complete ===")

    else:
        print(f"Unknown command: {cmd}")