agent-governance/runtime/revocation.py

#!/usr/bin/env python3
"""
Revocation Engine
=================
Real-time violation detection and immediate credential revocation.
Part of Phase 4: Promotion and Revocation Engine.

Immediate Revocation Events:
- Resource created outside approved pool
- Terraform apply without stored plan
- Ansible run without check-mode (no waiver)
- Prod access without gate approval
- Unrecorded root session
- Direct baseline mutation
- Error budget exceeded
- Procedure violations
"""

import json
import sqlite3
import subprocess
import sys
import redis
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import Enum
from typing import Optional


# =============================================================================
# Configuration
# =============================================================================

LEDGER_DB = "/opt/agent-governance/ledger/governance.db"


class ViolationType(str, Enum):
    # Critical - Immediate revocation
    UNAUTHORIZED_POOL = "UNAUTHORIZED_POOL"
    APPLY_WITHOUT_PLAN = "APPLY_WITHOUT_PLAN"
    RUN_WITHOUT_CHECK = "RUN_WITHOUT_CHECK"
    UNAUTHORIZED_PROD = "UNAUTHORIZED_PROD"
    UNRECORDED_ROOT = "UNRECORDED_ROOT"
    BASELINE_MUTATION = "BASELINE_MUTATION"

    # High - Immediate revocation
    ERROR_BUDGET_EXCEEDED = "ERROR_BUDGET_EXCEEDED"
    PROCEDURE_VIOLATION = "PROCEDURE_VIOLATION"
    HEARTBEAT_TIMEOUT = "HEARTBEAT_TIMEOUT"
    LOCK_EXPIRED = "LOCK_EXPIRED"

    # Medium - Warning then revocation
    SCOPE_VIOLATION = "SCOPE_VIOLATION"
    FORBIDDEN_ACTION = "FORBIDDEN_ACTION"

    # Low - Warning only
    CONFIDENCE_BELOW_THRESHOLD = "CONFIDENCE_BELOW_THRESHOLD"
    MISSING_ARTIFACT = "MISSING_ARTIFACT"


class Severity(str, Enum):
    CRITICAL = "critical"  # Immediate revocation, alert
    HIGH = "high"          # Immediate revocation
    MEDIUM = "medium"      # Warning, second offense = revoke
    LOW = "low"            # Warning only


# Violation severity mapping
VIOLATION_SEVERITY = {
    ViolationType.UNAUTHORIZED_POOL: Severity.CRITICAL,
    ViolationType.APPLY_WITHOUT_PLAN: Severity.CRITICAL,
    ViolationType.RUN_WITHOUT_CHECK: Severity.CRITICAL,
    ViolationType.UNAUTHORIZED_PROD: Severity.CRITICAL,
    ViolationType.UNRECORDED_ROOT: Severity.CRITICAL,
    ViolationType.BASELINE_MUTATION: Severity.CRITICAL,
    ViolationType.ERROR_BUDGET_EXCEEDED: Severity.HIGH,
    ViolationType.PROCEDURE_VIOLATION: Severity.HIGH,
    ViolationType.HEARTBEAT_TIMEOUT: Severity.HIGH,
    ViolationType.LOCK_EXPIRED: Severity.HIGH,
    ViolationType.SCOPE_VIOLATION: Severity.MEDIUM,
    ViolationType.FORBIDDEN_ACTION: Severity.MEDIUM,
    ViolationType.CONFIDENCE_BELOW_THRESHOLD: Severity.LOW,
    ViolationType.MISSING_ARTIFACT: Severity.LOW,
}


@dataclass
class Violation:
    agent_id: str
    violation_type: ViolationType
    severity: Severity
    description: str
    triggering_action: str
    evidence: dict
    timestamp: str
    remediation: str = ""

    def to_dict(self) -> dict:
        return {
            "agent_id": self.agent_id,
            "violation_type": self.violation_type.value,
            "severity": self.severity.value,
            "description": self.description,
            "triggering_action": self.triggering_action,
            "evidence": self.evidence,
            "timestamp": self.timestamp,
            "remediation": self.remediation
        }


@dataclass
class RevocationResult:
    agent_id: str
    success: bool
    action_taken: str
    violation: Violation
    vault_revoked: bool
    dragonfly_revoked: bool
    timestamp: str

    def to_dict(self) -> dict:
        return {
            "agent_id": self.agent_id,
            "success": self.success,
            "action_taken": self.action_taken,
            "violation": self.violation.to_dict(),
            "vault_revoked": self.vault_revoked,
            "dragonfly_revoked": self.dragonfly_revoked,
            "timestamp": self.timestamp
        }


class RevocationEngine:
    """
    Detects violations and revokes agent access.
    """

    def __init__(self, db_path: str = LEDGER_DB):
        self.db_path = db_path
        self.vault_token = self._get_vault_token()
        self.redis = self._get_redis()

    def _now(self) -> str:
        return datetime.now(timezone.utc).isoformat()

    def _get_vault_token(self) -> str:
        try:
            with open("/opt/vault/init-keys.json") as f:
                return json.load(f)["root_token"]
        except:
            return ""

    def _get_redis(self) -> Optional[redis.Redis]:
        try:
            # Get credentials from Vault
            result = subprocess.run([
                "curl", "-sk",
                "-H", f"X-Vault-Token: {self.vault_token}",
                "https://127.0.0.1:8200/v1/secret/data/services/dragonfly"
            ], capture_output=True, text=True)

            creds = json.loads(result.stdout)["data"]["data"]
            return redis.Redis(
                host=creds["host"],
                port=int(creds["port"]),
                password=creds["password"],
                decode_responses=True
            )
        except:
            return None

    def _get_conn(self) -> sqlite3.Connection:
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        return conn

    def record_violation(self, violation: Violation) -> int:
        """Record a violation in the ledger"""
        conn = self._get_conn()
        cursor = conn.cursor()

        cursor.execute("""
            INSERT INTO violations
            (timestamp, agent_id, violation_type, severity, description,
             triggering_action, evidence, remediation)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            violation.timestamp,
            violation.agent_id,
            violation.violation_type.value,
            violation.severity.value,
            violation.description,
            violation.triggering_action,
            json.dumps(violation.evidence),
            violation.remediation
        ))

        violation_id = cursor.lastrowid

        # Update agent metrics
        cursor.execute("""
            UPDATE agent_metrics
            SET consecutive_compliant = 0,
                last_violation_at = ?,
                promotion_eligible = 0,
                updated_at = ?
            WHERE agent_id = ?
        """, (violation.timestamp, violation.timestamp, violation.agent_id))

        conn.commit()
        conn.close()

        return violation_id

    def revoke_vault_token(self, agent_id: str) -> bool:
        """Revoke agent's Vault token"""
        # In production, this would look up the agent's token accessor
        # and revoke it via Vault API

        # For now, set a revocation signal that the agent must respect
        try:
            result = subprocess.run([
                "curl", "-sk", "-X", "POST",
                "-H", f"X-Vault-Token: {self.vault_token}",
                "-d", json.dumps({"revoked_at": self._now(), "agent_id": agent_id}),
                f"https://127.0.0.1:8200/v1/secret/data/revocations/{agent_id}"
            ], capture_output=True, text=True)

            return "errors" not in result.stdout
        except:
            return False

    def revoke_dragonfly_access(self, agent_id: str) -> bool:
        """Revoke agent's access in DragonflyDB"""
        if not self.redis:
            return False

        try:
            # Set revocation signal
            self.redis.set(f"agent:{agent_id}:revoke_signal", "1", ex=86400)

            # Update agent state to REVOKED
            state_key = f"agent:{agent_id}:state"
            state_data = self.redis.get(state_key)
            if state_data:
                state = json.loads(state_data)
                state["status"] = "REVOKED"
                state["revoked_at"] = self._now()
                self.redis.set(state_key, json.dumps(state))

            # Release any locks
            self.redis.delete(f"agent:{agent_id}:lock")

            # Add to revocation ledger
            revocation_event = {
                "agent_id": agent_id,
                "revoked_at": self._now(),
                "reason": "VIOLATION"
            }
            self.redis.rpush("revocations:ledger", json.dumps(revocation_event))

            return True
        except Exception as e:
            print(f"[ERROR] Failed to revoke DragonflyDB access: {e}")
            return False

    def send_alert(self, violation: Violation):
        """Send alert for critical violations"""
        # In production, this would integrate with PagerDuty, Slack, etc.
        print(f"\n{'!'*60}")
        print(f"CRITICAL ALERT: {violation.violation_type.value}")
        print(f"{'!'*60}")
        print(f"Agent: {violation.agent_id}")
        print(f"Description: {violation.description}")
        print(f"Severity: {violation.severity.value}")
        print(f"Time: {violation.timestamp}")
        print(f"{'!'*60}\n")

        # Store alert for dashboard
        if self.redis:
            alert = {
                "type": "VIOLATION",
                "violation_type": violation.violation_type.value,
                "agent_id": violation.agent_id,
                "severity": violation.severity.value,
                "message": violation.description,
                "timestamp": violation.timestamp
            }
            self.redis.rpush("alerts:queue", json.dumps(alert))
            self.redis.ltrim("alerts:queue", -100, -1)  # Keep last 100 alerts

    def process_violation(self, violation: Violation) -> RevocationResult:
        """Process a violation and take appropriate action"""
        print(f"\n[VIOLATION] Processing: {violation.violation_type.value}")
        print(f"[VIOLATION] Agent: {violation.agent_id}")
        print(f"[VIOLATION] Severity: {violation.severity.value}")

        # Record the violation
        violation_id = self.record_violation(violation)

        # Determine action based on severity
        vault_revoked = False
        dragonfly_revoked = False
        action_taken = "RECORDED"

        if violation.severity in [Severity.CRITICAL, Severity.HIGH]:
            # Immediate revocation
            print(f"[REVOKE] Initiating immediate revocation for {violation.agent_id}")

            vault_revoked = self.revoke_vault_token(violation.agent_id)
            dragonfly_revoked = self.revoke_dragonfly_access(violation.agent_id)

            action_taken = "REVOKED"

            if violation.severity == Severity.CRITICAL:
                self.send_alert(violation)

        elif violation.severity == Severity.MEDIUM:
            # Check for prior warnings
            conn = self._get_conn()
            cursor = conn.cursor()

            cursor.execute("""
                SELECT COUNT(*) as count FROM violations
                WHERE agent_id = ?
                AND severity = 'medium'
                AND datetime(timestamp) >= datetime('now', '-7 days')
            """, (violation.agent_id,))

            prior_warnings = cursor.fetchone()['count']
            conn.close()

            if prior_warnings >= 2:  # This is the 3rd+ medium violation
                print(f"[REVOKE] Multiple medium violations, revoking {violation.agent_id}")
                vault_revoked = self.revoke_vault_token(violation.agent_id)
                dragonfly_revoked = self.revoke_dragonfly_access(violation.agent_id)
                action_taken = "REVOKED"
            else:
                print(f"[WARNING] Warning issued to {violation.agent_id}")
                action_taken = "WARNING"

        else:  # LOW severity
            print(f"[WARNING] Low severity violation recorded for {violation.agent_id}")
            action_taken = "WARNING"

        return RevocationResult(
            agent_id=violation.agent_id,
            success=True,
            action_taken=action_taken,
            violation=violation,
            vault_revoked=vault_revoked,
            dragonfly_revoked=dragonfly_revoked,
            timestamp=self._now()
        )

    def create_violation(
        self,
        agent_id: str,
        violation_type: ViolationType,
        description: str,
        triggering_action: str = "",
        evidence: dict = None,
        remediation: str = ""
    ) -> Violation:
        """Helper to create a violation object"""
        return Violation(
            agent_id=agent_id,
            violation_type=violation_type,
            severity=VIOLATION_SEVERITY[violation_type],
            description=description,
            triggering_action=triggering_action,
            evidence=evidence or {},
            timestamp=self._now(),
            remediation=remediation
        )

    def get_violations(self, agent_id: str = None, severity: str = None,
                       limit: int = 50) -> list:
        """Get violation history"""
        conn = self._get_conn()
        cursor = conn.cursor()

        query = "SELECT * FROM violations WHERE 1=1"
        params = []

        if agent_id:
            query += " AND agent_id = ?"
            params.append(agent_id)

        if severity:
            query += " AND severity = ?"
            params.append(severity)

        query += " ORDER BY timestamp DESC LIMIT ?"
        params.append(limit)

        cursor.execute(query, params)
        rows = [dict(row) for row in cursor.fetchall()]
        conn.close()

        return rows

    def get_active_revocations(self) -> list:
        """Get currently revoked agents from DragonflyDB"""
        if not self.redis:
            return []

        revoked = []
        keys = self.redis.keys("agent:*:state")

        for key in keys:
            data = self.redis.get(key)
            if data:
                state = json.loads(data)
                if state.get("status") == "REVOKED":
                    revoked.append({
                        "agent_id": state.get("agent_id"),
                        "revoked_at": state.get("revoked_at"),
                        "notes": state.get("notes", "")
                    })

        return revoked

    def acknowledge_violation(self, violation_id: int, acknowledged_by: str) -> bool:
        """Acknowledge a violation (for remediation tracking)"""
        conn = self._get_conn()
        cursor = conn.cursor()

        cursor.execute("""
            UPDATE violations
            SET acknowledged = 1,
                acknowledged_by = ?
            WHERE id = ?
        """, (acknowledged_by, violation_id))

        success = cursor.rowcount > 0
        conn.commit()
        conn.close()
        return success


# =============================================================================
# Violation Detectors
# =============================================================================

class ViolationDetector:
    """
    Detects violations in real-time.
    """

    def __init__(self):
        self.engine = RevocationEngine()

    def check_plan_artifact(self, agent_id: str, action: str, artifact_exists: bool) -> Optional[Violation]:
        """Check if apply/run has a corresponding plan artifact"""
        if action in ["terraform_apply", "ansible_run"] and not artifact_exists:
            return self.engine.create_violation(
                agent_id=agent_id,
                violation_type=ViolationType.APPLY_WITHOUT_PLAN,
                description=f"Attempted {action} without required plan artifact",
                triggering_action=action,
                remediation="Always run plan/check before apply/run"
            )
        return None

    def check_pool_authorization(self, agent_id: str, agent_tier: int,
                                  target_pool: str) -> Optional[Violation]:
        """Check if agent is authorized for target pool"""
        forbidden_pools = {
            0: ["pve-sandbox", "pve-staging", "pve-prod"],
            1: ["pve-staging", "pve-prod"],
            2: ["pve-prod"],
            3: [],
            4: []
        }

        if target_pool in forbidden_pools.get(agent_tier, []):
            return self.engine.create_violation(
                agent_id=agent_id,
                violation_type=ViolationType.UNAUTHORIZED_POOL,
                description=f"Tier {agent_tier} agent accessed forbidden pool: {target_pool}",
                evidence={"agent_tier": agent_tier, "target_pool": target_pool},
                remediation="Request promotion or use authorized pools only"
            )
        return None

    def check_production_access(self, agent_id: str, target: str,
                                 has_approval: bool) -> Optional[Violation]:
        """Check if production access has gate approval"""
        if "prod" in target.lower() and not has_approval:
            return self.engine.create_violation(
                agent_id=agent_id,
                violation_type=ViolationType.UNAUTHORIZED_PROD,
                description=f"Production access attempted without approval: {target}",
                evidence={"target": target, "approval": False},
                remediation="Request production access through gate approval process"
            )
        return None


# =============================================================================
# CLI
# =============================================================================

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Revocation Engine")
    subparsers = parser.add_subparsers(dest="command", required=True)

    # Report violation command
    report_parser = subparsers.add_parser("report", help="Report a violation")
    report_parser.add_argument("agent_id", help="Agent ID")
    report_parser.add_argument("--type", required=True,
                               choices=[v.value for v in ViolationType],
                               help="Violation type")
    report_parser.add_argument("--description", required=True, help="Description")
    report_parser.add_argument("--action", default="", help="Triggering action")

    # List violations command
    list_parser = subparsers.add_parser("list", help="List violations")
    list_parser.add_argument("--agent-id", help="Filter by agent")
    list_parser.add_argument("--severity", choices=["critical", "high", "medium", "low"])
    list_parser.add_argument("--limit", type=int, default=20)
    list_parser.add_argument("--json", action="store_true")

    # List revoked command
    revoked_parser = subparsers.add_parser("revoked", help="List revoked agents")
    revoked_parser.add_argument("--json", action="store_true")

    # Acknowledge command
    ack_parser = subparsers.add_parser("acknowledge", help="Acknowledge violation")
    ack_parser.add_argument("violation_id", type=int)
    ack_parser.add_argument("--by", required=True, help="Acknowledger name")

    # Types command
    types_parser = subparsers.add_parser("types", help="List violation types")

    args = parser.parse_args()

    engine = RevocationEngine()

    if args.command == "report":
        violation_type = ViolationType(args.type)
        violation = engine.create_violation(
            agent_id=args.agent_id,
            violation_type=violation_type,
            description=args.description,
            triggering_action=args.action
        )

        result = engine.process_violation(violation)

        print(f"\n{'='*60}")
        print("VIOLATION PROCESSED")
        print(f"{'='*60}")
        print(f"Agent: {result.agent_id}")
        print(f"Type: {result.violation.violation_type.value}")
        print(f"Severity: {result.violation.severity.value}")
        print(f"Action Taken: {result.action_taken}")
        print(f"Vault Revoked: {result.vault_revoked}")
        print(f"DragonflyDB Revoked: {result.dragonfly_revoked}")
        print(f"{'='*60}")

    elif args.command == "list":
        violations = engine.get_violations(
            agent_id=args.agent_id,
            severity=args.severity,
            limit=args.limit
        )

        if args.json:
            print(json.dumps(violations, indent=2))
        else:
            print(f"\n{'='*60}")
            print("VIOLATION HISTORY")
            print(f"{'='*60}")

            if not violations:
                print("No violations found")
            else:
                for v in violations:
                    severity_icon = {
                        "critical": "[CRIT]",
                        "high": "[HIGH]",
                        "medium": "[MED]",
                        "low": "[LOW]"
                    }.get(v['severity'], "[?]")

                    ack = "[ACK]" if v['acknowledged'] else ""
                    print(f"\n{severity_icon} {v['violation_type']} {ack}")
                    print(f"  Agent: {v['agent_id']}")
                    print(f"  {v['description']}")
                    print(f"  Time: {v['timestamp']}")

            print(f"{'='*60}")

    elif args.command == "revoked":
        revoked = engine.get_active_revocations()

        if args.json:
            print(json.dumps(revoked, indent=2))
        else:
            print(f"\n{'='*60}")
            print("CURRENTLY REVOKED AGENTS")
            print(f"{'='*60}")

            if not revoked:
                print("No agents currently revoked")
            else:
                for r in revoked:
                    print(f"\n  {r['agent_id']}")
                    print(f"    Revoked: {r['revoked_at']}")
                    if r['notes']:
                        print(f"    Notes: {r['notes']}")

            print(f"{'='*60}")

    elif args.command == "acknowledge":
        success = engine.acknowledge_violation(args.violation_id, args.by)
        if success:
            print(f"[OK] Violation {args.violation_id} acknowledged by {args.by}")
        else:
            print(f"[FAILED] Could not acknowledge violation {args.violation_id}")
            sys.exit(1)

    elif args.command == "types":
        print(f"\n{'='*60}")
        print("VIOLATION TYPES")
        print(f"{'='*60}")

        for vtype in ViolationType:
            severity = VIOLATION_SEVERITY[vtype]
            print(f"\n  {vtype.value}")
            print(f"    Severity: {severity.value}")

        print(f"{'='*60}")