profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

470 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Pipeline Core Definitions
=========================
This module contains the authoritative definitions for the agent governance
pipeline system. All other code (tests, demos, orchestrators) should import
from here to ensure consistency with the architecture specification.
Architecture Reference: /opt/agent-governance/docs/ARCHITECTURE.md Section 4.2
"""
from enum import Enum
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Any, Optional
import hashlib
import json
# =============================================================================
# Agent Lifecycle Phases (OFFICIAL)
# =============================================================================
# These phases define the complete agent lifecycle as specified in ARCHITECTURE.md
# Order: BOOTSTRAP -> PREFLIGHT -> PLAN -> EXECUTE -> VERIFY -> PACKAGE -> REPORT -> EXIT
class AgentPhase(str, Enum):
"""
Official agent lifecycle phases.
From ARCHITECTURE.md Section 4.2:
- BOOTSTRAP: Agent initialization and authentication
- PREFLIGHT: Pre-execution validation (sandbox, inventory, dependencies)
- PLAN: Generate and validate execution plan
- EXECUTE: Perform the planned actions
- VERIFY: Validate execution results
- PACKAGE: Bundle artifacts and evidence
- REPORT: Generate completion report
- EXIT: Clean shutdown and resource release
- REVOKED: Agent was revoked (terminal state)
"""
BOOTSTRAP = "BOOTSTRAP"
PREFLIGHT = "PREFLIGHT"
PLAN = "PLAN"
EXECUTE = "EXECUTE"
VERIFY = "VERIFY"
PACKAGE = "PACKAGE"
REPORT = "REPORT"
EXIT = "EXIT"
REVOKED = "REVOKED"
# Ordered list of phases for iteration (excludes REVOKED as it's terminal)
AGENT_PHASES_ORDERED = [
AgentPhase.BOOTSTRAP,
AgentPhase.PREFLIGHT,
AgentPhase.PLAN,
AgentPhase.EXECUTE,
AgentPhase.VERIFY,
AgentPhase.PACKAGE,
AgentPhase.REPORT,
AgentPhase.EXIT,
]
# Phase names as strings for compatibility
AGENT_PHASE_NAMES = [p.value for p in AGENT_PHASES_ORDERED]
# =============================================================================
# Agent Status (OFFICIAL)
# =============================================================================
class AgentStatus(str, Enum):
"""
Official agent runtime status values.
From ARCHITECTURE.md and runtime/governance.py:
- PENDING: Awaiting start
- STARTING: Initialization in progress
- RUNNING: Actively executing
- PAUSED: Temporarily suspended (for plan clarification)
- COMPLETED: Successfully finished
- FAILED: Execution failed
- REVOKED: Forcibly terminated
- RECOVERING: Recovering from chaos/error condition
"""
PENDING = "PENDING"
STARTING = "STARTING"
RUNNING = "RUNNING"
PAUSED = "PAUSED"
COMPLETED = "COMPLETED"
FAILED = "FAILED"
REVOKED = "REVOKED"
RECOVERING = "RECOVERING"
# =============================================================================
# Pipeline Stage Types (OFFICIAL)
# =============================================================================
class StageType(str, Enum):
"""
Official pipeline stage types from pipeline.schema.json.
- AGENT: Executes an agent task
- GATE: Approval/consensus checkpoint (human or automated)
- PARALLEL: Concurrent execution of multiple branches
- CONDITION: Conditional branching (if/then/else)
"""
AGENT = "agent"
GATE = "gate"
PARALLEL = "parallel"
CONDITION = "condition"
class StageStatus(str, Enum):
"""Official stage execution status."""
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
SKIPPED = "skipped"
# =============================================================================
# Output Types (Alpha/Beta/Gamma)
# =============================================================================
class OutputType(str, Enum):
"""
Agent output classification for checkpoint tracking.
- ALPHA: Initial/draft outputs (plans, analysis)
- BETA: Refined outputs (validated plans, partial results)
- GAMMA: Final outputs (completed work, verified results)
"""
ALPHA = "alpha"
BETA = "beta"
GAMMA = "gamma"
# Map phases to their typical output types
PHASE_OUTPUT_TYPES = {
AgentPhase.BOOTSTRAP: OutputType.ALPHA,
AgentPhase.PREFLIGHT: OutputType.ALPHA,
AgentPhase.PLAN: OutputType.BETA,
AgentPhase.EXECUTE: OutputType.BETA,
AgentPhase.VERIFY: OutputType.GAMMA,
AgentPhase.PACKAGE: OutputType.GAMMA,
AgentPhase.REPORT: OutputType.GAMMA,
AgentPhase.EXIT: OutputType.GAMMA,
}
# =============================================================================
# Chaos Conditions
# =============================================================================
class ChaosCondition(str, Enum):
"""
Chaos conditions that can be injected for testing resilience.
"""
NONE = "none"
TOKEN_REVOKED = "token_revoked"
LOCK_LOST = "lock_lost"
STATE_CORRUPTED = "state_corrupted"
HEARTBEAT_TIMEOUT = "heartbeat_timeout"
ERROR_SPIKE = "error_spike"
NETWORK_DELAY = "network_delay"
class ViolationSeverity(str, Enum):
"""
Severity levels for violations.
Synchronized with runtime/revocation.py.
"""
CRITICAL = "critical" # Immediate revocation + alert
HIGH = "high" # Immediate revocation
MEDIUM = "medium" # Warning, second offense = revoke
LOW = "low" # Warning only
class ViolationType(str, Enum):
"""
Full violation taxonomy with severity classification.
Synchronized with runtime/revocation.py.
Critical - Immediate revocation + alert:
UNAUTHORIZED_POOL, APPLY_WITHOUT_PLAN, RUN_WITHOUT_CHECK,
UNAUTHORIZED_PROD, UNRECORDED_ROOT, BASELINE_MUTATION
High - Immediate revocation:
ERROR_BUDGET_EXCEEDED, PROCEDURE_VIOLATION, HEARTBEAT_TIMEOUT, LOCK_EXPIRED
Medium - Warning then revocation:
SCOPE_VIOLATION, FORBIDDEN_ACTION
Low - Warning only:
CONFIDENCE_BELOW_THRESHOLD, MISSING_ARTIFACT
"""
# Critical
UNAUTHORIZED_POOL = "UNAUTHORIZED_POOL"
APPLY_WITHOUT_PLAN = "APPLY_WITHOUT_PLAN"
RUN_WITHOUT_CHECK = "RUN_WITHOUT_CHECK"
UNAUTHORIZED_PROD = "UNAUTHORIZED_PROD"
UNRECORDED_ROOT = "UNRECORDED_ROOT"
BASELINE_MUTATION = "BASELINE_MUTATION"
# High
ERROR_BUDGET_EXCEEDED = "ERROR_BUDGET_EXCEEDED"
PROCEDURE_VIOLATION = "PROCEDURE_VIOLATION"
HEARTBEAT_TIMEOUT = "HEARTBEAT_TIMEOUT"
LOCK_EXPIRED = "LOCK_EXPIRED"
# Medium
SCOPE_VIOLATION = "SCOPE_VIOLATION"
FORBIDDEN_ACTION = "FORBIDDEN_ACTION"
# Low
CONFIDENCE_BELOW_THRESHOLD = "CONFIDENCE_BELOW_THRESHOLD"
MISSING_ARTIFACT = "MISSING_ARTIFACT"
# Backwards compatibility alias
RevocationType = ViolationType
# Violation severity mapping
VIOLATION_SEVERITY_MAP = {
ViolationType.UNAUTHORIZED_POOL: ViolationSeverity.CRITICAL,
ViolationType.APPLY_WITHOUT_PLAN: ViolationSeverity.CRITICAL,
ViolationType.RUN_WITHOUT_CHECK: ViolationSeverity.CRITICAL,
ViolationType.UNAUTHORIZED_PROD: ViolationSeverity.CRITICAL,
ViolationType.UNRECORDED_ROOT: ViolationSeverity.CRITICAL,
ViolationType.BASELINE_MUTATION: ViolationSeverity.CRITICAL,
ViolationType.ERROR_BUDGET_EXCEEDED: ViolationSeverity.HIGH,
ViolationType.PROCEDURE_VIOLATION: ViolationSeverity.HIGH,
ViolationType.HEARTBEAT_TIMEOUT: ViolationSeverity.HIGH,
ViolationType.LOCK_EXPIRED: ViolationSeverity.HIGH,
ViolationType.SCOPE_VIOLATION: ViolationSeverity.MEDIUM,
ViolationType.FORBIDDEN_ACTION: ViolationSeverity.MEDIUM,
ViolationType.CONFIDENCE_BELOW_THRESHOLD: ViolationSeverity.LOW,
ViolationType.MISSING_ARTIFACT: ViolationSeverity.LOW,
}
# =============================================================================
# Integration Event Types
# =============================================================================
class IntegrationEventType(str, Enum):
"""
Event types for external integrations (GitHub, Slack, etc.).
Maps to AgentPhase lifecycle and governance events.
"""
# Lifecycle events
PLAN_CREATED = "plan_created" # AgentPhase.PLAN completed
EXECUTION_STARTED = "execution_started" # AgentPhase.EXECUTE started
EXECUTION_COMPLETE = "execution_complete" # AgentPhase.EXECUTE completed
# Governance events
VIOLATION_DETECTED = "violation_detected" # ViolationType triggered
PROMOTION_REQUESTED = "promotion_requested" # Tier upgrade request
PROMOTION_APPROVED = "promotion_approved" # Tier upgrade granted
AGENT_REVOKED = "agent_revoked" # Agent token revoked
# Workflow events
APPROVAL_REQUIRED = "approval_required" # StageType.GATE reached
HEARTBEAT = "heartbeat" # Agent health signal
# Map integration events to lifecycle phases
INTEGRATION_EVENT_PHASE_MAP = {
IntegrationEventType.PLAN_CREATED: AgentPhase.PLAN,
IntegrationEventType.EXECUTION_STARTED: AgentPhase.EXECUTE,
IntegrationEventType.EXECUTION_COMPLETE: AgentPhase.EXECUTE,
IntegrationEventType.AGENT_REVOKED: AgentPhase.REVOKED,
}
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class AgentOutput:
"""
Represents an Alpha/Beta/Gamma output from an agent at a checkpoint.
"""
agent_id: str
output_type: OutputType
phase: AgentPhase
content: Dict[str, Any]
timestamp: datetime = field(default_factory=datetime.utcnow)
checksum: str = ""
def __post_init__(self):
if not self.checksum:
self.checksum = hashlib.sha256(
json.dumps(self.content, sort_keys=True).encode()
).hexdigest()[:12]
@dataclass
class StageResult:
"""Result of a pipeline stage execution."""
name: str
status: StageStatus
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
artifacts: Dict[str, Any] = field(default_factory=dict)
error: Optional[str] = None
agent_id: Optional[str] = None
@dataclass
class PipelineContext:
"""Runtime context passed through pipeline stages."""
pipeline_name: str
run_id: str
inputs: Dict[str, Any]
variables: Dict[str, Any] = field(default_factory=dict)
artifacts: Dict[str, Any] = field(default_factory=dict)
stage_results: Dict[str, StageResult] = field(default_factory=dict)
@dataclass
class ErrorBudget:
"""Error budget configuration for agents."""
max_total_errors: int = 12
max_same_error_repeats: int = 3
max_procedure_violations: int = 1
@dataclass
class ClarifiedPlan:
"""A clarified plan broadcast by overwatch after error threshold."""
plan_id: str
trigger_reason: str
history_reviewed: List[str]
outputs_analyzed: List[str]
original_objective: str
clarified_objective: str
adjustments: List[str]
broadcast_at: datetime
acknowledged_by: List[str] = field(default_factory=list)
# =============================================================================
# DragonflyDB Key Patterns (OFFICIAL)
# =============================================================================
class RedisKeys:
"""
Official DragonflyDB keyspace patterns from ARCHITECTURE.md.
"""
# Agent keys
AGENT_PACKET = "agent:{agent_id}:packet" # Instruction packet
AGENT_STATE = "agent:{agent_id}:state" # Runtime state
AGENT_ERRORS = "agent:{agent_id}:errors" # Error counters
AGENT_HEARTBEAT = "agent:{agent_id}:heartbeat" # Last seen
AGENT_LOCK = "agent:{agent_id}:lock" # Execution lock
AGENT_OUTPUT = "agent:{agent_id}:output:{type}" # Alpha/Beta/Gamma outputs
# Task keys
TASK_ACTIVE_AGENT = "task:{task_id}:active_agent"
TASK_ARTIFACTS = "task:{task_id}:artifacts"
# Project keys
PROJECT_AGENTS = "project:{project_id}:agents"
PROJECT_OBJECTIVE = "project:{project_id}:objective"
PROJECT_PLAN = "project:{project_id}:plan:{plan_id}"
PROJECT_BROADCAST = "project:{project_id}:broadcast"
# Coordination keys
BLACKBOARD = "blackboard:{task}:{section}"
MESSAGE = "msg:{task}:{channel}"
REVOCATIONS = "revocations:ledger"
HANDOFF = "handoff:{task}:latest"
# History keys
HISTORY_RUNS = "history:{agent_id}:runs"
@classmethod
def agent_packet(cls, agent_id: str) -> str:
return cls.AGENT_PACKET.format(agent_id=agent_id)
@classmethod
def agent_state(cls, agent_id: str) -> str:
return cls.AGENT_STATE.format(agent_id=agent_id)
@classmethod
def agent_errors(cls, agent_id: str) -> str:
return cls.AGENT_ERRORS.format(agent_id=agent_id)
@classmethod
def agent_heartbeat(cls, agent_id: str) -> str:
return cls.AGENT_HEARTBEAT.format(agent_id=agent_id)
@classmethod
def agent_lock(cls, agent_id: str) -> str:
return cls.AGENT_LOCK.format(agent_id=agent_id)
@classmethod
def agent_output(cls, agent_id: str, output_type: str) -> str:
return cls.AGENT_OUTPUT.format(agent_id=agent_id, type=output_type)
@classmethod
def project_agents(cls, project_id: str) -> str:
return cls.PROJECT_AGENTS.format(project_id=project_id)
@classmethod
def project_objective(cls, project_id: str) -> str:
return cls.PROJECT_OBJECTIVE.format(project_id=project_id)
@classmethod
def project_plan(cls, project_id: str, plan_id: str) -> str:
return cls.PROJECT_PLAN.format(project_id=project_id, plan_id=plan_id)
@classmethod
def history_runs(cls, agent_id: str) -> str:
return cls.HISTORY_RUNS.format(agent_id=agent_id)
# =============================================================================
# Configuration Constants
# =============================================================================
# Default Redis connection settings
DEFAULT_REDIS_HOST = "127.0.0.1"
DEFAULT_REDIS_PORT = 6379
DEFAULT_REDIS_PASSWORD = "governance2026"
# Default paths
DEFAULT_LEDGER_PATH = "/opt/agent-governance/ledger/governance.db"
DEFAULT_SCHEMA_PATH = "/opt/agent-governance/pipeline/schemas/pipeline.schema.json"
DEFAULT_TEMPLATES_PATH = "/opt/agent-governance/pipeline/templates"
# Timeouts (seconds)
DEFAULT_HEARTBEAT_TTL = 60
DEFAULT_LOCK_TTL = 300
DEFAULT_OUTPUT_TTL = 300
# =============================================================================
# Utility Functions
# =============================================================================
def get_output_type_for_phase(phase: AgentPhase) -> OutputType:
"""Get the appropriate output type for a given phase."""
return PHASE_OUTPUT_TYPES.get(phase, OutputType.ALPHA)
def is_terminal_phase(phase: AgentPhase) -> bool:
"""Check if a phase is terminal (EXIT or REVOKED)."""
return phase in (AgentPhase.EXIT, AgentPhase.REVOKED)
def next_phase(current: AgentPhase) -> Optional[AgentPhase]:
"""Get the next phase in the pipeline, or None if at end."""
if current == AgentPhase.REVOKED:
return None
try:
idx = AGENT_PHASES_ORDERED.index(current)
if idx < len(AGENT_PHASES_ORDERED) - 1:
return AGENT_PHASES_ORDERED[idx + 1]
except ValueError:
pass
return None