Phase 8 Production Hardening with complete governance infrastructure: - Vault integration with tiered policies (T0-T4) - DragonflyDB state management - SQLite audit ledger - Pipeline DSL and templates - Promotion/revocation engine - Checkpoint system for session persistence - Health manager and circuit breaker for fault tolerance - GitHub/Slack integrations - Architectural test pipeline with bug watcher, suggestion engine, council review - Multi-agent chaos testing framework Test Results: - Governance tests: 68/68 passing - E2E workflow: 16/16 passing - Phase 2 Vault: 14/14 passing - Integration tests: 27/27 passing Coverage: 57.6% average across 12 phases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
470 lines
15 KiB
Python
470 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Pipeline Core Definitions
|
|
=========================
|
|
|
|
This module contains the authoritative definitions for the agent governance
|
|
pipeline system. All other code (tests, demos, orchestrators) should import
|
|
from here to ensure consistency with the architecture specification.
|
|
|
|
Architecture Reference: /opt/agent-governance/docs/ARCHITECTURE.md Section 4.2
|
|
"""
|
|
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
import hashlib
|
|
import json
|
|
|
|
|
|
# =============================================================================
|
|
# Agent Lifecycle Phases (OFFICIAL)
|
|
# =============================================================================
|
|
# These phases define the complete agent lifecycle as specified in ARCHITECTURE.md
|
|
# Order: BOOTSTRAP -> PREFLIGHT -> PLAN -> EXECUTE -> VERIFY -> PACKAGE -> REPORT -> EXIT
|
|
|
|
class AgentPhase(str, Enum):
|
|
"""
|
|
Official agent lifecycle phases.
|
|
|
|
From ARCHITECTURE.md Section 4.2:
|
|
- BOOTSTRAP: Agent initialization and authentication
|
|
- PREFLIGHT: Pre-execution validation (sandbox, inventory, dependencies)
|
|
- PLAN: Generate and validate execution plan
|
|
- EXECUTE: Perform the planned actions
|
|
- VERIFY: Validate execution results
|
|
- PACKAGE: Bundle artifacts and evidence
|
|
- REPORT: Generate completion report
|
|
- EXIT: Clean shutdown and resource release
|
|
- REVOKED: Agent was revoked (terminal state)
|
|
"""
|
|
BOOTSTRAP = "BOOTSTRAP"
|
|
PREFLIGHT = "PREFLIGHT"
|
|
PLAN = "PLAN"
|
|
EXECUTE = "EXECUTE"
|
|
VERIFY = "VERIFY"
|
|
PACKAGE = "PACKAGE"
|
|
REPORT = "REPORT"
|
|
EXIT = "EXIT"
|
|
REVOKED = "REVOKED"
|
|
|
|
|
|
# Ordered list of phases for iteration (excludes REVOKED as it's terminal)
|
|
AGENT_PHASES_ORDERED = [
|
|
AgentPhase.BOOTSTRAP,
|
|
AgentPhase.PREFLIGHT,
|
|
AgentPhase.PLAN,
|
|
AgentPhase.EXECUTE,
|
|
AgentPhase.VERIFY,
|
|
AgentPhase.PACKAGE,
|
|
AgentPhase.REPORT,
|
|
AgentPhase.EXIT,
|
|
]
|
|
|
|
# Phase names as strings for compatibility
|
|
AGENT_PHASE_NAMES = [p.value for p in AGENT_PHASES_ORDERED]
|
|
|
|
|
|
# =============================================================================
|
|
# Agent Status (OFFICIAL)
|
|
# =============================================================================
|
|
|
|
class AgentStatus(str, Enum):
|
|
"""
|
|
Official agent runtime status values.
|
|
|
|
From ARCHITECTURE.md and runtime/governance.py:
|
|
- PENDING: Awaiting start
|
|
- STARTING: Initialization in progress
|
|
- RUNNING: Actively executing
|
|
- PAUSED: Temporarily suspended (for plan clarification)
|
|
- COMPLETED: Successfully finished
|
|
- FAILED: Execution failed
|
|
- REVOKED: Forcibly terminated
|
|
- RECOVERING: Recovering from chaos/error condition
|
|
"""
|
|
PENDING = "PENDING"
|
|
STARTING = "STARTING"
|
|
RUNNING = "RUNNING"
|
|
PAUSED = "PAUSED"
|
|
COMPLETED = "COMPLETED"
|
|
FAILED = "FAILED"
|
|
REVOKED = "REVOKED"
|
|
RECOVERING = "RECOVERING"
|
|
|
|
|
|
# =============================================================================
|
|
# Pipeline Stage Types (OFFICIAL)
|
|
# =============================================================================
|
|
|
|
class StageType(str, Enum):
|
|
"""
|
|
Official pipeline stage types from pipeline.schema.json.
|
|
|
|
- AGENT: Executes an agent task
|
|
- GATE: Approval/consensus checkpoint (human or automated)
|
|
- PARALLEL: Concurrent execution of multiple branches
|
|
- CONDITION: Conditional branching (if/then/else)
|
|
"""
|
|
AGENT = "agent"
|
|
GATE = "gate"
|
|
PARALLEL = "parallel"
|
|
CONDITION = "condition"
|
|
|
|
|
|
class StageStatus(str, Enum):
|
|
"""Official stage execution status."""
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
SKIPPED = "skipped"
|
|
|
|
|
|
# =============================================================================
|
|
# Output Types (Alpha/Beta/Gamma)
|
|
# =============================================================================
|
|
|
|
class OutputType(str, Enum):
|
|
"""
|
|
Agent output classification for checkpoint tracking.
|
|
|
|
- ALPHA: Initial/draft outputs (plans, analysis)
|
|
- BETA: Refined outputs (validated plans, partial results)
|
|
- GAMMA: Final outputs (completed work, verified results)
|
|
"""
|
|
ALPHA = "alpha"
|
|
BETA = "beta"
|
|
GAMMA = "gamma"
|
|
|
|
|
|
# Map phases to their typical output types
|
|
PHASE_OUTPUT_TYPES = {
|
|
AgentPhase.BOOTSTRAP: OutputType.ALPHA,
|
|
AgentPhase.PREFLIGHT: OutputType.ALPHA,
|
|
AgentPhase.PLAN: OutputType.BETA,
|
|
AgentPhase.EXECUTE: OutputType.BETA,
|
|
AgentPhase.VERIFY: OutputType.GAMMA,
|
|
AgentPhase.PACKAGE: OutputType.GAMMA,
|
|
AgentPhase.REPORT: OutputType.GAMMA,
|
|
AgentPhase.EXIT: OutputType.GAMMA,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Chaos Conditions
|
|
# =============================================================================
|
|
|
|
class ChaosCondition(str, Enum):
|
|
"""
|
|
Chaos conditions that can be injected for testing resilience.
|
|
"""
|
|
NONE = "none"
|
|
TOKEN_REVOKED = "token_revoked"
|
|
LOCK_LOST = "lock_lost"
|
|
STATE_CORRUPTED = "state_corrupted"
|
|
HEARTBEAT_TIMEOUT = "heartbeat_timeout"
|
|
ERROR_SPIKE = "error_spike"
|
|
NETWORK_DELAY = "network_delay"
|
|
|
|
|
|
class ViolationSeverity(str, Enum):
|
|
"""
|
|
Severity levels for violations.
|
|
Synchronized with runtime/revocation.py.
|
|
"""
|
|
CRITICAL = "critical" # Immediate revocation + alert
|
|
HIGH = "high" # Immediate revocation
|
|
MEDIUM = "medium" # Warning, second offense = revoke
|
|
LOW = "low" # Warning only
|
|
|
|
|
|
class ViolationType(str, Enum):
|
|
"""
|
|
Full violation taxonomy with severity classification.
|
|
Synchronized with runtime/revocation.py.
|
|
|
|
Critical - Immediate revocation + alert:
|
|
UNAUTHORIZED_POOL, APPLY_WITHOUT_PLAN, RUN_WITHOUT_CHECK,
|
|
UNAUTHORIZED_PROD, UNRECORDED_ROOT, BASELINE_MUTATION
|
|
|
|
High - Immediate revocation:
|
|
ERROR_BUDGET_EXCEEDED, PROCEDURE_VIOLATION, HEARTBEAT_TIMEOUT, LOCK_EXPIRED
|
|
|
|
Medium - Warning then revocation:
|
|
SCOPE_VIOLATION, FORBIDDEN_ACTION
|
|
|
|
Low - Warning only:
|
|
CONFIDENCE_BELOW_THRESHOLD, MISSING_ARTIFACT
|
|
"""
|
|
# Critical
|
|
UNAUTHORIZED_POOL = "UNAUTHORIZED_POOL"
|
|
APPLY_WITHOUT_PLAN = "APPLY_WITHOUT_PLAN"
|
|
RUN_WITHOUT_CHECK = "RUN_WITHOUT_CHECK"
|
|
UNAUTHORIZED_PROD = "UNAUTHORIZED_PROD"
|
|
UNRECORDED_ROOT = "UNRECORDED_ROOT"
|
|
BASELINE_MUTATION = "BASELINE_MUTATION"
|
|
|
|
# High
|
|
ERROR_BUDGET_EXCEEDED = "ERROR_BUDGET_EXCEEDED"
|
|
PROCEDURE_VIOLATION = "PROCEDURE_VIOLATION"
|
|
HEARTBEAT_TIMEOUT = "HEARTBEAT_TIMEOUT"
|
|
LOCK_EXPIRED = "LOCK_EXPIRED"
|
|
|
|
# Medium
|
|
SCOPE_VIOLATION = "SCOPE_VIOLATION"
|
|
FORBIDDEN_ACTION = "FORBIDDEN_ACTION"
|
|
|
|
# Low
|
|
CONFIDENCE_BELOW_THRESHOLD = "CONFIDENCE_BELOW_THRESHOLD"
|
|
MISSING_ARTIFACT = "MISSING_ARTIFACT"
|
|
|
|
|
|
# Backwards compatibility alias
|
|
RevocationType = ViolationType
|
|
|
|
|
|
# Violation severity mapping
|
|
VIOLATION_SEVERITY_MAP = {
|
|
ViolationType.UNAUTHORIZED_POOL: ViolationSeverity.CRITICAL,
|
|
ViolationType.APPLY_WITHOUT_PLAN: ViolationSeverity.CRITICAL,
|
|
ViolationType.RUN_WITHOUT_CHECK: ViolationSeverity.CRITICAL,
|
|
ViolationType.UNAUTHORIZED_PROD: ViolationSeverity.CRITICAL,
|
|
ViolationType.UNRECORDED_ROOT: ViolationSeverity.CRITICAL,
|
|
ViolationType.BASELINE_MUTATION: ViolationSeverity.CRITICAL,
|
|
ViolationType.ERROR_BUDGET_EXCEEDED: ViolationSeverity.HIGH,
|
|
ViolationType.PROCEDURE_VIOLATION: ViolationSeverity.HIGH,
|
|
ViolationType.HEARTBEAT_TIMEOUT: ViolationSeverity.HIGH,
|
|
ViolationType.LOCK_EXPIRED: ViolationSeverity.HIGH,
|
|
ViolationType.SCOPE_VIOLATION: ViolationSeverity.MEDIUM,
|
|
ViolationType.FORBIDDEN_ACTION: ViolationSeverity.MEDIUM,
|
|
ViolationType.CONFIDENCE_BELOW_THRESHOLD: ViolationSeverity.LOW,
|
|
ViolationType.MISSING_ARTIFACT: ViolationSeverity.LOW,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Integration Event Types
|
|
# =============================================================================
|
|
|
|
class IntegrationEventType(str, Enum):
|
|
"""
|
|
Event types for external integrations (GitHub, Slack, etc.).
|
|
Maps to AgentPhase lifecycle and governance events.
|
|
"""
|
|
# Lifecycle events
|
|
PLAN_CREATED = "plan_created" # AgentPhase.PLAN completed
|
|
EXECUTION_STARTED = "execution_started" # AgentPhase.EXECUTE started
|
|
EXECUTION_COMPLETE = "execution_complete" # AgentPhase.EXECUTE completed
|
|
|
|
# Governance events
|
|
VIOLATION_DETECTED = "violation_detected" # ViolationType triggered
|
|
PROMOTION_REQUESTED = "promotion_requested" # Tier upgrade request
|
|
PROMOTION_APPROVED = "promotion_approved" # Tier upgrade granted
|
|
AGENT_REVOKED = "agent_revoked" # Agent token revoked
|
|
|
|
# Workflow events
|
|
APPROVAL_REQUIRED = "approval_required" # StageType.GATE reached
|
|
HEARTBEAT = "heartbeat" # Agent health signal
|
|
|
|
|
|
# Map integration events to lifecycle phases
|
|
INTEGRATION_EVENT_PHASE_MAP = {
|
|
IntegrationEventType.PLAN_CREATED: AgentPhase.PLAN,
|
|
IntegrationEventType.EXECUTION_STARTED: AgentPhase.EXECUTE,
|
|
IntegrationEventType.EXECUTION_COMPLETE: AgentPhase.EXECUTE,
|
|
IntegrationEventType.AGENT_REVOKED: AgentPhase.REVOKED,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Data Classes
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class AgentOutput:
|
|
"""
|
|
Represents an Alpha/Beta/Gamma output from an agent at a checkpoint.
|
|
"""
|
|
agent_id: str
|
|
output_type: OutputType
|
|
phase: AgentPhase
|
|
content: Dict[str, Any]
|
|
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
checksum: str = ""
|
|
|
|
def __post_init__(self):
|
|
if not self.checksum:
|
|
self.checksum = hashlib.sha256(
|
|
json.dumps(self.content, sort_keys=True).encode()
|
|
).hexdigest()[:12]
|
|
|
|
|
|
@dataclass
|
|
class StageResult:
|
|
"""Result of a pipeline stage execution."""
|
|
name: str
|
|
status: StageStatus
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
artifacts: Dict[str, Any] = field(default_factory=dict)
|
|
error: Optional[str] = None
|
|
agent_id: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class PipelineContext:
|
|
"""Runtime context passed through pipeline stages."""
|
|
pipeline_name: str
|
|
run_id: str
|
|
inputs: Dict[str, Any]
|
|
variables: Dict[str, Any] = field(default_factory=dict)
|
|
artifacts: Dict[str, Any] = field(default_factory=dict)
|
|
stage_results: Dict[str, StageResult] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ErrorBudget:
|
|
"""Error budget configuration for agents."""
|
|
max_total_errors: int = 12
|
|
max_same_error_repeats: int = 3
|
|
max_procedure_violations: int = 1
|
|
|
|
|
|
@dataclass
|
|
class ClarifiedPlan:
|
|
"""A clarified plan broadcast by overwatch after error threshold."""
|
|
plan_id: str
|
|
trigger_reason: str
|
|
history_reviewed: List[str]
|
|
outputs_analyzed: List[str]
|
|
original_objective: str
|
|
clarified_objective: str
|
|
adjustments: List[str]
|
|
broadcast_at: datetime
|
|
acknowledged_by: List[str] = field(default_factory=list)
|
|
|
|
|
|
# =============================================================================
|
|
# DragonflyDB Key Patterns (OFFICIAL)
|
|
# =============================================================================
|
|
|
|
class RedisKeys:
|
|
"""
|
|
Official DragonflyDB keyspace patterns from ARCHITECTURE.md.
|
|
"""
|
|
# Agent keys
|
|
AGENT_PACKET = "agent:{agent_id}:packet" # Instruction packet
|
|
AGENT_STATE = "agent:{agent_id}:state" # Runtime state
|
|
AGENT_ERRORS = "agent:{agent_id}:errors" # Error counters
|
|
AGENT_HEARTBEAT = "agent:{agent_id}:heartbeat" # Last seen
|
|
AGENT_LOCK = "agent:{agent_id}:lock" # Execution lock
|
|
AGENT_OUTPUT = "agent:{agent_id}:output:{type}" # Alpha/Beta/Gamma outputs
|
|
|
|
# Task keys
|
|
TASK_ACTIVE_AGENT = "task:{task_id}:active_agent"
|
|
TASK_ARTIFACTS = "task:{task_id}:artifacts"
|
|
|
|
# Project keys
|
|
PROJECT_AGENTS = "project:{project_id}:agents"
|
|
PROJECT_OBJECTIVE = "project:{project_id}:objective"
|
|
PROJECT_PLAN = "project:{project_id}:plan:{plan_id}"
|
|
PROJECT_BROADCAST = "project:{project_id}:broadcast"
|
|
|
|
# Coordination keys
|
|
BLACKBOARD = "blackboard:{task}:{section}"
|
|
MESSAGE = "msg:{task}:{channel}"
|
|
REVOCATIONS = "revocations:ledger"
|
|
HANDOFF = "handoff:{task}:latest"
|
|
|
|
# History keys
|
|
HISTORY_RUNS = "history:{agent_id}:runs"
|
|
|
|
@classmethod
|
|
def agent_packet(cls, agent_id: str) -> str:
|
|
return cls.AGENT_PACKET.format(agent_id=agent_id)
|
|
|
|
@classmethod
|
|
def agent_state(cls, agent_id: str) -> str:
|
|
return cls.AGENT_STATE.format(agent_id=agent_id)
|
|
|
|
@classmethod
|
|
def agent_errors(cls, agent_id: str) -> str:
|
|
return cls.AGENT_ERRORS.format(agent_id=agent_id)
|
|
|
|
@classmethod
|
|
def agent_heartbeat(cls, agent_id: str) -> str:
|
|
return cls.AGENT_HEARTBEAT.format(agent_id=agent_id)
|
|
|
|
@classmethod
|
|
def agent_lock(cls, agent_id: str) -> str:
|
|
return cls.AGENT_LOCK.format(agent_id=agent_id)
|
|
|
|
@classmethod
|
|
def agent_output(cls, agent_id: str, output_type: str) -> str:
|
|
return cls.AGENT_OUTPUT.format(agent_id=agent_id, type=output_type)
|
|
|
|
@classmethod
|
|
def project_agents(cls, project_id: str) -> str:
|
|
return cls.PROJECT_AGENTS.format(project_id=project_id)
|
|
|
|
@classmethod
|
|
def project_objective(cls, project_id: str) -> str:
|
|
return cls.PROJECT_OBJECTIVE.format(project_id=project_id)
|
|
|
|
@classmethod
|
|
def project_plan(cls, project_id: str, plan_id: str) -> str:
|
|
return cls.PROJECT_PLAN.format(project_id=project_id, plan_id=plan_id)
|
|
|
|
@classmethod
|
|
def history_runs(cls, agent_id: str) -> str:
|
|
return cls.HISTORY_RUNS.format(agent_id=agent_id)
|
|
|
|
|
|
# =============================================================================
|
|
# Configuration Constants
|
|
# =============================================================================
|
|
|
|
# Default Redis connection settings
|
|
DEFAULT_REDIS_HOST = "127.0.0.1"
|
|
DEFAULT_REDIS_PORT = 6379
|
|
DEFAULT_REDIS_PASSWORD = "governance2026"
|
|
|
|
# Default paths
|
|
DEFAULT_LEDGER_PATH = "/opt/agent-governance/ledger/governance.db"
|
|
DEFAULT_SCHEMA_PATH = "/opt/agent-governance/pipeline/schemas/pipeline.schema.json"
|
|
DEFAULT_TEMPLATES_PATH = "/opt/agent-governance/pipeline/templates"
|
|
|
|
# Timeouts (seconds)
|
|
DEFAULT_HEARTBEAT_TTL = 60
|
|
DEFAULT_LOCK_TTL = 300
|
|
DEFAULT_OUTPUT_TTL = 300
|
|
|
|
|
|
# =============================================================================
|
|
# Utility Functions
|
|
# =============================================================================
|
|
|
|
def get_output_type_for_phase(phase: AgentPhase) -> OutputType:
|
|
"""Get the appropriate output type for a given phase."""
|
|
return PHASE_OUTPUT_TYPES.get(phase, OutputType.ALPHA)
|
|
|
|
|
|
def is_terminal_phase(phase: AgentPhase) -> bool:
|
|
"""Check if a phase is terminal (EXIT or REVOKED)."""
|
|
return phase in (AgentPhase.EXIT, AgentPhase.REVOKED)
|
|
|
|
|
|
def next_phase(current: AgentPhase) -> Optional[AgentPhase]:
|
|
"""Get the next phase in the pipeline, or None if at end."""
|
|
if current == AgentPhase.REVOKED:
|
|
return None
|
|
try:
|
|
idx = AGENT_PHASES_ORDERED.index(current)
|
|
if idx < len(AGENT_PHASES_ORDERED) - 1:
|
|
return AGENT_PHASES_ORDERED[idx + 1]
|
|
except ValueError:
|
|
pass
|
|
return None
|