agent-governance/pipeline/pipeline.py

#!/usr/bin/env python3
"""
Pipeline DSL Parser and Executor

Parses YAML pipeline definitions and orchestrates agent execution.

Architecture Reference: /opt/agent-governance/docs/ARCHITECTURE.md
Core Definitions: /opt/agent-governance/pipeline/core.py
"""

import json
import yaml
import asyncio
import argparse
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum
import jsonschema
import redis
import sqlite3
import sys

# Import core definitions from unified module
from pipeline.core import (
    StageType,
    StageStatus,
    StageResult,
    PipelineContext,
    AgentPhase,
    AGENT_PHASE_NAMES,
    DEFAULT_REDIS_HOST,
    DEFAULT_REDIS_PORT,
    DEFAULT_REDIS_PASSWORD,
    DEFAULT_LEDGER_PATH,
    DEFAULT_SCHEMA_PATH,
    DEFAULT_TEMPLATES_PATH,
    RedisKeys,
)

# Paths
SCHEMA_PATH = Path(DEFAULT_SCHEMA_PATH)
TEMPLATES_PATH = Path(DEFAULT_TEMPLATES_PATH)
LEDGER_PATH = Path(DEFAULT_LEDGER_PATH)
REDIS_HOST = DEFAULT_REDIS_HOST
REDIS_PORT = DEFAULT_REDIS_PORT
REDIS_PASSWORD = DEFAULT_REDIS_PASSWORD


class PipelineParser:
    """Parses and validates pipeline definitions"""

    def __init__(self):
        with open(SCHEMA_PATH) as f:
            self.schema = json.load(f)

    def parse_file(self, path: Path) -> Dict[str, Any]:
        """Parse a YAML pipeline file"""
        with open(path) as f:
            if path.suffix in ['.yaml', '.yml']:
                pipeline = yaml.safe_load(f)
            else:
                pipeline = json.load(f)

        self.validate(pipeline)
        return pipeline

    def parse_string(self, content: str, format: str = 'yaml') -> Dict[str, Any]:
        """Parse pipeline from string"""
        if format == 'yaml':
            pipeline = yaml.safe_load(content)
        else:
            pipeline = json.loads(content)

        self.validate(pipeline)
        return pipeline

    def validate(self, pipeline: Dict[str, Any]) -> bool:
        """Validate pipeline against schema"""
        try:
            jsonschema.validate(pipeline, self.schema)
            return True
        except jsonschema.ValidationError as e:
            raise ValueError(f"Pipeline validation failed: {e.message}")


class AgentTemplate:
    """Agent template loader"""

    def __init__(self, template_name: str):
        self.name = template_name
        self.config = self._load_template()

    def _load_template(self) -> Dict[str, Any]:
        """Load template configuration"""
        template_path = TEMPLATES_PATH / f"{self.name}.yaml"
        if template_path.exists():
            with open(template_path) as f:
                return yaml.safe_load(f)

        # Return default template
        return {
            "name": self.name,
            "tier": 0,
            "allowed_actions": ["read_docs", "read_inventory", "generate_plan"],
            "forbidden_actions": ["execute_shell", "ssh_connect"],
            "default_config": {}
        }

    def instantiate(self, config: Dict[str, Any] = None) -> Dict[str, Any]:
        """Create agent instance from template"""
        instance = self.config.copy()
        if config:
            instance.update(config)
        return instance


class StageExecutor:
    """Executes individual pipeline stages"""

    def __init__(self, redis_client: redis.Redis, db_path: Path):
        self.redis = redis_client
        self.db_path = db_path

    async def execute_agent_stage(
        self,
        stage: Dict[str, Any],
        context: PipelineContext
    ) -> StageResult:
        """Execute an agent stage"""
        stage_name = stage["name"]
        agent_config = stage.get("agent", {})

        result = StageResult(
            name=stage_name,
            status=StageStatus.RUNNING,
            started_at=datetime.utcnow()
        )

        try:
            # Load template if specified
            template_name = agent_config.get("template", "default")
            template = AgentTemplate(template_name)
            agent = template.instantiate(agent_config.get("config", {}))

            # Generate agent ID for this run
            agent_id = f"pipeline-{context.run_id}-{stage_name}"
            result.agent_id = agent_id

            # Store agent state in DragonflyDB
            self.redis.hset(f"agent:{agent_id}:state", mapping={
                "status": "running",
                "stage": stage_name,
                "pipeline": context.pipeline_name,
                "started_at": datetime.utcnow().isoformat()
            })

            # Execute based on template type
            if template_name == "terraform":
                output = await self._run_terraform_agent(agent, context, stage)
            elif template_name == "ansible":
                output = await self._run_ansible_agent(agent, context, stage)
            elif template_name == "code-review":
                output = await self._run_code_review_agent(agent, context, stage)
            else:
                output = await self._run_generic_agent(agent, context, stage)

            result.artifacts = output.get("artifacts", {})
            result.status = StageStatus.COMPLETED

            # Update DragonflyDB state
            self.redis.hset(f"agent:{agent_id}:state", "status", "completed")

        except Exception as e:
            result.status = StageStatus.FAILED
            result.error = str(e)
            if result.agent_id:
                self.redis.hset(f"agent:{result.agent_id}:state", "status", "failed")

        result.completed_at = datetime.utcnow()
        return result

    async def execute_gate_stage(
        self,
        stage: Dict[str, Any],
        context: PipelineContext
    ) -> StageResult:
        """Execute a gate (approval) stage"""
        stage_name = stage["name"]
        gate_config = stage.get("gate", {})
        approval_type = gate_config.get("approval", "auto")

        result = StageResult(
            name=stage_name,
            status=StageStatus.RUNNING,
            started_at=datetime.utcnow()
        )

        try:
            if approval_type == "auto":
                # Auto-approve based on previous stage results
                result.status = StageStatus.COMPLETED
            elif approval_type == "human":
                # Queue for human approval
                timeout = self._parse_timeout(gate_config.get("timeout", "30m"))
                approval = await self._wait_for_human_approval(
                    context, stage_name, timeout
                )
                result.status = StageStatus.COMPLETED if approval else StageStatus.FAILED
            elif approval_type == "consensus":
                # Multi-agent consensus
                result.status = await self._wait_for_consensus(context, stage_name)

        except Exception as e:
            result.status = StageStatus.FAILED
            result.error = str(e)

        result.completed_at = datetime.utcnow()
        return result

    async def execute_parallel_stage(
        self,
        stage: Dict[str, Any],
        context: PipelineContext
    ) -> StageResult:
        """Execute parallel branches"""
        stage_name = stage["name"]
        parallel_config = stage.get("parallel", {})
        branches = parallel_config.get("branches", [])
        wait_mode = parallel_config.get("wait", "all")

        result = StageResult(
            name=stage_name,
            status=StageStatus.RUNNING,
            started_at=datetime.utcnow()
        )

        try:
            # Execute branches in parallel
            tasks = [
                self._execute_stage(branch, context)
                for branch in branches
            ]

            if wait_mode == "all":
                branch_results = await asyncio.gather(*tasks)
                all_ok = all(r.status == StageStatus.COMPLETED for r in branch_results)
                result.status = StageStatus.COMPLETED if all_ok else StageStatus.FAILED
            elif wait_mode == "any":
                done, pending = await asyncio.wait(
                    [asyncio.create_task(t) for t in tasks],
                    return_when=asyncio.FIRST_COMPLETED
                )
                result.status = StageStatus.COMPLETED
                for p in pending:
                    p.cancel()
            else:  # none
                # Fire and forget
                for task in tasks:
                    asyncio.create_task(task)
                result.status = StageStatus.COMPLETED

            result.artifacts["branch_results"] = [
                {"name": r.name, "status": r.status.value}
                for r in branch_results
            ] if wait_mode == "all" else []

        except Exception as e:
            result.status = StageStatus.FAILED
            result.error = str(e)

        result.completed_at = datetime.utcnow()
        return result

    async def execute_condition_stage(
        self,
        stage: Dict[str, Any],
        context: PipelineContext
    ) -> StageResult:
        """Execute conditional stage"""
        stage_name = stage["name"]
        condition_config = stage.get("condition", {})

        result = StageResult(
            name=stage_name,
            status=StageStatus.RUNNING,
            started_at=datetime.utcnow()
        )

        try:
            condition_expr = condition_config.get("if", "true")
            condition_met = self._evaluate_condition(condition_expr, context)

            if condition_met:
                then_stage = condition_config.get("then")
                if then_stage:
                    sub_result = await self._execute_stage(then_stage, context)
                    result.status = sub_result.status
                    result.artifacts = sub_result.artifacts
            else:
                else_stage = condition_config.get("else")
                if else_stage:
                    sub_result = await self._execute_stage(else_stage, context)
                    result.status = sub_result.status
                    result.artifacts = sub_result.artifacts
                else:
                    result.status = StageStatus.SKIPPED

        except Exception as e:
            result.status = StageStatus.FAILED
            result.error = str(e)

        result.completed_at = datetime.utcnow()
        return result

    async def _execute_stage(
        self,
        stage: Dict[str, Any],
        context: PipelineContext
    ) -> StageResult:
        """Route to appropriate stage executor"""
        stage_type = StageType(stage["type"])

        if stage_type == StageType.AGENT:
            return await self.execute_agent_stage(stage, context)
        elif stage_type == StageType.GATE:
            return await self.execute_gate_stage(stage, context)
        elif stage_type == StageType.PARALLEL:
            return await self.execute_parallel_stage(stage, context)
        elif stage_type == StageType.CONDITION:
            return await self.execute_condition_stage(stage, context)
        else:
            raise ValueError(f"Unknown stage type: {stage_type}")

    async def _run_terraform_agent(
        self, agent: Dict, context: PipelineContext, stage: Dict
    ) -> Dict[str, Any]:
        """Run Terraform specialist agent"""
        # Placeholder - would integrate with tf-governed.sh
        return {
            "artifacts": {
                "plan_id": f"tf-plan-{context.run_id}",
                "resource_count": 0
            }
        }

    async def _run_ansible_agent(
        self, agent: Dict, context: PipelineContext, stage: Dict
    ) -> Dict[str, Any]:
        """Run Ansible specialist agent"""
        # Placeholder - would integrate with ansible-governed.sh
        return {
            "artifacts": {
                "check_id": f"ansible-check-{context.run_id}",
                "task_count": 0
            }
        }

    async def _run_code_review_agent(
        self, agent: Dict, context: PipelineContext, stage: Dict
    ) -> Dict[str, Any]:
        """Run code review agent"""
        return {
            "artifacts": {
                "review_id": f"review-{context.run_id}",
                "findings": []
            }
        }

    async def _run_generic_agent(
        self, agent: Dict, context: PipelineContext, stage: Dict
    ) -> Dict[str, Any]:
        """Run generic agent"""
        return {"artifacts": {}}

    async def _wait_for_human_approval(
        self, context: PipelineContext, stage_name: str, timeout: timedelta
    ) -> bool:
        """Wait for human approval via DragonflyDB"""
        approval_key = f"pipeline:{context.run_id}:approval:{stage_name}"
        self.redis.set(approval_key, "pending", ex=int(timeout.total_seconds()))

        # In real implementation, would poll for approval
        # For now, auto-approve after short delay
        await asyncio.sleep(1)
        return True

    async def _wait_for_consensus(
        self, context: PipelineContext, stage_name: str
    ) -> StageStatus:
        """Wait for multi-agent consensus"""
        # Placeholder - would integrate with multi-agent coordination
        return StageStatus.COMPLETED

    def _evaluate_condition(self, expr: str, context: PipelineContext) -> bool:
        """Evaluate condition expression"""
        # Simple evaluation - check for stage completion
        if expr.startswith("stage."):
            parts = expr.split(".")
            stage_name = parts[1]
            attr = parts[2] if len(parts) > 2 else "completed"

            if stage_name in context.stage_results:
                result = context.stage_results[stage_name]
                if attr == "completed":
                    return result.status == StageStatus.COMPLETED
                elif attr == "failed":
                    return result.status == StageStatus.FAILED

        return expr.lower() == "true"

    def _parse_timeout(self, timeout_str: str) -> timedelta:
        """Parse timeout string (e.g., '30m', '2h')"""
        value = int(timeout_str[:-1])
        unit = timeout_str[-1]

        if unit == 's':
            return timedelta(seconds=value)
        elif unit == 'm':
            return timedelta(minutes=value)
        elif unit == 'h':
            return timedelta(hours=value)
        else:
            return timedelta(minutes=30)


class PipelineExecutor:
    """Orchestrates full pipeline execution"""

    def __init__(self):
        self.parser = PipelineParser()
        self.redis = redis.Redis(
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD,
            decode_responses=True
        )
        self.stage_executor = StageExecutor(self.redis, LEDGER_PATH)

    def generate_run_id(self) -> str:
        """Generate unique pipeline run ID"""
        import hashlib
        timestamp = datetime.utcnow().isoformat()
        return hashlib.sha256(timestamp.encode()).hexdigest()[:12]

    async def execute(
        self,
        pipeline: Dict[str, Any],
        inputs: Dict[str, Any] = None
    ) -> Tuple[bool, Dict[str, Any]]:
        """Execute a pipeline"""
        run_id = self.generate_run_id()

        context = PipelineContext(
            pipeline_name=pipeline["name"],
            run_id=run_id,
            inputs=inputs or {}
        )

        # Store pipeline run state
        self.redis.hset(f"pipeline:{run_id}:state", mapping={
            "name": pipeline["name"],
            "status": "running",
            "started_at": datetime.utcnow().isoformat()
        })

        print(f"Starting pipeline: {pipeline['name']} (run: {run_id})")

        success = True
        stages = pipeline.get("stages", [])

        for stage in stages:
            stage_name = stage["name"]

            # Check dependencies
            requires = stage.get("requires", [])
            for req in requires:
                if req not in context.stage_results:
                    print(f"  Skipping {stage_name}: dependency {req} not met")
                    continue
                if context.stage_results[req].status != StageStatus.COMPLETED:
                    print(f"  Skipping {stage_name}: dependency {req} failed")
                    continue

            print(f"  Executing stage: {stage_name}")
            result = await self.stage_executor._execute_stage(stage, context)
            context.stage_results[stage_name] = result

            if result.status == StageStatus.FAILED:
                print(f"    FAILED: {result.error}")

                # Handle failure
                on_failure = stage.get("on_failure", {})
                action = on_failure.get("action", "fail")

                if action == "fail":
                    success = False
                    break
                elif action == "skip":
                    continue
                elif action == "retry":
                    retries = on_failure.get("retries", 1)
                    for i in range(retries):
                        print(f"    Retry {i+1}/{retries}...")
                        result = await self.stage_executor._execute_stage(stage, context)
                        if result.status == StageStatus.COMPLETED:
                            break
                    else:
                        success = False
                        break
            else:
                print(f"    OK ({result.status.value})")

        # Store final state
        self.redis.hset(f"pipeline:{run_id}:state", mapping={
            "status": "completed" if success else "failed",
            "completed_at": datetime.utcnow().isoformat()
        })

        # Record in ledger
        self._record_to_ledger(context, success)

        return success, {
            "run_id": run_id,
            "stages": {
                name: {
                    "status": r.status.value,
                    "duration": (r.completed_at - r.started_at).total_seconds()
                    if r.started_at and r.completed_at else 0,
                    "artifacts": r.artifacts
                }
                for name, r in context.stage_results.items()
            }
        }

    def _record_to_ledger(self, context: PipelineContext, success: bool):
        """Record pipeline execution to ledger"""
        try:
            conn = sqlite3.connect(LEDGER_PATH)
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO agent_actions
                (timestamp, agent_id, agent_version, tier, action, decision,
                 confidence, success, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                datetime.utcnow().isoformat(),
                f"pipeline-{context.run_id}",
                "1.0.0",
                0,
                f"pipeline:{context.pipeline_name}",
                "EXECUTE",
                1.0 if success else 0.0,
                1 if success else 0,
                datetime.utcnow().isoformat()
            ))
            conn.commit()
            conn.close()
        except Exception as e:
            print(f"Warning: Failed to record to ledger: {e}")


def main():
    parser = argparse.ArgumentParser(description="Pipeline DSL Executor")
    subparsers = parser.add_subparsers(dest="command", required=True)

    # validate command
    validate_parser = subparsers.add_parser("validate", help="Validate a pipeline file")
    validate_parser.add_argument("file", type=Path, help="Pipeline YAML file")

    # run command
    run_parser = subparsers.add_parser("run", help="Execute a pipeline")
    run_parser.add_argument("file", type=Path, help="Pipeline YAML file")
    run_parser.add_argument("--input", "-i", action="append", nargs=2,
                           metavar=("KEY", "VALUE"), help="Input parameter")
    run_parser.add_argument("--dry-run", action="store_true", help="Validate only")

    # list command
    list_parser = subparsers.add_parser("list", help="List available templates")

    args = parser.parse_args()

    if args.command == "validate":
        try:
            pipeline_parser = PipelineParser()
            pipeline = pipeline_parser.parse_file(args.file)
            print(f"Pipeline '{pipeline['name']}' is valid")
            print(f"  Version: {pipeline.get('version', 'N/A')}")
            print(f"  Stages: {len(pipeline.get('stages', []))}")
            sys.exit(0)
        except Exception as e:
            print(f"Validation failed: {e}")
            sys.exit(1)

    elif args.command == "run":
        try:
            pipeline_parser = PipelineParser()
            pipeline = pipeline_parser.parse_file(args.file)

            inputs = {}
            if args.input:
                for key, value in args.input:
                    inputs[key] = value

            if args.dry_run:
                print(f"Dry run: Pipeline '{pipeline['name']}' validated")
                print(f"  Would execute {len(pipeline.get('stages', []))} stages")
                sys.exit(0)

            executor = PipelineExecutor()
            success, result = asyncio.run(executor.execute(pipeline, inputs))

            print(f"\nPipeline {'completed' if success else 'failed'}")
            print(f"Run ID: {result['run_id']}")

            sys.exit(0 if success else 1)
        except Exception as e:
            print(f"Execution failed: {e}")
            sys.exit(1)

    elif args.command == "list":
        print("Available templates:")
        for template_file in TEMPLATES_PATH.glob("*.yaml"):
            print(f"  - {template_file.stem}")


if __name__ == "__main__":
    main()