agent-governance/agents/multi-agent/orchestrator.ts

/**
 * Multi-Agent Coordination System - Orchestrator
 * Manages parallel agent execution, spawn conditions, and metrics
 *
 * Environment variables:
 * - PIPELINE_ID: Parent pipeline ID for error reporting
 * - TASK_ID: Task ID override
 */

import type { TaskDefinition, CoordinationMetrics, SpawnCondition, AgentRole } from "./types";
import {
  Blackboard,
  MessageBus,
  AgentStateManager,
  SpawnController,
  MetricsCollector,
} from "./coordination";
import { AgentAlpha, AgentBeta, AgentGamma } from "./agents";

function now(): string {
  return new Date().toISOString();
}

function generateId(): string {
  return "task-" + Math.random().toString(36).slice(2, 8) + "-" + Date.now().toString(36);
}

// Error reporting to parent pipeline's observability system
async function reportErrorToObservability(
  pipelineId: string,
  errorType: string,
  severity: "low" | "medium" | "high" | "critical",
  details: string
): Promise<void> {
  try {
    // Report to the UI server's error tracking API
    const response = await fetch("http://localhost:3000/api/pipeline/errors/record", {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({
        pipeline_id: pipelineId,
        error_type: errorType,
        severity,
        details
      })
    });
    if (!response.ok) {
      console.error(`[ERROR_REPORT] Failed to report error: ${response.status}`);
    }
  } catch (e: any) {
    // Silently fail - don't let error reporting cause more errors
    console.error(`[ERROR_REPORT] Error reporting failed: ${e.message}`);
  }
}

// =============================================================================
// Multi-Agent Orchestrator
// =============================================================================

export class MultiAgentOrchestrator {
  private taskId: string;
  private pipelineId?: string;
  private blackboard!: Blackboard;
  private stateManager!: AgentStateManager;
  private spawnController!: SpawnController;
  private metrics!: MetricsCollector;

  private alphaAgent!: AgentAlpha;
  private betaAgent!: AgentBeta;
  private gammaAgent?: AgentGamma;

  private alphaBus!: MessageBus;
  private betaBus!: MessageBus;
  private gammaBus?: MessageBus;

  private model: string;
  private startTime!: number;
  private monitorInterval?: ReturnType<typeof setInterval>;
  private errorCount: number = 0;
  private iterationCount: number = 0;
  private maxIterations: number = 10;
  private lastProgressTime: number = 0;
  private progressTimeout: number = 60000; // 60 seconds without progress = stuck

  constructor(model: string = "anthropic/claude-sonnet-4") {
    // Use environment variable for task ID if provided
    this.taskId = process.env.TASK_ID || generateId();
    this.pipelineId = process.env.PIPELINE_ID;
    this.model = model;
  }

  private async reportError(errorType: string, severity: "low" | "medium" | "high" | "critical", details: string): Promise<void> {
    this.errorCount++;
    if (this.pipelineId) {
      await reportErrorToObservability(this.pipelineId, errorType, severity, details);
    }
    this.log(`ERROR [${severity}] ${errorType}: ${details}`);
  }

  private updateProgress(): void {
    this.lastProgressTime = Date.now();
    this.iterationCount++;
  }

  private isStuck(): boolean {
    if (this.lastProgressTime === 0) return false;
    return (Date.now() - this.lastProgressTime) > this.progressTimeout;
  }

  private isIterationLimitExceeded(): boolean {
    return this.iterationCount >= this.maxIterations;
  }

  // Request token revocation for stuck agents
  private async revokeStuckAgentTokens(): Promise<void> {
    if (!this.pipelineId) return;

    try {
      await fetch("http://localhost:3000/api/pipeline/revoke", {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
          pipeline_id: this.pipelineId,
          reason: "iteration_timeout",
          details: `Agents stuck after ${this.iterationCount} iterations`
        })
      });
      this.log("Token revocation requested for stuck agents");
    } catch (e: any) {
      this.log(`Failed to revoke tokens: ${e.message}`);
    }
  }

  // Record structured failure context for auto-recovery
  private async recordFailureContext(
    reason: "stuck" | "iteration_limit" | "consensus_failed",
    metrics: CoordinationMetrics
  ): Promise<void> {
    if (!this.pipelineId) return;

    const failureContext = {
      pipeline_id: this.pipelineId,
      task_id: this.taskId,
      failure_reason: reason,
      failure_time: new Date().toISOString(),
      iteration_count: this.iterationCount,
      elapsed_ms: Date.now() - this.startTime,
      metrics: metrics,
      gamma_spawned: this.gammaAgent !== undefined,
      error_count: this.errorCount,
      recovery_hint: this.getRecoveryHint(reason)
    };

    try {
      await fetch("http://localhost:3000/api/pipeline/failure-context", {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify(failureContext)
      });
      this.log(`Failure context recorded: ${reason}`);
    } catch (e: any) {
      this.log(`Failed to record failure context: ${e.message}`);
    }
  }

  private getRecoveryHint(reason: string): string {
    switch (reason) {
      case "stuck":
        return "Agents became unresponsive. Try with fresh agents or GAMMA mediator.";
      case "iteration_limit":
        return "Max iterations reached without consensus. Consider simplifying objective.";
      case "consensus_failed":
        return "Agents completed but disagreed. GAMMA mediator may help resolve conflicts.";
      default:
        return "Unknown failure. Review logs for details.";
    }
  }

  private log(msg: string) {
    const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
    console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
  }

  async initialize(): Promise<void> {
    this.startTime = Date.now();

    console.log("\n" + "=".repeat(70));
    console.log("MULTI-AGENT COORDINATION SYSTEM");
    console.log("Task ID: " + this.taskId);
    if (this.pipelineId) {
      console.log("Pipeline ID: " + this.pipelineId);
    }
    console.log("Model: " + this.model);
    console.log("=".repeat(70) + "\n");

    this.log("Initializing coordination infrastructure...");

    // Initialize shared infrastructure
    this.blackboard = new Blackboard(this.taskId);
    this.stateManager = new AgentStateManager(this.taskId);
    this.spawnController = new SpawnController(this.taskId);
    this.metrics = new MetricsCollector(this.taskId);

    await Promise.all([
      this.blackboard.connect(),
      this.stateManager.connect(),
      this.spawnController.connect(),
      this.metrics.connect(),
    ]);

    this.log("Infrastructure connected");

    // Initialize message buses for ALPHA and BETA
    this.alphaBus = new MessageBus(this.taskId, "ALPHA");
    this.betaBus = new MessageBus(this.taskId, "BETA");

    await Promise.all([
      this.alphaBus.connect(),
      this.betaBus.connect(),
    ]);

    this.log("Message buses connected");

    // Create initial agents
    this.alphaAgent = new AgentAlpha(
      this.taskId, this.blackboard, this.alphaBus, this.stateManager, this.metrics, this.model
    );
    this.betaAgent = new AgentBeta(
      this.taskId, this.blackboard, this.betaBus, this.stateManager, this.metrics, this.model
    );

    await Promise.all([
      this.alphaAgent.init(),
      this.betaAgent.init(),
    ]);

    this.log("Agents ALPHA and BETA initialized");
  }

  async spawnGamma(reason: SpawnCondition): Promise<void> {
    if (this.gammaAgent) {
      this.log("GAMMA already spawned, skipping");
      return;
    }

    this.log(`SPAWNING GAMMA - Reason: ${reason.type} (threshold: ${reason.threshold}, current: ${reason.current_value})`);

    // Create message bus for GAMMA
    this.gammaBus = new MessageBus(this.taskId, "GAMMA");
    await this.gammaBus.connect();

    // Create and initialize GAMMA agent
    this.gammaAgent = new AgentGamma(
      this.taskId, this.blackboard, this.gammaBus, this.stateManager, this.metrics,
      reason.type, this.model
    );
    await this.gammaAgent.init();

    await this.spawnController.markGammaSpawned(reason);

    this.log("GAMMA agent spawned and initialized");
  }

  private async monitorConditions(): Promise<{ abort: boolean; reason?: string }> {
    // Track progress
    this.updateProgress();

    // Check for iteration timeout (stuck without progress)
    if (this.isStuck()) {
      this.log("TIMEOUT: No progress detected - triggering auto-recovery");
      await this.reportError("iteration_timeout", "high", "Agents stuck without progress");
      return { abort: true, reason: "stuck" };
    }

    // Check for iteration limit exceeded
    if (this.isIterationLimitExceeded()) {
      this.log("LIMIT: Maximum iterations exceeded - triggering auto-recovery");
      await this.reportError("iteration_limit", "high", `Max iterations (${this.maxIterations}) exceeded`);
      return { abort: true, reason: "iteration_limit" };
    }

    // Check stuck condition
    const stuckAgents = await this.stateManager.detectStuckAgents(30);
    if (stuckAgents.length > 0) {
      this.log(`Stuck agents detected: ${stuckAgents.join(", ")}`);
      const condition = await this.spawnController.updateCondition("STUCK", stuckAgents.length);
      if (condition?.triggered) {
        const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
        if (shouldSpawn && reason) {
          await this.spawnGamma(reason);
        }
      }
    }

    // Check conflict condition
    const metricsData = await this.metrics.getMetrics();
    const unresolvedConflicts = metricsData.conflicts_detected - metricsData.conflicts_resolved;
    const conflictCondition = await this.spawnController.updateCondition("CONFLICT", unresolvedConflicts);
    if (conflictCondition?.triggered && !this.spawnController.isGammaSpawned()) {
      const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
      if (shouldSpawn && reason) {
        await this.spawnGamma(reason);
      }
    }

    // Check complexity condition (from blackboard)
    const analysis = await this.blackboard.read("problem", "analysis");
    if (analysis?.value?.complexity_score) {
      const complexityCondition = await this.spawnController.updateCondition("COMPLEXITY", analysis.value.complexity_score);
      if (complexityCondition?.triggered && !this.spawnController.isGammaSpawned()) {
        const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
        if (shouldSpawn && reason) {
          await this.spawnGamma(reason);
        }
      }
    }

    // Log current state
    const states = await this.stateManager.getAllStates();
    const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", ");
    this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts} | Iter: ${this.iterationCount}`);

    return { abort: false };
  }

  async runTask(task: TaskDefinition): Promise<CoordinationMetrics & { abort_reason?: string }> {
    this.log(`Starting task: ${task.objective.slice(0, 60)}...`);
    this.lastProgressTime = Date.now();

    // Write task to blackboard
    await this.blackboard.write("problem", "task_definition", task, "ALPHA");

    // Track if we need to abort due to timeout/iteration limit
    let abortReason: string | undefined;

    // Start monitoring with abort detection
    this.monitorInterval = setInterval(async () => {
      const result = await this.monitorConditions();
      if (result.abort && result.reason) {
        abortReason = result.reason;
        if (this.monitorInterval) {
          clearInterval(this.monitorInterval);
        }
      }
    }, 2000);

    // Run agents in parallel
    this.log("Launching ALPHA and BETA in parallel...");

    const alphaPromise = this.alphaAgent.run(task).catch(async e => {
      await this.reportError("agent_failure", "high", `ALPHA error: ${e.message}`);
    });

    const betaPromise = this.betaAgent.run(task).catch(async e => {
      await this.reportError("agent_failure", "high", `BETA error: ${e.message}`);
    });

    // Wait for initial agents to complete (or timeout)
    const timeout = task.timeout_seconds * 1000;
    const timeoutPromise = new Promise<void>(resolve => setTimeout(resolve, timeout));

    await Promise.race([
      Promise.all([alphaPromise, betaPromise]),
      timeoutPromise,
    ]);

    // Check if we were aborted during execution
    if (abortReason) {
      this.log(`Task aborted: ${abortReason}`);
      if (this.monitorInterval) {
        clearInterval(this.monitorInterval);
      }

      // Revoke tokens for stuck agents
      await this.revokeStuckAgentTokens();

      // Get partial metrics and mark as failed
      const partialMetrics = await this.metrics.finalize(false);
      await this.recordFailureContext(abortReason as any, partialMetrics);

      return { ...partialMetrics, abort_reason: abortReason };
    }

    this.log("Initial agents completed or timeout reached");

    // Check if GAMMA needs to be spawned for success validation
    const states = await this.stateManager.getAllStates();
    const bothComplete = states.every(s => s.status === "WAITING" || s.status === "COMPLETED");

    if (bothComplete && !this.spawnController.isGammaSpawned()) {
      await this.spawnController.updateCondition("SUCCESS", 1.0);
      const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
      if (shouldSpawn && reason) {
        await this.spawnGamma(reason);
      }
    }

    // If GAMMA was spawned, run it
    if (this.gammaAgent) {
      this.log("Running GAMMA for resolution...");
      await this.gammaAgent.run(task).catch(async e => {
        await this.reportError("agent_failure", "high", `GAMMA error: ${e.message}`);
      });
    }

    // Stop monitoring
    if (this.monitorInterval) {
      clearInterval(this.monitorInterval);
    }

    // Check consensus
    const consensus = await this.blackboard.checkConsensus("synthesis", ["ALPHA", "BETA"]);
    const consensusAchieved = consensus.reached ||
      (await this.blackboard.read("consensus", "final"))?.value?.achieved === true;

    this.log(`Consensus achieved: ${consensusAchieved}`);

    // Finalize metrics
    const finalMetrics = await this.metrics.finalize(consensusAchieved);

    // If consensus failed, record structured failure context
    if (!consensusAchieved) {
      await this.recordFailureContext("consensus_failed", finalMetrics);
    }

    return finalMetrics;
  }

  async cleanup(): Promise<void> {
    this.log("Cleaning up...");

    if (this.monitorInterval) {
      clearInterval(this.monitorInterval);
    }

    await Promise.all([
      this.alphaBus?.disconnect(),
      this.betaBus?.disconnect(),
      this.gammaBus?.disconnect(),
      this.blackboard?.disconnect(),
      this.stateManager?.disconnect(),
      this.spawnController?.disconnect(),
      this.metrics?.disconnect(),
    ].filter(Boolean));

    this.log("Cleanup complete");
  }

  getTaskId(): string {
    return this.taskId;
  }
}

// =============================================================================
// Performance Analysis
// =============================================================================

export function analyzePerformance(metrics: CoordinationMetrics): void {
  console.log("\n" + "=".repeat(70));
  console.log("PERFORMANCE ANALYSIS");
  console.log("=".repeat(70));

  const duration = metrics.end_time
    ? (new Date(metrics.end_time).getTime() - new Date(metrics.start_time).getTime()) / 1000
    : 0;

  console.log("\nTiming:");
  console.log(`  Duration: ${duration.toFixed(1)}s`);

  console.log("\nCommunication:");
  console.log(`  Total messages: ${metrics.total_messages}`);
  console.log(`  Direct messages: ${metrics.direct_messages}`);
  console.log(`  Blackboard writes: ${metrics.blackboard_writes}`);
  console.log(`  Blackboard reads: ${metrics.blackboard_reads}`);
  console.log(`  Communication overhead: ${((metrics.total_messages + metrics.blackboard_writes) / duration).toFixed(2)} ops/sec`);

  console.log("\nCoordination:");
  console.log(`  Conflicts detected: ${metrics.conflicts_detected}`);
  console.log(`  Conflicts resolved: ${metrics.conflicts_resolved}`);
  console.log(`  Conflict resolution rate: ${metrics.conflicts_detected > 0 ? ((metrics.conflicts_resolved / metrics.conflicts_detected) * 100).toFixed(1) : 100}%`);

  console.log("\nGamma Agent:");
  console.log(`  Spawned: ${metrics.gamma_spawned ? "Yes" : "No"}`);
  if (metrics.gamma_spawned) {
    console.log(`  Spawn reason: ${metrics.gamma_spawn_reason}`);
    console.log(`  Spawn time: ${metrics.gamma_spawn_time}`);
  }

  console.log("\nOutcome:");
  console.log(`  Consensus achieved: ${metrics.final_consensus ? "Yes" : "No"}`);
  console.log(`  Performance score: ${(metrics.performance_score * 100).toFixed(1)}%`);

  // Threshold analysis
  console.log("\nThreshold Effects:");
  const messageThreshold = 50;
  const conflictThreshold = 3;

  if (metrics.total_messages > messageThreshold) {
    console.log(`  ! High message volume (${metrics.total_messages} > ${messageThreshold}) - potential coordination overhead`);
  } else {
    console.log(`  + Message volume within threshold (${metrics.total_messages} <= ${messageThreshold})`);
  }

  if (metrics.conflicts_detected > conflictThreshold) {
    console.log(`  ! High conflict rate (${metrics.conflicts_detected} > ${conflictThreshold}) - agents may have divergent strategies`);
  } else {
    console.log(`  + Conflict rate within threshold (${metrics.conflicts_detected} <= ${conflictThreshold})`);
  }

  if (metrics.gamma_spawned && metrics.gamma_spawn_reason === "STUCK") {
    console.log(`  ! Gamma spawned due to stuck condition - consider adjusting agent strategies`);
  }

  console.log("\n" + "=".repeat(70));
}

// =============================================================================
// CLI Entry Point
// =============================================================================

async function main() {
  const args = process.argv.slice(2);

  // Default complex task
  let objective = args[0] || `Design a distributed event-driven architecture for a real-time analytics platform that handles:
1) High-throughput data ingestion from multiple sources
2) Stream processing with exactly-once semantics
3) Real-time aggregations and windowed computations
4) Low-latency query serving for dashboards
5) Horizontal scalability to handle 1M events/second
The solution should consider fault tolerance, data consistency, and cost optimization.`;

  let model = "anthropic/claude-sonnet-4";
  const modelIdx = args.indexOf("--model");
  if (modelIdx !== -1 && args[modelIdx + 1]) {
    model = args[modelIdx + 1];
  }

  // Parse timeout
  let timeout = 120;
  const timeoutIdx = args.indexOf("--timeout");
  if (timeoutIdx !== -1 && args[timeoutIdx + 1]) {
    timeout = parseInt(args[timeoutIdx + 1]);
  }

  const task: TaskDefinition = {
    task_id: generateId(),
    objective,
    complexity: "high",
    subtasks: [
      { id: "s1", description: "Analyze data ingestion requirements", status: "pending", dependencies: [] },
      { id: "s2", description: "Design stream processing pipeline", status: "pending", dependencies: ["s1"] },
      { id: "s3", description: "Plan storage and query layer", status: "pending", dependencies: ["s1"] },
      { id: "s4", description: "Define scalability strategy", status: "pending", dependencies: ["s2", "s3"] },
      { id: "s5", description: "Integrate fault tolerance mechanisms", status: "pending", dependencies: ["s4"] },
    ],
    constraints: [
      "Must use open-source technologies where possible",
      "Latency < 100ms for query responses",
      "Support for multiple data formats (JSON, Avro, Protobuf)",
      "Cost-effective for variable workloads",
    ],
    success_criteria: [
      "Complete architecture design with component diagrams",
      "Data flow specifications",
      "Scalability analysis",
      "Fault tolerance mechanisms documented",
      "Cost estimation provided",
    ],
    timeout_seconds: timeout,
  };

  const orchestrator = new MultiAgentOrchestrator(model);

  let exitCode = 0;
  try {
    await orchestrator.initialize();
    const metrics = await orchestrator.runTask(task);

    console.log("\n" + "=".repeat(70));
    console.log("FINAL METRICS");
    console.log("=".repeat(70));
    console.log(JSON.stringify(metrics, null, 2));

    analyzePerformance(metrics);

    // Output special marker for server to parse consensus status
    // Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...},"abort_reason":...}
    console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({
      consensus: metrics.final_consensus,
      task_id: metrics.task_id,
      metrics: metrics,
      abort_reason: (metrics as any).abort_reason || null,
      requires_auto_recovery: !metrics.final_consensus || !!(metrics as any).abort_reason
    }));

    // Exit codes:
    // 0 = Success (consensus achieved)
    // 1 = Error (crash or exception)
    // 2 = Consensus failure (agents completed but no agreement)
    // 3 = Aborted (timeout/stuck/iteration limit - auto-recovery needed)
    if ((metrics as any).abort_reason) {
      console.log(`\n[ORCHESTRATOR] Task aborted: ${(metrics as any).abort_reason} - exiting with code 3 (auto-recovery needed)`);
      exitCode = 3;

      const pipelineId = process.env.PIPELINE_ID;
      if (pipelineId) {
        await reportErrorToObservability(pipelineId, "orchestrator_aborted", "high",
          `Orchestration aborted: ${(metrics as any).abort_reason}. Auto-recovery required.`);
      }
    } else if (!metrics.final_consensus) {
      console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2 (auto-recovery needed)");
      exitCode = 2;

      // Report consensus failure to observability
      const pipelineId = process.env.PIPELINE_ID;
      if (pipelineId) {
        await reportErrorToObservability(pipelineId, "consensus_failure", "high",
          `Agents failed to reach consensus. Conflicts: ${metrics.conflicts_detected}, Resolved: ${metrics.conflicts_resolved}`);
      }
    }

  } catch (e: any) {
    console.error("Orchestrator error:", e.message);
    exitCode = 1;

    // Report critical error to observability if pipeline ID is set
    const pipelineId = process.env.PIPELINE_ID;
    if (pipelineId) {
      await reportErrorToObservability(pipelineId, "orchestrator_failure", "critical", e.message);
    }
  } finally {
    await orchestrator.cleanup();
    // Explicitly exit to ensure all connections are closed
    process.exit(exitCode);
  }
}

main().catch((e) => {
  console.error("Fatal error:", e);
  process.exit(1);
});