/** * Multi-Agent Coordination System - Orchestrator * Manages parallel agent execution, spawn conditions, and metrics * * Environment variables: * - PIPELINE_ID: Parent pipeline ID for error reporting * - TASK_ID: Task ID override */ import type { TaskDefinition, CoordinationMetrics, SpawnCondition, AgentRole } from "./types"; import { Blackboard, MessageBus, AgentStateManager, SpawnController, MetricsCollector, } from "./coordination"; import { AgentAlpha, AgentBeta, AgentGamma } from "./agents"; function now(): string { return new Date().toISOString(); } function generateId(): string { return "task-" + Math.random().toString(36).slice(2, 8) + "-" + Date.now().toString(36); } // Error reporting to parent pipeline's observability system async function reportErrorToObservability( pipelineId: string, errorType: string, severity: "low" | "medium" | "high" | "critical", details: string ): Promise { try { // Report to the UI server's error tracking API const response = await fetch("http://localhost:3000/api/pipeline/errors/record", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ pipeline_id: pipelineId, error_type: errorType, severity, details }) }); if (!response.ok) { console.error(`[ERROR_REPORT] Failed to report error: ${response.status}`); } } catch (e: any) { // Silently fail - don't let error reporting cause more errors console.error(`[ERROR_REPORT] Error reporting failed: ${e.message}`); } } // ============================================================================= // Multi-Agent Orchestrator // ============================================================================= export class MultiAgentOrchestrator { private taskId: string; private pipelineId?: string; private blackboard!: Blackboard; private stateManager!: AgentStateManager; private spawnController!: SpawnController; private metrics!: MetricsCollector; private alphaAgent!: AgentAlpha; private betaAgent!: AgentBeta; private gammaAgent?: AgentGamma; private alphaBus!: MessageBus; private betaBus!: MessageBus; private gammaBus?: MessageBus; private model: string; private startTime!: number; private monitorInterval?: ReturnType; private errorCount: number = 0; private iterationCount: number = 0; private maxIterations: number = 10; private lastProgressTime: number = 0; private progressTimeout: number = 60000; // 60 seconds without progress = stuck constructor(model: string = "anthropic/claude-sonnet-4") { // Use environment variable for task ID if provided this.taskId = process.env.TASK_ID || generateId(); this.pipelineId = process.env.PIPELINE_ID; this.model = model; } private async reportError(errorType: string, severity: "low" | "medium" | "high" | "critical", details: string): Promise { this.errorCount++; if (this.pipelineId) { await reportErrorToObservability(this.pipelineId, errorType, severity, details); } this.log(`ERROR [${severity}] ${errorType}: ${details}`); } private updateProgress(): void { this.lastProgressTime = Date.now(); this.iterationCount++; } private isStuck(): boolean { if (this.lastProgressTime === 0) return false; return (Date.now() - this.lastProgressTime) > this.progressTimeout; } private isIterationLimitExceeded(): boolean { return this.iterationCount >= this.maxIterations; } // Request token revocation for stuck agents private async revokeStuckAgentTokens(): Promise { if (!this.pipelineId) return; try { await fetch("http://localhost:3000/api/pipeline/revoke", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ pipeline_id: this.pipelineId, reason: "iteration_timeout", details: `Agents stuck after ${this.iterationCount} iterations` }) }); this.log("Token revocation requested for stuck agents"); } catch (e: any) { this.log(`Failed to revoke tokens: ${e.message}`); } } // Record structured failure context for auto-recovery private async recordFailureContext( reason: "stuck" | "iteration_limit" | "consensus_failed", metrics: CoordinationMetrics ): Promise { if (!this.pipelineId) return; const failureContext = { pipeline_id: this.pipelineId, task_id: this.taskId, failure_reason: reason, failure_time: new Date().toISOString(), iteration_count: this.iterationCount, elapsed_ms: Date.now() - this.startTime, metrics: metrics, gamma_spawned: this.gammaAgent !== undefined, error_count: this.errorCount, recovery_hint: this.getRecoveryHint(reason) }; try { await fetch("http://localhost:3000/api/pipeline/failure-context", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(failureContext) }); this.log(`Failure context recorded: ${reason}`); } catch (e: any) { this.log(`Failed to record failure context: ${e.message}`); } } private getRecoveryHint(reason: string): string { switch (reason) { case "stuck": return "Agents became unresponsive. Try with fresh agents or GAMMA mediator."; case "iteration_limit": return "Max iterations reached without consensus. Consider simplifying objective."; case "consensus_failed": return "Agents completed but disagreed. GAMMA mediator may help resolve conflicts."; default: return "Unknown failure. Review logs for details."; } } private log(msg: string) { const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0"; console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`); } async initialize(): Promise { this.startTime = Date.now(); console.log("\n" + "=".repeat(70)); console.log("MULTI-AGENT COORDINATION SYSTEM"); console.log("Task ID: " + this.taskId); if (this.pipelineId) { console.log("Pipeline ID: " + this.pipelineId); } console.log("Model: " + this.model); console.log("=".repeat(70) + "\n"); this.log("Initializing coordination infrastructure..."); // Initialize shared infrastructure this.blackboard = new Blackboard(this.taskId); this.stateManager = new AgentStateManager(this.taskId); this.spawnController = new SpawnController(this.taskId); this.metrics = new MetricsCollector(this.taskId); await Promise.all([ this.blackboard.connect(), this.stateManager.connect(), this.spawnController.connect(), this.metrics.connect(), ]); this.log("Infrastructure connected"); // Initialize message buses for ALPHA and BETA this.alphaBus = new MessageBus(this.taskId, "ALPHA"); this.betaBus = new MessageBus(this.taskId, "BETA"); await Promise.all([ this.alphaBus.connect(), this.betaBus.connect(), ]); this.log("Message buses connected"); // Create initial agents this.alphaAgent = new AgentAlpha( this.taskId, this.blackboard, this.alphaBus, this.stateManager, this.metrics, this.model ); this.betaAgent = new AgentBeta( this.taskId, this.blackboard, this.betaBus, this.stateManager, this.metrics, this.model ); await Promise.all([ this.alphaAgent.init(), this.betaAgent.init(), ]); this.log("Agents ALPHA and BETA initialized"); } async spawnGamma(reason: SpawnCondition): Promise { if (this.gammaAgent) { this.log("GAMMA already spawned, skipping"); return; } this.log(`SPAWNING GAMMA - Reason: ${reason.type} (threshold: ${reason.threshold}, current: ${reason.current_value})`); // Create message bus for GAMMA this.gammaBus = new MessageBus(this.taskId, "GAMMA"); await this.gammaBus.connect(); // Create and initialize GAMMA agent this.gammaAgent = new AgentGamma( this.taskId, this.blackboard, this.gammaBus, this.stateManager, this.metrics, reason.type, this.model ); await this.gammaAgent.init(); await this.spawnController.markGammaSpawned(reason); this.log("GAMMA agent spawned and initialized"); } private async monitorConditions(): Promise<{ abort: boolean; reason?: string }> { // Track progress this.updateProgress(); // Check for iteration timeout (stuck without progress) if (this.isStuck()) { this.log("TIMEOUT: No progress detected - triggering auto-recovery"); await this.reportError("iteration_timeout", "high", "Agents stuck without progress"); return { abort: true, reason: "stuck" }; } // Check for iteration limit exceeded if (this.isIterationLimitExceeded()) { this.log("LIMIT: Maximum iterations exceeded - triggering auto-recovery"); await this.reportError("iteration_limit", "high", `Max iterations (${this.maxIterations}) exceeded`); return { abort: true, reason: "iteration_limit" }; } // Check stuck condition const stuckAgents = await this.stateManager.detectStuckAgents(30); if (stuckAgents.length > 0) { this.log(`Stuck agents detected: ${stuckAgents.join(", ")}`); const condition = await this.spawnController.updateCondition("STUCK", stuckAgents.length); if (condition?.triggered) { const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions(); if (shouldSpawn && reason) { await this.spawnGamma(reason); } } } // Check conflict condition const metricsData = await this.metrics.getMetrics(); const unresolvedConflicts = metricsData.conflicts_detected - metricsData.conflicts_resolved; const conflictCondition = await this.spawnController.updateCondition("CONFLICT", unresolvedConflicts); if (conflictCondition?.triggered && !this.spawnController.isGammaSpawned()) { const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions(); if (shouldSpawn && reason) { await this.spawnGamma(reason); } } // Check complexity condition (from blackboard) const analysis = await this.blackboard.read("problem", "analysis"); if (analysis?.value?.complexity_score) { const complexityCondition = await this.spawnController.updateCondition("COMPLEXITY", analysis.value.complexity_score); if (complexityCondition?.triggered && !this.spawnController.isGammaSpawned()) { const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions(); if (shouldSpawn && reason) { await this.spawnGamma(reason); } } } // Log current state const states = await this.stateManager.getAllStates(); const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", "); this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts} | Iter: ${this.iterationCount}`); return { abort: false }; } async runTask(task: TaskDefinition): Promise { this.log(`Starting task: ${task.objective.slice(0, 60)}...`); this.lastProgressTime = Date.now(); // Write task to blackboard await this.blackboard.write("problem", "task_definition", task, "ALPHA"); // Track if we need to abort due to timeout/iteration limit let abortReason: string | undefined; // Start monitoring with abort detection this.monitorInterval = setInterval(async () => { const result = await this.monitorConditions(); if (result.abort && result.reason) { abortReason = result.reason; if (this.monitorInterval) { clearInterval(this.monitorInterval); } } }, 2000); // Run agents in parallel this.log("Launching ALPHA and BETA in parallel..."); const alphaPromise = this.alphaAgent.run(task).catch(async e => { await this.reportError("agent_failure", "high", `ALPHA error: ${e.message}`); }); const betaPromise = this.betaAgent.run(task).catch(async e => { await this.reportError("agent_failure", "high", `BETA error: ${e.message}`); }); // Wait for initial agents to complete (or timeout) const timeout = task.timeout_seconds * 1000; const timeoutPromise = new Promise(resolve => setTimeout(resolve, timeout)); await Promise.race([ Promise.all([alphaPromise, betaPromise]), timeoutPromise, ]); // Check if we were aborted during execution if (abortReason) { this.log(`Task aborted: ${abortReason}`); if (this.monitorInterval) { clearInterval(this.monitorInterval); } // Revoke tokens for stuck agents await this.revokeStuckAgentTokens(); // Get partial metrics and mark as failed const partialMetrics = await this.metrics.finalize(false); await this.recordFailureContext(abortReason as any, partialMetrics); return { ...partialMetrics, abort_reason: abortReason }; } this.log("Initial agents completed or timeout reached"); // Check if GAMMA needs to be spawned for success validation const states = await this.stateManager.getAllStates(); const bothComplete = states.every(s => s.status === "WAITING" || s.status === "COMPLETED"); if (bothComplete && !this.spawnController.isGammaSpawned()) { await this.spawnController.updateCondition("SUCCESS", 1.0); const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions(); if (shouldSpawn && reason) { await this.spawnGamma(reason); } } // If GAMMA was spawned, run it if (this.gammaAgent) { this.log("Running GAMMA for resolution..."); await this.gammaAgent.run(task).catch(async e => { await this.reportError("agent_failure", "high", `GAMMA error: ${e.message}`); }); } // Stop monitoring if (this.monitorInterval) { clearInterval(this.monitorInterval); } // Check consensus const consensus = await this.blackboard.checkConsensus("synthesis", ["ALPHA", "BETA"]); const consensusAchieved = consensus.reached || (await this.blackboard.read("consensus", "final"))?.value?.achieved === true; this.log(`Consensus achieved: ${consensusAchieved}`); // Finalize metrics const finalMetrics = await this.metrics.finalize(consensusAchieved); // If consensus failed, record structured failure context if (!consensusAchieved) { await this.recordFailureContext("consensus_failed", finalMetrics); } return finalMetrics; } async cleanup(): Promise { this.log("Cleaning up..."); if (this.monitorInterval) { clearInterval(this.monitorInterval); } await Promise.all([ this.alphaBus?.disconnect(), this.betaBus?.disconnect(), this.gammaBus?.disconnect(), this.blackboard?.disconnect(), this.stateManager?.disconnect(), this.spawnController?.disconnect(), this.metrics?.disconnect(), ].filter(Boolean)); this.log("Cleanup complete"); } getTaskId(): string { return this.taskId; } } // ============================================================================= // Performance Analysis // ============================================================================= export function analyzePerformance(metrics: CoordinationMetrics): void { console.log("\n" + "=".repeat(70)); console.log("PERFORMANCE ANALYSIS"); console.log("=".repeat(70)); const duration = metrics.end_time ? (new Date(metrics.end_time).getTime() - new Date(metrics.start_time).getTime()) / 1000 : 0; console.log("\nTiming:"); console.log(` Duration: ${duration.toFixed(1)}s`); console.log("\nCommunication:"); console.log(` Total messages: ${metrics.total_messages}`); console.log(` Direct messages: ${metrics.direct_messages}`); console.log(` Blackboard writes: ${metrics.blackboard_writes}`); console.log(` Blackboard reads: ${metrics.blackboard_reads}`); console.log(` Communication overhead: ${((metrics.total_messages + metrics.blackboard_writes) / duration).toFixed(2)} ops/sec`); console.log("\nCoordination:"); console.log(` Conflicts detected: ${metrics.conflicts_detected}`); console.log(` Conflicts resolved: ${metrics.conflicts_resolved}`); console.log(` Conflict resolution rate: ${metrics.conflicts_detected > 0 ? ((metrics.conflicts_resolved / metrics.conflicts_detected) * 100).toFixed(1) : 100}%`); console.log("\nGamma Agent:"); console.log(` Spawned: ${metrics.gamma_spawned ? "Yes" : "No"}`); if (metrics.gamma_spawned) { console.log(` Spawn reason: ${metrics.gamma_spawn_reason}`); console.log(` Spawn time: ${metrics.gamma_spawn_time}`); } console.log("\nOutcome:"); console.log(` Consensus achieved: ${metrics.final_consensus ? "Yes" : "No"}`); console.log(` Performance score: ${(metrics.performance_score * 100).toFixed(1)}%`); // Threshold analysis console.log("\nThreshold Effects:"); const messageThreshold = 50; const conflictThreshold = 3; if (metrics.total_messages > messageThreshold) { console.log(` ! High message volume (${metrics.total_messages} > ${messageThreshold}) - potential coordination overhead`); } else { console.log(` + Message volume within threshold (${metrics.total_messages} <= ${messageThreshold})`); } if (metrics.conflicts_detected > conflictThreshold) { console.log(` ! High conflict rate (${metrics.conflicts_detected} > ${conflictThreshold}) - agents may have divergent strategies`); } else { console.log(` + Conflict rate within threshold (${metrics.conflicts_detected} <= ${conflictThreshold})`); } if (metrics.gamma_spawned && metrics.gamma_spawn_reason === "STUCK") { console.log(` ! Gamma spawned due to stuck condition - consider adjusting agent strategies`); } console.log("\n" + "=".repeat(70)); } // ============================================================================= // CLI Entry Point // ============================================================================= async function main() { const args = process.argv.slice(2); // Default complex task let objective = args[0] || `Design a distributed event-driven architecture for a real-time analytics platform that handles: 1) High-throughput data ingestion from multiple sources 2) Stream processing with exactly-once semantics 3) Real-time aggregations and windowed computations 4) Low-latency query serving for dashboards 5) Horizontal scalability to handle 1M events/second The solution should consider fault tolerance, data consistency, and cost optimization.`; let model = "anthropic/claude-sonnet-4"; const modelIdx = args.indexOf("--model"); if (modelIdx !== -1 && args[modelIdx + 1]) { model = args[modelIdx + 1]; } // Parse timeout let timeout = 120; const timeoutIdx = args.indexOf("--timeout"); if (timeoutIdx !== -1 && args[timeoutIdx + 1]) { timeout = parseInt(args[timeoutIdx + 1]); } const task: TaskDefinition = { task_id: generateId(), objective, complexity: "high", subtasks: [ { id: "s1", description: "Analyze data ingestion requirements", status: "pending", dependencies: [] }, { id: "s2", description: "Design stream processing pipeline", status: "pending", dependencies: ["s1"] }, { id: "s3", description: "Plan storage and query layer", status: "pending", dependencies: ["s1"] }, { id: "s4", description: "Define scalability strategy", status: "pending", dependencies: ["s2", "s3"] }, { id: "s5", description: "Integrate fault tolerance mechanisms", status: "pending", dependencies: ["s4"] }, ], constraints: [ "Must use open-source technologies where possible", "Latency < 100ms for query responses", "Support for multiple data formats (JSON, Avro, Protobuf)", "Cost-effective for variable workloads", ], success_criteria: [ "Complete architecture design with component diagrams", "Data flow specifications", "Scalability analysis", "Fault tolerance mechanisms documented", "Cost estimation provided", ], timeout_seconds: timeout, }; const orchestrator = new MultiAgentOrchestrator(model); let exitCode = 0; try { await orchestrator.initialize(); const metrics = await orchestrator.runTask(task); console.log("\n" + "=".repeat(70)); console.log("FINAL METRICS"); console.log("=".repeat(70)); console.log(JSON.stringify(metrics, null, 2)); analyzePerformance(metrics); // Output special marker for server to parse consensus status // Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...},"abort_reason":...} console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({ consensus: metrics.final_consensus, task_id: metrics.task_id, metrics: metrics, abort_reason: (metrics as any).abort_reason || null, requires_auto_recovery: !metrics.final_consensus || !!(metrics as any).abort_reason })); // Exit codes: // 0 = Success (consensus achieved) // 1 = Error (crash or exception) // 2 = Consensus failure (agents completed but no agreement) // 3 = Aborted (timeout/stuck/iteration limit - auto-recovery needed) if ((metrics as any).abort_reason) { console.log(`\n[ORCHESTRATOR] Task aborted: ${(metrics as any).abort_reason} - exiting with code 3 (auto-recovery needed)`); exitCode = 3; const pipelineId = process.env.PIPELINE_ID; if (pipelineId) { await reportErrorToObservability(pipelineId, "orchestrator_aborted", "high", `Orchestration aborted: ${(metrics as any).abort_reason}. Auto-recovery required.`); } } else if (!metrics.final_consensus) { console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2 (auto-recovery needed)"); exitCode = 2; // Report consensus failure to observability const pipelineId = process.env.PIPELINE_ID; if (pipelineId) { await reportErrorToObservability(pipelineId, "consensus_failure", "high", `Agents failed to reach consensus. Conflicts: ${metrics.conflicts_detected}, Resolved: ${metrics.conflicts_resolved}`); } } } catch (e: any) { console.error("Orchestrator error:", e.message); exitCode = 1; // Report critical error to observability if pipeline ID is set const pipelineId = process.env.PIPELINE_ID; if (pipelineId) { await reportErrorToObservability(pipelineId, "orchestrator_failure", "critical", e.message); } } finally { await orchestrator.cleanup(); // Explicitly exit to ensure all connections are closed process.exit(exitCode); } } main().catch((e) => { console.error("Fatal error:", e); process.exit(1); });