profit a19535b580 Implement auto-recovery for consensus failures
- Add iteration tracking and stuck detection to orchestrator
- Add triggerAutoRecovery function for automatic pipeline respawn
- Store structured failure context (proposals, conflicts, reason)
- Force GAMMA agent on recovery attempts for conflict resolution
- Limit auto-recovery to 3 attempts to prevent infinite loops
- Add UI status badges for rebooting/aborted states
- Add failure-context API endpoint for orchestrator handoff
- Add test_auto_recovery.py with 6 passing tests

Exit codes: 0=success, 1=error, 2=consensus failure, 3=aborted

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 19:28:27 -05:00

643 lines
22 KiB
TypeScript

/**
* Multi-Agent Coordination System - Orchestrator
* Manages parallel agent execution, spawn conditions, and metrics
*
* Environment variables:
* - PIPELINE_ID: Parent pipeline ID for error reporting
* - TASK_ID: Task ID override
*/
import type { TaskDefinition, CoordinationMetrics, SpawnCondition, AgentRole } from "./types";
import {
Blackboard,
MessageBus,
AgentStateManager,
SpawnController,
MetricsCollector,
} from "./coordination";
import { AgentAlpha, AgentBeta, AgentGamma } from "./agents";
function now(): string {
return new Date().toISOString();
}
function generateId(): string {
return "task-" + Math.random().toString(36).slice(2, 8) + "-" + Date.now().toString(36);
}
// Error reporting to parent pipeline's observability system
async function reportErrorToObservability(
pipelineId: string,
errorType: string,
severity: "low" | "medium" | "high" | "critical",
details: string
): Promise<void> {
try {
// Report to the UI server's error tracking API
const response = await fetch("http://localhost:3000/api/pipeline/errors/record", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
pipeline_id: pipelineId,
error_type: errorType,
severity,
details
})
});
if (!response.ok) {
console.error(`[ERROR_REPORT] Failed to report error: ${response.status}`);
}
} catch (e: any) {
// Silently fail - don't let error reporting cause more errors
console.error(`[ERROR_REPORT] Error reporting failed: ${e.message}`);
}
}
// =============================================================================
// Multi-Agent Orchestrator
// =============================================================================
export class MultiAgentOrchestrator {
private taskId: string;
private pipelineId?: string;
private blackboard!: Blackboard;
private stateManager!: AgentStateManager;
private spawnController!: SpawnController;
private metrics!: MetricsCollector;
private alphaAgent!: AgentAlpha;
private betaAgent!: AgentBeta;
private gammaAgent?: AgentGamma;
private alphaBus!: MessageBus;
private betaBus!: MessageBus;
private gammaBus?: MessageBus;
private model: string;
private startTime!: number;
private monitorInterval?: ReturnType<typeof setInterval>;
private errorCount: number = 0;
private iterationCount: number = 0;
private maxIterations: number = 10;
private lastProgressTime: number = 0;
private progressTimeout: number = 60000; // 60 seconds without progress = stuck
constructor(model: string = "anthropic/claude-sonnet-4") {
// Use environment variable for task ID if provided
this.taskId = process.env.TASK_ID || generateId();
this.pipelineId = process.env.PIPELINE_ID;
this.model = model;
}
private async reportError(errorType: string, severity: "low" | "medium" | "high" | "critical", details: string): Promise<void> {
this.errorCount++;
if (this.pipelineId) {
await reportErrorToObservability(this.pipelineId, errorType, severity, details);
}
this.log(`ERROR [${severity}] ${errorType}: ${details}`);
}
private updateProgress(): void {
this.lastProgressTime = Date.now();
this.iterationCount++;
}
private isStuck(): boolean {
if (this.lastProgressTime === 0) return false;
return (Date.now() - this.lastProgressTime) > this.progressTimeout;
}
private isIterationLimitExceeded(): boolean {
return this.iterationCount >= this.maxIterations;
}
// Request token revocation for stuck agents
private async revokeStuckAgentTokens(): Promise<void> {
if (!this.pipelineId) return;
try {
await fetch("http://localhost:3000/api/pipeline/revoke", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
pipeline_id: this.pipelineId,
reason: "iteration_timeout",
details: `Agents stuck after ${this.iterationCount} iterations`
})
});
this.log("Token revocation requested for stuck agents");
} catch (e: any) {
this.log(`Failed to revoke tokens: ${e.message}`);
}
}
// Record structured failure context for auto-recovery
private async recordFailureContext(
reason: "stuck" | "iteration_limit" | "consensus_failed",
metrics: CoordinationMetrics
): Promise<void> {
if (!this.pipelineId) return;
const failureContext = {
pipeline_id: this.pipelineId,
task_id: this.taskId,
failure_reason: reason,
failure_time: new Date().toISOString(),
iteration_count: this.iterationCount,
elapsed_ms: Date.now() - this.startTime,
metrics: metrics,
gamma_spawned: this.gammaAgent !== undefined,
error_count: this.errorCount,
recovery_hint: this.getRecoveryHint(reason)
};
try {
await fetch("http://localhost:3000/api/pipeline/failure-context", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(failureContext)
});
this.log(`Failure context recorded: ${reason}`);
} catch (e: any) {
this.log(`Failed to record failure context: ${e.message}`);
}
}
private getRecoveryHint(reason: string): string {
switch (reason) {
case "stuck":
return "Agents became unresponsive. Try with fresh agents or GAMMA mediator.";
case "iteration_limit":
return "Max iterations reached without consensus. Consider simplifying objective.";
case "consensus_failed":
return "Agents completed but disagreed. GAMMA mediator may help resolve conflicts.";
default:
return "Unknown failure. Review logs for details.";
}
}
private log(msg: string) {
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
}
async initialize(): Promise<void> {
this.startTime = Date.now();
console.log("\n" + "=".repeat(70));
console.log("MULTI-AGENT COORDINATION SYSTEM");
console.log("Task ID: " + this.taskId);
if (this.pipelineId) {
console.log("Pipeline ID: " + this.pipelineId);
}
console.log("Model: " + this.model);
console.log("=".repeat(70) + "\n");
this.log("Initializing coordination infrastructure...");
// Initialize shared infrastructure
this.blackboard = new Blackboard(this.taskId);
this.stateManager = new AgentStateManager(this.taskId);
this.spawnController = new SpawnController(this.taskId);
this.metrics = new MetricsCollector(this.taskId);
await Promise.all([
this.blackboard.connect(),
this.stateManager.connect(),
this.spawnController.connect(),
this.metrics.connect(),
]);
this.log("Infrastructure connected");
// Initialize message buses for ALPHA and BETA
this.alphaBus = new MessageBus(this.taskId, "ALPHA");
this.betaBus = new MessageBus(this.taskId, "BETA");
await Promise.all([
this.alphaBus.connect(),
this.betaBus.connect(),
]);
this.log("Message buses connected");
// Create initial agents
this.alphaAgent = new AgentAlpha(
this.taskId, this.blackboard, this.alphaBus, this.stateManager, this.metrics, this.model
);
this.betaAgent = new AgentBeta(
this.taskId, this.blackboard, this.betaBus, this.stateManager, this.metrics, this.model
);
await Promise.all([
this.alphaAgent.init(),
this.betaAgent.init(),
]);
this.log("Agents ALPHA and BETA initialized");
}
async spawnGamma(reason: SpawnCondition): Promise<void> {
if (this.gammaAgent) {
this.log("GAMMA already spawned, skipping");
return;
}
this.log(`SPAWNING GAMMA - Reason: ${reason.type} (threshold: ${reason.threshold}, current: ${reason.current_value})`);
// Create message bus for GAMMA
this.gammaBus = new MessageBus(this.taskId, "GAMMA");
await this.gammaBus.connect();
// Create and initialize GAMMA agent
this.gammaAgent = new AgentGamma(
this.taskId, this.blackboard, this.gammaBus, this.stateManager, this.metrics,
reason.type, this.model
);
await this.gammaAgent.init();
await this.spawnController.markGammaSpawned(reason);
this.log("GAMMA agent spawned and initialized");
}
private async monitorConditions(): Promise<{ abort: boolean; reason?: string }> {
// Track progress
this.updateProgress();
// Check for iteration timeout (stuck without progress)
if (this.isStuck()) {
this.log("TIMEOUT: No progress detected - triggering auto-recovery");
await this.reportError("iteration_timeout", "high", "Agents stuck without progress");
return { abort: true, reason: "stuck" };
}
// Check for iteration limit exceeded
if (this.isIterationLimitExceeded()) {
this.log("LIMIT: Maximum iterations exceeded - triggering auto-recovery");
await this.reportError("iteration_limit", "high", `Max iterations (${this.maxIterations}) exceeded`);
return { abort: true, reason: "iteration_limit" };
}
// Check stuck condition
const stuckAgents = await this.stateManager.detectStuckAgents(30);
if (stuckAgents.length > 0) {
this.log(`Stuck agents detected: ${stuckAgents.join(", ")}`);
const condition = await this.spawnController.updateCondition("STUCK", stuckAgents.length);
if (condition?.triggered) {
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
if (shouldSpawn && reason) {
await this.spawnGamma(reason);
}
}
}
// Check conflict condition
const metricsData = await this.metrics.getMetrics();
const unresolvedConflicts = metricsData.conflicts_detected - metricsData.conflicts_resolved;
const conflictCondition = await this.spawnController.updateCondition("CONFLICT", unresolvedConflicts);
if (conflictCondition?.triggered && !this.spawnController.isGammaSpawned()) {
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
if (shouldSpawn && reason) {
await this.spawnGamma(reason);
}
}
// Check complexity condition (from blackboard)
const analysis = await this.blackboard.read("problem", "analysis");
if (analysis?.value?.complexity_score) {
const complexityCondition = await this.spawnController.updateCondition("COMPLEXITY", analysis.value.complexity_score);
if (complexityCondition?.triggered && !this.spawnController.isGammaSpawned()) {
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
if (shouldSpawn && reason) {
await this.spawnGamma(reason);
}
}
}
// Log current state
const states = await this.stateManager.getAllStates();
const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", ");
this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts} | Iter: ${this.iterationCount}`);
return { abort: false };
}
async runTask(task: TaskDefinition): Promise<CoordinationMetrics & { abort_reason?: string }> {
this.log(`Starting task: ${task.objective.slice(0, 60)}...`);
this.lastProgressTime = Date.now();
// Write task to blackboard
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
// Track if we need to abort due to timeout/iteration limit
let abortReason: string | undefined;
// Start monitoring with abort detection
this.monitorInterval = setInterval(async () => {
const result = await this.monitorConditions();
if (result.abort && result.reason) {
abortReason = result.reason;
if (this.monitorInterval) {
clearInterval(this.monitorInterval);
}
}
}, 2000);
// Run agents in parallel
this.log("Launching ALPHA and BETA in parallel...");
const alphaPromise = this.alphaAgent.run(task).catch(async e => {
await this.reportError("agent_failure", "high", `ALPHA error: ${e.message}`);
});
const betaPromise = this.betaAgent.run(task).catch(async e => {
await this.reportError("agent_failure", "high", `BETA error: ${e.message}`);
});
// Wait for initial agents to complete (or timeout)
const timeout = task.timeout_seconds * 1000;
const timeoutPromise = new Promise<void>(resolve => setTimeout(resolve, timeout));
await Promise.race([
Promise.all([alphaPromise, betaPromise]),
timeoutPromise,
]);
// Check if we were aborted during execution
if (abortReason) {
this.log(`Task aborted: ${abortReason}`);
if (this.monitorInterval) {
clearInterval(this.monitorInterval);
}
// Revoke tokens for stuck agents
await this.revokeStuckAgentTokens();
// Get partial metrics and mark as failed
const partialMetrics = await this.metrics.finalize(false);
await this.recordFailureContext(abortReason as any, partialMetrics);
return { ...partialMetrics, abort_reason: abortReason };
}
this.log("Initial agents completed or timeout reached");
// Check if GAMMA needs to be spawned for success validation
const states = await this.stateManager.getAllStates();
const bothComplete = states.every(s => s.status === "WAITING" || s.status === "COMPLETED");
if (bothComplete && !this.spawnController.isGammaSpawned()) {
await this.spawnController.updateCondition("SUCCESS", 1.0);
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
if (shouldSpawn && reason) {
await this.spawnGamma(reason);
}
}
// If GAMMA was spawned, run it
if (this.gammaAgent) {
this.log("Running GAMMA for resolution...");
await this.gammaAgent.run(task).catch(async e => {
await this.reportError("agent_failure", "high", `GAMMA error: ${e.message}`);
});
}
// Stop monitoring
if (this.monitorInterval) {
clearInterval(this.monitorInterval);
}
// Check consensus
const consensus = await this.blackboard.checkConsensus("synthesis", ["ALPHA", "BETA"]);
const consensusAchieved = consensus.reached ||
(await this.blackboard.read("consensus", "final"))?.value?.achieved === true;
this.log(`Consensus achieved: ${consensusAchieved}`);
// Finalize metrics
const finalMetrics = await this.metrics.finalize(consensusAchieved);
// If consensus failed, record structured failure context
if (!consensusAchieved) {
await this.recordFailureContext("consensus_failed", finalMetrics);
}
return finalMetrics;
}
async cleanup(): Promise<void> {
this.log("Cleaning up...");
if (this.monitorInterval) {
clearInterval(this.monitorInterval);
}
await Promise.all([
this.alphaBus?.disconnect(),
this.betaBus?.disconnect(),
this.gammaBus?.disconnect(),
this.blackboard?.disconnect(),
this.stateManager?.disconnect(),
this.spawnController?.disconnect(),
this.metrics?.disconnect(),
].filter(Boolean));
this.log("Cleanup complete");
}
getTaskId(): string {
return this.taskId;
}
}
// =============================================================================
// Performance Analysis
// =============================================================================
export function analyzePerformance(metrics: CoordinationMetrics): void {
console.log("\n" + "=".repeat(70));
console.log("PERFORMANCE ANALYSIS");
console.log("=".repeat(70));
const duration = metrics.end_time
? (new Date(metrics.end_time).getTime() - new Date(metrics.start_time).getTime()) / 1000
: 0;
console.log("\nTiming:");
console.log(` Duration: ${duration.toFixed(1)}s`);
console.log("\nCommunication:");
console.log(` Total messages: ${metrics.total_messages}`);
console.log(` Direct messages: ${metrics.direct_messages}`);
console.log(` Blackboard writes: ${metrics.blackboard_writes}`);
console.log(` Blackboard reads: ${metrics.blackboard_reads}`);
console.log(` Communication overhead: ${((metrics.total_messages + metrics.blackboard_writes) / duration).toFixed(2)} ops/sec`);
console.log("\nCoordination:");
console.log(` Conflicts detected: ${metrics.conflicts_detected}`);
console.log(` Conflicts resolved: ${metrics.conflicts_resolved}`);
console.log(` Conflict resolution rate: ${metrics.conflicts_detected > 0 ? ((metrics.conflicts_resolved / metrics.conflicts_detected) * 100).toFixed(1) : 100}%`);
console.log("\nGamma Agent:");
console.log(` Spawned: ${metrics.gamma_spawned ? "Yes" : "No"}`);
if (metrics.gamma_spawned) {
console.log(` Spawn reason: ${metrics.gamma_spawn_reason}`);
console.log(` Spawn time: ${metrics.gamma_spawn_time}`);
}
console.log("\nOutcome:");
console.log(` Consensus achieved: ${metrics.final_consensus ? "Yes" : "No"}`);
console.log(` Performance score: ${(metrics.performance_score * 100).toFixed(1)}%`);
// Threshold analysis
console.log("\nThreshold Effects:");
const messageThreshold = 50;
const conflictThreshold = 3;
if (metrics.total_messages > messageThreshold) {
console.log(` ! High message volume (${metrics.total_messages} > ${messageThreshold}) - potential coordination overhead`);
} else {
console.log(` + Message volume within threshold (${metrics.total_messages} <= ${messageThreshold})`);
}
if (metrics.conflicts_detected > conflictThreshold) {
console.log(` ! High conflict rate (${metrics.conflicts_detected} > ${conflictThreshold}) - agents may have divergent strategies`);
} else {
console.log(` + Conflict rate within threshold (${metrics.conflicts_detected} <= ${conflictThreshold})`);
}
if (metrics.gamma_spawned && metrics.gamma_spawn_reason === "STUCK") {
console.log(` ! Gamma spawned due to stuck condition - consider adjusting agent strategies`);
}
console.log("\n" + "=".repeat(70));
}
// =============================================================================
// CLI Entry Point
// =============================================================================
async function main() {
const args = process.argv.slice(2);
// Default complex task
let objective = args[0] || `Design a distributed event-driven architecture for a real-time analytics platform that handles:
1) High-throughput data ingestion from multiple sources
2) Stream processing with exactly-once semantics
3) Real-time aggregations and windowed computations
4) Low-latency query serving for dashboards
5) Horizontal scalability to handle 1M events/second
The solution should consider fault tolerance, data consistency, and cost optimization.`;
let model = "anthropic/claude-sonnet-4";
const modelIdx = args.indexOf("--model");
if (modelIdx !== -1 && args[modelIdx + 1]) {
model = args[modelIdx + 1];
}
// Parse timeout
let timeout = 120;
const timeoutIdx = args.indexOf("--timeout");
if (timeoutIdx !== -1 && args[timeoutIdx + 1]) {
timeout = parseInt(args[timeoutIdx + 1]);
}
const task: TaskDefinition = {
task_id: generateId(),
objective,
complexity: "high",
subtasks: [
{ id: "s1", description: "Analyze data ingestion requirements", status: "pending", dependencies: [] },
{ id: "s2", description: "Design stream processing pipeline", status: "pending", dependencies: ["s1"] },
{ id: "s3", description: "Plan storage and query layer", status: "pending", dependencies: ["s1"] },
{ id: "s4", description: "Define scalability strategy", status: "pending", dependencies: ["s2", "s3"] },
{ id: "s5", description: "Integrate fault tolerance mechanisms", status: "pending", dependencies: ["s4"] },
],
constraints: [
"Must use open-source technologies where possible",
"Latency < 100ms for query responses",
"Support for multiple data formats (JSON, Avro, Protobuf)",
"Cost-effective for variable workloads",
],
success_criteria: [
"Complete architecture design with component diagrams",
"Data flow specifications",
"Scalability analysis",
"Fault tolerance mechanisms documented",
"Cost estimation provided",
],
timeout_seconds: timeout,
};
const orchestrator = new MultiAgentOrchestrator(model);
let exitCode = 0;
try {
await orchestrator.initialize();
const metrics = await orchestrator.runTask(task);
console.log("\n" + "=".repeat(70));
console.log("FINAL METRICS");
console.log("=".repeat(70));
console.log(JSON.stringify(metrics, null, 2));
analyzePerformance(metrics);
// Output special marker for server to parse consensus status
// Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...},"abort_reason":...}
console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({
consensus: metrics.final_consensus,
task_id: metrics.task_id,
metrics: metrics,
abort_reason: (metrics as any).abort_reason || null,
requires_auto_recovery: !metrics.final_consensus || !!(metrics as any).abort_reason
}));
// Exit codes:
// 0 = Success (consensus achieved)
// 1 = Error (crash or exception)
// 2 = Consensus failure (agents completed but no agreement)
// 3 = Aborted (timeout/stuck/iteration limit - auto-recovery needed)
if ((metrics as any).abort_reason) {
console.log(`\n[ORCHESTRATOR] Task aborted: ${(metrics as any).abort_reason} - exiting with code 3 (auto-recovery needed)`);
exitCode = 3;
const pipelineId = process.env.PIPELINE_ID;
if (pipelineId) {
await reportErrorToObservability(pipelineId, "orchestrator_aborted", "high",
`Orchestration aborted: ${(metrics as any).abort_reason}. Auto-recovery required.`);
}
} else if (!metrics.final_consensus) {
console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2 (auto-recovery needed)");
exitCode = 2;
// Report consensus failure to observability
const pipelineId = process.env.PIPELINE_ID;
if (pipelineId) {
await reportErrorToObservability(pipelineId, "consensus_failure", "high",
`Agents failed to reach consensus. Conflicts: ${metrics.conflicts_detected}, Resolved: ${metrics.conflicts_resolved}`);
}
}
} catch (e: any) {
console.error("Orchestrator error:", e.message);
exitCode = 1;
// Report critical error to observability if pipeline ID is set
const pipelineId = process.env.PIPELINE_ID;
if (pipelineId) {
await reportErrorToObservability(pipelineId, "orchestrator_failure", "critical", e.message);
}
} finally {
await orchestrator.cleanup();
// Explicitly exit to ensure all connections are closed
process.exit(exitCode);
}
}
main().catch((e) => {
console.error("Fatal error:", e);
process.exit(1);
});