Orchestrator changes:
- Add dumpAgentHandoff() to dump proposals/analysis before abort
- Add loadRecoveryContext() to load inherited context on recovery runs
- Add preseedBlackboard() to pre-seed inherited proposals
- Force-spawn GAMMA immediately on recovery runs
- Track isRecoveryRun, recoveryAttempt, inheritedContext, forceGamma
Server changes:
- Update recordConsensusFailure() to read orchestrator handoff JSON
- Add collectFromBlackboard() helper as fallback
- Update triggerAutoRecovery() with comprehensive context passing
- Store inherited_handoff reference for recovery pipelines
- Track retry_count, abort_reason, handoff_ref in recovery:* keys
- Add recovery badge and prior pipeline link in UI
Test coverage:
- test_auto_recovery.py: 6 unit tests
- test_e2e_auto_recovery.py: 5 E2E tests (handoff dump, recovery
pipeline creation, inherited context, retry tracking, status update)
Redis tracking keys:
- handoff:{pipeline_id}:agents - orchestrator dumps proposals here
- handoff:{recovery_id}:inherited - recovery pipeline inherits from
- recovery:{pipeline_id} - retry_count, abort_reason, handoff_ref
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
922 lines
33 KiB
TypeScript
922 lines
33 KiB
TypeScript
/**
|
|
* Multi-Agent Coordination System - Orchestrator
|
|
* Manages parallel agent execution, spawn conditions, and metrics
|
|
*
|
|
* Environment variables:
|
|
* - PIPELINE_ID: Parent pipeline ID for error reporting
|
|
* - TASK_ID: Task ID override
|
|
*/
|
|
|
|
import type { TaskDefinition, CoordinationMetrics, SpawnCondition, AgentRole } from "./types";
|
|
import {
|
|
Blackboard,
|
|
MessageBus,
|
|
AgentStateManager,
|
|
SpawnController,
|
|
MetricsCollector,
|
|
} from "./coordination";
|
|
import { AgentAlpha, AgentBeta, AgentGamma } from "./agents";
|
|
import { createClient, RedisClientType } from "redis";
|
|
|
|
// Redis client for direct handoff writes
|
|
let handoffRedis: RedisClientType | null = null;
|
|
|
|
async function getHandoffRedis(): Promise<RedisClientType> {
|
|
if (handoffRedis && handoffRedis.isOpen) return handoffRedis;
|
|
|
|
const initKeys = await Bun.file("/opt/vault/init-keys.json").json();
|
|
const token = initKeys.root_token;
|
|
const result = await fetch(`https://127.0.0.1:8200/v1/secret/data/services/dragonfly`, {
|
|
headers: { "X-Vault-Token": token },
|
|
// @ts-ignore - Bun supports this
|
|
tls: { rejectUnauthorized: false }
|
|
}).then(r => r.json());
|
|
|
|
const creds = result.data.data;
|
|
handoffRedis = createClient({
|
|
url: `redis://${creds.host}:${creds.port}`,
|
|
password: creds.password,
|
|
});
|
|
await handoffRedis.connect();
|
|
return handoffRedis;
|
|
}
|
|
|
|
function now(): string {
|
|
return new Date().toISOString();
|
|
}
|
|
|
|
function generateId(): string {
|
|
return "task-" + Math.random().toString(36).slice(2, 8) + "-" + Date.now().toString(36);
|
|
}
|
|
|
|
// Error reporting to parent pipeline's observability system
|
|
async function reportErrorToObservability(
|
|
pipelineId: string,
|
|
errorType: string,
|
|
severity: "low" | "medium" | "high" | "critical",
|
|
details: string
|
|
): Promise<void> {
|
|
try {
|
|
// Report to the UI server's error tracking API
|
|
const response = await fetch("http://localhost:3000/api/pipeline/errors/record", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
pipeline_id: pipelineId,
|
|
error_type: errorType,
|
|
severity,
|
|
details
|
|
})
|
|
});
|
|
if (!response.ok) {
|
|
console.error(`[ERROR_REPORT] Failed to report error: ${response.status}`);
|
|
}
|
|
} catch (e: any) {
|
|
// Silently fail - don't let error reporting cause more errors
|
|
console.error(`[ERROR_REPORT] Error reporting failed: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// Multi-Agent Orchestrator
|
|
// =============================================================================
|
|
|
|
export class MultiAgentOrchestrator {
|
|
private taskId: string;
|
|
private pipelineId?: string;
|
|
private blackboard!: Blackboard;
|
|
private stateManager!: AgentStateManager;
|
|
private spawnController!: SpawnController;
|
|
private metrics!: MetricsCollector;
|
|
|
|
private alphaAgent!: AgentAlpha;
|
|
private betaAgent!: AgentBeta;
|
|
private gammaAgent?: AgentGamma;
|
|
|
|
private alphaBus!: MessageBus;
|
|
private betaBus!: MessageBus;
|
|
private gammaBus?: MessageBus;
|
|
|
|
private model: string;
|
|
private startTime!: number;
|
|
private monitorInterval?: ReturnType<typeof setInterval>;
|
|
private errorCount: number = 0;
|
|
private iterationCount: number = 0;
|
|
private maxIterations: number = 10;
|
|
private lastProgressTime: number = 0;
|
|
private progressTimeout: number = 60000; // 60 seconds without progress = stuck
|
|
|
|
// Recovery context from prior runs
|
|
private isRecoveryRun: boolean = false;
|
|
private recoveryAttempt: number = 1;
|
|
private inheritedContext: any = null;
|
|
private forceGamma: boolean = false;
|
|
|
|
constructor(model: string = "anthropic/claude-sonnet-4") {
|
|
// Use environment variable for task ID if provided
|
|
this.taskId = process.env.TASK_ID || generateId();
|
|
this.pipelineId = process.env.PIPELINE_ID;
|
|
this.model = model;
|
|
}
|
|
|
|
private async reportError(errorType: string, severity: "low" | "medium" | "high" | "critical", details: string): Promise<void> {
|
|
this.errorCount++;
|
|
if (this.pipelineId) {
|
|
await reportErrorToObservability(this.pipelineId, errorType, severity, details);
|
|
}
|
|
this.log(`ERROR [${severity}] ${errorType}: ${details}`);
|
|
}
|
|
|
|
private updateProgress(): void {
|
|
this.lastProgressTime = Date.now();
|
|
this.iterationCount++;
|
|
}
|
|
|
|
private isStuck(): boolean {
|
|
if (this.lastProgressTime === 0) return false;
|
|
return (Date.now() - this.lastProgressTime) > this.progressTimeout;
|
|
}
|
|
|
|
private isIterationLimitExceeded(): boolean {
|
|
return this.iterationCount >= this.maxIterations;
|
|
}
|
|
|
|
// Request token revocation for stuck agents
|
|
private async revokeStuckAgentTokens(): Promise<void> {
|
|
if (!this.pipelineId) return;
|
|
|
|
try {
|
|
await fetch("http://localhost:3000/api/pipeline/revoke", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
pipeline_id: this.pipelineId,
|
|
reason: "iteration_timeout",
|
|
details: `Agents stuck after ${this.iterationCount} iterations`
|
|
})
|
|
});
|
|
this.log("Token revocation requested for stuck agents");
|
|
} catch (e: any) {
|
|
this.log(`Failed to revoke tokens: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
// Record structured failure context for auto-recovery
|
|
private async recordFailureContext(
|
|
reason: "stuck" | "iteration_limit" | "consensus_failed",
|
|
metrics: CoordinationMetrics
|
|
): Promise<void> {
|
|
if (!this.pipelineId) return;
|
|
|
|
const failureContext = {
|
|
pipeline_id: this.pipelineId,
|
|
task_id: this.taskId,
|
|
failure_reason: reason,
|
|
failure_time: new Date().toISOString(),
|
|
iteration_count: this.iterationCount,
|
|
elapsed_ms: Date.now() - this.startTime,
|
|
metrics: metrics,
|
|
gamma_spawned: this.gammaAgent !== undefined,
|
|
error_count: this.errorCount,
|
|
recovery_hint: this.getRecoveryHint(reason)
|
|
};
|
|
|
|
try {
|
|
await fetch("http://localhost:3000/api/pipeline/failure-context", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify(failureContext)
|
|
});
|
|
this.log(`Failure context recorded: ${reason}`);
|
|
} catch (e: any) {
|
|
this.log(`Failed to record failure context: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
private getRecoveryHint(reason: string): string {
|
|
switch (reason) {
|
|
case "stuck":
|
|
return "Agents became unresponsive. Try with fresh agents or GAMMA mediator.";
|
|
case "iteration_limit":
|
|
return "Max iterations reached without consensus. Consider simplifying objective.";
|
|
case "consensus_failed":
|
|
return "Agents completed but disagreed. GAMMA mediator may help resolve conflicts.";
|
|
default:
|
|
return "Unknown failure. Review logs for details.";
|
|
}
|
|
}
|
|
|
|
// Load recovery context from prior run if this is a recovery pipeline
|
|
private async loadRecoveryContext(): Promise<void> {
|
|
if (!this.pipelineId) return;
|
|
|
|
try {
|
|
const redis = await getHandoffRedis();
|
|
const pipelineData = await redis.hGetAll(`pipeline:${this.pipelineId}`);
|
|
|
|
if (pipelineData.is_recovery === "true") {
|
|
this.isRecoveryRun = true;
|
|
this.recoveryAttempt = parseInt(pipelineData.recovery_attempt || "1");
|
|
this.forceGamma = pipelineData.force_gamma === "true";
|
|
|
|
console.log("\n" + "=".repeat(70));
|
|
console.log(`RECOVERY RUN - Attempt ${this.recoveryAttempt}`);
|
|
console.log("=".repeat(70));
|
|
|
|
// Load inherited handoff
|
|
const inheritedKey = pipelineData.inherited_handoff;
|
|
if (inheritedKey) {
|
|
const inheritedData = await redis.get(inheritedKey);
|
|
if (inheritedData) {
|
|
this.inheritedContext = JSON.parse(inheritedData);
|
|
console.log(`Loaded inherited context from: ${inheritedKey}`);
|
|
console.log(`- Prior proposals: ${this.inheritedContext.proposals?.length || 0}`);
|
|
console.log(`- Prior synthesis attempts: ${this.inheritedContext.synthesis_attempts?.length || 0}`);
|
|
console.log(`- Recovery hints:`);
|
|
this.inheritedContext.recovery_hints?.forEach((hint: string, i: number) => {
|
|
console.log(` ${i + 1}. ${hint}`);
|
|
});
|
|
}
|
|
}
|
|
|
|
// Also try loading prior_context (JSON string)
|
|
if (pipelineData.prior_context) {
|
|
const priorContext = JSON.parse(pipelineData.prior_context);
|
|
console.log(`Prior run: ${priorContext.prior_pipeline}`);
|
|
console.log(`Prior failure reason: ${priorContext.failure_reason}`);
|
|
console.log(`Prior iteration count: ${priorContext.iteration_count}`);
|
|
}
|
|
|
|
if (this.forceGamma) {
|
|
console.log("\nFORCE GAMMA MODE: GAMMA mediator will be spawned immediately");
|
|
}
|
|
|
|
console.log("=".repeat(70) + "\n");
|
|
}
|
|
} catch (e: any) {
|
|
this.log(`Warning: Could not load recovery context: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
// Get inherited context for agents to use
|
|
getInheritedContext(): any {
|
|
return this.inheritedContext;
|
|
}
|
|
|
|
// Check if GAMMA should be force-spawned
|
|
shouldForceGamma(): boolean {
|
|
return this.forceGamma;
|
|
}
|
|
|
|
// Pre-seed the blackboard with inherited proposals from prior run
|
|
private async preseedBlackboard(): Promise<void> {
|
|
if (!this.inheritedContext) return;
|
|
|
|
try {
|
|
// Seed prior proposals into the blackboard
|
|
if (this.inheritedContext.proposals?.length > 0) {
|
|
for (const proposal of this.inheritedContext.proposals) {
|
|
await this.blackboard.write(
|
|
"solutions",
|
|
`inherited_${proposal.agent}_${proposal.key || 'proposal'}`,
|
|
{
|
|
...proposal.value,
|
|
_inherited: true,
|
|
_from_run: this.recoveryAttempt - 1
|
|
},
|
|
proposal.agent as any
|
|
);
|
|
}
|
|
this.log(`Seeded ${this.inheritedContext.proposals.length} proposals from prior run`);
|
|
}
|
|
|
|
// Seed prior synthesis attempts
|
|
if (this.inheritedContext.synthesis_attempts?.length > 0) {
|
|
for (const synthesis of this.inheritedContext.synthesis_attempts) {
|
|
await this.blackboard.write(
|
|
"synthesis",
|
|
`inherited_${synthesis.agent}_${synthesis.key || 'synthesis'}`,
|
|
{
|
|
...synthesis.value,
|
|
_inherited: true,
|
|
_from_run: this.recoveryAttempt - 1
|
|
},
|
|
synthesis.agent as any
|
|
);
|
|
}
|
|
this.log(`Seeded ${this.inheritedContext.synthesis_attempts.length} synthesis attempts from prior run`);
|
|
}
|
|
|
|
// Write recovery metadata to blackboard
|
|
await this.blackboard.write("problem", "recovery_context", {
|
|
is_recovery: true,
|
|
recovery_attempt: this.recoveryAttempt,
|
|
prior_pipeline: this.inheritedContext.from_pipeline,
|
|
prior_proposals_count: this.inheritedContext.proposals?.length || 0,
|
|
recovery_hints: this.inheritedContext.recovery_hints || [],
|
|
instructions: [
|
|
"This is a RECOVERY run - prior agents failed to reach consensus",
|
|
"Review the inherited proposals in the 'solutions' section",
|
|
"Look for common ground between prior proposals",
|
|
"GAMMA mediator will help resolve conflicts",
|
|
"Try to synthesize a solution that incorporates the best ideas from prior attempts"
|
|
]
|
|
}, "ALPHA");
|
|
|
|
} catch (e: any) {
|
|
this.log(`Warning: Failed to pre-seed blackboard: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
// Collect and dump all agent proposals/analysis to handoff JSON
|
|
private async dumpAgentHandoff(): Promise<void> {
|
|
if (!this.pipelineId) return;
|
|
|
|
try {
|
|
const redis = await getHandoffRedis();
|
|
const handoffKey = `handoff:${this.pipelineId}:agents`;
|
|
|
|
// Collect all solutions from blackboard
|
|
const solutions = await this.blackboard.readSection("solutions");
|
|
const synthesis = await this.blackboard.readSection("synthesis");
|
|
const consensus = await this.blackboard.readSection("consensus");
|
|
const problem = await this.blackboard.readSection("problem");
|
|
|
|
// Get agent states
|
|
const agentStates = await this.stateManager.getAllStates();
|
|
|
|
// Get message history
|
|
const alphaMessages = await this.alphaBus.getMessageLog(50);
|
|
const betaMessages = await this.betaBus.getMessageLog(50);
|
|
const gammaMessages = this.gammaBus ? await this.gammaBus.getMessageLog(50) : [];
|
|
|
|
// Build structured handoff
|
|
const handoff = {
|
|
pipeline_id: this.pipelineId,
|
|
task_id: this.taskId,
|
|
dump_time: new Date().toISOString(),
|
|
iteration_count: this.iterationCount,
|
|
max_iterations: this.maxIterations,
|
|
gamma_active: this.gammaAgent !== undefined,
|
|
|
|
// Agent proposals and analysis
|
|
proposals: solutions.map(s => ({
|
|
agent: s.author,
|
|
key: s.key,
|
|
value: s.value,
|
|
version: s.version,
|
|
timestamp: s.timestamp
|
|
})),
|
|
|
|
// Synthesis attempts
|
|
synthesis_attempts: synthesis.map(s => ({
|
|
agent: s.author,
|
|
key: s.key,
|
|
value: s.value,
|
|
timestamp: s.timestamp
|
|
})),
|
|
|
|
// Consensus votes and discussions
|
|
consensus_state: consensus.map(c => ({
|
|
key: c.key,
|
|
value: c.value,
|
|
author: c.author,
|
|
timestamp: c.timestamp
|
|
})),
|
|
|
|
// Problem analysis
|
|
problem_analysis: problem.map(p => ({
|
|
key: p.key,
|
|
value: p.value,
|
|
author: p.author
|
|
})),
|
|
|
|
// Agent states at abort time
|
|
agent_states: agentStates.map(s => ({
|
|
role: s.role,
|
|
status: s.status,
|
|
phase: s.phase,
|
|
progress: s.progress,
|
|
blocked_reason: s.blocked_reason,
|
|
last_activity: s.last_activity
|
|
})),
|
|
|
|
// Recent message history for context
|
|
message_summary: {
|
|
alpha_last_messages: alphaMessages.slice(-10),
|
|
beta_last_messages: betaMessages.slice(-10),
|
|
gamma_last_messages: gammaMessages.slice(-10)
|
|
},
|
|
|
|
// Recovery hints
|
|
recovery_hints: [
|
|
`Iteration limit (${this.maxIterations}) exceeded after ${this.iterationCount} iterations`,
|
|
this.gammaAgent ? "GAMMA was active but could not resolve conflicts" : "GAMMA was not spawned",
|
|
`${solutions.length} proposals generated, ${synthesis.length} synthesis attempts`,
|
|
"Consider: simplifying objective, forcing GAMMA earlier, or increasing iteration limit"
|
|
]
|
|
};
|
|
|
|
// Store handoff JSON
|
|
await redis.set(handoffKey, JSON.stringify(handoff), { EX: 86400 }); // 24hr TTL
|
|
await redis.hSet(`pipeline:${this.pipelineId}`, "handoff_key", handoffKey);
|
|
await redis.hSet(`pipeline:${this.pipelineId}`, "handoff_time", handoff.dump_time);
|
|
|
|
this.log(`Agent handoff dumped: ${solutions.length} proposals, ${synthesis.length} synthesis attempts`);
|
|
|
|
} catch (e: any) {
|
|
this.log(`Failed to dump agent handoff: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
private log(msg: string) {
|
|
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
|
|
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
|
|
}
|
|
|
|
async initialize(): Promise<void> {
|
|
this.startTime = Date.now();
|
|
|
|
console.log("\n" + "=".repeat(70));
|
|
console.log("MULTI-AGENT COORDINATION SYSTEM");
|
|
console.log("Task ID: " + this.taskId);
|
|
if (this.pipelineId) {
|
|
console.log("Pipeline ID: " + this.pipelineId);
|
|
}
|
|
console.log("Model: " + this.model);
|
|
console.log("=".repeat(70) + "\n");
|
|
|
|
// Check if this is a recovery run and load inherited context
|
|
await this.loadRecoveryContext();
|
|
|
|
this.log("Initializing coordination infrastructure...");
|
|
|
|
// Initialize shared infrastructure
|
|
this.blackboard = new Blackboard(this.taskId);
|
|
this.stateManager = new AgentStateManager(this.taskId);
|
|
this.spawnController = new SpawnController(this.taskId);
|
|
this.metrics = new MetricsCollector(this.taskId);
|
|
|
|
await Promise.all([
|
|
this.blackboard.connect(),
|
|
this.stateManager.connect(),
|
|
this.spawnController.connect(),
|
|
this.metrics.connect(),
|
|
]);
|
|
|
|
this.log("Infrastructure connected");
|
|
|
|
// Pre-seed blackboard with inherited context if this is a recovery run
|
|
if (this.isRecoveryRun && this.inheritedContext) {
|
|
this.log("Pre-seeding blackboard with inherited context...");
|
|
await this.preseedBlackboard();
|
|
}
|
|
|
|
// Initialize message buses for ALPHA and BETA
|
|
this.alphaBus = new MessageBus(this.taskId, "ALPHA");
|
|
this.betaBus = new MessageBus(this.taskId, "BETA");
|
|
|
|
await Promise.all([
|
|
this.alphaBus.connect(),
|
|
this.betaBus.connect(),
|
|
]);
|
|
|
|
this.log("Message buses connected");
|
|
|
|
// Create initial agents
|
|
this.alphaAgent = new AgentAlpha(
|
|
this.taskId, this.blackboard, this.alphaBus, this.stateManager, this.metrics, this.model
|
|
);
|
|
this.betaAgent = new AgentBeta(
|
|
this.taskId, this.blackboard, this.betaBus, this.stateManager, this.metrics, this.model
|
|
);
|
|
|
|
await Promise.all([
|
|
this.alphaAgent.init(),
|
|
this.betaAgent.init(),
|
|
]);
|
|
|
|
this.log("Agents ALPHA and BETA initialized");
|
|
}
|
|
|
|
async spawnGamma(reason: SpawnCondition): Promise<void> {
|
|
if (this.gammaAgent) {
|
|
this.log("GAMMA already spawned, skipping");
|
|
return;
|
|
}
|
|
|
|
this.log(`SPAWNING GAMMA - Reason: ${reason.type} (threshold: ${reason.threshold}, current: ${reason.current_value})`);
|
|
|
|
// Create message bus for GAMMA
|
|
this.gammaBus = new MessageBus(this.taskId, "GAMMA");
|
|
await this.gammaBus.connect();
|
|
|
|
// Create and initialize GAMMA agent
|
|
this.gammaAgent = new AgentGamma(
|
|
this.taskId, this.blackboard, this.gammaBus, this.stateManager, this.metrics,
|
|
reason.type, this.model
|
|
);
|
|
await this.gammaAgent.init();
|
|
|
|
await this.spawnController.markGammaSpawned(reason);
|
|
|
|
this.log("GAMMA agent spawned and initialized");
|
|
}
|
|
|
|
private async monitorConditions(): Promise<{ abort: boolean; reason?: string }> {
|
|
// Track progress
|
|
this.updateProgress();
|
|
|
|
// Check for iteration timeout (stuck without progress)
|
|
if (this.isStuck()) {
|
|
this.log("TIMEOUT: No progress detected - triggering auto-recovery");
|
|
await this.reportError("iteration_timeout", "high", "Agents stuck without progress");
|
|
return { abort: true, reason: "stuck" };
|
|
}
|
|
|
|
// Check for iteration limit exceeded
|
|
if (this.isIterationLimitExceeded()) {
|
|
this.log("LIMIT: Maximum iterations exceeded - triggering auto-recovery");
|
|
await this.reportError("iteration_limit", "high", `Max iterations (${this.maxIterations}) exceeded`);
|
|
return { abort: true, reason: "iteration_limit" };
|
|
}
|
|
|
|
// Check stuck condition
|
|
const stuckAgents = await this.stateManager.detectStuckAgents(30);
|
|
if (stuckAgents.length > 0) {
|
|
this.log(`Stuck agents detected: ${stuckAgents.join(", ")}`);
|
|
const condition = await this.spawnController.updateCondition("STUCK", stuckAgents.length);
|
|
if (condition?.triggered) {
|
|
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
|
|
if (shouldSpawn && reason) {
|
|
await this.spawnGamma(reason);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check conflict condition
|
|
const metricsData = await this.metrics.getMetrics();
|
|
const unresolvedConflicts = metricsData.conflicts_detected - metricsData.conflicts_resolved;
|
|
const conflictCondition = await this.spawnController.updateCondition("CONFLICT", unresolvedConflicts);
|
|
if (conflictCondition?.triggered && !this.spawnController.isGammaSpawned()) {
|
|
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
|
|
if (shouldSpawn && reason) {
|
|
await this.spawnGamma(reason);
|
|
}
|
|
}
|
|
|
|
// Check complexity condition (from blackboard)
|
|
const analysis = await this.blackboard.read("problem", "analysis");
|
|
if (analysis?.value?.complexity_score) {
|
|
const complexityCondition = await this.spawnController.updateCondition("COMPLEXITY", analysis.value.complexity_score);
|
|
if (complexityCondition?.triggered && !this.spawnController.isGammaSpawned()) {
|
|
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
|
|
if (shouldSpawn && reason) {
|
|
await this.spawnGamma(reason);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Log current state
|
|
const states = await this.stateManager.getAllStates();
|
|
const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", ");
|
|
this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts} | Iter: ${this.iterationCount}`);
|
|
|
|
return { abort: false };
|
|
}
|
|
|
|
async runTask(task: TaskDefinition): Promise<CoordinationMetrics & { abort_reason?: string }> {
|
|
this.log(`Starting task: ${task.objective.slice(0, 60)}...`);
|
|
this.lastProgressTime = Date.now();
|
|
|
|
// Write task to blackboard
|
|
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
|
|
|
|
// FORCE GAMMA: If this is a recovery run, spawn GAMMA immediately
|
|
if (this.forceGamma && !this.gammaAgent) {
|
|
this.log("FORCE GAMMA MODE: Spawning GAMMA mediator immediately for recovery");
|
|
const forceReason: SpawnCondition = {
|
|
type: "STUCK",
|
|
threshold: 0,
|
|
current_value: 1,
|
|
triggered: true,
|
|
description: "Force-spawned for recovery run"
|
|
};
|
|
await this.spawnGamma(forceReason);
|
|
}
|
|
|
|
// Track if we need to abort due to timeout/iteration limit
|
|
let abortReason: string | undefined;
|
|
|
|
// Start monitoring with abort detection
|
|
this.monitorInterval = setInterval(async () => {
|
|
const result = await this.monitorConditions();
|
|
if (result.abort && result.reason) {
|
|
abortReason = result.reason;
|
|
if (this.monitorInterval) {
|
|
clearInterval(this.monitorInterval);
|
|
}
|
|
}
|
|
}, 2000);
|
|
|
|
// Run agents in parallel
|
|
this.log("Launching ALPHA and BETA in parallel...");
|
|
|
|
const alphaPromise = this.alphaAgent.run(task).catch(async e => {
|
|
await this.reportError("agent_failure", "high", `ALPHA error: ${e.message}`);
|
|
});
|
|
|
|
const betaPromise = this.betaAgent.run(task).catch(async e => {
|
|
await this.reportError("agent_failure", "high", `BETA error: ${e.message}`);
|
|
});
|
|
|
|
// Wait for initial agents to complete (or timeout)
|
|
const timeout = task.timeout_seconds * 1000;
|
|
const timeoutPromise = new Promise<void>(resolve => setTimeout(resolve, timeout));
|
|
|
|
await Promise.race([
|
|
Promise.all([alphaPromise, betaPromise]),
|
|
timeoutPromise,
|
|
]);
|
|
|
|
// Check if we were aborted during execution
|
|
if (abortReason) {
|
|
this.log(`Task aborted: ${abortReason}`);
|
|
if (this.monitorInterval) {
|
|
clearInterval(this.monitorInterval);
|
|
}
|
|
|
|
// CRITICAL: Dump all agent proposals/analysis to handoff JSON BEFORE revoking tokens
|
|
this.log("Dumping agent handoff data for recovery pipeline...");
|
|
await this.dumpAgentHandoff();
|
|
|
|
// Now revoke tokens for stuck agents
|
|
await this.revokeStuckAgentTokens();
|
|
|
|
// Get partial metrics and mark as failed
|
|
const partialMetrics = await this.metrics.finalize(false);
|
|
await this.recordFailureContext(abortReason as any, partialMetrics);
|
|
|
|
return { ...partialMetrics, abort_reason: abortReason };
|
|
}
|
|
|
|
this.log("Initial agents completed or timeout reached");
|
|
|
|
// Check if GAMMA needs to be spawned for success validation
|
|
const states = await this.stateManager.getAllStates();
|
|
const bothComplete = states.every(s => s.status === "WAITING" || s.status === "COMPLETED");
|
|
|
|
if (bothComplete && !this.spawnController.isGammaSpawned()) {
|
|
await this.spawnController.updateCondition("SUCCESS", 1.0);
|
|
const { shouldSpawn, reason } = await this.spawnController.checkSpawnConditions();
|
|
if (shouldSpawn && reason) {
|
|
await this.spawnGamma(reason);
|
|
}
|
|
}
|
|
|
|
// If GAMMA was spawned, run it
|
|
if (this.gammaAgent) {
|
|
this.log("Running GAMMA for resolution...");
|
|
await this.gammaAgent.run(task).catch(async e => {
|
|
await this.reportError("agent_failure", "high", `GAMMA error: ${e.message}`);
|
|
});
|
|
}
|
|
|
|
// Stop monitoring
|
|
if (this.monitorInterval) {
|
|
clearInterval(this.monitorInterval);
|
|
}
|
|
|
|
// Check consensus
|
|
const consensus = await this.blackboard.checkConsensus("synthesis", ["ALPHA", "BETA"]);
|
|
const consensusAchieved = consensus.reached ||
|
|
(await this.blackboard.read("consensus", "final"))?.value?.achieved === true;
|
|
|
|
this.log(`Consensus achieved: ${consensusAchieved}`);
|
|
|
|
// Finalize metrics
|
|
const finalMetrics = await this.metrics.finalize(consensusAchieved);
|
|
|
|
// If consensus failed, record structured failure context
|
|
if (!consensusAchieved) {
|
|
await this.recordFailureContext("consensus_failed", finalMetrics);
|
|
}
|
|
|
|
return finalMetrics;
|
|
}
|
|
|
|
async cleanup(): Promise<void> {
|
|
this.log("Cleaning up...");
|
|
|
|
if (this.monitorInterval) {
|
|
clearInterval(this.monitorInterval);
|
|
}
|
|
|
|
await Promise.all([
|
|
this.alphaBus?.disconnect(),
|
|
this.betaBus?.disconnect(),
|
|
this.gammaBus?.disconnect(),
|
|
this.blackboard?.disconnect(),
|
|
this.stateManager?.disconnect(),
|
|
this.spawnController?.disconnect(),
|
|
this.metrics?.disconnect(),
|
|
].filter(Boolean));
|
|
|
|
this.log("Cleanup complete");
|
|
}
|
|
|
|
getTaskId(): string {
|
|
return this.taskId;
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// Performance Analysis
|
|
// =============================================================================
|
|
|
|
export function analyzePerformance(metrics: CoordinationMetrics): void {
|
|
console.log("\n" + "=".repeat(70));
|
|
console.log("PERFORMANCE ANALYSIS");
|
|
console.log("=".repeat(70));
|
|
|
|
const duration = metrics.end_time
|
|
? (new Date(metrics.end_time).getTime() - new Date(metrics.start_time).getTime()) / 1000
|
|
: 0;
|
|
|
|
console.log("\nTiming:");
|
|
console.log(` Duration: ${duration.toFixed(1)}s`);
|
|
|
|
console.log("\nCommunication:");
|
|
console.log(` Total messages: ${metrics.total_messages}`);
|
|
console.log(` Direct messages: ${metrics.direct_messages}`);
|
|
console.log(` Blackboard writes: ${metrics.blackboard_writes}`);
|
|
console.log(` Blackboard reads: ${metrics.blackboard_reads}`);
|
|
console.log(` Communication overhead: ${((metrics.total_messages + metrics.blackboard_writes) / duration).toFixed(2)} ops/sec`);
|
|
|
|
console.log("\nCoordination:");
|
|
console.log(` Conflicts detected: ${metrics.conflicts_detected}`);
|
|
console.log(` Conflicts resolved: ${metrics.conflicts_resolved}`);
|
|
console.log(` Conflict resolution rate: ${metrics.conflicts_detected > 0 ? ((metrics.conflicts_resolved / metrics.conflicts_detected) * 100).toFixed(1) : 100}%`);
|
|
|
|
console.log("\nGamma Agent:");
|
|
console.log(` Spawned: ${metrics.gamma_spawned ? "Yes" : "No"}`);
|
|
if (metrics.gamma_spawned) {
|
|
console.log(` Spawn reason: ${metrics.gamma_spawn_reason}`);
|
|
console.log(` Spawn time: ${metrics.gamma_spawn_time}`);
|
|
}
|
|
|
|
console.log("\nOutcome:");
|
|
console.log(` Consensus achieved: ${metrics.final_consensus ? "Yes" : "No"}`);
|
|
console.log(` Performance score: ${(metrics.performance_score * 100).toFixed(1)}%`);
|
|
|
|
// Threshold analysis
|
|
console.log("\nThreshold Effects:");
|
|
const messageThreshold = 50;
|
|
const conflictThreshold = 3;
|
|
|
|
if (metrics.total_messages > messageThreshold) {
|
|
console.log(` ! High message volume (${metrics.total_messages} > ${messageThreshold}) - potential coordination overhead`);
|
|
} else {
|
|
console.log(` + Message volume within threshold (${metrics.total_messages} <= ${messageThreshold})`);
|
|
}
|
|
|
|
if (metrics.conflicts_detected > conflictThreshold) {
|
|
console.log(` ! High conflict rate (${metrics.conflicts_detected} > ${conflictThreshold}) - agents may have divergent strategies`);
|
|
} else {
|
|
console.log(` + Conflict rate within threshold (${metrics.conflicts_detected} <= ${conflictThreshold})`);
|
|
}
|
|
|
|
if (metrics.gamma_spawned && metrics.gamma_spawn_reason === "STUCK") {
|
|
console.log(` ! Gamma spawned due to stuck condition - consider adjusting agent strategies`);
|
|
}
|
|
|
|
console.log("\n" + "=".repeat(70));
|
|
}
|
|
|
|
// =============================================================================
|
|
// CLI Entry Point
|
|
// =============================================================================
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
// Default complex task
|
|
let objective = args[0] || `Design a distributed event-driven architecture for a real-time analytics platform that handles:
|
|
1) High-throughput data ingestion from multiple sources
|
|
2) Stream processing with exactly-once semantics
|
|
3) Real-time aggregations and windowed computations
|
|
4) Low-latency query serving for dashboards
|
|
5) Horizontal scalability to handle 1M events/second
|
|
The solution should consider fault tolerance, data consistency, and cost optimization.`;
|
|
|
|
let model = "anthropic/claude-sonnet-4";
|
|
const modelIdx = args.indexOf("--model");
|
|
if (modelIdx !== -1 && args[modelIdx + 1]) {
|
|
model = args[modelIdx + 1];
|
|
}
|
|
|
|
// Parse timeout
|
|
let timeout = 120;
|
|
const timeoutIdx = args.indexOf("--timeout");
|
|
if (timeoutIdx !== -1 && args[timeoutIdx + 1]) {
|
|
timeout = parseInt(args[timeoutIdx + 1]);
|
|
}
|
|
|
|
const task: TaskDefinition = {
|
|
task_id: generateId(),
|
|
objective,
|
|
complexity: "high",
|
|
subtasks: [
|
|
{ id: "s1", description: "Analyze data ingestion requirements", status: "pending", dependencies: [] },
|
|
{ id: "s2", description: "Design stream processing pipeline", status: "pending", dependencies: ["s1"] },
|
|
{ id: "s3", description: "Plan storage and query layer", status: "pending", dependencies: ["s1"] },
|
|
{ id: "s4", description: "Define scalability strategy", status: "pending", dependencies: ["s2", "s3"] },
|
|
{ id: "s5", description: "Integrate fault tolerance mechanisms", status: "pending", dependencies: ["s4"] },
|
|
],
|
|
constraints: [
|
|
"Must use open-source technologies where possible",
|
|
"Latency < 100ms for query responses",
|
|
"Support for multiple data formats (JSON, Avro, Protobuf)",
|
|
"Cost-effective for variable workloads",
|
|
],
|
|
success_criteria: [
|
|
"Complete architecture design with component diagrams",
|
|
"Data flow specifications",
|
|
"Scalability analysis",
|
|
"Fault tolerance mechanisms documented",
|
|
"Cost estimation provided",
|
|
],
|
|
timeout_seconds: timeout,
|
|
};
|
|
|
|
const orchestrator = new MultiAgentOrchestrator(model);
|
|
|
|
let exitCode = 0;
|
|
try {
|
|
await orchestrator.initialize();
|
|
const metrics = await orchestrator.runTask(task);
|
|
|
|
console.log("\n" + "=".repeat(70));
|
|
console.log("FINAL METRICS");
|
|
console.log("=".repeat(70));
|
|
console.log(JSON.stringify(metrics, null, 2));
|
|
|
|
analyzePerformance(metrics);
|
|
|
|
// Output special marker for server to parse consensus status
|
|
// Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...},"abort_reason":...}
|
|
console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({
|
|
consensus: metrics.final_consensus,
|
|
task_id: metrics.task_id,
|
|
metrics: metrics,
|
|
abort_reason: (metrics as any).abort_reason || null,
|
|
requires_auto_recovery: !metrics.final_consensus || !!(metrics as any).abort_reason
|
|
}));
|
|
|
|
// Exit codes:
|
|
// 0 = Success (consensus achieved)
|
|
// 1 = Error (crash or exception)
|
|
// 2 = Consensus failure (agents completed but no agreement)
|
|
// 3 = Aborted (timeout/stuck/iteration limit - auto-recovery needed)
|
|
if ((metrics as any).abort_reason) {
|
|
console.log(`\n[ORCHESTRATOR] Task aborted: ${(metrics as any).abort_reason} - exiting with code 3 (auto-recovery needed)`);
|
|
exitCode = 3;
|
|
|
|
const pipelineId = process.env.PIPELINE_ID;
|
|
if (pipelineId) {
|
|
await reportErrorToObservability(pipelineId, "orchestrator_aborted", "high",
|
|
`Orchestration aborted: ${(metrics as any).abort_reason}. Auto-recovery required.`);
|
|
}
|
|
} else if (!metrics.final_consensus) {
|
|
console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2 (auto-recovery needed)");
|
|
exitCode = 2;
|
|
|
|
// Report consensus failure to observability
|
|
const pipelineId = process.env.PIPELINE_ID;
|
|
if (pipelineId) {
|
|
await reportErrorToObservability(pipelineId, "consensus_failure", "high",
|
|
`Agents failed to reach consensus. Conflicts: ${metrics.conflicts_detected}, Resolved: ${metrics.conflicts_resolved}`);
|
|
}
|
|
}
|
|
|
|
} catch (e: any) {
|
|
console.error("Orchestrator error:", e.message);
|
|
exitCode = 1;
|
|
|
|
// Report critical error to observability if pipeline ID is set
|
|
const pipelineId = process.env.PIPELINE_ID;
|
|
if (pipelineId) {
|
|
await reportErrorToObservability(pipelineId, "orchestrator_failure", "critical", e.message);
|
|
}
|
|
} finally {
|
|
await orchestrator.cleanup();
|
|
// Explicitly exit to ensure all connections are closed
|
|
process.exit(exitCode);
|
|
}
|
|
}
|
|
|
|
main().catch((e) => {
|
|
console.error("Fatal error:", e);
|
|
process.exit(1);
|
|
});
|