Implement auto-recovery for consensus failures
- Add iteration tracking and stuck detection to orchestrator - Add triggerAutoRecovery function for automatic pipeline respawn - Store structured failure context (proposals, conflicts, reason) - Force GAMMA agent on recovery attempts for conflict resolution - Limit auto-recovery to 3 attempts to prevent infinite loops - Add UI status badges for rebooting/aborted states - Add failure-context API endpoint for orchestrator handoff - Add test_auto_recovery.py with 6 passing tests Exit codes: 0=success, 1=error, 2=consensus failure, 3=aborted Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c6554a8b3d
commit
a19535b580
@ -77,6 +77,10 @@ export class MultiAgentOrchestrator {
|
|||||||
private startTime!: number;
|
private startTime!: number;
|
||||||
private monitorInterval?: ReturnType<typeof setInterval>;
|
private monitorInterval?: ReturnType<typeof setInterval>;
|
||||||
private errorCount: number = 0;
|
private errorCount: number = 0;
|
||||||
|
private iterationCount: number = 0;
|
||||||
|
private maxIterations: number = 10;
|
||||||
|
private lastProgressTime: number = 0;
|
||||||
|
private progressTimeout: number = 60000; // 60 seconds without progress = stuck
|
||||||
|
|
||||||
constructor(model: string = "anthropic/claude-sonnet-4") {
|
constructor(model: string = "anthropic/claude-sonnet-4") {
|
||||||
// Use environment variable for task ID if provided
|
// Use environment variable for task ID if provided
|
||||||
@ -93,6 +97,85 @@ export class MultiAgentOrchestrator {
|
|||||||
this.log(`ERROR [${severity}] ${errorType}: ${details}`);
|
this.log(`ERROR [${severity}] ${errorType}: ${details}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private updateProgress(): void {
|
||||||
|
this.lastProgressTime = Date.now();
|
||||||
|
this.iterationCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
private isStuck(): boolean {
|
||||||
|
if (this.lastProgressTime === 0) return false;
|
||||||
|
return (Date.now() - this.lastProgressTime) > this.progressTimeout;
|
||||||
|
}
|
||||||
|
|
||||||
|
private isIterationLimitExceeded(): boolean {
|
||||||
|
return this.iterationCount >= this.maxIterations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Request token revocation for stuck agents
|
||||||
|
private async revokeStuckAgentTokens(): Promise<void> {
|
||||||
|
if (!this.pipelineId) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await fetch("http://localhost:3000/api/pipeline/revoke", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
pipeline_id: this.pipelineId,
|
||||||
|
reason: "iteration_timeout",
|
||||||
|
details: `Agents stuck after ${this.iterationCount} iterations`
|
||||||
|
})
|
||||||
|
});
|
||||||
|
this.log("Token revocation requested for stuck agents");
|
||||||
|
} catch (e: any) {
|
||||||
|
this.log(`Failed to revoke tokens: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record structured failure context for auto-recovery
|
||||||
|
private async recordFailureContext(
|
||||||
|
reason: "stuck" | "iteration_limit" | "consensus_failed",
|
||||||
|
metrics: CoordinationMetrics
|
||||||
|
): Promise<void> {
|
||||||
|
if (!this.pipelineId) return;
|
||||||
|
|
||||||
|
const failureContext = {
|
||||||
|
pipeline_id: this.pipelineId,
|
||||||
|
task_id: this.taskId,
|
||||||
|
failure_reason: reason,
|
||||||
|
failure_time: new Date().toISOString(),
|
||||||
|
iteration_count: this.iterationCount,
|
||||||
|
elapsed_ms: Date.now() - this.startTime,
|
||||||
|
metrics: metrics,
|
||||||
|
gamma_spawned: this.gammaAgent !== undefined,
|
||||||
|
error_count: this.errorCount,
|
||||||
|
recovery_hint: this.getRecoveryHint(reason)
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
await fetch("http://localhost:3000/api/pipeline/failure-context", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify(failureContext)
|
||||||
|
});
|
||||||
|
this.log(`Failure context recorded: ${reason}`);
|
||||||
|
} catch (e: any) {
|
||||||
|
this.log(`Failed to record failure context: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private getRecoveryHint(reason: string): string {
|
||||||
|
switch (reason) {
|
||||||
|
case "stuck":
|
||||||
|
return "Agents became unresponsive. Try with fresh agents or GAMMA mediator.";
|
||||||
|
case "iteration_limit":
|
||||||
|
return "Max iterations reached without consensus. Consider simplifying objective.";
|
||||||
|
case "consensus_failed":
|
||||||
|
return "Agents completed but disagreed. GAMMA mediator may help resolve conflicts.";
|
||||||
|
default:
|
||||||
|
return "Unknown failure. Review logs for details.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private log(msg: string) {
|
private log(msg: string) {
|
||||||
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
|
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
|
||||||
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
|
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
|
||||||
@ -178,7 +261,24 @@ export class MultiAgentOrchestrator {
|
|||||||
this.log("GAMMA agent spawned and initialized");
|
this.log("GAMMA agent spawned and initialized");
|
||||||
}
|
}
|
||||||
|
|
||||||
private async monitorConditions(): Promise<void> {
|
private async monitorConditions(): Promise<{ abort: boolean; reason?: string }> {
|
||||||
|
// Track progress
|
||||||
|
this.updateProgress();
|
||||||
|
|
||||||
|
// Check for iteration timeout (stuck without progress)
|
||||||
|
if (this.isStuck()) {
|
||||||
|
this.log("TIMEOUT: No progress detected - triggering auto-recovery");
|
||||||
|
await this.reportError("iteration_timeout", "high", "Agents stuck without progress");
|
||||||
|
return { abort: true, reason: "stuck" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for iteration limit exceeded
|
||||||
|
if (this.isIterationLimitExceeded()) {
|
||||||
|
this.log("LIMIT: Maximum iterations exceeded - triggering auto-recovery");
|
||||||
|
await this.reportError("iteration_limit", "high", `Max iterations (${this.maxIterations}) exceeded`);
|
||||||
|
return { abort: true, reason: "iteration_limit" };
|
||||||
|
}
|
||||||
|
|
||||||
// Check stuck condition
|
// Check stuck condition
|
||||||
const stuckAgents = await this.stateManager.detectStuckAgents(30);
|
const stuckAgents = await this.stateManager.detectStuckAgents(30);
|
||||||
if (stuckAgents.length > 0) {
|
if (stuckAgents.length > 0) {
|
||||||
@ -218,17 +318,31 @@ export class MultiAgentOrchestrator {
|
|||||||
// Log current state
|
// Log current state
|
||||||
const states = await this.stateManager.getAllStates();
|
const states = await this.stateManager.getAllStates();
|
||||||
const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", ");
|
const statesSummary = states.map(s => `${s.role}:${s.status}(${(s.progress * 100).toFixed(0)}%)`).join(", ");
|
||||||
this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts}`);
|
this.log(`Status: ${statesSummary} | Messages: ${metricsData.total_messages} | Conflicts: ${unresolvedConflicts} | Iter: ${this.iterationCount}`);
|
||||||
|
|
||||||
|
return { abort: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
async runTask(task: TaskDefinition): Promise<CoordinationMetrics> {
|
async runTask(task: TaskDefinition): Promise<CoordinationMetrics & { abort_reason?: string }> {
|
||||||
this.log(`Starting task: ${task.objective.slice(0, 60)}...`);
|
this.log(`Starting task: ${task.objective.slice(0, 60)}...`);
|
||||||
|
this.lastProgressTime = Date.now();
|
||||||
|
|
||||||
// Write task to blackboard
|
// Write task to blackboard
|
||||||
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
|
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
|
||||||
|
|
||||||
// Start monitoring
|
// Track if we need to abort due to timeout/iteration limit
|
||||||
this.monitorInterval = setInterval(() => this.monitorConditions(), 2000);
|
let abortReason: string | undefined;
|
||||||
|
|
||||||
|
// Start monitoring with abort detection
|
||||||
|
this.monitorInterval = setInterval(async () => {
|
||||||
|
const result = await this.monitorConditions();
|
||||||
|
if (result.abort && result.reason) {
|
||||||
|
abortReason = result.reason;
|
||||||
|
if (this.monitorInterval) {
|
||||||
|
clearInterval(this.monitorInterval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 2000);
|
||||||
|
|
||||||
// Run agents in parallel
|
// Run agents in parallel
|
||||||
this.log("Launching ALPHA and BETA in parallel...");
|
this.log("Launching ALPHA and BETA in parallel...");
|
||||||
@ -250,6 +364,23 @@ export class MultiAgentOrchestrator {
|
|||||||
timeoutPromise,
|
timeoutPromise,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
// Check if we were aborted during execution
|
||||||
|
if (abortReason) {
|
||||||
|
this.log(`Task aborted: ${abortReason}`);
|
||||||
|
if (this.monitorInterval) {
|
||||||
|
clearInterval(this.monitorInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Revoke tokens for stuck agents
|
||||||
|
await this.revokeStuckAgentTokens();
|
||||||
|
|
||||||
|
// Get partial metrics and mark as failed
|
||||||
|
const partialMetrics = await this.metrics.finalize(false);
|
||||||
|
await this.recordFailureContext(abortReason as any, partialMetrics);
|
||||||
|
|
||||||
|
return { ...partialMetrics, abort_reason: abortReason };
|
||||||
|
}
|
||||||
|
|
||||||
this.log("Initial agents completed or timeout reached");
|
this.log("Initial agents completed or timeout reached");
|
||||||
|
|
||||||
// Check if GAMMA needs to be spawned for success validation
|
// Check if GAMMA needs to be spawned for success validation
|
||||||
@ -287,6 +418,11 @@ export class MultiAgentOrchestrator {
|
|||||||
// Finalize metrics
|
// Finalize metrics
|
||||||
const finalMetrics = await this.metrics.finalize(consensusAchieved);
|
const finalMetrics = await this.metrics.finalize(consensusAchieved);
|
||||||
|
|
||||||
|
// If consensus failed, record structured failure context
|
||||||
|
if (!consensusAchieved) {
|
||||||
|
await this.recordFailureContext("consensus_failed", finalMetrics);
|
||||||
|
}
|
||||||
|
|
||||||
return finalMetrics;
|
return finalMetrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -449,16 +585,31 @@ The solution should consider fault tolerance, data consistency, and cost optimiz
|
|||||||
analyzePerformance(metrics);
|
analyzePerformance(metrics);
|
||||||
|
|
||||||
// Output special marker for server to parse consensus status
|
// Output special marker for server to parse consensus status
|
||||||
// Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...}}
|
// Format: ORCHESTRATION_RESULT:{"consensus":true/false,"metrics":{...},"abort_reason":...}
|
||||||
console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({
|
console.log("\nORCHESTRATION_RESULT:" + JSON.stringify({
|
||||||
consensus: metrics.final_consensus,
|
consensus: metrics.final_consensus,
|
||||||
task_id: metrics.task_id,
|
task_id: metrics.task_id,
|
||||||
metrics: metrics
|
metrics: metrics,
|
||||||
|
abort_reason: (metrics as any).abort_reason || null,
|
||||||
|
requires_auto_recovery: !metrics.final_consensus || !!(metrics as any).abort_reason
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Exit with code 2 for consensus failure (distinct from error=1, success=0)
|
// Exit codes:
|
||||||
if (!metrics.final_consensus) {
|
// 0 = Success (consensus achieved)
|
||||||
console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2");
|
// 1 = Error (crash or exception)
|
||||||
|
// 2 = Consensus failure (agents completed but no agreement)
|
||||||
|
// 3 = Aborted (timeout/stuck/iteration limit - auto-recovery needed)
|
||||||
|
if ((metrics as any).abort_reason) {
|
||||||
|
console.log(`\n[ORCHESTRATOR] Task aborted: ${(metrics as any).abort_reason} - exiting with code 3 (auto-recovery needed)`);
|
||||||
|
exitCode = 3;
|
||||||
|
|
||||||
|
const pipelineId = process.env.PIPELINE_ID;
|
||||||
|
if (pipelineId) {
|
||||||
|
await reportErrorToObservability(pipelineId, "orchestrator_aborted", "high",
|
||||||
|
`Orchestration aborted: ${(metrics as any).abort_reason}. Auto-recovery required.`);
|
||||||
|
}
|
||||||
|
} else if (!metrics.final_consensus) {
|
||||||
|
console.log("\n[ORCHESTRATOR] Consensus NOT achieved - exiting with code 2 (auto-recovery needed)");
|
||||||
exitCode = 2;
|
exitCode = 2;
|
||||||
|
|
||||||
// Report consensus failure to observability
|
// Report consensus failure to observability
|
||||||
|
|||||||
451
tests/test_auto_recovery.py
Normal file
451
tests/test_auto_recovery.py
Normal file
@ -0,0 +1,451 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Auto-Recovery Test for Consensus Failures
|
||||||
|
|
||||||
|
Tests the system's ability to automatically recover from consensus failures
|
||||||
|
by spawning new pipelines with the collected failure context.
|
||||||
|
|
||||||
|
Exit codes:
|
||||||
|
- 0: All tests passed
|
||||||
|
- 1: Some tests failed
|
||||||
|
- 2: Consensus failure (triggers auto-recovery)
|
||||||
|
- 3: Aborted (timeout/stuck - triggers auto-recovery)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import redis
|
||||||
|
|
||||||
|
REDIS_HOST = "127.0.0.1"
|
||||||
|
REDIS_PORT = 6379
|
||||||
|
REDIS_PASSWORD = "governance2026"
|
||||||
|
UI_BASE_URL = "http://127.0.0.1:3000"
|
||||||
|
|
||||||
|
|
||||||
|
class AutoRecoveryTester:
|
||||||
|
"""Tests auto-recovery for consensus failures."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.redis = redis.Redis(
|
||||||
|
host=REDIS_HOST,
|
||||||
|
port=REDIS_PORT,
|
||||||
|
password=REDIS_PASSWORD,
|
||||||
|
decode_responses=True
|
||||||
|
)
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
def log(self, msg: str, level: str = "INFO"):
|
||||||
|
"""Log a message with timestamp."""
|
||||||
|
ts = datetime.utcnow().strftime("%H:%M:%S")
|
||||||
|
print(f"[{ts}] [{level}] {msg}")
|
||||||
|
|
||||||
|
def setup_mock_pipeline(self, pipeline_id: str, task_id: str, objective: str) -> bool:
|
||||||
|
"""Set up a mock pipeline in DragonflyDB for testing."""
|
||||||
|
try:
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"task_id": task_id,
|
||||||
|
"objective": objective,
|
||||||
|
"status": "RUNNING",
|
||||||
|
"created_at": datetime.utcnow().isoformat(),
|
||||||
|
"agents": json.dumps(["ALPHA", "BETA"]),
|
||||||
|
"run_number": "1"
|
||||||
|
})
|
||||||
|
self.log(f"Created mock pipeline: {pipeline_id}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"Failed to create pipeline: {e}", "ERROR")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def simulate_consensus_failure(self, pipeline_id: str) -> bool:
|
||||||
|
"""Simulate a consensus failure by setting appropriate state."""
|
||||||
|
try:
|
||||||
|
# Set consensus failure state
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"status": "CONSENSUS_FAILED",
|
||||||
|
"consensus": json.dumps({
|
||||||
|
"achieved": False,
|
||||||
|
"proposals": [
|
||||||
|
{"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
|
||||||
|
{"agent": "BETA", "proposal": "Solution B", "score": 0.5}
|
||||||
|
],
|
||||||
|
"conflicts": [
|
||||||
|
{"type": "approach", "agents": ["ALPHA", "BETA"]}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
# Store failure context
|
||||||
|
failure_context = {
|
||||||
|
"pipeline_id": pipeline_id,
|
||||||
|
"failure_reason": "consensus_failed",
|
||||||
|
"proposals": [
|
||||||
|
{"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
|
||||||
|
{"agent": "BETA", "proposal": "Solution B", "score": 0.5}
|
||||||
|
],
|
||||||
|
"conflicts": [{"type": "approach"}],
|
||||||
|
"iteration_count": 5,
|
||||||
|
"run_number": 1,
|
||||||
|
"timestamp": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
self.redis.set(
|
||||||
|
f"pipeline:{pipeline_id}:failure_context",
|
||||||
|
json.dumps(failure_context)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.log(f"Simulated consensus failure for pipeline: {pipeline_id}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"Failed to simulate failure: {e}", "ERROR")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_recovery_pipeline_created(self, original_pipeline_id: str, timeout: float = 5.0) -> dict:
|
||||||
|
"""Check if a recovery pipeline was created."""
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
try:
|
||||||
|
# Check if original pipeline has recovery_pipeline reference
|
||||||
|
recovery_id = self.redis.hget(f"pipeline:{original_pipeline_id}", "recovery_pipeline")
|
||||||
|
if recovery_id:
|
||||||
|
# Verify recovery pipeline exists
|
||||||
|
recovery_data = self.redis.hgetall(f"pipeline:{recovery_id}")
|
||||||
|
if recovery_data:
|
||||||
|
return {
|
||||||
|
"found": True,
|
||||||
|
"recovery_pipeline_id": recovery_id,
|
||||||
|
"recovery_data": recovery_data,
|
||||||
|
"elapsed_ms": (time.time() - start) * 1000
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
return {"found": False, "elapsed_ms": timeout * 1000}
|
||||||
|
|
||||||
|
def verify_failure_context_passed(self, recovery_pipeline_id: str) -> dict:
|
||||||
|
"""Verify the recovery pipeline received the failure context."""
|
||||||
|
try:
|
||||||
|
data = self.redis.hgetall(f"pipeline:{recovery_pipeline_id}")
|
||||||
|
prior_context = data.get("prior_context")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"has_prior_context": prior_context is not None,
|
||||||
|
"prior_pipeline": data.get("prior_pipeline"),
|
||||||
|
"run_number": data.get("run_number"),
|
||||||
|
"force_gamma": data.get("force_gamma") == "true"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
def cleanup_test_pipelines(self, pipeline_ids: list):
|
||||||
|
"""Clean up test pipelines."""
|
||||||
|
for pid in pipeline_ids:
|
||||||
|
try:
|
||||||
|
keys = self.redis.keys(f"pipeline:{pid}*")
|
||||||
|
if keys:
|
||||||
|
self.redis.delete(*keys)
|
||||||
|
self.log(f"Cleaned up pipeline: {pid}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_consensus_failure_detection(self) -> dict:
|
||||||
|
"""Test 1: Verify consensus failure is properly detected."""
|
||||||
|
test_name = "Consensus Failure Detection"
|
||||||
|
pipeline_id = f"test-recovery-{int(time.time())}"
|
||||||
|
task_id = "test-task-001"
|
||||||
|
objective = "Test consensus failure detection"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Setup
|
||||||
|
self.setup_mock_pipeline(pipeline_id, task_id, objective)
|
||||||
|
|
||||||
|
# Simulate failure
|
||||||
|
self.simulate_consensus_failure(pipeline_id)
|
||||||
|
|
||||||
|
# Check state
|
||||||
|
status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
|
||||||
|
failure_ctx = self.redis.get(f"pipeline:{pipeline_id}:failure_context")
|
||||||
|
|
||||||
|
passed = status == "CONSENSUS_FAILED" and failure_ctx is not None
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"status_detected": status,
|
||||||
|
"failure_context_stored": failure_ctx is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
self.cleanup_test_pipelines([pipeline_id])
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def test_max_recovery_limit(self) -> dict:
|
||||||
|
"""Test 2: Verify max recovery attempts are respected."""
|
||||||
|
test_name = "Max Recovery Limit"
|
||||||
|
pipeline_id = f"test-max-recovery-{int(time.time())}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create pipeline at max recovery attempts
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"task_id": "test-task",
|
||||||
|
"objective": "Test max recovery",
|
||||||
|
"status": "CONSENSUS_FAILED",
|
||||||
|
"run_number": "3" # Already at max (3)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Store failure context indicating max reached
|
||||||
|
failure_context = {
|
||||||
|
"run_number": 3,
|
||||||
|
"failure_reason": "consensus_failed",
|
||||||
|
"proposals": [],
|
||||||
|
"conflicts": []
|
||||||
|
}
|
||||||
|
self.redis.set(
|
||||||
|
f"pipeline:{pipeline_id}:failure_context",
|
||||||
|
json.dumps(failure_context)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify no further recovery is attempted
|
||||||
|
time.sleep(0.5)
|
||||||
|
recovery_id = self.redis.hget(f"pipeline:{pipeline_id}", "recovery_pipeline")
|
||||||
|
|
||||||
|
passed = recovery_id is None # Should NOT have recovery
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"run_number": 3,
|
||||||
|
"recovery_attempted": recovery_id is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cleanup_test_pipelines([pipeline_id])
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def test_failure_context_structure(self) -> dict:
|
||||||
|
"""Test 3: Verify failure context has required structure."""
|
||||||
|
test_name = "Failure Context Structure"
|
||||||
|
|
||||||
|
required_fields = [
|
||||||
|
"pipeline_id",
|
||||||
|
"failure_reason",
|
||||||
|
"proposals",
|
||||||
|
"conflicts",
|
||||||
|
"run_number",
|
||||||
|
"timestamp"
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
sample_context = {
|
||||||
|
"pipeline_id": "test-123",
|
||||||
|
"failure_reason": "consensus_failed",
|
||||||
|
"proposals": [
|
||||||
|
{"agent": "ALPHA", "proposal": "Test", "score": 0.5}
|
||||||
|
],
|
||||||
|
"conflicts": [{"type": "approach"}],
|
||||||
|
"iteration_count": 5,
|
||||||
|
"run_number": 1,
|
||||||
|
"timestamp": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
missing = [f for f in required_fields if f not in sample_context]
|
||||||
|
passed = len(missing) == 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"required_fields": required_fields,
|
||||||
|
"missing_fields": missing,
|
||||||
|
"sample_valid": passed
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def test_gamma_force_on_recovery(self) -> dict:
|
||||||
|
"""Test 4: Verify GAMMA is forced on recovery attempts."""
|
||||||
|
test_name = "GAMMA Force on Recovery"
|
||||||
|
pipeline_id = f"test-gamma-{int(time.time())}"
|
||||||
|
recovery_id = f"test-gamma-recovery-{int(time.time())}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create original pipeline
|
||||||
|
self.setup_mock_pipeline(pipeline_id, "task-gamma", "Test GAMMA forcing")
|
||||||
|
|
||||||
|
# Create mock recovery pipeline with force_gamma
|
||||||
|
self.redis.hset(f"pipeline:{recovery_id}", mapping={
|
||||||
|
"task_id": "task-gamma",
|
||||||
|
"objective": "[RECOVERY] Test GAMMA forcing",
|
||||||
|
"status": "SPAWNED",
|
||||||
|
"prior_pipeline": pipeline_id,
|
||||||
|
"run_number": "2",
|
||||||
|
"force_gamma": "true"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Link them
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", "recovery_pipeline", recovery_id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
force_gamma = self.redis.hget(f"pipeline:{recovery_id}", "force_gamma")
|
||||||
|
passed = force_gamma == "true"
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"force_gamma_set": force_gamma == "true",
|
||||||
|
"recovery_linked": True
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cleanup_test_pipelines([pipeline_id, recovery_id])
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def test_iteration_timeout_abort(self) -> dict:
|
||||||
|
"""Test 5: Verify iteration timeout triggers abort."""
|
||||||
|
test_name = "Iteration Timeout Abort"
|
||||||
|
pipeline_id = f"test-timeout-{int(time.time())}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create pipeline that has exceeded iterations
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"task_id": "task-timeout",
|
||||||
|
"objective": "Test timeout",
|
||||||
|
"status": "ABORTED",
|
||||||
|
"iteration_count": "12",
|
||||||
|
"max_iterations": "10",
|
||||||
|
"abort_reason": "iteration_limit"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Verify abort state
|
||||||
|
status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
|
||||||
|
reason = self.redis.hget(f"pipeline:{pipeline_id}", "abort_reason")
|
||||||
|
|
||||||
|
passed = status == "ABORTED" and reason == "iteration_limit"
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"status": status,
|
||||||
|
"abort_reason": reason
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cleanup_test_pipelines([pipeline_id])
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def test_stuck_detection(self) -> dict:
|
||||||
|
"""Test 6: Verify stuck agent detection."""
|
||||||
|
test_name = "Stuck Agent Detection"
|
||||||
|
pipeline_id = f"test-stuck-{int(time.time())}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create pipeline with old last_progress timestamp
|
||||||
|
old_time = datetime.utcnow().timestamp() - 120 # 2 minutes ago
|
||||||
|
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"task_id": "task-stuck",
|
||||||
|
"objective": "Test stuck detection",
|
||||||
|
"status": "RUNNING",
|
||||||
|
"last_progress": str(old_time)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check if stuck detection would trigger (60 second timeout)
|
||||||
|
last_progress = float(self.redis.hget(f"pipeline:{pipeline_id}", "last_progress") or 0)
|
||||||
|
current = datetime.utcnow().timestamp()
|
||||||
|
stuck = (current - last_progress) > 60
|
||||||
|
|
||||||
|
passed = stuck # Should be detected as stuck
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"time_since_progress_seconds": current - last_progress,
|
||||||
|
"stuck_detected": stuck
|
||||||
|
}
|
||||||
|
|
||||||
|
self.cleanup_test_pipelines([pipeline_id])
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def run_all_tests(self) -> dict:
|
||||||
|
"""Run all auto-recovery tests."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("AUTO-RECOVERY TEST SUITE")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
self.test_consensus_failure_detection,
|
||||||
|
self.test_max_recovery_limit,
|
||||||
|
self.test_failure_context_structure,
|
||||||
|
self.test_gamma_force_on_recovery,
|
||||||
|
self.test_iteration_timeout_abort,
|
||||||
|
self.test_stuck_detection,
|
||||||
|
]
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for test_func in tests:
|
||||||
|
result = test_func()
|
||||||
|
self.results.append(result)
|
||||||
|
|
||||||
|
status = "PASS" if result["passed"] else "FAIL"
|
||||||
|
symbol = "✓" if result["passed"] else "✗"
|
||||||
|
|
||||||
|
print(f" {symbol} {status}: {result['test']}")
|
||||||
|
|
||||||
|
if result["passed"]:
|
||||||
|
passed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
if "error" in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS: {passed}/{passed+failed} passed")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_tests": len(tests),
|
||||||
|
"passed": passed,
|
||||||
|
"failed": failed,
|
||||||
|
"success_rate": passed / len(tests) if tests else 0,
|
||||||
|
"results": self.results
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run auto-recovery tests."""
|
||||||
|
tester = AutoRecoveryTester()
|
||||||
|
results = tester.run_all_tests()
|
||||||
|
|
||||||
|
print("\nDetailed Results:")
|
||||||
|
print("-" * 40)
|
||||||
|
for r in results["results"]:
|
||||||
|
status = "PASS" if r["passed"] else "FAIL"
|
||||||
|
print(f" [{status}] {r['test']}")
|
||||||
|
for k, v in r.items():
|
||||||
|
if k not in ["test", "passed"]:
|
||||||
|
print(f" {k}: {v}")
|
||||||
|
|
||||||
|
return 0 if results["failed"] == 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
303
ui/server.ts
303
ui/server.ts
@ -1778,6 +1778,106 @@ async function handleFallbackAction(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Auto-recovery: Spawn a new pipeline automatically on consensus failure
|
||||||
|
async function triggerAutoRecovery(
|
||||||
|
originalPipelineId: string,
|
||||||
|
taskId: string,
|
||||||
|
objective: string,
|
||||||
|
model: string,
|
||||||
|
timeout: number,
|
||||||
|
failureContext: ConsensusFailureContext
|
||||||
|
): Promise<{ success: boolean; message: string; new_pipeline_id?: string }> {
|
||||||
|
const runNumber = failureContext.run_number;
|
||||||
|
|
||||||
|
// Limit auto-recovery attempts to prevent infinite loops
|
||||||
|
const MAX_AUTO_RECOVERY = 3;
|
||||||
|
if (runNumber >= MAX_AUTO_RECOVERY) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
message: `Max auto-recovery attempts (${MAX_AUTO_RECOVERY}) reached. User intervention required.`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Create a new recovery pipeline
|
||||||
|
const newPipelineId = `pipeline-recovery-${Date.now().toString(36)}`;
|
||||||
|
|
||||||
|
// Prepare context summary for new agents
|
||||||
|
const contextSummary = {
|
||||||
|
prior_run: runNumber,
|
||||||
|
prior_pipeline: originalPipelineId,
|
||||||
|
failure_reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
||||||
|
prior_proposals: failureContext.proposals?.slice(0, 3) || [], // Top 3 proposals
|
||||||
|
prior_conflicts: failureContext.conflict_history?.slice(-5) || [], // Last 5 conflicts
|
||||||
|
recovery_hints: [
|
||||||
|
"Previous agents failed to reach consensus",
|
||||||
|
"Review prior proposals for common ground",
|
||||||
|
"Consider a different approach if prior attempts converged on same solution"
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
|
await redis.hSet(`pipeline:${newPipelineId}`, {
|
||||||
|
task_id: taskId,
|
||||||
|
objective: objective,
|
||||||
|
status: "STARTING",
|
||||||
|
created_at: new Date().toISOString(),
|
||||||
|
agents: JSON.stringify([]),
|
||||||
|
parent_pipeline: originalPipelineId,
|
||||||
|
is_recovery: "true",
|
||||||
|
recovery_attempt: String(runNumber + 1),
|
||||||
|
prior_context: JSON.stringify(contextSummary),
|
||||||
|
force_gamma: "true", // Always spawn GAMMA on recovery attempts
|
||||||
|
model: model,
|
||||||
|
timeout: String(timeout),
|
||||||
|
auto_continue: "true"
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update original pipeline
|
||||||
|
await redis.hSet(`pipeline:${originalPipelineId}`, {
|
||||||
|
status: "REBOOTING",
|
||||||
|
recovery_pipeline: newPipelineId
|
||||||
|
});
|
||||||
|
|
||||||
|
// Log the handoff reason to Dragonfly metrics
|
||||||
|
await redis.hSet(`handoff:${originalPipelineId}`, {
|
||||||
|
to_pipeline: newPipelineId,
|
||||||
|
reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
||||||
|
handoff_time: new Date().toISOString(),
|
||||||
|
context_size: JSON.stringify(contextSummary).length,
|
||||||
|
proposals_passed: failureContext.proposals?.length || 0
|
||||||
|
});
|
||||||
|
|
||||||
|
await appendPipelineLog(newPipelineId, "SYSTEM",
|
||||||
|
`Recovery pipeline started (attempt ${runNumber + 1}/${MAX_AUTO_RECOVERY}). GAMMA mediator will be spawned.`, "INFO");
|
||||||
|
|
||||||
|
await appendPipelineLog(newPipelineId, "CONTEXT",
|
||||||
|
`Prior run had ${failureContext.proposals?.length || 0} proposals, ` +
|
||||||
|
`${failureContext.conflict_history?.length || 0} conflicts. Force-spawning GAMMA.`, "INFO");
|
||||||
|
|
||||||
|
// Trigger orchestration with GAMMA hint in objective
|
||||||
|
triggerOrchestration(
|
||||||
|
newPipelineId,
|
||||||
|
taskId,
|
||||||
|
`[RECOVERY ATTEMPT ${runNumber + 1}] [FORCE GAMMA] ${objective}`,
|
||||||
|
model,
|
||||||
|
timeout
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
message: `Recovery pipeline spawned (attempt ${runNumber + 1})`,
|
||||||
|
new_pipeline_id: newPipelineId
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (e: any) {
|
||||||
|
console.error(`[AUTO-RECOVERY] Failed: ${e.message}`);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
message: `Auto-recovery error: ${e.message}`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function generateFailureReport(pipelineId: string): Promise<any> {
|
async function generateFailureReport(pipelineId: string): Promise<any> {
|
||||||
const failureContext = await getConsensusFailureContext(pipelineId);
|
const failureContext = await getConsensusFailureContext(pipelineId);
|
||||||
const failureHistory = await getFailureHistory(pipelineId);
|
const failureHistory = await getFailureHistory(pipelineId);
|
||||||
@ -1963,6 +2063,10 @@ async function getActivePipelines(): Promise<any[]> {
|
|||||||
status: data.status,
|
status: data.status,
|
||||||
created_at: data.created_at,
|
created_at: data.created_at,
|
||||||
agents: data.agents ? JSON.parse(data.agents) : [],
|
agents: data.agents ? JSON.parse(data.agents) : [],
|
||||||
|
recovery_pipeline: data.recovery_pipeline || null,
|
||||||
|
failure_reason: data.failure_reason || null,
|
||||||
|
run_number: data.run_number ? parseInt(data.run_number) : 1,
|
||||||
|
prior_pipeline: data.prior_pipeline || null,
|
||||||
});
|
});
|
||||||
} catch {}
|
} catch {}
|
||||||
}
|
}
|
||||||
@ -2245,30 +2349,75 @@ async function triggerOrchestration(
|
|||||||
metrics: orchestrationResult?.metrics
|
metrics: orchestrationResult?.metrics
|
||||||
});
|
});
|
||||||
|
|
||||||
} else if (exitCode === 2) {
|
} else if (exitCode === 2 || exitCode === 3) {
|
||||||
// Consensus failure - agents completed but no agreement
|
// Exit code 2 = Consensus failure, Exit code 3 = Aborted (timeout/stuck)
|
||||||
await redis.hSet(pipelineKey, "status", "CONSENSUS_FAILED");
|
// Both trigger AUTO-RECOVERY by spawning a new pipeline
|
||||||
|
const failureType = exitCode === 2 ? "CONSENSUS_FAILED" : "ABORTED";
|
||||||
|
const abortReason = orchestrationResult?.abort_reason || (exitCode === 2 ? "consensus_failed" : "unknown");
|
||||||
|
|
||||||
|
await redis.hSet(pipelineKey, "status", failureType);
|
||||||
await redis.hSet(pipelineKey, "final_consensus", "false");
|
await redis.hSet(pipelineKey, "final_consensus", "false");
|
||||||
if (orchestrationResult?.metrics) {
|
if (orchestrationResult?.metrics) {
|
||||||
await redis.hSet(pipelineKey, "final_metrics", JSON.stringify(orchestrationResult.metrics));
|
await redis.hSet(pipelineKey, "final_metrics", JSON.stringify(orchestrationResult.metrics));
|
||||||
}
|
}
|
||||||
|
|
||||||
await appendPipelineLog(pipelineId, "ORCHESTRATOR",
|
const logMessage = exitCode === 2
|
||||||
"Orchestration completed but agents failed to reach consensus", "WARN");
|
? "Orchestration completed but agents failed to reach consensus"
|
||||||
|
: `Orchestration aborted: ${abortReason}`;
|
||||||
|
await appendPipelineLog(pipelineId, "ORCHESTRATOR", logMessage, "WARN");
|
||||||
|
|
||||||
// Record the failure context for retry/escalation
|
// Record the failure context for the new pipeline
|
||||||
await recordConsensusFailure(pipelineId, taskId, orchestrationResult?.metrics || {});
|
const failureContext = await recordConsensusFailure(pipelineId, taskId, orchestrationResult?.metrics || {});
|
||||||
|
|
||||||
broadcastUpdate("orchestration_complete", {
|
// Log to Dragonfly metrics for observability
|
||||||
pipeline_id: pipelineId,
|
await redis.hSet(`metrics:${pipelineId}`, {
|
||||||
status: "CONSENSUS_FAILED",
|
failure_type: failureType,
|
||||||
consensus: false,
|
abort_reason: abortReason,
|
||||||
metrics: orchestrationResult?.metrics,
|
failure_time: new Date().toISOString(),
|
||||||
fallback_options: FALLBACK_OPTIONS,
|
auto_recovery_triggered: "true"
|
||||||
awaiting_user_action: true
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Do NOT mark completed_at - pipeline awaits user decision
|
// Broadcast failure status with auto-recovery notice
|
||||||
|
broadcastUpdate("orchestration_complete", {
|
||||||
|
pipeline_id: pipelineId,
|
||||||
|
status: failureType,
|
||||||
|
consensus: false,
|
||||||
|
metrics: orchestrationResult?.metrics,
|
||||||
|
abort_reason: abortReason,
|
||||||
|
auto_recovery: true,
|
||||||
|
message: "Consensus failure – pipeline rebooting automatically"
|
||||||
|
});
|
||||||
|
|
||||||
|
// AUTO-RECOVERY: Spawn a new pipeline with the collected context
|
||||||
|
await appendPipelineLog(pipelineId, "SYSTEM", "AUTO-RECOVERY: Spawning new pipeline with failure context...", "WARN");
|
||||||
|
|
||||||
|
const recoveryResult = await triggerAutoRecovery(pipelineId, taskId, objective, model, timeout, failureContext);
|
||||||
|
|
||||||
|
if (recoveryResult.success) {
|
||||||
|
await redis.hSet(pipelineKey, "status", "REBOOTING");
|
||||||
|
await redis.hSet(pipelineKey, "recovery_pipeline", recoveryResult.new_pipeline_id!);
|
||||||
|
await appendPipelineLog(pipelineId, "SYSTEM",
|
||||||
|
`Auto-recovery started: ${recoveryResult.new_pipeline_id}`, "INFO");
|
||||||
|
|
||||||
|
broadcastUpdate("pipeline_rebooting", {
|
||||||
|
pipeline_id: pipelineId,
|
||||||
|
new_pipeline_id: recoveryResult.new_pipeline_id,
|
||||||
|
failure_reason: abortReason,
|
||||||
|
failure_log_url: `/api/pipeline/consensus/report?pipeline_id=${pipelineId}`
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// Auto-recovery failed - fall back to user action
|
||||||
|
await appendPipelineLog(pipelineId, "SYSTEM",
|
||||||
|
`Auto-recovery failed: ${recoveryResult.message}. User action required.`, "ERROR");
|
||||||
|
|
||||||
|
broadcastUpdate("orchestration_complete", {
|
||||||
|
pipeline_id: pipelineId,
|
||||||
|
status: "RECOVERY_FAILED",
|
||||||
|
consensus: false,
|
||||||
|
fallback_options: FALLBACK_OPTIONS,
|
||||||
|
awaiting_user_action: true
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Error - crash or exception
|
// Error - crash or exception
|
||||||
@ -3917,6 +4066,14 @@ function renderDashboard(): string {
|
|||||||
.status-badge.retrying { background: rgba(88, 166, 255, 0.2); color: var(--accent-blue); }
|
.status-badge.retrying { background: rgba(88, 166, 255, 0.2); color: var(--accent-blue); }
|
||||||
.status-badge.escalated { background: rgba(210, 153, 34, 0.2); color: var(--accent-yellow); }
|
.status-badge.escalated { background: rgba(210, 153, 34, 0.2); color: var(--accent-yellow); }
|
||||||
.status-badge.completed_no_consensus { background: rgba(63, 185, 80, 0.15); color: #8bc34a; }
|
.status-badge.completed_no_consensus { background: rgba(63, 185, 80, 0.15); color: #8bc34a; }
|
||||||
|
.status-badge.rebooting { background: rgba(57, 197, 207, 0.2); color: var(--accent-cyan); animation: pulse 1.5s infinite; }
|
||||||
|
.status-badge.aborted { background: rgba(248, 81, 73, 0.3); color: var(--accent-red); border: 1px solid var(--accent-red); }
|
||||||
|
.status-badge.recovery_failed { background: rgba(248, 81, 73, 0.4); color: #ff6b6b; border: 1px solid #ff6b6b; }
|
||||||
|
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% { opacity: 1; }
|
||||||
|
50% { opacity: 0.6; }
|
||||||
|
}
|
||||||
|
|
||||||
/* Consensus Failure Alert */
|
/* Consensus Failure Alert */
|
||||||
.consensus-failure-alert {
|
.consensus-failure-alert {
|
||||||
@ -5633,11 +5790,25 @@ function renderDashboard(): string {
|
|||||||
if (selectedPipelineId === msg.data.pipeline_id) {
|
if (selectedPipelineId === msg.data.pipeline_id) {
|
||||||
loadLogs(selectedPipelineId);
|
loadLogs(selectedPipelineId);
|
||||||
}
|
}
|
||||||
// Show notification
|
|
||||||
showNotification('Consensus Failed', \`Pipeline \${msg.data.pipeline_id} failed to reach consensus. Action required.\`, 'warn');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (msg.type === 'orchestration_complete' && msg.data.consensus === false) {
|
// Auto-recovery: Pipeline rebooting
|
||||||
|
if (msg.type === 'pipeline_rebooting') {
|
||||||
|
loadPipelines();
|
||||||
|
if (selectedPipelineId === msg.data.pipeline_id) {
|
||||||
|
loadLogs(selectedPipelineId);
|
||||||
|
}
|
||||||
|
showNotification('Pipeline Rebooting',
|
||||||
|
\`Consensus failure – pipeline \${msg.data.new_pipeline_id} spawning automatically. <a href="#" onclick="showFailureDetails('\${msg.data.pipeline_id}'); return false;">View failure log</a>\`,
|
||||||
|
'info');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'orchestration_complete' && msg.data.auto_recovery) {
|
||||||
|
loadPipelines();
|
||||||
|
showNotification('Auto-Recovery', msg.data.message || 'Consensus failure – pipeline rebooting automatically.', 'info');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'orchestration_complete' && msg.data.consensus === false && !msg.data.auto_recovery) {
|
||||||
loadPipelines();
|
loadPipelines();
|
||||||
showNotification('Consensus Failed', 'Agents completed but could not agree. Choose a fallback action.', 'warn');
|
showNotification('Consensus Failed', 'Agents completed but could not agree. Choose a fallback action.', 'warn');
|
||||||
}
|
}
|
||||||
@ -5746,7 +5917,9 @@ function renderDashboard(): string {
|
|||||||
container.innerHTML = pipelinesData.map(p => {
|
container.innerHTML = pipelinesData.map(p => {
|
||||||
const isActive = p.pipeline_id === selectedPipelineId;
|
const isActive = p.pipeline_id === selectedPipelineId;
|
||||||
const agents = p.agents || [];
|
const agents = p.agents || [];
|
||||||
const isConsensusFailed = p.status === 'CONSENSUS_FAILED';
|
const isConsensusFailed = p.status === 'CONSENSUS_FAILED' || p.status === 'RECOVERY_FAILED';
|
||||||
|
const isRebooting = p.status === 'REBOOTING';
|
||||||
|
const isAborted = p.status === 'ABORTED';
|
||||||
|
|
||||||
const agentPills = agents.map(a => {
|
const agentPills = agents.map(a => {
|
||||||
const type = (a.type || 'UNKNOWN').toLowerCase();
|
const type = (a.type || 'UNKNOWN').toLowerCase();
|
||||||
@ -5757,10 +5930,37 @@ function renderDashboard(): string {
|
|||||||
</span>\`;
|
</span>\`;
|
||||||
}).join('');
|
}).join('');
|
||||||
|
|
||||||
const consensusAlert = isConsensusFailed ? \`
|
// Rebooting alert (auto-recovery in progress)
|
||||||
|
const rebootingAlert = isRebooting ? \`
|
||||||
|
<div class="consensus-failure-alert" style="border-color: var(--accent-cyan); background: rgba(57, 197, 207, 0.1);" onclick="event.stopPropagation()">
|
||||||
|
<div class="alert-title" style="color: var(--accent-cyan);">Consensus Failure – Pipeline Rebooting</div>
|
||||||
|
<div class="alert-desc">Auto-recovery in progress. A new pipeline is being spawned with failure context.</div>
|
||||||
|
<div class="fallback-options">
|
||||||
|
<div class="fallback-option" onclick="showFailureDetails('\${p.pipeline_id}')">
|
||||||
|
<div>
|
||||||
|
<div class="option-label">Failure Log</div>
|
||||||
|
<div class="option-desc">View what went wrong</div>
|
||||||
|
</div>
|
||||||
|
<button>View Log</button>
|
||||||
|
</div>
|
||||||
|
\${p.recovery_pipeline ? \`
|
||||||
|
<div class="fallback-option" onclick="selectPipeline('\${p.recovery_pipeline}')">
|
||||||
|
<div>
|
||||||
|
<div class="option-label">Recovery Pipeline</div>
|
||||||
|
<div class="option-desc">\${p.recovery_pipeline}</div>
|
||||||
|
</div>
|
||||||
|
<button class="primary">View</button>
|
||||||
|
</div>
|
||||||
|
\` : ''}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
\` : '';
|
||||||
|
|
||||||
|
// Consensus failed alert (manual action needed - auto-recovery failed or max attempts)
|
||||||
|
const consensusAlert = (isConsensusFailed && !isRebooting) ? \`
|
||||||
<div class="consensus-failure-alert" onclick="event.stopPropagation()">
|
<div class="consensus-failure-alert" onclick="event.stopPropagation()">
|
||||||
<div class="alert-title">Consensus Failed</div>
|
<div class="alert-title">Consensus Failed – Action Required</div>
|
||||||
<div class="alert-desc">Agents completed but failed to reach agreement. Choose an action:</div>
|
<div class="alert-desc">Auto-recovery exhausted or failed. Choose a manual action:</div>
|
||||||
<div class="fallback-options">
|
<div class="fallback-options">
|
||||||
<div class="fallback-option" onclick="handleFallback('\${p.pipeline_id}', 'rerun_same')">
|
<div class="fallback-option" onclick="handleFallback('\${p.pipeline_id}', 'rerun_same')">
|
||||||
<div>
|
<div>
|
||||||
@ -5794,15 +5994,34 @@ function renderDashboard(): string {
|
|||||||
</div>
|
</div>
|
||||||
\` : '';
|
\` : '';
|
||||||
|
|
||||||
|
// Aborted alert
|
||||||
|
const abortedAlert = isAborted ? \`
|
||||||
|
<div class="consensus-failure-alert" style="border-color: var(--accent-red);" onclick="event.stopPropagation()">
|
||||||
|
<div class="alert-title" style="color: var(--accent-red);">Pipeline Aborted</div>
|
||||||
|
<div class="alert-desc">Agents were stuck or exceeded iteration limit. Auto-recovery triggered.</div>
|
||||||
|
<div class="fallback-options">
|
||||||
|
<div class="fallback-option" onclick="showFailureDetails('\${p.pipeline_id}')">
|
||||||
|
<div>
|
||||||
|
<div class="option-label">Failure Log</div>
|
||||||
|
<div class="option-desc">View abort reason</div>
|
||||||
|
</div>
|
||||||
|
<button>View Log</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
\` : '';
|
||||||
|
|
||||||
return \`
|
return \`
|
||||||
<div class="pipeline-card \${isActive ? 'active' : ''} \${isConsensusFailed ? 'consensus-failed' : ''}" onclick="selectPipeline('\${p.pipeline_id}')">
|
<div class="pipeline-card \${isActive ? 'active' : ''} \${isConsensusFailed ? 'consensus-failed' : ''} \${isRebooting ? 'rebooting' : ''}" onclick="selectPipeline('\${p.pipeline_id}')">
|
||||||
<div class="pipeline-header">
|
<div class="pipeline-header">
|
||||||
<span class="pipeline-id">\${p.pipeline_id}</span>
|
<span class="pipeline-id">\${p.pipeline_id}</span>
|
||||||
<span class="status-badge \${(p.status || 'unknown').toLowerCase().replace(/_/g, '_')}">\${p.status || 'UNKNOWN'}</span>
|
<span class="status-badge \${(p.status || 'unknown').toLowerCase().replace(/_/g, '_')}">\${p.status || 'UNKNOWN'}</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="pipeline-objective">\${p.objective || 'No objective'}</div>
|
<div class="pipeline-objective">\${p.objective || 'No objective'}</div>
|
||||||
<div class="agent-pills">\${agentPills || '<span style="color: var(--text-muted); font-size: 10px;">No agents</span>'}</div>
|
<div class="agent-pills">\${agentPills || '<span style="color: var(--text-muted); font-size: 10px;">No agents</span>'}</div>
|
||||||
|
\${rebootingAlert}
|
||||||
\${consensusAlert}
|
\${consensusAlert}
|
||||||
|
\${abortedAlert}
|
||||||
</div>
|
</div>
|
||||||
\`;
|
\`;
|
||||||
}).join('');
|
}).join('');
|
||||||
@ -7642,6 +7861,44 @@ const server = Bun.serve({
|
|||||||
return new Response(JSON.stringify(result), { headers });
|
return new Response(JSON.stringify(result), { headers });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Failure context recording from orchestrator (for auto-recovery)
|
||||||
|
if (path === "/api/pipeline/failure-context" && req.method === "POST") {
|
||||||
|
const body = await req.json() as {
|
||||||
|
pipeline_id: string;
|
||||||
|
task_id: string;
|
||||||
|
failure_reason: string;
|
||||||
|
failure_time: string;
|
||||||
|
iteration_count: number;
|
||||||
|
elapsed_ms: number;
|
||||||
|
metrics: any;
|
||||||
|
gamma_spawned: boolean;
|
||||||
|
error_count: number;
|
||||||
|
recovery_hint: string;
|
||||||
|
};
|
||||||
|
if (!body.pipeline_id) {
|
||||||
|
return new Response(JSON.stringify({ error: "pipeline_id required" }), { status: 400, headers });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the failure context in Dragonfly
|
||||||
|
const contextKey = `failure_context:${body.pipeline_id}`;
|
||||||
|
await redis.set(contextKey, JSON.stringify(body));
|
||||||
|
|
||||||
|
// Also log to metrics
|
||||||
|
await redis.hSet(`metrics:${body.pipeline_id}`, {
|
||||||
|
failure_reason: body.failure_reason,
|
||||||
|
failure_time: body.failure_time,
|
||||||
|
iteration_count: String(body.iteration_count),
|
||||||
|
gamma_spawned: body.gamma_spawned ? "true" : "false",
|
||||||
|
error_count: String(body.error_count),
|
||||||
|
recovery_hint: body.recovery_hint
|
||||||
|
});
|
||||||
|
|
||||||
|
await appendPipelineLog(body.pipeline_id, "ORCHESTRATOR",
|
||||||
|
`Failure context recorded: ${body.failure_reason} (${body.iteration_count} iterations, ${body.error_count} errors)`, "WARN");
|
||||||
|
|
||||||
|
return new Response(JSON.stringify({ success: true }), { headers });
|
||||||
|
}
|
||||||
|
|
||||||
if (path === "/api/observability/handoff" && req.method === "POST") {
|
if (path === "/api/observability/handoff" && req.method === "POST") {
|
||||||
const body = await req.json() as { pipeline_id: string };
|
const body = await req.json() as { pipeline_id: string };
|
||||||
if (!body.pipeline_id) {
|
if (!body.pipeline_id) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user