- Add iteration tracking and stuck detection to orchestrator - Add triggerAutoRecovery function for automatic pipeline respawn - Store structured failure context (proposals, conflicts, reason) - Force GAMMA agent on recovery attempts for conflict resolution - Limit auto-recovery to 3 attempts to prevent infinite loops - Add UI status badges for rebooting/aborted states - Add failure-context API endpoint for orchestrator handoff - Add test_auto_recovery.py with 6 passing tests Exit codes: 0=success, 1=error, 2=consensus failure, 3=aborted Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
452 lines
15 KiB
Python
452 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Auto-Recovery Test for Consensus Failures
|
|
|
|
Tests the system's ability to automatically recover from consensus failures
|
|
by spawning new pipelines with the collected failure context.
|
|
|
|
Exit codes:
|
|
- 0: All tests passed
|
|
- 1: Some tests failed
|
|
- 2: Consensus failure (triggers auto-recovery)
|
|
- 3: Aborted (timeout/stuck - triggers auto-recovery)
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import time
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import redis
|
|
|
|
REDIS_HOST = "127.0.0.1"
|
|
REDIS_PORT = 6379
|
|
REDIS_PASSWORD = "governance2026"
|
|
UI_BASE_URL = "http://127.0.0.1:3000"
|
|
|
|
|
|
class AutoRecoveryTester:
|
|
"""Tests auto-recovery for consensus failures."""
|
|
|
|
def __init__(self):
|
|
self.redis = redis.Redis(
|
|
host=REDIS_HOST,
|
|
port=REDIS_PORT,
|
|
password=REDIS_PASSWORD,
|
|
decode_responses=True
|
|
)
|
|
self.results = []
|
|
|
|
def log(self, msg: str, level: str = "INFO"):
|
|
"""Log a message with timestamp."""
|
|
ts = datetime.utcnow().strftime("%H:%M:%S")
|
|
print(f"[{ts}] [{level}] {msg}")
|
|
|
|
def setup_mock_pipeline(self, pipeline_id: str, task_id: str, objective: str) -> bool:
|
|
"""Set up a mock pipeline in DragonflyDB for testing."""
|
|
try:
|
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
|
"task_id": task_id,
|
|
"objective": objective,
|
|
"status": "RUNNING",
|
|
"created_at": datetime.utcnow().isoformat(),
|
|
"agents": json.dumps(["ALPHA", "BETA"]),
|
|
"run_number": "1"
|
|
})
|
|
self.log(f"Created mock pipeline: {pipeline_id}")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Failed to create pipeline: {e}", "ERROR")
|
|
return False
|
|
|
|
def simulate_consensus_failure(self, pipeline_id: str) -> bool:
|
|
"""Simulate a consensus failure by setting appropriate state."""
|
|
try:
|
|
# Set consensus failure state
|
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
|
"status": "CONSENSUS_FAILED",
|
|
"consensus": json.dumps({
|
|
"achieved": False,
|
|
"proposals": [
|
|
{"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
|
|
{"agent": "BETA", "proposal": "Solution B", "score": 0.5}
|
|
],
|
|
"conflicts": [
|
|
{"type": "approach", "agents": ["ALPHA", "BETA"]}
|
|
]
|
|
})
|
|
})
|
|
|
|
# Store failure context
|
|
failure_context = {
|
|
"pipeline_id": pipeline_id,
|
|
"failure_reason": "consensus_failed",
|
|
"proposals": [
|
|
{"agent": "ALPHA", "proposal": "Solution A", "score": 0.6},
|
|
{"agent": "BETA", "proposal": "Solution B", "score": 0.5}
|
|
],
|
|
"conflicts": [{"type": "approach"}],
|
|
"iteration_count": 5,
|
|
"run_number": 1,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
self.redis.set(
|
|
f"pipeline:{pipeline_id}:failure_context",
|
|
json.dumps(failure_context)
|
|
)
|
|
|
|
self.log(f"Simulated consensus failure for pipeline: {pipeline_id}")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Failed to simulate failure: {e}", "ERROR")
|
|
return False
|
|
|
|
def check_recovery_pipeline_created(self, original_pipeline_id: str, timeout: float = 5.0) -> dict:
|
|
"""Check if a recovery pipeline was created."""
|
|
start = time.time()
|
|
while time.time() - start < timeout:
|
|
try:
|
|
# Check if original pipeline has recovery_pipeline reference
|
|
recovery_id = self.redis.hget(f"pipeline:{original_pipeline_id}", "recovery_pipeline")
|
|
if recovery_id:
|
|
# Verify recovery pipeline exists
|
|
recovery_data = self.redis.hgetall(f"pipeline:{recovery_id}")
|
|
if recovery_data:
|
|
return {
|
|
"found": True,
|
|
"recovery_pipeline_id": recovery_id,
|
|
"recovery_data": recovery_data,
|
|
"elapsed_ms": (time.time() - start) * 1000
|
|
}
|
|
except Exception:
|
|
pass
|
|
time.sleep(0.2)
|
|
|
|
return {"found": False, "elapsed_ms": timeout * 1000}
|
|
|
|
def verify_failure_context_passed(self, recovery_pipeline_id: str) -> dict:
|
|
"""Verify the recovery pipeline received the failure context."""
|
|
try:
|
|
data = self.redis.hgetall(f"pipeline:{recovery_pipeline_id}")
|
|
prior_context = data.get("prior_context")
|
|
|
|
return {
|
|
"has_prior_context": prior_context is not None,
|
|
"prior_pipeline": data.get("prior_pipeline"),
|
|
"run_number": data.get("run_number"),
|
|
"force_gamma": data.get("force_gamma") == "true"
|
|
}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
def cleanup_test_pipelines(self, pipeline_ids: list):
|
|
"""Clean up test pipelines."""
|
|
for pid in pipeline_ids:
|
|
try:
|
|
keys = self.redis.keys(f"pipeline:{pid}*")
|
|
if keys:
|
|
self.redis.delete(*keys)
|
|
self.log(f"Cleaned up pipeline: {pid}")
|
|
except Exception:
|
|
pass
|
|
|
|
def test_consensus_failure_detection(self) -> dict:
|
|
"""Test 1: Verify consensus failure is properly detected."""
|
|
test_name = "Consensus Failure Detection"
|
|
pipeline_id = f"test-recovery-{int(time.time())}"
|
|
task_id = "test-task-001"
|
|
objective = "Test consensus failure detection"
|
|
|
|
try:
|
|
# Setup
|
|
self.setup_mock_pipeline(pipeline_id, task_id, objective)
|
|
|
|
# Simulate failure
|
|
self.simulate_consensus_failure(pipeline_id)
|
|
|
|
# Check state
|
|
status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
|
|
failure_ctx = self.redis.get(f"pipeline:{pipeline_id}:failure_context")
|
|
|
|
passed = status == "CONSENSUS_FAILED" and failure_ctx is not None
|
|
|
|
result = {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"status_detected": status,
|
|
"failure_context_stored": failure_ctx is not None
|
|
}
|
|
|
|
# Cleanup
|
|
self.cleanup_test_pipelines([pipeline_id])
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def test_max_recovery_limit(self) -> dict:
|
|
"""Test 2: Verify max recovery attempts are respected."""
|
|
test_name = "Max Recovery Limit"
|
|
pipeline_id = f"test-max-recovery-{int(time.time())}"
|
|
|
|
try:
|
|
# Create pipeline at max recovery attempts
|
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
|
"task_id": "test-task",
|
|
"objective": "Test max recovery",
|
|
"status": "CONSENSUS_FAILED",
|
|
"run_number": "3" # Already at max (3)
|
|
})
|
|
|
|
# Store failure context indicating max reached
|
|
failure_context = {
|
|
"run_number": 3,
|
|
"failure_reason": "consensus_failed",
|
|
"proposals": [],
|
|
"conflicts": []
|
|
}
|
|
self.redis.set(
|
|
f"pipeline:{pipeline_id}:failure_context",
|
|
json.dumps(failure_context)
|
|
)
|
|
|
|
# Verify no further recovery is attempted
|
|
time.sleep(0.5)
|
|
recovery_id = self.redis.hget(f"pipeline:{pipeline_id}", "recovery_pipeline")
|
|
|
|
passed = recovery_id is None # Should NOT have recovery
|
|
|
|
result = {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"run_number": 3,
|
|
"recovery_attempted": recovery_id is not None
|
|
}
|
|
|
|
self.cleanup_test_pipelines([pipeline_id])
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def test_failure_context_structure(self) -> dict:
|
|
"""Test 3: Verify failure context has required structure."""
|
|
test_name = "Failure Context Structure"
|
|
|
|
required_fields = [
|
|
"pipeline_id",
|
|
"failure_reason",
|
|
"proposals",
|
|
"conflicts",
|
|
"run_number",
|
|
"timestamp"
|
|
]
|
|
|
|
try:
|
|
sample_context = {
|
|
"pipeline_id": "test-123",
|
|
"failure_reason": "consensus_failed",
|
|
"proposals": [
|
|
{"agent": "ALPHA", "proposal": "Test", "score": 0.5}
|
|
],
|
|
"conflicts": [{"type": "approach"}],
|
|
"iteration_count": 5,
|
|
"run_number": 1,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
missing = [f for f in required_fields if f not in sample_context]
|
|
passed = len(missing) == 0
|
|
|
|
return {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"required_fields": required_fields,
|
|
"missing_fields": missing,
|
|
"sample_valid": passed
|
|
}
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def test_gamma_force_on_recovery(self) -> dict:
|
|
"""Test 4: Verify GAMMA is forced on recovery attempts."""
|
|
test_name = "GAMMA Force on Recovery"
|
|
pipeline_id = f"test-gamma-{int(time.time())}"
|
|
recovery_id = f"test-gamma-recovery-{int(time.time())}"
|
|
|
|
try:
|
|
# Create original pipeline
|
|
self.setup_mock_pipeline(pipeline_id, "task-gamma", "Test GAMMA forcing")
|
|
|
|
# Create mock recovery pipeline with force_gamma
|
|
self.redis.hset(f"pipeline:{recovery_id}", mapping={
|
|
"task_id": "task-gamma",
|
|
"objective": "[RECOVERY] Test GAMMA forcing",
|
|
"status": "SPAWNED",
|
|
"prior_pipeline": pipeline_id,
|
|
"run_number": "2",
|
|
"force_gamma": "true"
|
|
})
|
|
|
|
# Link them
|
|
self.redis.hset(f"pipeline:{pipeline_id}", "recovery_pipeline", recovery_id)
|
|
|
|
# Verify
|
|
force_gamma = self.redis.hget(f"pipeline:{recovery_id}", "force_gamma")
|
|
passed = force_gamma == "true"
|
|
|
|
result = {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"force_gamma_set": force_gamma == "true",
|
|
"recovery_linked": True
|
|
}
|
|
|
|
self.cleanup_test_pipelines([pipeline_id, recovery_id])
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def test_iteration_timeout_abort(self) -> dict:
|
|
"""Test 5: Verify iteration timeout triggers abort."""
|
|
test_name = "Iteration Timeout Abort"
|
|
pipeline_id = f"test-timeout-{int(time.time())}"
|
|
|
|
try:
|
|
# Create pipeline that has exceeded iterations
|
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
|
"task_id": "task-timeout",
|
|
"objective": "Test timeout",
|
|
"status": "ABORTED",
|
|
"iteration_count": "12",
|
|
"max_iterations": "10",
|
|
"abort_reason": "iteration_limit"
|
|
})
|
|
|
|
# Verify abort state
|
|
status = self.redis.hget(f"pipeline:{pipeline_id}", "status")
|
|
reason = self.redis.hget(f"pipeline:{pipeline_id}", "abort_reason")
|
|
|
|
passed = status == "ABORTED" and reason == "iteration_limit"
|
|
|
|
result = {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"status": status,
|
|
"abort_reason": reason
|
|
}
|
|
|
|
self.cleanup_test_pipelines([pipeline_id])
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def test_stuck_detection(self) -> dict:
|
|
"""Test 6: Verify stuck agent detection."""
|
|
test_name = "Stuck Agent Detection"
|
|
pipeline_id = f"test-stuck-{int(time.time())}"
|
|
|
|
try:
|
|
# Create pipeline with old last_progress timestamp
|
|
old_time = datetime.utcnow().timestamp() - 120 # 2 minutes ago
|
|
|
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
|
"task_id": "task-stuck",
|
|
"objective": "Test stuck detection",
|
|
"status": "RUNNING",
|
|
"last_progress": str(old_time)
|
|
})
|
|
|
|
# Check if stuck detection would trigger (60 second timeout)
|
|
last_progress = float(self.redis.hget(f"pipeline:{pipeline_id}", "last_progress") or 0)
|
|
current = datetime.utcnow().timestamp()
|
|
stuck = (current - last_progress) > 60
|
|
|
|
passed = stuck # Should be detected as stuck
|
|
|
|
result = {
|
|
"test": test_name,
|
|
"passed": passed,
|
|
"time_since_progress_seconds": current - last_progress,
|
|
"stuck_detected": stuck
|
|
}
|
|
|
|
self.cleanup_test_pipelines([pipeline_id])
|
|
return result
|
|
|
|
except Exception as e:
|
|
return {"test": test_name, "passed": False, "error": str(e)}
|
|
|
|
def run_all_tests(self) -> dict:
|
|
"""Run all auto-recovery tests."""
|
|
print("\n" + "=" * 60)
|
|
print("AUTO-RECOVERY TEST SUITE")
|
|
print("=" * 60 + "\n")
|
|
|
|
tests = [
|
|
self.test_consensus_failure_detection,
|
|
self.test_max_recovery_limit,
|
|
self.test_failure_context_structure,
|
|
self.test_gamma_force_on_recovery,
|
|
self.test_iteration_timeout_abort,
|
|
self.test_stuck_detection,
|
|
]
|
|
|
|
passed = 0
|
|
failed = 0
|
|
|
|
for test_func in tests:
|
|
result = test_func()
|
|
self.results.append(result)
|
|
|
|
status = "PASS" if result["passed"] else "FAIL"
|
|
symbol = "✓" if result["passed"] else "✗"
|
|
|
|
print(f" {symbol} {status}: {result['test']}")
|
|
|
|
if result["passed"]:
|
|
passed += 1
|
|
else:
|
|
failed += 1
|
|
if "error" in result:
|
|
print(f" Error: {result['error']}")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"RESULTS: {passed}/{passed+failed} passed")
|
|
print(f"{'='*60}\n")
|
|
|
|
return {
|
|
"total_tests": len(tests),
|
|
"passed": passed,
|
|
"failed": failed,
|
|
"success_rate": passed / len(tests) if tests else 0,
|
|
"results": self.results
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Run auto-recovery tests."""
|
|
tester = AutoRecoveryTester()
|
|
results = tester.run_all_tests()
|
|
|
|
print("\nDetailed Results:")
|
|
print("-" * 40)
|
|
for r in results["results"]:
|
|
status = "PASS" if r["passed"] else "FAIL"
|
|
print(f" [{status}] {r['test']}")
|
|
for k, v in r.items():
|
|
if k not in ["test", "passed"]:
|
|
print(f" {k}: {v}")
|
|
|
|
return 0 if results["failed"] == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|