Implement real auto-recovery with handoff chain
Orchestrator changes:
- Add dumpAgentHandoff() to dump proposals/analysis before abort
- Add loadRecoveryContext() to load inherited context on recovery runs
- Add preseedBlackboard() to pre-seed inherited proposals
- Force-spawn GAMMA immediately on recovery runs
- Track isRecoveryRun, recoveryAttempt, inheritedContext, forceGamma
Server changes:
- Update recordConsensusFailure() to read orchestrator handoff JSON
- Add collectFromBlackboard() helper as fallback
- Update triggerAutoRecovery() with comprehensive context passing
- Store inherited_handoff reference for recovery pipelines
- Track retry_count, abort_reason, handoff_ref in recovery:* keys
- Add recovery badge and prior pipeline link in UI
Test coverage:
- test_auto_recovery.py: 6 unit tests
- test_e2e_auto_recovery.py: 5 E2E tests (handoff dump, recovery
pipeline creation, inherited context, retry tracking, status update)
Redis tracking keys:
- handoff:{pipeline_id}:agents - orchestrator dumps proposals here
- handoff:{recovery_id}:inherited - recovery pipeline inherits from
- recovery:{pipeline_id} - retry_count, abort_reason, handoff_ref
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
a19535b580
commit
c96919fe35
@ -16,6 +16,30 @@ import {
|
|||||||
MetricsCollector,
|
MetricsCollector,
|
||||||
} from "./coordination";
|
} from "./coordination";
|
||||||
import { AgentAlpha, AgentBeta, AgentGamma } from "./agents";
|
import { AgentAlpha, AgentBeta, AgentGamma } from "./agents";
|
||||||
|
import { createClient, RedisClientType } from "redis";
|
||||||
|
|
||||||
|
// Redis client for direct handoff writes
|
||||||
|
let handoffRedis: RedisClientType | null = null;
|
||||||
|
|
||||||
|
async function getHandoffRedis(): Promise<RedisClientType> {
|
||||||
|
if (handoffRedis && handoffRedis.isOpen) return handoffRedis;
|
||||||
|
|
||||||
|
const initKeys = await Bun.file("/opt/vault/init-keys.json").json();
|
||||||
|
const token = initKeys.root_token;
|
||||||
|
const result = await fetch(`https://127.0.0.1:8200/v1/secret/data/services/dragonfly`, {
|
||||||
|
headers: { "X-Vault-Token": token },
|
||||||
|
// @ts-ignore - Bun supports this
|
||||||
|
tls: { rejectUnauthorized: false }
|
||||||
|
}).then(r => r.json());
|
||||||
|
|
||||||
|
const creds = result.data.data;
|
||||||
|
handoffRedis = createClient({
|
||||||
|
url: `redis://${creds.host}:${creds.port}`,
|
||||||
|
password: creds.password,
|
||||||
|
});
|
||||||
|
await handoffRedis.connect();
|
||||||
|
return handoffRedis;
|
||||||
|
}
|
||||||
|
|
||||||
function now(): string {
|
function now(): string {
|
||||||
return new Date().toISOString();
|
return new Date().toISOString();
|
||||||
@ -82,6 +106,12 @@ export class MultiAgentOrchestrator {
|
|||||||
private lastProgressTime: number = 0;
|
private lastProgressTime: number = 0;
|
||||||
private progressTimeout: number = 60000; // 60 seconds without progress = stuck
|
private progressTimeout: number = 60000; // 60 seconds without progress = stuck
|
||||||
|
|
||||||
|
// Recovery context from prior runs
|
||||||
|
private isRecoveryRun: boolean = false;
|
||||||
|
private recoveryAttempt: number = 1;
|
||||||
|
private inheritedContext: any = null;
|
||||||
|
private forceGamma: boolean = false;
|
||||||
|
|
||||||
constructor(model: string = "anthropic/claude-sonnet-4") {
|
constructor(model: string = "anthropic/claude-sonnet-4") {
|
||||||
// Use environment variable for task ID if provided
|
// Use environment variable for task ID if provided
|
||||||
this.taskId = process.env.TASK_ID || generateId();
|
this.taskId = process.env.TASK_ID || generateId();
|
||||||
@ -176,6 +206,229 @@ export class MultiAgentOrchestrator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load recovery context from prior run if this is a recovery pipeline
|
||||||
|
private async loadRecoveryContext(): Promise<void> {
|
||||||
|
if (!this.pipelineId) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const redis = await getHandoffRedis();
|
||||||
|
const pipelineData = await redis.hGetAll(`pipeline:${this.pipelineId}`);
|
||||||
|
|
||||||
|
if (pipelineData.is_recovery === "true") {
|
||||||
|
this.isRecoveryRun = true;
|
||||||
|
this.recoveryAttempt = parseInt(pipelineData.recovery_attempt || "1");
|
||||||
|
this.forceGamma = pipelineData.force_gamma === "true";
|
||||||
|
|
||||||
|
console.log("\n" + "=".repeat(70));
|
||||||
|
console.log(`RECOVERY RUN - Attempt ${this.recoveryAttempt}`);
|
||||||
|
console.log("=".repeat(70));
|
||||||
|
|
||||||
|
// Load inherited handoff
|
||||||
|
const inheritedKey = pipelineData.inherited_handoff;
|
||||||
|
if (inheritedKey) {
|
||||||
|
const inheritedData = await redis.get(inheritedKey);
|
||||||
|
if (inheritedData) {
|
||||||
|
this.inheritedContext = JSON.parse(inheritedData);
|
||||||
|
console.log(`Loaded inherited context from: ${inheritedKey}`);
|
||||||
|
console.log(`- Prior proposals: ${this.inheritedContext.proposals?.length || 0}`);
|
||||||
|
console.log(`- Prior synthesis attempts: ${this.inheritedContext.synthesis_attempts?.length || 0}`);
|
||||||
|
console.log(`- Recovery hints:`);
|
||||||
|
this.inheritedContext.recovery_hints?.forEach((hint: string, i: number) => {
|
||||||
|
console.log(` ${i + 1}. ${hint}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also try loading prior_context (JSON string)
|
||||||
|
if (pipelineData.prior_context) {
|
||||||
|
const priorContext = JSON.parse(pipelineData.prior_context);
|
||||||
|
console.log(`Prior run: ${priorContext.prior_pipeline}`);
|
||||||
|
console.log(`Prior failure reason: ${priorContext.failure_reason}`);
|
||||||
|
console.log(`Prior iteration count: ${priorContext.iteration_count}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.forceGamma) {
|
||||||
|
console.log("\nFORCE GAMMA MODE: GAMMA mediator will be spawned immediately");
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("=".repeat(70) + "\n");
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
this.log(`Warning: Could not load recovery context: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get inherited context for agents to use
|
||||||
|
getInheritedContext(): any {
|
||||||
|
return this.inheritedContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if GAMMA should be force-spawned
|
||||||
|
shouldForceGamma(): boolean {
|
||||||
|
return this.forceGamma;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-seed the blackboard with inherited proposals from prior run
|
||||||
|
private async preseedBlackboard(): Promise<void> {
|
||||||
|
if (!this.inheritedContext) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Seed prior proposals into the blackboard
|
||||||
|
if (this.inheritedContext.proposals?.length > 0) {
|
||||||
|
for (const proposal of this.inheritedContext.proposals) {
|
||||||
|
await this.blackboard.write(
|
||||||
|
"solutions",
|
||||||
|
`inherited_${proposal.agent}_${proposal.key || 'proposal'}`,
|
||||||
|
{
|
||||||
|
...proposal.value,
|
||||||
|
_inherited: true,
|
||||||
|
_from_run: this.recoveryAttempt - 1
|
||||||
|
},
|
||||||
|
proposal.agent as any
|
||||||
|
);
|
||||||
|
}
|
||||||
|
this.log(`Seeded ${this.inheritedContext.proposals.length} proposals from prior run`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seed prior synthesis attempts
|
||||||
|
if (this.inheritedContext.synthesis_attempts?.length > 0) {
|
||||||
|
for (const synthesis of this.inheritedContext.synthesis_attempts) {
|
||||||
|
await this.blackboard.write(
|
||||||
|
"synthesis",
|
||||||
|
`inherited_${synthesis.agent}_${synthesis.key || 'synthesis'}`,
|
||||||
|
{
|
||||||
|
...synthesis.value,
|
||||||
|
_inherited: true,
|
||||||
|
_from_run: this.recoveryAttempt - 1
|
||||||
|
},
|
||||||
|
synthesis.agent as any
|
||||||
|
);
|
||||||
|
}
|
||||||
|
this.log(`Seeded ${this.inheritedContext.synthesis_attempts.length} synthesis attempts from prior run`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write recovery metadata to blackboard
|
||||||
|
await this.blackboard.write("problem", "recovery_context", {
|
||||||
|
is_recovery: true,
|
||||||
|
recovery_attempt: this.recoveryAttempt,
|
||||||
|
prior_pipeline: this.inheritedContext.from_pipeline,
|
||||||
|
prior_proposals_count: this.inheritedContext.proposals?.length || 0,
|
||||||
|
recovery_hints: this.inheritedContext.recovery_hints || [],
|
||||||
|
instructions: [
|
||||||
|
"This is a RECOVERY run - prior agents failed to reach consensus",
|
||||||
|
"Review the inherited proposals in the 'solutions' section",
|
||||||
|
"Look for common ground between prior proposals",
|
||||||
|
"GAMMA mediator will help resolve conflicts",
|
||||||
|
"Try to synthesize a solution that incorporates the best ideas from prior attempts"
|
||||||
|
]
|
||||||
|
}, "ALPHA");
|
||||||
|
|
||||||
|
} catch (e: any) {
|
||||||
|
this.log(`Warning: Failed to pre-seed blackboard: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect and dump all agent proposals/analysis to handoff JSON
|
||||||
|
private async dumpAgentHandoff(): Promise<void> {
|
||||||
|
if (!this.pipelineId) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const redis = await getHandoffRedis();
|
||||||
|
const handoffKey = `handoff:${this.pipelineId}:agents`;
|
||||||
|
|
||||||
|
// Collect all solutions from blackboard
|
||||||
|
const solutions = await this.blackboard.readSection("solutions");
|
||||||
|
const synthesis = await this.blackboard.readSection("synthesis");
|
||||||
|
const consensus = await this.blackboard.readSection("consensus");
|
||||||
|
const problem = await this.blackboard.readSection("problem");
|
||||||
|
|
||||||
|
// Get agent states
|
||||||
|
const agentStates = await this.stateManager.getAllStates();
|
||||||
|
|
||||||
|
// Get message history
|
||||||
|
const alphaMessages = await this.alphaBus.getMessageLog(50);
|
||||||
|
const betaMessages = await this.betaBus.getMessageLog(50);
|
||||||
|
const gammaMessages = this.gammaBus ? await this.gammaBus.getMessageLog(50) : [];
|
||||||
|
|
||||||
|
// Build structured handoff
|
||||||
|
const handoff = {
|
||||||
|
pipeline_id: this.pipelineId,
|
||||||
|
task_id: this.taskId,
|
||||||
|
dump_time: new Date().toISOString(),
|
||||||
|
iteration_count: this.iterationCount,
|
||||||
|
max_iterations: this.maxIterations,
|
||||||
|
gamma_active: this.gammaAgent !== undefined,
|
||||||
|
|
||||||
|
// Agent proposals and analysis
|
||||||
|
proposals: solutions.map(s => ({
|
||||||
|
agent: s.author,
|
||||||
|
key: s.key,
|
||||||
|
value: s.value,
|
||||||
|
version: s.version,
|
||||||
|
timestamp: s.timestamp
|
||||||
|
})),
|
||||||
|
|
||||||
|
// Synthesis attempts
|
||||||
|
synthesis_attempts: synthesis.map(s => ({
|
||||||
|
agent: s.author,
|
||||||
|
key: s.key,
|
||||||
|
value: s.value,
|
||||||
|
timestamp: s.timestamp
|
||||||
|
})),
|
||||||
|
|
||||||
|
// Consensus votes and discussions
|
||||||
|
consensus_state: consensus.map(c => ({
|
||||||
|
key: c.key,
|
||||||
|
value: c.value,
|
||||||
|
author: c.author,
|
||||||
|
timestamp: c.timestamp
|
||||||
|
})),
|
||||||
|
|
||||||
|
// Problem analysis
|
||||||
|
problem_analysis: problem.map(p => ({
|
||||||
|
key: p.key,
|
||||||
|
value: p.value,
|
||||||
|
author: p.author
|
||||||
|
})),
|
||||||
|
|
||||||
|
// Agent states at abort time
|
||||||
|
agent_states: agentStates.map(s => ({
|
||||||
|
role: s.role,
|
||||||
|
status: s.status,
|
||||||
|
phase: s.phase,
|
||||||
|
progress: s.progress,
|
||||||
|
blocked_reason: s.blocked_reason,
|
||||||
|
last_activity: s.last_activity
|
||||||
|
})),
|
||||||
|
|
||||||
|
// Recent message history for context
|
||||||
|
message_summary: {
|
||||||
|
alpha_last_messages: alphaMessages.slice(-10),
|
||||||
|
beta_last_messages: betaMessages.slice(-10),
|
||||||
|
gamma_last_messages: gammaMessages.slice(-10)
|
||||||
|
},
|
||||||
|
|
||||||
|
// Recovery hints
|
||||||
|
recovery_hints: [
|
||||||
|
`Iteration limit (${this.maxIterations}) exceeded after ${this.iterationCount} iterations`,
|
||||||
|
this.gammaAgent ? "GAMMA was active but could not resolve conflicts" : "GAMMA was not spawned",
|
||||||
|
`${solutions.length} proposals generated, ${synthesis.length} synthesis attempts`,
|
||||||
|
"Consider: simplifying objective, forcing GAMMA earlier, or increasing iteration limit"
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
|
// Store handoff JSON
|
||||||
|
await redis.set(handoffKey, JSON.stringify(handoff), { EX: 86400 }); // 24hr TTL
|
||||||
|
await redis.hSet(`pipeline:${this.pipelineId}`, "handoff_key", handoffKey);
|
||||||
|
await redis.hSet(`pipeline:${this.pipelineId}`, "handoff_time", handoff.dump_time);
|
||||||
|
|
||||||
|
this.log(`Agent handoff dumped: ${solutions.length} proposals, ${synthesis.length} synthesis attempts`);
|
||||||
|
|
||||||
|
} catch (e: any) {
|
||||||
|
this.log(`Failed to dump agent handoff: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private log(msg: string) {
|
private log(msg: string) {
|
||||||
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
|
const elapsed = this.startTime ? ((Date.now() - this.startTime) / 1000).toFixed(1) : "0.0";
|
||||||
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
|
console.log(`[${elapsed}s] [ORCHESTRATOR] ${msg}`);
|
||||||
@ -193,6 +446,9 @@ export class MultiAgentOrchestrator {
|
|||||||
console.log("Model: " + this.model);
|
console.log("Model: " + this.model);
|
||||||
console.log("=".repeat(70) + "\n");
|
console.log("=".repeat(70) + "\n");
|
||||||
|
|
||||||
|
// Check if this is a recovery run and load inherited context
|
||||||
|
await this.loadRecoveryContext();
|
||||||
|
|
||||||
this.log("Initializing coordination infrastructure...");
|
this.log("Initializing coordination infrastructure...");
|
||||||
|
|
||||||
// Initialize shared infrastructure
|
// Initialize shared infrastructure
|
||||||
@ -210,6 +466,12 @@ export class MultiAgentOrchestrator {
|
|||||||
|
|
||||||
this.log("Infrastructure connected");
|
this.log("Infrastructure connected");
|
||||||
|
|
||||||
|
// Pre-seed blackboard with inherited context if this is a recovery run
|
||||||
|
if (this.isRecoveryRun && this.inheritedContext) {
|
||||||
|
this.log("Pre-seeding blackboard with inherited context...");
|
||||||
|
await this.preseedBlackboard();
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize message buses for ALPHA and BETA
|
// Initialize message buses for ALPHA and BETA
|
||||||
this.alphaBus = new MessageBus(this.taskId, "ALPHA");
|
this.alphaBus = new MessageBus(this.taskId, "ALPHA");
|
||||||
this.betaBus = new MessageBus(this.taskId, "BETA");
|
this.betaBus = new MessageBus(this.taskId, "BETA");
|
||||||
@ -330,6 +592,19 @@ export class MultiAgentOrchestrator {
|
|||||||
// Write task to blackboard
|
// Write task to blackboard
|
||||||
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
|
await this.blackboard.write("problem", "task_definition", task, "ALPHA");
|
||||||
|
|
||||||
|
// FORCE GAMMA: If this is a recovery run, spawn GAMMA immediately
|
||||||
|
if (this.forceGamma && !this.gammaAgent) {
|
||||||
|
this.log("FORCE GAMMA MODE: Spawning GAMMA mediator immediately for recovery");
|
||||||
|
const forceReason: SpawnCondition = {
|
||||||
|
type: "STUCK",
|
||||||
|
threshold: 0,
|
||||||
|
current_value: 1,
|
||||||
|
triggered: true,
|
||||||
|
description: "Force-spawned for recovery run"
|
||||||
|
};
|
||||||
|
await this.spawnGamma(forceReason);
|
||||||
|
}
|
||||||
|
|
||||||
// Track if we need to abort due to timeout/iteration limit
|
// Track if we need to abort due to timeout/iteration limit
|
||||||
let abortReason: string | undefined;
|
let abortReason: string | undefined;
|
||||||
|
|
||||||
@ -371,7 +646,11 @@ export class MultiAgentOrchestrator {
|
|||||||
clearInterval(this.monitorInterval);
|
clearInterval(this.monitorInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Revoke tokens for stuck agents
|
// CRITICAL: Dump all agent proposals/analysis to handoff JSON BEFORE revoking tokens
|
||||||
|
this.log("Dumping agent handoff data for recovery pipeline...");
|
||||||
|
await this.dumpAgentHandoff();
|
||||||
|
|
||||||
|
// Now revoke tokens for stuck agents
|
||||||
await this.revokeStuckAgentTokens();
|
await this.revokeStuckAgentTokens();
|
||||||
|
|
||||||
// Get partial metrics and mark as failed
|
// Get partial metrics and mark as failed
|
||||||
|
|||||||
518
tests/test_e2e_auto_recovery.py
Normal file
518
tests/test_e2e_auto_recovery.py
Normal file
@ -0,0 +1,518 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
End-to-End Auto-Recovery Test
|
||||||
|
|
||||||
|
Tests the complete auto-recovery flow:
|
||||||
|
1. Start a pipeline
|
||||||
|
2. Simulate iteration_limit abort
|
||||||
|
3. Verify handoff JSON is dumped
|
||||||
|
4. Verify recovery pipeline is spawned
|
||||||
|
5. Verify inherited context is loaded
|
||||||
|
6. Track retry_count and abort_reason in Redis
|
||||||
|
|
||||||
|
This test requires the UI server to be running on localhost:3000
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import redis
|
||||||
|
import requests
|
||||||
|
|
||||||
|
REDIS_HOST = "127.0.0.1"
|
||||||
|
REDIS_PORT = 6379
|
||||||
|
REDIS_PASSWORD = "governance2026"
|
||||||
|
UI_BASE_URL = "http://127.0.0.1:3000"
|
||||||
|
|
||||||
|
class E2EAutoRecoveryTest:
|
||||||
|
"""End-to-end auto-recovery test runner."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.redis = redis.Redis(
|
||||||
|
host=REDIS_HOST,
|
||||||
|
port=REDIS_PORT,
|
||||||
|
password=REDIS_PASSWORD,
|
||||||
|
decode_responses=True
|
||||||
|
)
|
||||||
|
self.test_pipeline_id = None
|
||||||
|
self.test_results = []
|
||||||
|
|
||||||
|
def log(self, msg: str, level: str = "INFO"):
|
||||||
|
"""Log a message with timestamp."""
|
||||||
|
ts = datetime.utcnow().strftime("%H:%M:%S.%f")[:-3]
|
||||||
|
print(f"[{ts}] [{level}] {msg}")
|
||||||
|
|
||||||
|
def setup_test_pipeline(self) -> str:
|
||||||
|
"""Set up a test pipeline that will hit iteration_limit."""
|
||||||
|
pipeline_id = f"test-e2e-recovery-{int(time.time())}"
|
||||||
|
task_id = f"task-e2e-{int(time.time())}"
|
||||||
|
|
||||||
|
# Create pipeline with low iteration limit to trigger abort quickly
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"task_id": task_id,
|
||||||
|
"objective": "Test auto-recovery on iteration_limit",
|
||||||
|
"status": "RUNNING",
|
||||||
|
"created_at": datetime.utcnow().isoformat(),
|
||||||
|
"agents": json.dumps(["ALPHA", "BETA"]),
|
||||||
|
"run_number": "1",
|
||||||
|
"model": "anthropic/claude-sonnet-4",
|
||||||
|
"timeout": "30"
|
||||||
|
})
|
||||||
|
|
||||||
|
self.log(f"Created test pipeline: {pipeline_id}")
|
||||||
|
self.test_pipeline_id = pipeline_id
|
||||||
|
return pipeline_id
|
||||||
|
|
||||||
|
def simulate_orchestrator_abort(self, pipeline_id: str) -> bool:
|
||||||
|
"""Simulate an orchestrator abort due to iteration_limit."""
|
||||||
|
try:
|
||||||
|
task_id = self.redis.hget(f"pipeline:{pipeline_id}", "task_id")
|
||||||
|
|
||||||
|
# Simulate agent proposals being written to blackboard
|
||||||
|
proposals = [
|
||||||
|
{"agent": "ALPHA", "key": "proposal_1", "value": {"solution": "Solution A approach"}, "version": 1},
|
||||||
|
{"agent": "BETA", "key": "proposal_1", "value": {"solution": "Solution B approach"}, "version": 1},
|
||||||
|
]
|
||||||
|
|
||||||
|
for p in proposals:
|
||||||
|
self.redis.hset(
|
||||||
|
f"blackboard:{task_id}:solutions",
|
||||||
|
f"{p['agent']}_{p['key']}",
|
||||||
|
json.dumps(p)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate agent state
|
||||||
|
self.redis.hset(f"agents:{task_id}", mapping={
|
||||||
|
"ALPHA": json.dumps({"role": "ALPHA", "status": "WORKING", "progress": 0.7}),
|
||||||
|
"BETA": json.dumps({"role": "BETA", "status": "WORKING", "progress": 0.6}),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Simulate handoff dump (what orchestrator does before abort)
|
||||||
|
handoff_key = f"handoff:{pipeline_id}:agents"
|
||||||
|
handoff = {
|
||||||
|
"pipeline_id": pipeline_id,
|
||||||
|
"task_id": task_id,
|
||||||
|
"dump_time": datetime.utcnow().isoformat(),
|
||||||
|
"iteration_count": 12,
|
||||||
|
"max_iterations": 10,
|
||||||
|
"gamma_active": False,
|
||||||
|
"proposals": proposals,
|
||||||
|
"synthesis_attempts": [
|
||||||
|
{"agent": "ALPHA", "key": "synthesis_1", "value": {"merged": "Combined approach"}}
|
||||||
|
],
|
||||||
|
"consensus_state": [],
|
||||||
|
"problem_analysis": [
|
||||||
|
{"key": "analysis", "value": {"complexity_score": 0.85}, "author": "ALPHA"}
|
||||||
|
],
|
||||||
|
"agent_states": [
|
||||||
|
{"role": "ALPHA", "status": "WORKING", "progress": 0.7},
|
||||||
|
{"role": "BETA", "status": "WORKING", "progress": 0.6},
|
||||||
|
],
|
||||||
|
"message_summary": {
|
||||||
|
"alpha_last_messages": [],
|
||||||
|
"beta_last_messages": [],
|
||||||
|
"gamma_last_messages": []
|
||||||
|
},
|
||||||
|
"recovery_hints": [
|
||||||
|
"Iteration limit (10) exceeded after 12 iterations",
|
||||||
|
"GAMMA was not spawned",
|
||||||
|
"2 proposals generated, 1 synthesis attempts"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
self.redis.set(handoff_key, json.dumps(handoff), ex=86400)
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", "handoff_key", handoff_key)
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", "handoff_time", handoff["dump_time"])
|
||||||
|
|
||||||
|
self.log(f"Simulated handoff dump: {len(proposals)} proposals")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"Failed to simulate abort: {e}", "ERROR")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def trigger_auto_recovery(self, pipeline_id: str) -> dict:
|
||||||
|
"""Trigger auto-recovery by simulating the orchestration completion with abort."""
|
||||||
|
try:
|
||||||
|
task_id = self.redis.hget(f"pipeline:{pipeline_id}", "task_id")
|
||||||
|
objective = self.redis.hget(f"pipeline:{pipeline_id}", "objective")
|
||||||
|
|
||||||
|
# Set abort state
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"status": "ABORTED",
|
||||||
|
"final_consensus": "false",
|
||||||
|
"abort_reason": "iteration_limit"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Call the failure context recording
|
||||||
|
metrics = {
|
||||||
|
"abort_reason": "iteration_limit",
|
||||||
|
"iteration_count": 12,
|
||||||
|
"gamma_spawned": False
|
||||||
|
}
|
||||||
|
|
||||||
|
# Simulate what the server does on exit code 3
|
||||||
|
# Record failure context
|
||||||
|
failure_context = {
|
||||||
|
"pipeline_id": pipeline_id,
|
||||||
|
"task_id": task_id,
|
||||||
|
"objective": objective,
|
||||||
|
"failure_time": datetime.utcnow().isoformat(),
|
||||||
|
"metrics": metrics,
|
||||||
|
"proposals": json.loads(self.redis.get(f"handoff:{pipeline_id}:agents") or "{}").get("proposals", []),
|
||||||
|
"agent_states": [],
|
||||||
|
"conflict_history": [],
|
||||||
|
"blackboard_snapshot": {},
|
||||||
|
"run_number": 1,
|
||||||
|
"handoff_ref": f"handoff:{pipeline_id}:agents"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Store failure context
|
||||||
|
failure_key = f"consensus_failure:{pipeline_id}:run_1"
|
||||||
|
self.redis.set(failure_key, json.dumps(failure_context))
|
||||||
|
self.redis.rpush(f"consensus_failures:{pipeline_id}", failure_key)
|
||||||
|
|
||||||
|
# Create recovery pipeline
|
||||||
|
recovery_id = f"pipeline-recovery-{int(time.time() * 1000)}"
|
||||||
|
|
||||||
|
context_summary = {
|
||||||
|
"prior_run": 1,
|
||||||
|
"prior_pipeline": pipeline_id,
|
||||||
|
"handoff_ref": f"handoff:{pipeline_id}:agents",
|
||||||
|
"failure_reason": "iteration_limit",
|
||||||
|
"iteration_count": 12,
|
||||||
|
"prior_proposals": failure_context["proposals"][:5],
|
||||||
|
"recovery_hints": [
|
||||||
|
"Previous run aborted after 12 iterations",
|
||||||
|
"GAMMA was not spawned - will be forced this time",
|
||||||
|
"2 proposals were generated"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Store inherited handoff
|
||||||
|
inherited_key = f"handoff:{recovery_id}:inherited"
|
||||||
|
self.redis.set(inherited_key, json.dumps({
|
||||||
|
"from_pipeline": pipeline_id,
|
||||||
|
"from_handoff": f"handoff:{pipeline_id}:agents",
|
||||||
|
"inherited_at": datetime.utcnow().isoformat(),
|
||||||
|
"proposals": failure_context["proposals"],
|
||||||
|
"recovery_hints": context_summary["recovery_hints"]
|
||||||
|
}), ex=86400)
|
||||||
|
|
||||||
|
# Create recovery pipeline
|
||||||
|
self.redis.hset(f"pipeline:{recovery_id}", mapping={
|
||||||
|
"task_id": task_id,
|
||||||
|
"objective": f"[RECOVERY ATTEMPT 2] [FORCE GAMMA] {objective}",
|
||||||
|
"status": "STARTING",
|
||||||
|
"created_at": datetime.utcnow().isoformat(),
|
||||||
|
"agents": json.dumps([]),
|
||||||
|
"parent_pipeline": pipeline_id,
|
||||||
|
"is_recovery": "true",
|
||||||
|
"recovery_attempt": "2",
|
||||||
|
"run_number": "2",
|
||||||
|
"prior_context": json.dumps(context_summary),
|
||||||
|
"inherited_handoff": inherited_key,
|
||||||
|
"force_gamma": "true",
|
||||||
|
"model": "anthropic/claude-sonnet-4",
|
||||||
|
"timeout": "60",
|
||||||
|
"auto_continue": "true"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Update original pipeline
|
||||||
|
self.redis.hset(f"pipeline:{pipeline_id}", mapping={
|
||||||
|
"status": "REBOOTING",
|
||||||
|
"recovery_pipeline": recovery_id,
|
||||||
|
"recovery_triggered_at": datetime.utcnow().isoformat()
|
||||||
|
})
|
||||||
|
|
||||||
|
# Track recovery metrics
|
||||||
|
self.redis.hset(f"recovery:{pipeline_id}", mapping={
|
||||||
|
"retry_count": "2",
|
||||||
|
"abort_reason": "iteration_limit",
|
||||||
|
"latest_recovery": recovery_id,
|
||||||
|
"handoff_ref": f"handoff:{pipeline_id}:agents",
|
||||||
|
"proposals_passed": str(len(failure_context["proposals"])),
|
||||||
|
"last_attempt": datetime.utcnow().isoformat()
|
||||||
|
})
|
||||||
|
|
||||||
|
self.log(f"Created recovery pipeline: {recovery_id}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"recovery_pipeline_id": recovery_id,
|
||||||
|
"proposals_inherited": len(failure_context["proposals"])
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"Failed to trigger auto-recovery: {e}", "ERROR")
|
||||||
|
return {"success": False, "error": str(e)}
|
||||||
|
|
||||||
|
def verify_handoff_dump(self, pipeline_id: str) -> dict:
|
||||||
|
"""Verify the handoff JSON was properly dumped."""
|
||||||
|
test_name = "Handoff Dump Verification"
|
||||||
|
|
||||||
|
try:
|
||||||
|
handoff_key = f"handoff:{pipeline_id}:agents"
|
||||||
|
handoff_data = self.redis.get(handoff_key)
|
||||||
|
|
||||||
|
if not handoff_data:
|
||||||
|
return {"test": test_name, "passed": False, "error": "No handoff data found"}
|
||||||
|
|
||||||
|
handoff = json.loads(handoff_data)
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"has_proposals": len(handoff.get("proposals", [])) > 0,
|
||||||
|
"has_iteration_count": "iteration_count" in handoff,
|
||||||
|
"has_agent_states": "agent_states" in handoff,
|
||||||
|
"has_recovery_hints": len(handoff.get("recovery_hints", [])) > 0,
|
||||||
|
"has_dump_time": "dump_time" in handoff
|
||||||
|
}
|
||||||
|
|
||||||
|
passed = all(checks.values())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"checks": checks,
|
||||||
|
"proposals_count": len(handoff.get("proposals", [])),
|
||||||
|
"iteration_count": handoff.get("iteration_count")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def verify_recovery_pipeline(self, original_id: str) -> dict:
|
||||||
|
"""Verify a recovery pipeline was properly created."""
|
||||||
|
test_name = "Recovery Pipeline Creation"
|
||||||
|
|
||||||
|
try:
|
||||||
|
recovery_id = self.redis.hget(f"pipeline:{original_id}", "recovery_pipeline")
|
||||||
|
|
||||||
|
if not recovery_id:
|
||||||
|
return {"test": test_name, "passed": False, "error": "No recovery pipeline found"}
|
||||||
|
|
||||||
|
recovery_data = self.redis.hgetall(f"pipeline:{recovery_id}")
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"is_recovery_flag": recovery_data.get("is_recovery") == "true",
|
||||||
|
"has_parent_pipeline": recovery_data.get("parent_pipeline") == original_id,
|
||||||
|
"has_force_gamma": recovery_data.get("force_gamma") == "true",
|
||||||
|
"has_inherited_handoff": "inherited_handoff" in recovery_data,
|
||||||
|
"has_prior_context": "prior_context" in recovery_data,
|
||||||
|
"run_number_incremented": int(recovery_data.get("run_number", 0)) > 1
|
||||||
|
}
|
||||||
|
|
||||||
|
passed = all(checks.values())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"recovery_pipeline_id": recovery_id,
|
||||||
|
"checks": checks,
|
||||||
|
"run_number": recovery_data.get("run_number")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def verify_inherited_context(self, recovery_id: str) -> dict:
|
||||||
|
"""Verify the recovery pipeline properly inherited context."""
|
||||||
|
test_name = "Inherited Context Verification"
|
||||||
|
|
||||||
|
try:
|
||||||
|
inherited_key = self.redis.hget(f"pipeline:{recovery_id}", "inherited_handoff")
|
||||||
|
|
||||||
|
if not inherited_key:
|
||||||
|
return {"test": test_name, "passed": False, "error": "No inherited handoff key"}
|
||||||
|
|
||||||
|
inherited_data = self.redis.get(inherited_key)
|
||||||
|
|
||||||
|
if not inherited_data:
|
||||||
|
return {"test": test_name, "passed": False, "error": "Inherited data not found"}
|
||||||
|
|
||||||
|
inherited = json.loads(inherited_data)
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"has_from_pipeline": "from_pipeline" in inherited,
|
||||||
|
"has_proposals": len(inherited.get("proposals", [])) > 0,
|
||||||
|
"has_recovery_hints": len(inherited.get("recovery_hints", [])) > 0,
|
||||||
|
"has_inherited_at": "inherited_at" in inherited
|
||||||
|
}
|
||||||
|
|
||||||
|
passed = all(checks.values())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"checks": checks,
|
||||||
|
"proposals_inherited": len(inherited.get("proposals", []))
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def verify_retry_tracking(self, original_id: str) -> dict:
|
||||||
|
"""Verify retry_count, abort_reason, and handoff references are tracked."""
|
||||||
|
test_name = "Retry Tracking Verification"
|
||||||
|
|
||||||
|
try:
|
||||||
|
recovery_data = self.redis.hgetall(f"recovery:{original_id}")
|
||||||
|
|
||||||
|
if not recovery_data:
|
||||||
|
return {"test": test_name, "passed": False, "error": "No recovery tracking data"}
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"has_retry_count": "retry_count" in recovery_data,
|
||||||
|
"has_abort_reason": "abort_reason" in recovery_data,
|
||||||
|
"has_handoff_ref": "handoff_ref" in recovery_data,
|
||||||
|
"has_latest_recovery": "latest_recovery" in recovery_data,
|
||||||
|
"abort_reason_correct": recovery_data.get("abort_reason") == "iteration_limit"
|
||||||
|
}
|
||||||
|
|
||||||
|
passed = all(checks.values())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"checks": checks,
|
||||||
|
"retry_count": recovery_data.get("retry_count"),
|
||||||
|
"abort_reason": recovery_data.get("abort_reason")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def verify_original_status(self, original_id: str) -> dict:
|
||||||
|
"""Verify the original pipeline status was updated correctly."""
|
||||||
|
test_name = "Original Pipeline Status"
|
||||||
|
|
||||||
|
try:
|
||||||
|
original_data = self.redis.hgetall(f"pipeline:{original_id}")
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"status_rebooting": original_data.get("status") == "REBOOTING",
|
||||||
|
"has_recovery_pipeline": "recovery_pipeline" in original_data,
|
||||||
|
"has_recovery_triggered_at": "recovery_triggered_at" in original_data
|
||||||
|
}
|
||||||
|
|
||||||
|
passed = all(checks.values())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"test": test_name,
|
||||||
|
"passed": passed,
|
||||||
|
"checks": checks,
|
||||||
|
"status": original_data.get("status"),
|
||||||
|
"recovery_pipeline": original_data.get("recovery_pipeline")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"test": test_name, "passed": False, "error": str(e)}
|
||||||
|
|
||||||
|
def cleanup(self, pipeline_ids: list):
|
||||||
|
"""Clean up test pipelines."""
|
||||||
|
for pid in pipeline_ids:
|
||||||
|
try:
|
||||||
|
keys = self.redis.keys(f"*{pid}*")
|
||||||
|
if keys:
|
||||||
|
self.redis.delete(*keys)
|
||||||
|
self.log(f"Cleaned up: {pid}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run_all_tests(self) -> dict:
|
||||||
|
"""Run the complete end-to-end test."""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("END-TO-END AUTO-RECOVERY TEST")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
pipeline_id = self.setup_test_pipeline()
|
||||||
|
|
||||||
|
# Step 1: Simulate orchestrator dumping handoff before abort
|
||||||
|
self.log("Step 1: Simulating orchestrator abort with handoff dump...")
|
||||||
|
self.simulate_orchestrator_abort(pipeline_id)
|
||||||
|
|
||||||
|
# Step 2: Trigger auto-recovery
|
||||||
|
self.log("Step 2: Triggering auto-recovery...")
|
||||||
|
recovery_result = self.trigger_auto_recovery(pipeline_id)
|
||||||
|
|
||||||
|
if not recovery_result["success"]:
|
||||||
|
print(f"\nFAILED: Auto-recovery trigger failed: {recovery_result.get('error')}")
|
||||||
|
return {"passed": 0, "failed": 1, "tests": []}
|
||||||
|
|
||||||
|
recovery_id = recovery_result["recovery_pipeline_id"]
|
||||||
|
|
||||||
|
# Run verification tests
|
||||||
|
self.log("\nStep 3: Running verification tests...")
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
self.verify_handoff_dump(pipeline_id),
|
||||||
|
self.verify_recovery_pipeline(pipeline_id),
|
||||||
|
self.verify_inherited_context(recovery_id),
|
||||||
|
self.verify_retry_tracking(pipeline_id),
|
||||||
|
self.verify_original_status(pipeline_id),
|
||||||
|
]
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for result in tests:
|
||||||
|
status = "PASS" if result["passed"] else "FAIL"
|
||||||
|
symbol = "+" if result["passed"] else "x"
|
||||||
|
|
||||||
|
print(f" {symbol} {status}: {result['test']}")
|
||||||
|
|
||||||
|
if result["passed"]:
|
||||||
|
passed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
if "error" in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
elif "checks" in result:
|
||||||
|
failed_checks = [k for k, v in result["checks"].items() if not v]
|
||||||
|
print(f" Failed checks: {', '.join(failed_checks)}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"RESULTS: {passed}/{passed + failed} passed")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
|
||||||
|
# Show recovery chain summary
|
||||||
|
print("\nRECOVERY CHAIN SUMMARY:")
|
||||||
|
print(f" Original Pipeline: {pipeline_id}")
|
||||||
|
print(f" Status: REBOOTING")
|
||||||
|
print(f" Recovery Pipeline: {recovery_id}")
|
||||||
|
print(f" Proposals Inherited: {recovery_result['proposals_inherited']}")
|
||||||
|
|
||||||
|
retry_data = self.redis.hgetall(f"recovery:{pipeline_id}")
|
||||||
|
print(f" Retry Count: {retry_data.get('retry_count', 'N/A')}")
|
||||||
|
print(f" Abort Reason: {retry_data.get('abort_reason', 'N/A')}")
|
||||||
|
print(f" Handoff Ref: {retry_data.get('handoff_ref', 'N/A')}")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
self.log("\nCleaning up test data...")
|
||||||
|
self.cleanup([pipeline_id, recovery_id])
|
||||||
|
|
||||||
|
print(f"\n{'=' * 70}\n")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"passed": passed,
|
||||||
|
"failed": failed,
|
||||||
|
"tests": tests
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run E2E auto-recovery test."""
|
||||||
|
tester = E2EAutoRecoveryTest()
|
||||||
|
results = tester.run_all_tests()
|
||||||
|
|
||||||
|
return 0 if results["failed"] == 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
223
ui/server.ts
223
ui/server.ts
@ -1516,34 +1516,27 @@ const FALLBACK_OPTIONS: FallbackOption[] = [
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
async function recordConsensusFailure(
|
// Helper: Collect context from blackboard (fallback when handoff not available)
|
||||||
pipelineId: string,
|
async function collectFromBlackboard(context: ConsensusFailureContext, taskId: string): Promise<void> {
|
||||||
taskId: string,
|
|
||||||
metrics: any
|
|
||||||
): Promise<ConsensusFailureContext> {
|
|
||||||
const pipelineKey = `pipeline:${pipelineId}`;
|
|
||||||
const pipelineData = await redis.hGetAll(pipelineKey);
|
|
||||||
|
|
||||||
// Get run number (increment if retrying)
|
|
||||||
const prevRunNumber = parseInt(pipelineData.run_number || "0");
|
|
||||||
const runNumber = prevRunNumber + 1;
|
|
||||||
|
|
||||||
// Collect all context for the failed run
|
|
||||||
const context: ConsensusFailureContext = {
|
|
||||||
pipeline_id: pipelineId,
|
|
||||||
task_id: taskId,
|
|
||||||
objective: pipelineData.objective || "",
|
|
||||||
failure_time: new Date().toISOString(),
|
|
||||||
metrics: metrics,
|
|
||||||
proposals: [],
|
|
||||||
agent_states: [],
|
|
||||||
conflict_history: [],
|
|
||||||
blackboard_snapshot: {},
|
|
||||||
run_number: runNumber
|
|
||||||
};
|
|
||||||
|
|
||||||
// Collect proposals from blackboard (if available in Redis)
|
|
||||||
try {
|
try {
|
||||||
|
// Collect proposals from blackboard solutions section
|
||||||
|
const solutionsData = await redis.hGetAll(`blackboard:${taskId}:solutions`);
|
||||||
|
for (const [key, value] of Object.entries(solutionsData)) {
|
||||||
|
try {
|
||||||
|
const entry = JSON.parse(value as string);
|
||||||
|
context.proposals.push({
|
||||||
|
agent: entry.author,
|
||||||
|
key: entry.key,
|
||||||
|
value: entry.value,
|
||||||
|
version: entry.version,
|
||||||
|
timestamp: entry.timestamp
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
context.proposals.push({ key, raw: value });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also check old key format
|
||||||
const proposalKeys = await redis.keys(`blackboard:${taskId}:solutions:*`);
|
const proposalKeys = await redis.keys(`blackboard:${taskId}:solutions:*`);
|
||||||
for (const key of proposalKeys) {
|
for (const key of proposalKeys) {
|
||||||
const proposal = await redis.get(key);
|
const proposal = await redis.get(key);
|
||||||
@ -1585,7 +1578,73 @@ async function recordConsensusFailure(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
console.error(`[CONSENSUS] Error collecting context: ${e.message}`);
|
console.error(`[CONSENSUS] Error collecting from blackboard: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function recordConsensusFailure(
|
||||||
|
pipelineId: string,
|
||||||
|
taskId: string,
|
||||||
|
metrics: any
|
||||||
|
): Promise<ConsensusFailureContext> {
|
||||||
|
const pipelineKey = `pipeline:${pipelineId}`;
|
||||||
|
const pipelineData = await redis.hGetAll(pipelineKey);
|
||||||
|
|
||||||
|
// Get run number (increment if retrying)
|
||||||
|
const prevRunNumber = parseInt(pipelineData.run_number || "0");
|
||||||
|
const runNumber = prevRunNumber + 1;
|
||||||
|
|
||||||
|
// Collect all context for the failed run
|
||||||
|
const context: ConsensusFailureContext = {
|
||||||
|
pipeline_id: pipelineId,
|
||||||
|
task_id: taskId,
|
||||||
|
objective: pipelineData.objective || "",
|
||||||
|
failure_time: new Date().toISOString(),
|
||||||
|
metrics: metrics,
|
||||||
|
proposals: [],
|
||||||
|
agent_states: [],
|
||||||
|
conflict_history: [],
|
||||||
|
blackboard_snapshot: {},
|
||||||
|
run_number: runNumber
|
||||||
|
};
|
||||||
|
|
||||||
|
// FIRST: Try to read the structured handoff JSON from the orchestrator
|
||||||
|
try {
|
||||||
|
const handoffKey = `handoff:${pipelineId}:agents`;
|
||||||
|
const handoffData = await redis.get(handoffKey);
|
||||||
|
|
||||||
|
if (handoffData) {
|
||||||
|
const handoff = JSON.parse(handoffData);
|
||||||
|
console.log(`[CONSENSUS] Found orchestrator handoff: ${handoff.proposals?.length || 0} proposals`);
|
||||||
|
|
||||||
|
// Use handoff data directly - it's more complete
|
||||||
|
context.proposals = handoff.proposals || [];
|
||||||
|
context.agent_states = handoff.agent_states || [];
|
||||||
|
context.conflict_history = handoff.message_summary?.alpha_last_messages?.concat(
|
||||||
|
handoff.message_summary?.beta_last_messages || [],
|
||||||
|
handoff.message_summary?.gamma_last_messages || []
|
||||||
|
).filter((m: any) => m.type === "CONFLICT" || m.type === "PROPOSAL" || m.type === "VOTE") || [];
|
||||||
|
|
||||||
|
// Store synthesis attempts and problem analysis in blackboard snapshot
|
||||||
|
context.blackboard_snapshot = {
|
||||||
|
synthesis: handoff.synthesis_attempts || [],
|
||||||
|
consensus: handoff.consensus_state || [],
|
||||||
|
problem: handoff.problem_analysis || [],
|
||||||
|
recovery_hints: handoff.recovery_hints || []
|
||||||
|
};
|
||||||
|
|
||||||
|
// Store handoff reference
|
||||||
|
(context as any).handoff_ref = handoffKey;
|
||||||
|
(context as any).iteration_count = handoff.iteration_count;
|
||||||
|
(context as any).gamma_active = handoff.gamma_active;
|
||||||
|
} else {
|
||||||
|
console.log(`[CONSENSUS] No handoff found, falling back to blackboard scan`);
|
||||||
|
// Fallback to blackboard scan (old method)
|
||||||
|
await collectFromBlackboard(context, taskId);
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
console.error(`[CONSENSUS] Error reading handoff, falling back: ${e.message}`);
|
||||||
|
await collectFromBlackboard(context, taskId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store the failure context in Dragonfly
|
// Store the failure context in Dragonfly
|
||||||
@ -1802,20 +1861,47 @@ async function triggerAutoRecovery(
|
|||||||
// Create a new recovery pipeline
|
// Create a new recovery pipeline
|
||||||
const newPipelineId = `pipeline-recovery-${Date.now().toString(36)}`;
|
const newPipelineId = `pipeline-recovery-${Date.now().toString(36)}`;
|
||||||
|
|
||||||
// Prepare context summary for new agents
|
// Get handoff reference from original pipeline (if available)
|
||||||
|
const handoffRef = (failureContext as any).handoff_ref || `handoff:${originalPipelineId}:agents`;
|
||||||
|
const iterationCount = (failureContext as any).iteration_count || 0;
|
||||||
|
const gammaWasActive = (failureContext as any).gamma_active || false;
|
||||||
|
|
||||||
|
// Prepare comprehensive context summary for new agents
|
||||||
const contextSummary = {
|
const contextSummary = {
|
||||||
prior_run: runNumber,
|
prior_run: runNumber,
|
||||||
prior_pipeline: originalPipelineId,
|
prior_pipeline: originalPipelineId,
|
||||||
|
handoff_ref: handoffRef,
|
||||||
failure_reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
failure_reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
||||||
prior_proposals: failureContext.proposals?.slice(0, 3) || [], // Top 3 proposals
|
iteration_count: iterationCount,
|
||||||
prior_conflicts: failureContext.conflict_history?.slice(-5) || [], // Last 5 conflicts
|
gamma_was_active: gammaWasActive,
|
||||||
|
|
||||||
|
// Include actual proposals (not just references)
|
||||||
|
prior_proposals: failureContext.proposals?.slice(0, 5) || [], // Top 5 proposals
|
||||||
|
prior_synthesis: failureContext.blackboard_snapshot?.synthesis || [],
|
||||||
|
prior_conflicts: failureContext.conflict_history?.slice(-10) || [], // Last 10 conflicts
|
||||||
|
|
||||||
recovery_hints: [
|
recovery_hints: [
|
||||||
"Previous agents failed to reach consensus",
|
`Previous run aborted after ${iterationCount} iterations`,
|
||||||
|
gammaWasActive ? "GAMMA was active but could not resolve conflicts" : "GAMMA was not spawned - will be forced this time",
|
||||||
|
`${failureContext.proposals?.length || 0} proposals were generated`,
|
||||||
"Review prior proposals for common ground",
|
"Review prior proposals for common ground",
|
||||||
"Consider a different approach if prior attempts converged on same solution"
|
"Consider synthesizing the best elements of prior proposals"
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Store full handoff in Redis for the new pipeline
|
||||||
|
const newHandoffKey = `handoff:${newPipelineId}:inherited`;
|
||||||
|
await redis.set(newHandoffKey, JSON.stringify({
|
||||||
|
from_pipeline: originalPipelineId,
|
||||||
|
from_handoff: handoffRef,
|
||||||
|
inherited_at: new Date().toISOString(),
|
||||||
|
proposals: failureContext.proposals,
|
||||||
|
synthesis_attempts: failureContext.blackboard_snapshot?.synthesis,
|
||||||
|
consensus_state: failureContext.blackboard_snapshot?.consensus,
|
||||||
|
agent_states: failureContext.agent_states,
|
||||||
|
recovery_hints: contextSummary.recovery_hints
|
||||||
|
}), { EX: 86400 }); // 24hr TTL
|
||||||
|
|
||||||
await redis.hSet(`pipeline:${newPipelineId}`, {
|
await redis.hSet(`pipeline:${newPipelineId}`, {
|
||||||
task_id: taskId,
|
task_id: taskId,
|
||||||
objective: objective,
|
objective: objective,
|
||||||
@ -1825,34 +1911,57 @@ async function triggerAutoRecovery(
|
|||||||
parent_pipeline: originalPipelineId,
|
parent_pipeline: originalPipelineId,
|
||||||
is_recovery: "true",
|
is_recovery: "true",
|
||||||
recovery_attempt: String(runNumber + 1),
|
recovery_attempt: String(runNumber + 1),
|
||||||
|
run_number: String(runNumber + 1),
|
||||||
prior_context: JSON.stringify(contextSummary),
|
prior_context: JSON.stringify(contextSummary),
|
||||||
|
inherited_handoff: newHandoffKey,
|
||||||
force_gamma: "true", // Always spawn GAMMA on recovery attempts
|
force_gamma: "true", // Always spawn GAMMA on recovery attempts
|
||||||
model: model,
|
model: model,
|
||||||
timeout: String(timeout),
|
timeout: String(timeout),
|
||||||
auto_continue: "true"
|
auto_continue: "true"
|
||||||
});
|
});
|
||||||
|
|
||||||
// Update original pipeline
|
// Update original pipeline with detailed recovery tracking
|
||||||
await redis.hSet(`pipeline:${originalPipelineId}`, {
|
await redis.hSet(`pipeline:${originalPipelineId}`, {
|
||||||
status: "REBOOTING",
|
status: "REBOOTING",
|
||||||
recovery_pipeline: newPipelineId
|
recovery_pipeline: newPipelineId,
|
||||||
|
recovery_triggered_at: new Date().toISOString()
|
||||||
});
|
});
|
||||||
|
|
||||||
// Log the handoff reason to Dragonfly metrics
|
// Track retry metrics in Dragonfly
|
||||||
|
await redis.hSet(`recovery:${originalPipelineId}`, {
|
||||||
|
retry_count: String(runNumber + 1),
|
||||||
|
abort_reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
||||||
|
latest_recovery: newPipelineId,
|
||||||
|
handoff_ref: handoffRef,
|
||||||
|
proposals_passed: String(failureContext.proposals?.length || 0),
|
||||||
|
last_attempt: new Date().toISOString()
|
||||||
|
});
|
||||||
|
|
||||||
|
// Log detailed handoff reason to Dragonfly metrics
|
||||||
await redis.hSet(`handoff:${originalPipelineId}`, {
|
await redis.hSet(`handoff:${originalPipelineId}`, {
|
||||||
to_pipeline: newPipelineId,
|
to_pipeline: newPipelineId,
|
||||||
reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
reason: failureContext.metrics?.abort_reason || "consensus_failed",
|
||||||
handoff_time: new Date().toISOString(),
|
handoff_time: new Date().toISOString(),
|
||||||
context_size: JSON.stringify(contextSummary).length,
|
context_size: String(JSON.stringify(contextSummary).length),
|
||||||
proposals_passed: failureContext.proposals?.length || 0
|
proposals_passed: String(failureContext.proposals?.length || 0),
|
||||||
|
iteration_count: String(iterationCount),
|
||||||
|
gamma_was_active: gammaWasActive ? "true" : "false"
|
||||||
});
|
});
|
||||||
|
|
||||||
await appendPipelineLog(newPipelineId, "SYSTEM",
|
await appendPipelineLog(newPipelineId, "SYSTEM",
|
||||||
`Recovery pipeline started (attempt ${runNumber + 1}/${MAX_AUTO_RECOVERY}). GAMMA mediator will be spawned.`, "INFO");
|
`Recovery pipeline started (attempt ${runNumber + 1}/${MAX_AUTO_RECOVERY}). GAMMA mediator will be force-spawned.`, "INFO");
|
||||||
|
|
||||||
await appendPipelineLog(newPipelineId, "CONTEXT",
|
await appendPipelineLog(newPipelineId, "CONTEXT",
|
||||||
`Prior run had ${failureContext.proposals?.length || 0} proposals, ` +
|
`Inherited from ${originalPipelineId}: ${failureContext.proposals?.length || 0} proposals, ` +
|
||||||
`${failureContext.conflict_history?.length || 0} conflicts. Force-spawning GAMMA.`, "INFO");
|
`${failureContext.conflict_history?.length || 0} conflicts, ${iterationCount} iterations.`, "INFO");
|
||||||
|
|
||||||
|
if (failureContext.proposals && failureContext.proposals.length > 0) {
|
||||||
|
// Log summary of inherited proposals
|
||||||
|
const proposalSummary = failureContext.proposals.slice(0, 3).map((p: any, i: number) =>
|
||||||
|
`${i + 1}. [${p.agent}] ${typeof p.value === 'string' ? p.value.slice(0, 100) : JSON.stringify(p.value).slice(0, 100)}...`
|
||||||
|
).join("\n");
|
||||||
|
await appendPipelineLog(newPipelineId, "CONTEXT", `Top inherited proposals:\n${proposalSummary}`, "INFO");
|
||||||
|
}
|
||||||
|
|
||||||
// Trigger orchestration with GAMMA hint in objective
|
// Trigger orchestration with GAMMA hint in objective
|
||||||
triggerOrchestration(
|
triggerOrchestration(
|
||||||
@ -2067,6 +2176,12 @@ async function getActivePipelines(): Promise<any[]> {
|
|||||||
failure_reason: data.failure_reason || null,
|
failure_reason: data.failure_reason || null,
|
||||||
run_number: data.run_number ? parseInt(data.run_number) : 1,
|
run_number: data.run_number ? parseInt(data.run_number) : 1,
|
||||||
prior_pipeline: data.prior_pipeline || null,
|
prior_pipeline: data.prior_pipeline || null,
|
||||||
|
// Recovery tracking
|
||||||
|
is_recovery: data.is_recovery || null,
|
||||||
|
parent_pipeline: data.parent_pipeline || null,
|
||||||
|
recovery_attempt: data.recovery_attempt ? parseInt(data.recovery_attempt) : null,
|
||||||
|
force_gamma: data.force_gamma || null,
|
||||||
|
inherited_handoff: data.inherited_handoff || null,
|
||||||
});
|
});
|
||||||
} catch {}
|
} catch {}
|
||||||
}
|
}
|
||||||
@ -4069,6 +4184,8 @@ function renderDashboard(): string {
|
|||||||
.status-badge.rebooting { background: rgba(57, 197, 207, 0.2); color: var(--accent-cyan); animation: pulse 1.5s infinite; }
|
.status-badge.rebooting { background: rgba(57, 197, 207, 0.2); color: var(--accent-cyan); animation: pulse 1.5s infinite; }
|
||||||
.status-badge.aborted { background: rgba(248, 81, 73, 0.3); color: var(--accent-red); border: 1px solid var(--accent-red); }
|
.status-badge.aborted { background: rgba(248, 81, 73, 0.3); color: var(--accent-red); border: 1px solid var(--accent-red); }
|
||||||
.status-badge.recovery_failed { background: rgba(248, 81, 73, 0.4); color: #ff6b6b; border: 1px solid #ff6b6b; }
|
.status-badge.recovery_failed { background: rgba(248, 81, 73, 0.4); color: #ff6b6b; border: 1px solid #ff6b6b; }
|
||||||
|
.status-badge.recovery { background: rgba(139, 92, 246, 0.2); color: var(--accent-purple); border: 1px solid var(--accent-purple); }
|
||||||
|
.pipeline-card.recovery-run { border-left: 3px solid var(--accent-purple); }
|
||||||
|
|
||||||
@keyframes pulse {
|
@keyframes pulse {
|
||||||
0%, 100% { opacity: 1; }
|
0%, 100% { opacity: 1; }
|
||||||
@ -5920,6 +6037,8 @@ function renderDashboard(): string {
|
|||||||
const isConsensusFailed = p.status === 'CONSENSUS_FAILED' || p.status === 'RECOVERY_FAILED';
|
const isConsensusFailed = p.status === 'CONSENSUS_FAILED' || p.status === 'RECOVERY_FAILED';
|
||||||
const isRebooting = p.status === 'REBOOTING';
|
const isRebooting = p.status === 'REBOOTING';
|
||||||
const isAborted = p.status === 'ABORTED';
|
const isAborted = p.status === 'ABORTED';
|
||||||
|
const isRecoveryRun = p.is_recovery === 'true' || p.recovery_attempt;
|
||||||
|
const runNumber = p.run_number || p.recovery_attempt || 1;
|
||||||
|
|
||||||
const agentPills = agents.map(a => {
|
const agentPills = agents.map(a => {
|
||||||
const type = (a.type || 'UNKNOWN').toLowerCase();
|
const type = (a.type || 'UNKNOWN').toLowerCase();
|
||||||
@ -6011,13 +6130,31 @@ function renderDashboard(): string {
|
|||||||
</div>
|
</div>
|
||||||
\` : '';
|
\` : '';
|
||||||
|
|
||||||
|
// Recovery run indicator
|
||||||
|
const recoveryBadge = isRecoveryRun ? \`
|
||||||
|
<span class="status-badge recovery" title="Recovery run \${runNumber} - inheriting context from prior run">
|
||||||
|
RETRY \${runNumber}
|
||||||
|
</span>
|
||||||
|
\` : '';
|
||||||
|
|
||||||
|
// Prior pipeline link for recovery runs
|
||||||
|
const priorPipelineLink = (isRecoveryRun && p.parent_pipeline) ? \`
|
||||||
|
<div style="font-size: 10px; color: var(--text-muted); margin-top: 4px;">
|
||||||
|
Prior: <span style="color: var(--accent-cyan); cursor: pointer;" onclick="event.stopPropagation(); selectPipeline('\${p.parent_pipeline}')">\${p.parent_pipeline}</span>
|
||||||
|
</div>
|
||||||
|
\` : '';
|
||||||
|
|
||||||
return \`
|
return \`
|
||||||
<div class="pipeline-card \${isActive ? 'active' : ''} \${isConsensusFailed ? 'consensus-failed' : ''} \${isRebooting ? 'rebooting' : ''}" onclick="selectPipeline('\${p.pipeline_id}')">
|
<div class="pipeline-card \${isActive ? 'active' : ''} \${isConsensusFailed ? 'consensus-failed' : ''} \${isRebooting ? 'rebooting' : ''} \${isRecoveryRun ? 'recovery-run' : ''}" onclick="selectPipeline('\${p.pipeline_id}')">
|
||||||
<div class="pipeline-header">
|
<div class="pipeline-header">
|
||||||
<span class="pipeline-id">\${p.pipeline_id}</span>
|
<span class="pipeline-id">\${p.pipeline_id}</span>
|
||||||
<span class="status-badge \${(p.status || 'unknown').toLowerCase().replace(/_/g, '_')}">\${p.status || 'UNKNOWN'}</span>
|
<div style="display: flex; gap: 6px; align-items: center;">
|
||||||
|
\${recoveryBadge}
|
||||||
|
<span class="status-badge \${(p.status || 'unknown').toLowerCase().replace(/_/g, '_')}">\${p.status || 'UNKNOWN'}</span>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="pipeline-objective">\${p.objective || 'No objective'}</div>
|
<div class="pipeline-objective">\${p.objective || 'No objective'}</div>
|
||||||
|
\${priorPipelineLink}
|
||||||
<div class="agent-pills">\${agentPills || '<span style="color: var(--text-muted); font-size: 10px;">No agents</span>'}</div>
|
<div class="agent-pills">\${agentPills || '<span style="color: var(--text-muted); font-size: 10px;">No agents</span>'}</div>
|
||||||
\${rebootingAlert}
|
\${rebootingAlert}
|
||||||
\${consensusAlert}
|
\${consensusAlert}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user