profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

714 lines
25 KiB
Python

"""
Bug Window Watcher
==================
Real-time monitoring of every pipeline stage with anomaly detection.
Features:
- Monitors all stages in real-time
- Surfaces anomalies, regressions, unhandled errors
- Links findings to phase, directory, STATUS, and checkpoint entries
"""
import json
import time
import hashlib
import sqlite3
import subprocess
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, field, asdict
from enum import Enum
from pathlib import Path
from typing import Any, Optional
import redis
class AnomalyType(str, Enum):
"""Types of anomalies the watcher can detect"""
UNHANDLED_ERROR = "unhandled_error"
REGRESSION = "regression"
PERFORMANCE_DEGRADATION = "performance_degradation"
MISSING_ARTIFACT = "missing_artifact"
STATE_INCONSISTENCY = "state_inconsistency"
HEALTH_CHECK_FAILURE = "health_check_failure"
DEPENDENCY_UNAVAILABLE = "dependency_unavailable"
TIMEOUT = "timeout"
UNEXPECTED_OUTPUT = "unexpected_output"
SECURITY_VIOLATION = "security_violation"
class Severity(str, Enum):
"""Severity levels for anomalies"""
CRITICAL = "critical" # System compromised, immediate action
HIGH = "high" # Major functionality impacted
MEDIUM = "medium" # Degraded but functional
LOW = "low" # Minor issue, informational
INFO = "info" # Tracking only
@dataclass
class Anomaly:
"""Represents a detected anomaly"""
id: str
type: AnomalyType
severity: Severity
phase: int
phase_name: str
directory: str
message: str
details: dict = field(default_factory=dict)
stack_trace: Optional[str] = None
checkpoint_id: Optional[str] = None
status_file: Optional[str] = None
detected_at: str = ""
resolved: bool = False
resolution_notes: Optional[str] = None
def __post_init__(self):
if not self.detected_at:
self.detected_at = datetime.now(timezone.utc).isoformat()
if not self.id:
self.id = f"anom-{hashlib.sha256(f'{self.type}{self.phase}{self.message}{self.detected_at}'.encode()).hexdigest()[:12]}"
@dataclass
class WatcherState:
"""Current state of the bug watcher"""
active: bool = False
started_at: Optional[str] = None
anomalies_detected: int = 0
phases_watched: list = field(default_factory=list)
last_scan_at: Optional[str] = None
error_count: int = 0
class BugWindowWatcher:
"""
Real-time anomaly detection across all pipeline stages.
Monitors:
- Phase transitions and state changes
- Error logs and stack traces
- Performance metrics and timeouts
- Dependency availability
- Artifact integrity
- Security boundaries
"""
# Phase definitions
PHASES = {
1: "Foundation (Vault + Basic Infrastructure)",
2: "Vault Policy Engine",
3: "Execution Pipeline",
4: "Promotion and Revocation Engine",
5: "Agent Bootstrapping",
6: "Pipeline DSL, Agent Templates, Testing Framework",
7: "Hierarchical Teams & Learning System",
8: "Production Hardening",
9: "External Integrations",
10: "Multi-Tenant Support",
11: "Agent Marketplace",
12: "Observability",
}
# Phase -> key directories mapping
PHASE_DIRECTORIES = {
1: ["ledger", "bin"],
2: ["runtime"],
3: ["preflight", "wrappers", "evidence"],
4: ["runtime"],
5: ["agents", "checkpoint", "orchestrator"],
6: ["pipeline", "tests"],
7: ["teams", "analytics", "memory"],
8: ["runtime", "testing/oversight"],
9: ["integrations"],
10: ["teams"],
11: ["agents"],
12: ["analytics", "ui"],
}
def __init__(self, base_path: str = "/opt/agent-governance"):
self.base_path = Path(base_path)
self.ledger_db = self.base_path / "ledger" / "governance.db"
self.checkpoint_dir = self.base_path / "checkpoint" / "storage"
self.state = WatcherState()
self.anomalies: list[Anomaly] = []
self._redis: Optional[redis.Redis] = None
self._setup_redis()
def _setup_redis(self):
"""Connect to DragonflyDB for real-time state"""
try:
self._redis = redis.Redis(
host='127.0.0.1',
port=6379,
password='governance2026',
decode_responses=True
)
self._redis.ping()
except Exception:
self._redis = None
def _now(self) -> str:
return datetime.now(timezone.utc).isoformat()
def start(self) -> WatcherState:
"""Start the bug watcher"""
self.state.active = True
self.state.started_at = self._now()
self.state.phases_watched = list(self.PHASES.keys())
if self._redis:
self._redis.hset("oversight:watcher", mapping={
"active": "true",
"started_at": self.state.started_at,
"phases": json.dumps(self.state.phases_watched)
})
return self.state
def stop(self) -> WatcherState:
"""Stop the bug watcher"""
self.state.active = False
if self._redis:
self._redis.hset("oversight:watcher", "active", "false")
return self.state
def scan_all_phases(self) -> list[Anomaly]:
"""Scan all phases for anomalies"""
all_anomalies = []
for phase_num in self.PHASES:
anomalies = self.scan_phase(phase_num)
all_anomalies.extend(anomalies)
self.state.last_scan_at = self._now()
self.state.anomalies_detected = len(all_anomalies)
return all_anomalies
def scan_phase(self, phase_num: int) -> list[Anomaly]:
"""Scan a specific phase for anomalies"""
anomalies = []
phase_name = self.PHASES.get(phase_num, f"Phase {phase_num}")
directories = self.PHASE_DIRECTORIES.get(phase_num, [])
# 1. Check STATUS.md files for issues
for dir_name in directories:
dir_path = self.base_path / dir_name
if dir_path.exists():
status_anomalies = self._check_status_file(dir_path, phase_num, phase_name)
anomalies.extend(status_anomalies)
# 2. Check for recent errors in ledger
ledger_anomalies = self._check_ledger_errors(phase_num, phase_name)
anomalies.extend(ledger_anomalies)
# 3. Check dependency health
dep_anomalies = self._check_dependencies(phase_num, phase_name)
anomalies.extend(dep_anomalies)
# 4. Check checkpoint consistency
ckpt_anomalies = self._check_checkpoint_consistency(phase_num, phase_name)
anomalies.extend(ckpt_anomalies)
# 5. Phase-specific checks
specific_anomalies = self._run_phase_specific_checks(phase_num, phase_name)
anomalies.extend(specific_anomalies)
# Store anomalies
self.anomalies.extend(anomalies)
self._persist_anomalies(anomalies)
return anomalies
def _check_status_file(self, dir_path: Path, phase_num: int, phase_name: str) -> list[Anomaly]:
"""Check STATUS.md file for issues"""
anomalies = []
status_file = dir_path / "STATUS.md"
if not status_file.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.LOW,
phase=phase_num,
phase_name=phase_name,
directory=str(dir_path.relative_to(self.base_path)),
message=f"Missing STATUS.md in {dir_path.name}",
status_file=None
))
return anomalies
try:
content = status_file.read_text()
# Check for blocked status
if "BLOCKED" in content.upper() or "" in content:
anomalies.append(Anomaly(
id="",
type=AnomalyType.STATE_INCONSISTENCY,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory=str(dir_path.relative_to(self.base_path)),
message=f"Directory {dir_path.name} is BLOCKED",
status_file=str(status_file),
details={"content_preview": content[:500]}
))
# Check for stale status (not updated in 7 days)
if "Last updated:" in content:
try:
# Extract date from "Last updated: YYYY-MM-DD"
for line in content.split('\n'):
if 'Last updated:' in line or 'last_updated' in line.lower():
# Try to find a date pattern
import re
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', line)
if date_match:
last_update = datetime.fromisoformat(date_match.group(1))
if datetime.now() - last_update > timedelta(days=7):
anomalies.append(Anomaly(
id="",
type=AnomalyType.STATE_INCONSISTENCY,
severity=Severity.LOW,
phase=phase_num,
phase_name=phase_name,
directory=str(dir_path.relative_to(self.base_path)),
message=f"Stale STATUS.md - last updated {date_match.group(1)}",
status_file=str(status_file)
))
break
except Exception:
pass
except Exception as e:
anomalies.append(Anomaly(
id="",
type=AnomalyType.UNHANDLED_ERROR,
severity=Severity.MEDIUM,
phase=phase_num,
phase_name=phase_name,
directory=str(dir_path.relative_to(self.base_path)),
message=f"Error reading STATUS.md: {e}",
status_file=str(status_file)
))
return anomalies
def _check_ledger_errors(self, phase_num: int, phase_name: str) -> list[Anomaly]:
"""Check governance ledger for recent errors"""
anomalies = []
if not self.ledger_db.exists():
return anomalies
try:
conn = sqlite3.connect(self.ledger_db)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Check violations table
cursor.execute("""
SELECT * FROM violations
WHERE severity IN ('critical', 'high')
AND acknowledged = 0
ORDER BY timestamp DESC LIMIT 10
""")
for row in cursor.fetchall():
anomalies.append(Anomaly(
id="",
type=AnomalyType.SECURITY_VIOLATION,
severity=Severity.CRITICAL if row['severity'] == 'critical' else Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="ledger",
message=f"Unacknowledged {row['severity']} violation: {row['violation_type']}",
details={
"violation_id": row['id'],
"agent_id": row['agent_id'],
"description": row['description'],
"timestamp": row['timestamp']
}
))
conn.close()
except Exception as e:
self.state.error_count += 1
return anomalies
def _check_dependencies(self, phase_num: int, phase_name: str) -> list[Anomaly]:
"""Check dependency availability"""
anomalies = []
# Check Vault
try:
result = subprocess.run(
["docker", "exec", "vault", "vault", "status", "-format=json"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
anomalies.append(Anomaly(
id="",
type=AnomalyType.DEPENDENCY_UNAVAILABLE,
severity=Severity.CRITICAL,
phase=phase_num,
phase_name=phase_name,
directory="infrastructure",
message="Vault is unavailable or sealed",
details={"stderr": result.stderr[:500] if result.stderr else ""}
))
except Exception as e:
anomalies.append(Anomaly(
id="",
type=AnomalyType.DEPENDENCY_UNAVAILABLE,
severity=Severity.CRITICAL,
phase=phase_num,
phase_name=phase_name,
directory="infrastructure",
message=f"Cannot check Vault status: {e}"
))
# Check DragonflyDB
if self._redis:
try:
self._redis.ping()
except Exception:
anomalies.append(Anomaly(
id="",
type=AnomalyType.DEPENDENCY_UNAVAILABLE,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="infrastructure",
message="DragonflyDB is unavailable"
))
# Check Ledger DB
if not self.ledger_db.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.DEPENDENCY_UNAVAILABLE,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="ledger",
message="Governance ledger database not found"
))
return anomalies
def _check_checkpoint_consistency(self, phase_num: int, phase_name: str) -> list[Anomaly]:
"""Check checkpoint data for consistency issues"""
anomalies = []
if not self.checkpoint_dir.exists():
return anomalies
checkpoints = sorted(self.checkpoint_dir.glob("ckpt-*.json"), reverse=True)
if not checkpoints:
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.MEDIUM,
phase=phase_num,
phase_name=phase_name,
directory="checkpoint",
message="No checkpoints found"
))
return anomalies
# Check latest checkpoint
try:
latest = json.loads(checkpoints[0].read_text())
# Verify content hash
if 'content_hash' in latest:
# Could verify hash here
pass
# Check for stale checkpoint (older than 1 hour)
if 'created_at' in latest:
created = datetime.fromisoformat(latest['created_at'].replace('Z', '+00:00'))
if datetime.now(timezone.utc) - created > timedelta(hours=1):
anomalies.append(Anomaly(
id="",
type=AnomalyType.STATE_INCONSISTENCY,
severity=Severity.LOW,
phase=phase_num,
phase_name=phase_name,
directory="checkpoint",
message=f"Last checkpoint is stale: {latest['created_at']}",
checkpoint_id=latest.get('checkpoint_id')
))
except Exception as e:
anomalies.append(Anomaly(
id="",
type=AnomalyType.UNHANDLED_ERROR,
severity=Severity.MEDIUM,
phase=phase_num,
phase_name=phase_name,
directory="checkpoint",
message=f"Error reading checkpoint: {e}"
))
return anomalies
def _run_phase_specific_checks(self, phase_num: int, phase_name: str) -> list[Anomaly]:
"""Run checks specific to each phase"""
anomalies = []
if phase_num == 3: # Execution Pipeline
# Check preflight module
preflight_path = self.base_path / "preflight" / "preflight.py"
if not preflight_path.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="preflight",
message="Preflight module missing"
))
elif phase_num == 4: # Promotion/Revocation
# Check for agents with high violation counts
if self._redis:
try:
keys = self._redis.keys("agent:*:errors")
for key in keys[:10]: # Limit check
errors = self._redis.hgetall(key)
total = int(errors.get('total_errors', 0))
if total > 5:
agent_id = key.split(':')[1]
anomalies.append(Anomaly(
id="",
type=AnomalyType.REGRESSION,
severity=Severity.MEDIUM,
phase=phase_num,
phase_name=phase_name,
directory="runtime",
message=f"Agent {agent_id} has {total} errors",
details=errors
))
except Exception:
pass
elif phase_num == 5: # Agent Bootstrapping - SPECIAL ATTENTION
# Check tier0-agent
tier0_config = self.base_path / "agents" / "tier0-agent" / "config" / "agent.json"
if not tier0_config.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="agents/tier0-agent",
message="Tier 0 agent config missing"
))
# Check orchestrator
model_controller = self.base_path / "orchestrator" / "model_controller.py"
if not model_controller.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.MEDIUM,
phase=phase_num,
phase_name=phase_name,
directory="orchestrator",
message="Model controller missing"
))
elif phase_num == 8: # Production Hardening
# Check if health_manager exists
health_manager = self.base_path / "runtime" / "health_manager.py"
if not health_manager.exists():
anomalies.append(Anomaly(
id="",
type=AnomalyType.MISSING_ARTIFACT,
severity=Severity.HIGH,
phase=phase_num,
phase_name=phase_name,
directory="runtime",
message="health_manager.py not implemented - Phase 8 blocked"
))
return anomalies
def _persist_anomalies(self, anomalies: list[Anomaly]):
"""Persist anomalies to storage"""
if not self._redis:
return
for anomaly in anomalies:
# Store in Redis list
self._redis.lpush(
"oversight:anomalies",
json.dumps(asdict(anomaly))
)
# Keep only last 1000
self._redis.ltrim("oversight:anomalies", 0, 999)
# Index by severity
self._redis.sadd(f"oversight:anomalies:{anomaly.severity.value}", anomaly.id)
# Index by phase
self._redis.sadd(f"oversight:anomalies:phase:{anomaly.phase}", anomaly.id)
def get_anomalies(
self,
severity: Optional[Severity] = None,
phase: Optional[int] = None,
limit: int = 50
) -> list[Anomaly]:
"""Retrieve anomalies with optional filters"""
if not self._redis:
# Return in-memory anomalies
filtered = self.anomalies
if severity:
filtered = [a for a in filtered if a.severity == severity]
if phase:
filtered = [a for a in filtered if a.phase == phase]
return filtered[:limit]
# Get from Redis
raw = self._redis.lrange("oversight:anomalies", 0, limit - 1)
anomalies = []
for item in raw:
try:
data = json.loads(item)
anomaly = Anomaly(**data)
if severity and anomaly.severity != severity:
continue
if phase and anomaly.phase != phase:
continue
anomalies.append(anomaly)
except Exception:
continue
return anomalies
def acknowledge_anomaly(self, anomaly_id: str, notes: str = "") -> bool:
"""Mark an anomaly as resolved"""
if not self._redis:
for anomaly in self.anomalies:
if anomaly.id == anomaly_id:
anomaly.resolved = True
anomaly.resolution_notes = notes
return True
return False
# Update in Redis
self._redis.hset(f"oversight:anomaly:{anomaly_id}", mapping={
"resolved": "true",
"resolution_notes": notes,
"resolved_at": self._now()
})
return True
def get_summary(self) -> dict:
"""Get summary of watcher state and anomalies"""
anomalies = self.get_anomalies(limit=1000)
by_severity = {s.value: 0 for s in Severity}
by_phase = {p: 0 for p in self.PHASES}
by_type = {t.value: 0 for t in AnomalyType}
for a in anomalies:
# Handle both enum and string values
sev_val = a.severity.value if hasattr(a.severity, 'value') else a.severity
type_val = a.type.value if hasattr(a.type, 'value') else a.type
by_severity[sev_val] = by_severity.get(sev_val, 0) + 1
by_phase[a.phase] = by_phase.get(a.phase, 0) + 1
by_type[type_val] = by_type.get(type_val, 0) + 1
return {
"state": asdict(self.state),
"total_anomalies": len(anomalies),
"unresolved": len([a for a in anomalies if not a.resolved]),
"by_severity": by_severity,
"by_phase": by_phase,
"by_type": by_type,
"phases": self.PHASES
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Bug Window Watcher")
parser.add_argument("command", choices=["scan", "status", "list"], help="Command to run")
parser.add_argument("--phase", type=int, help="Specific phase to scan")
parser.add_argument("--severity", choices=["critical", "high", "medium", "low", "info"])
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
watcher = BugWindowWatcher()
watcher.start()
if args.command == "scan":
if args.phase:
anomalies = watcher.scan_phase(args.phase)
else:
anomalies = watcher.scan_all_phases()
if args.json:
print(json.dumps([asdict(a) for a in anomalies], indent=2))
else:
print(f"\n{'='*60}")
print(f"BUG WINDOW WATCHER - Scan Results")
print(f"{'='*60}")
print(f"Anomalies found: {len(anomalies)}")
print()
for a in anomalies:
icon = {"critical": "🔴", "high": "🟠", "medium": "🟡", "low": "🔵", "info": ""}.get(a.severity.value, "")
print(f"{icon} [{a.severity.value.upper()}] Phase {a.phase}: {a.message}")
print(f" Directory: {a.directory}")
if a.status_file:
print(f" Status: {a.status_file}")
print()
elif args.command == "status":
summary = watcher.get_summary()
if args.json:
print(json.dumps(summary, indent=2))
else:
print(f"\n{'='*60}")
print(f"BUG WINDOW WATCHER - Status")
print(f"{'='*60}")
print(f"Active: {summary['state']['active']}")
print(f"Total Anomalies: {summary['total_anomalies']}")
print(f"Unresolved: {summary['unresolved']}")
print()
print("By Severity:")
for sev, count in summary['by_severity'].items():
if count > 0:
print(f" {sev}: {count}")
elif args.command == "list":
severity = Severity(args.severity) if args.severity else None
anomalies = watcher.get_anomalies(severity=severity, phase=args.phase)
if args.json:
print(json.dumps([asdict(a) for a in anomalies], indent=2))
else:
for a in anomalies:
print(f"[{a.id}] {a.severity.value}: {a.message}")