""" Bug Window Watcher ================== Real-time monitoring of every pipeline stage with anomaly detection. Features: - Monitors all stages in real-time - Surfaces anomalies, regressions, unhandled errors - Links findings to phase, directory, STATUS, and checkpoint entries """ import json import time import hashlib import sqlite3 import subprocess from datetime import datetime, timezone, timedelta from dataclasses import dataclass, field, asdict from enum import Enum from pathlib import Path from typing import Any, Optional import redis class AnomalyType(str, Enum): """Types of anomalies the watcher can detect""" UNHANDLED_ERROR = "unhandled_error" REGRESSION = "regression" PERFORMANCE_DEGRADATION = "performance_degradation" MISSING_ARTIFACT = "missing_artifact" STATE_INCONSISTENCY = "state_inconsistency" HEALTH_CHECK_FAILURE = "health_check_failure" DEPENDENCY_UNAVAILABLE = "dependency_unavailable" TIMEOUT = "timeout" UNEXPECTED_OUTPUT = "unexpected_output" SECURITY_VIOLATION = "security_violation" class Severity(str, Enum): """Severity levels for anomalies""" CRITICAL = "critical" # System compromised, immediate action HIGH = "high" # Major functionality impacted MEDIUM = "medium" # Degraded but functional LOW = "low" # Minor issue, informational INFO = "info" # Tracking only @dataclass class Anomaly: """Represents a detected anomaly""" id: str type: AnomalyType severity: Severity phase: int phase_name: str directory: str message: str details: dict = field(default_factory=dict) stack_trace: Optional[str] = None checkpoint_id: Optional[str] = None status_file: Optional[str] = None detected_at: str = "" resolved: bool = False resolution_notes: Optional[str] = None def __post_init__(self): if not self.detected_at: self.detected_at = datetime.now(timezone.utc).isoformat() if not self.id: self.id = f"anom-{hashlib.sha256(f'{self.type}{self.phase}{self.message}{self.detected_at}'.encode()).hexdigest()[:12]}" @dataclass class WatcherState: """Current state of the bug watcher""" active: bool = False started_at: Optional[str] = None anomalies_detected: int = 0 phases_watched: list = field(default_factory=list) last_scan_at: Optional[str] = None error_count: int = 0 class BugWindowWatcher: """ Real-time anomaly detection across all pipeline stages. Monitors: - Phase transitions and state changes - Error logs and stack traces - Performance metrics and timeouts - Dependency availability - Artifact integrity - Security boundaries """ # Phase definitions PHASES = { 1: "Foundation (Vault + Basic Infrastructure)", 2: "Vault Policy Engine", 3: "Execution Pipeline", 4: "Promotion and Revocation Engine", 5: "Agent Bootstrapping", 6: "Pipeline DSL, Agent Templates, Testing Framework", 7: "Hierarchical Teams & Learning System", 8: "Production Hardening", 9: "External Integrations", 10: "Multi-Tenant Support", 11: "Agent Marketplace", 12: "Observability", } # Phase -> key directories mapping PHASE_DIRECTORIES = { 1: ["ledger", "bin"], 2: ["runtime"], 3: ["preflight", "wrappers", "evidence"], 4: ["runtime"], 5: ["agents", "checkpoint", "orchestrator"], 6: ["pipeline", "tests"], 7: ["teams", "analytics", "memory"], 8: ["runtime", "testing/oversight"], 9: ["integrations"], 10: ["teams"], 11: ["agents"], 12: ["analytics", "ui"], } def __init__(self, base_path: str = "/opt/agent-governance"): self.base_path = Path(base_path) self.ledger_db = self.base_path / "ledger" / "governance.db" self.checkpoint_dir = self.base_path / "checkpoint" / "storage" self.state = WatcherState() self.anomalies: list[Anomaly] = [] self._redis: Optional[redis.Redis] = None self._setup_redis() def _setup_redis(self): """Connect to DragonflyDB for real-time state""" try: self._redis = redis.Redis( host='127.0.0.1', port=6379, password='governance2026', decode_responses=True ) self._redis.ping() except Exception: self._redis = None def _now(self) -> str: return datetime.now(timezone.utc).isoformat() def start(self) -> WatcherState: """Start the bug watcher""" self.state.active = True self.state.started_at = self._now() self.state.phases_watched = list(self.PHASES.keys()) if self._redis: self._redis.hset("oversight:watcher", mapping={ "active": "true", "started_at": self.state.started_at, "phases": json.dumps(self.state.phases_watched) }) return self.state def stop(self) -> WatcherState: """Stop the bug watcher""" self.state.active = False if self._redis: self._redis.hset("oversight:watcher", "active", "false") return self.state def scan_all_phases(self) -> list[Anomaly]: """Scan all phases for anomalies""" all_anomalies = [] for phase_num in self.PHASES: anomalies = self.scan_phase(phase_num) all_anomalies.extend(anomalies) self.state.last_scan_at = self._now() self.state.anomalies_detected = len(all_anomalies) return all_anomalies def scan_phase(self, phase_num: int) -> list[Anomaly]: """Scan a specific phase for anomalies""" anomalies = [] phase_name = self.PHASES.get(phase_num, f"Phase {phase_num}") directories = self.PHASE_DIRECTORIES.get(phase_num, []) # 1. Check STATUS.md files for issues for dir_name in directories: dir_path = self.base_path / dir_name if dir_path.exists(): status_anomalies = self._check_status_file(dir_path, phase_num, phase_name) anomalies.extend(status_anomalies) # 2. Check for recent errors in ledger ledger_anomalies = self._check_ledger_errors(phase_num, phase_name) anomalies.extend(ledger_anomalies) # 3. Check dependency health dep_anomalies = self._check_dependencies(phase_num, phase_name) anomalies.extend(dep_anomalies) # 4. Check checkpoint consistency ckpt_anomalies = self._check_checkpoint_consistency(phase_num, phase_name) anomalies.extend(ckpt_anomalies) # 5. Phase-specific checks specific_anomalies = self._run_phase_specific_checks(phase_num, phase_name) anomalies.extend(specific_anomalies) # Store anomalies self.anomalies.extend(anomalies) self._persist_anomalies(anomalies) return anomalies def _check_status_file(self, dir_path: Path, phase_num: int, phase_name: str) -> list[Anomaly]: """Check STATUS.md file for issues""" anomalies = [] status_file = dir_path / "STATUS.md" if not status_file.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.LOW, phase=phase_num, phase_name=phase_name, directory=str(dir_path.relative_to(self.base_path)), message=f"Missing STATUS.md in {dir_path.name}", status_file=None )) return anomalies try: content = status_file.read_text() # Check for blocked status if "BLOCKED" in content.upper() or "❗" in content: anomalies.append(Anomaly( id="", type=AnomalyType.STATE_INCONSISTENCY, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory=str(dir_path.relative_to(self.base_path)), message=f"Directory {dir_path.name} is BLOCKED", status_file=str(status_file), details={"content_preview": content[:500]} )) # Check for stale status (not updated in 7 days) if "Last updated:" in content: try: # Extract date from "Last updated: YYYY-MM-DD" for line in content.split('\n'): if 'Last updated:' in line or 'last_updated' in line.lower(): # Try to find a date pattern import re date_match = re.search(r'(\d{4}-\d{2}-\d{2})', line) if date_match: last_update = datetime.fromisoformat(date_match.group(1)) if datetime.now() - last_update > timedelta(days=7): anomalies.append(Anomaly( id="", type=AnomalyType.STATE_INCONSISTENCY, severity=Severity.LOW, phase=phase_num, phase_name=phase_name, directory=str(dir_path.relative_to(self.base_path)), message=f"Stale STATUS.md - last updated {date_match.group(1)}", status_file=str(status_file) )) break except Exception: pass except Exception as e: anomalies.append(Anomaly( id="", type=AnomalyType.UNHANDLED_ERROR, severity=Severity.MEDIUM, phase=phase_num, phase_name=phase_name, directory=str(dir_path.relative_to(self.base_path)), message=f"Error reading STATUS.md: {e}", status_file=str(status_file) )) return anomalies def _check_ledger_errors(self, phase_num: int, phase_name: str) -> list[Anomaly]: """Check governance ledger for recent errors""" anomalies = [] if not self.ledger_db.exists(): return anomalies try: conn = sqlite3.connect(self.ledger_db) conn.row_factory = sqlite3.Row cursor = conn.cursor() # Check violations table cursor.execute(""" SELECT * FROM violations WHERE severity IN ('critical', 'high') AND acknowledged = 0 ORDER BY timestamp DESC LIMIT 10 """) for row in cursor.fetchall(): anomalies.append(Anomaly( id="", type=AnomalyType.SECURITY_VIOLATION, severity=Severity.CRITICAL if row['severity'] == 'critical' else Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="ledger", message=f"Unacknowledged {row['severity']} violation: {row['violation_type']}", details={ "violation_id": row['id'], "agent_id": row['agent_id'], "description": row['description'], "timestamp": row['timestamp'] } )) conn.close() except Exception as e: self.state.error_count += 1 return anomalies def _check_dependencies(self, phase_num: int, phase_name: str) -> list[Anomaly]: """Check dependency availability""" anomalies = [] # Check Vault try: result = subprocess.run( ["docker", "exec", "vault", "vault", "status", "-format=json"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: anomalies.append(Anomaly( id="", type=AnomalyType.DEPENDENCY_UNAVAILABLE, severity=Severity.CRITICAL, phase=phase_num, phase_name=phase_name, directory="infrastructure", message="Vault is unavailable or sealed", details={"stderr": result.stderr[:500] if result.stderr else ""} )) except Exception as e: anomalies.append(Anomaly( id="", type=AnomalyType.DEPENDENCY_UNAVAILABLE, severity=Severity.CRITICAL, phase=phase_num, phase_name=phase_name, directory="infrastructure", message=f"Cannot check Vault status: {e}" )) # Check DragonflyDB if self._redis: try: self._redis.ping() except Exception: anomalies.append(Anomaly( id="", type=AnomalyType.DEPENDENCY_UNAVAILABLE, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="infrastructure", message="DragonflyDB is unavailable" )) # Check Ledger DB if not self.ledger_db.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.DEPENDENCY_UNAVAILABLE, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="ledger", message="Governance ledger database not found" )) return anomalies def _check_checkpoint_consistency(self, phase_num: int, phase_name: str) -> list[Anomaly]: """Check checkpoint data for consistency issues""" anomalies = [] if not self.checkpoint_dir.exists(): return anomalies checkpoints = sorted(self.checkpoint_dir.glob("ckpt-*.json"), reverse=True) if not checkpoints: anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.MEDIUM, phase=phase_num, phase_name=phase_name, directory="checkpoint", message="No checkpoints found" )) return anomalies # Check latest checkpoint try: latest = json.loads(checkpoints[0].read_text()) # Verify content hash if 'content_hash' in latest: # Could verify hash here pass # Check for stale checkpoint (older than 1 hour) if 'created_at' in latest: created = datetime.fromisoformat(latest['created_at'].replace('Z', '+00:00')) if datetime.now(timezone.utc) - created > timedelta(hours=1): anomalies.append(Anomaly( id="", type=AnomalyType.STATE_INCONSISTENCY, severity=Severity.LOW, phase=phase_num, phase_name=phase_name, directory="checkpoint", message=f"Last checkpoint is stale: {latest['created_at']}", checkpoint_id=latest.get('checkpoint_id') )) except Exception as e: anomalies.append(Anomaly( id="", type=AnomalyType.UNHANDLED_ERROR, severity=Severity.MEDIUM, phase=phase_num, phase_name=phase_name, directory="checkpoint", message=f"Error reading checkpoint: {e}" )) return anomalies def _run_phase_specific_checks(self, phase_num: int, phase_name: str) -> list[Anomaly]: """Run checks specific to each phase""" anomalies = [] if phase_num == 3: # Execution Pipeline # Check preflight module preflight_path = self.base_path / "preflight" / "preflight.py" if not preflight_path.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="preflight", message="Preflight module missing" )) elif phase_num == 4: # Promotion/Revocation # Check for agents with high violation counts if self._redis: try: keys = self._redis.keys("agent:*:errors") for key in keys[:10]: # Limit check errors = self._redis.hgetall(key) total = int(errors.get('total_errors', 0)) if total > 5: agent_id = key.split(':')[1] anomalies.append(Anomaly( id="", type=AnomalyType.REGRESSION, severity=Severity.MEDIUM, phase=phase_num, phase_name=phase_name, directory="runtime", message=f"Agent {agent_id} has {total} errors", details=errors )) except Exception: pass elif phase_num == 5: # Agent Bootstrapping - SPECIAL ATTENTION # Check tier0-agent tier0_config = self.base_path / "agents" / "tier0-agent" / "config" / "agent.json" if not tier0_config.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="agents/tier0-agent", message="Tier 0 agent config missing" )) # Check orchestrator model_controller = self.base_path / "orchestrator" / "model_controller.py" if not model_controller.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.MEDIUM, phase=phase_num, phase_name=phase_name, directory="orchestrator", message="Model controller missing" )) elif phase_num == 8: # Production Hardening # Check if health_manager exists health_manager = self.base_path / "runtime" / "health_manager.py" if not health_manager.exists(): anomalies.append(Anomaly( id="", type=AnomalyType.MISSING_ARTIFACT, severity=Severity.HIGH, phase=phase_num, phase_name=phase_name, directory="runtime", message="health_manager.py not implemented - Phase 8 blocked" )) return anomalies def _persist_anomalies(self, anomalies: list[Anomaly]): """Persist anomalies to storage""" if not self._redis: return for anomaly in anomalies: # Store in Redis list self._redis.lpush( "oversight:anomalies", json.dumps(asdict(anomaly)) ) # Keep only last 1000 self._redis.ltrim("oversight:anomalies", 0, 999) # Index by severity self._redis.sadd(f"oversight:anomalies:{anomaly.severity.value}", anomaly.id) # Index by phase self._redis.sadd(f"oversight:anomalies:phase:{anomaly.phase}", anomaly.id) def get_anomalies( self, severity: Optional[Severity] = None, phase: Optional[int] = None, limit: int = 50 ) -> list[Anomaly]: """Retrieve anomalies with optional filters""" if not self._redis: # Return in-memory anomalies filtered = self.anomalies if severity: filtered = [a for a in filtered if a.severity == severity] if phase: filtered = [a for a in filtered if a.phase == phase] return filtered[:limit] # Get from Redis raw = self._redis.lrange("oversight:anomalies", 0, limit - 1) anomalies = [] for item in raw: try: data = json.loads(item) anomaly = Anomaly(**data) if severity and anomaly.severity != severity: continue if phase and anomaly.phase != phase: continue anomalies.append(anomaly) except Exception: continue return anomalies def acknowledge_anomaly(self, anomaly_id: str, notes: str = "") -> bool: """Mark an anomaly as resolved""" if not self._redis: for anomaly in self.anomalies: if anomaly.id == anomaly_id: anomaly.resolved = True anomaly.resolution_notes = notes return True return False # Update in Redis self._redis.hset(f"oversight:anomaly:{anomaly_id}", mapping={ "resolved": "true", "resolution_notes": notes, "resolved_at": self._now() }) return True def get_summary(self) -> dict: """Get summary of watcher state and anomalies""" anomalies = self.get_anomalies(limit=1000) by_severity = {s.value: 0 for s in Severity} by_phase = {p: 0 for p in self.PHASES} by_type = {t.value: 0 for t in AnomalyType} for a in anomalies: # Handle both enum and string values sev_val = a.severity.value if hasattr(a.severity, 'value') else a.severity type_val = a.type.value if hasattr(a.type, 'value') else a.type by_severity[sev_val] = by_severity.get(sev_val, 0) + 1 by_phase[a.phase] = by_phase.get(a.phase, 0) + 1 by_type[type_val] = by_type.get(type_val, 0) + 1 return { "state": asdict(self.state), "total_anomalies": len(anomalies), "unresolved": len([a for a in anomalies if not a.resolved]), "by_severity": by_severity, "by_phase": by_phase, "by_type": by_type, "phases": self.PHASES } if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Bug Window Watcher") parser.add_argument("command", choices=["scan", "status", "list"], help="Command to run") parser.add_argument("--phase", type=int, help="Specific phase to scan") parser.add_argument("--severity", choices=["critical", "high", "medium", "low", "info"]) parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() watcher = BugWindowWatcher() watcher.start() if args.command == "scan": if args.phase: anomalies = watcher.scan_phase(args.phase) else: anomalies = watcher.scan_all_phases() if args.json: print(json.dumps([asdict(a) for a in anomalies], indent=2)) else: print(f"\n{'='*60}") print(f"BUG WINDOW WATCHER - Scan Results") print(f"{'='*60}") print(f"Anomalies found: {len(anomalies)}") print() for a in anomalies: icon = {"critical": "🔴", "high": "🟠", "medium": "🟡", "low": "🔵", "info": "⚪"}.get(a.severity.value, "⚪") print(f"{icon} [{a.severity.value.upper()}] Phase {a.phase}: {a.message}") print(f" Directory: {a.directory}") if a.status_file: print(f" Status: {a.status_file}") print() elif args.command == "status": summary = watcher.get_summary() if args.json: print(json.dumps(summary, indent=2)) else: print(f"\n{'='*60}") print(f"BUG WINDOW WATCHER - Status") print(f"{'='*60}") print(f"Active: {summary['state']['active']}") print(f"Total Anomalies: {summary['total_anomalies']}") print(f"Unresolved: {summary['unresolved']}") print() print("By Severity:") for sev, count in summary['by_severity'].items(): if count > 0: print(f" {sev}: {count}") elif args.command == "list": severity = Severity(args.severity) if args.severity else None anomalies = watcher.get_anomalies(severity=severity, phase=args.phase) if args.json: print(json.dumps([asdict(a) for a in anomalies], indent=2)) else: for a in anomalies: print(f"[{a.id}] {a.severity.value}: {a.message}")