Major additions: - marketplace/: Agent template registry with FTS5 search, ratings, versioning - observability/: Prometheus metrics, distributed tracing, structured logging - ledger/migrations/: Database migration scripts for multi-tenant support - tests/governance/: 15 new test files for phases 6-12 (295 total tests) - bin/validate-phases: Full 12-phase validation script New features: - Multi-tenant support with tenant isolation and quota enforcement - Agent marketplace with semantic versioning and search - Observability with metrics, tracing, and log correlation - Tier-1 agent bootstrap scripts Updated components: - ledger/api.py: Extended API for tenants, marketplace, observability - ledger/schema.sql: Added tenant, project, marketplace tables - testing/framework.ts: Enhanced test framework - checkpoint/checkpoint.py: Improved checkpoint management Archived: - External integrations (Slack/GitHub/PagerDuty) moved to .archive/ - Old checkpoint files cleaned up Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
585 lines
20 KiB
Python
585 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Prometheus Metrics Module
|
|
=========================
|
|
Exposes metrics in Prometheus format for monitoring agent governance system.
|
|
|
|
Metrics:
|
|
- agent_executions_total - Total agent executions by tier and status
|
|
- agent_execution_duration_seconds - Execution duration histogram
|
|
- agent_violations_total - Violations by type and severity
|
|
- agent_promotions_total - Promotions by tier transition
|
|
- api_requests_total - API requests by endpoint and status
|
|
- tenant_quota_usage - Quota usage by tenant and resource
|
|
- system_health - Component health status
|
|
"""
|
|
|
|
import sqlite3
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List
|
|
from dataclasses import dataclass, field
|
|
from contextlib import contextmanager
|
|
import threading
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
DB_PATH = Path("/opt/agent-governance/ledger/governance.db")
|
|
|
|
# =============================================================================
|
|
# Metric Types
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class Counter:
|
|
"""Prometheus counter metric"""
|
|
name: str
|
|
help: str
|
|
labels: List[str] = field(default_factory=list)
|
|
values: Dict[tuple, float] = field(default_factory=dict)
|
|
_lock: threading.Lock = field(default_factory=threading.Lock)
|
|
|
|
def inc(self, labels: dict = None, value: float = 1):
|
|
"""Increment counter"""
|
|
key = tuple(labels.values()) if labels else ()
|
|
with self._lock:
|
|
self.values[key] = self.values.get(key, 0) + value
|
|
|
|
def to_prometheus(self) -> str:
|
|
"""Format as Prometheus text"""
|
|
lines = [f"# HELP {self.name} {self.help}", f"# TYPE {self.name} counter"]
|
|
for key, value in self.values.items():
|
|
if self.labels and key:
|
|
label_str = ",".join(f'{l}="{v}"' for l, v in zip(self.labels, key))
|
|
lines.append(f"{self.name}{{{label_str}}} {value}")
|
|
else:
|
|
lines.append(f"{self.name} {value}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
@dataclass
|
|
class Gauge:
|
|
"""Prometheus gauge metric"""
|
|
name: str
|
|
help: str
|
|
labels: List[str] = field(default_factory=list)
|
|
values: Dict[tuple, float] = field(default_factory=dict)
|
|
_lock: threading.Lock = field(default_factory=threading.Lock)
|
|
|
|
def set(self, value: float, labels: dict = None):
|
|
"""Set gauge value"""
|
|
key = tuple(labels.values()) if labels else ()
|
|
with self._lock:
|
|
self.values[key] = value
|
|
|
|
def inc(self, labels: dict = None, value: float = 1):
|
|
"""Increment gauge"""
|
|
key = tuple(labels.values()) if labels else ()
|
|
with self._lock:
|
|
self.values[key] = self.values.get(key, 0) + value
|
|
|
|
def dec(self, labels: dict = None, value: float = 1):
|
|
"""Decrement gauge"""
|
|
key = tuple(labels.values()) if labels else ()
|
|
with self._lock:
|
|
self.values[key] = self.values.get(key, 0) - value
|
|
|
|
def to_prometheus(self) -> str:
|
|
"""Format as Prometheus text"""
|
|
lines = [f"# HELP {self.name} {self.help}", f"# TYPE {self.name} gauge"]
|
|
for key, value in self.values.items():
|
|
if self.labels and key:
|
|
label_str = ",".join(f'{l}="{v}"' for l, v in zip(self.labels, key))
|
|
lines.append(f"{self.name}{{{label_str}}} {value}")
|
|
else:
|
|
lines.append(f"{self.name} {value}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
@dataclass
|
|
class Histogram:
|
|
"""Prometheus histogram metric"""
|
|
name: str
|
|
help: str
|
|
labels: List[str] = field(default_factory=list)
|
|
buckets: List[float] = field(default_factory=lambda: [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60])
|
|
observations: Dict[tuple, List[float]] = field(default_factory=dict)
|
|
_lock: threading.Lock = field(default_factory=threading.Lock)
|
|
|
|
def observe(self, value: float, labels: dict = None):
|
|
"""Record observation"""
|
|
key = tuple(labels.values()) if labels else ()
|
|
with self._lock:
|
|
if key not in self.observations:
|
|
self.observations[key] = []
|
|
self.observations[key].append(value)
|
|
|
|
def to_prometheus(self) -> str:
|
|
"""Format as Prometheus text"""
|
|
lines = [f"# HELP {self.name} {self.help}", f"# TYPE {self.name} histogram"]
|
|
|
|
for key, values in self.observations.items():
|
|
label_str = ",".join(f'{l}="{v}"' for l, v in zip(self.labels, key)) if self.labels and key else ""
|
|
prefix = f"{self.name}{{{label_str}," if label_str else f"{self.name}{{le="
|
|
|
|
# Bucket counts
|
|
for bucket in self.buckets:
|
|
count = sum(1 for v in values if v <= bucket)
|
|
if label_str:
|
|
lines.append(f'{self.name}_bucket{{{label_str},le="{bucket}"}} {count}')
|
|
else:
|
|
lines.append(f'{self.name}_bucket{{le="{bucket}"}} {count}')
|
|
|
|
# +Inf bucket
|
|
if label_str:
|
|
lines.append(f'{self.name}_bucket{{{label_str},le="+Inf"}} {len(values)}')
|
|
else:
|
|
lines.append(f'{self.name}_bucket{{le="+Inf"}} {len(values)}')
|
|
|
|
# Sum and count
|
|
total = sum(values)
|
|
if label_str:
|
|
lines.append(f"{self.name}_sum{{{label_str}}} {total}")
|
|
lines.append(f"{self.name}_count{{{label_str}}} {len(values)}")
|
|
else:
|
|
lines.append(f"{self.name}_sum {total}")
|
|
lines.append(f"{self.name}_count {len(values)}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# =============================================================================
|
|
# Metrics Registry
|
|
# =============================================================================
|
|
|
|
class MetricsRegistry:
|
|
"""Central registry for all metrics"""
|
|
|
|
def __init__(self):
|
|
# Agent execution metrics
|
|
self.agent_executions_total = Counter(
|
|
name="agent_executions_total",
|
|
help="Total number of agent executions",
|
|
labels=["tier", "action", "status"]
|
|
)
|
|
|
|
self.agent_execution_duration = Histogram(
|
|
name="agent_execution_duration_seconds",
|
|
help="Agent execution duration in seconds",
|
|
labels=["tier", "action"],
|
|
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300]
|
|
)
|
|
|
|
self.agent_active = Gauge(
|
|
name="agent_active_count",
|
|
help="Number of currently active agents",
|
|
labels=["tier"]
|
|
)
|
|
|
|
# Violation metrics
|
|
self.violations_total = Counter(
|
|
name="agent_violations_total",
|
|
help="Total number of violations",
|
|
labels=["type", "severity"]
|
|
)
|
|
|
|
# Promotion metrics
|
|
self.promotions_total = Counter(
|
|
name="agent_promotions_total",
|
|
help="Total number of promotions",
|
|
labels=["from_tier", "to_tier"]
|
|
)
|
|
|
|
# API metrics
|
|
self.api_requests_total = Counter(
|
|
name="api_requests_total",
|
|
help="Total API requests",
|
|
labels=["method", "endpoint", "status"]
|
|
)
|
|
|
|
self.api_request_duration = Histogram(
|
|
name="api_request_duration_seconds",
|
|
help="API request duration in seconds",
|
|
labels=["method", "endpoint"],
|
|
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
|
|
)
|
|
|
|
# Tenant/quota metrics
|
|
self.tenant_quota_usage = Gauge(
|
|
name="tenant_quota_usage_ratio",
|
|
help="Tenant quota usage as ratio (0-1)",
|
|
labels=["tenant_id", "resource"]
|
|
)
|
|
|
|
self.tenant_api_calls = Counter(
|
|
name="tenant_api_calls_total",
|
|
help="API calls by tenant",
|
|
labels=["tenant_id"]
|
|
)
|
|
|
|
# System health metrics
|
|
self.component_health = Gauge(
|
|
name="component_health",
|
|
help="Component health status (1=healthy, 0=unhealthy)",
|
|
labels=["component"]
|
|
)
|
|
|
|
self.db_connections = Gauge(
|
|
name="db_connections_active",
|
|
help="Active database connections"
|
|
)
|
|
|
|
# Marketplace metrics
|
|
self.template_downloads = Counter(
|
|
name="marketplace_template_downloads_total",
|
|
help="Template downloads",
|
|
labels=["template_id", "category"]
|
|
)
|
|
|
|
self.template_installs = Gauge(
|
|
name="marketplace_template_installs_active",
|
|
help="Active template installations",
|
|
labels=["category"]
|
|
)
|
|
|
|
# Orchestration metrics
|
|
self.orchestration_requests = Counter(
|
|
name="orchestration_requests_total",
|
|
help="Orchestration requests by model",
|
|
labels=["model", "status"]
|
|
)
|
|
|
|
self.orchestration_tokens = Counter(
|
|
name="orchestration_tokens_total",
|
|
help="Tokens used in orchestration",
|
|
labels=["model"]
|
|
)
|
|
|
|
# Uptime
|
|
self._start_time = time.time()
|
|
self.uptime = Gauge(
|
|
name="governance_uptime_seconds",
|
|
help="System uptime in seconds"
|
|
)
|
|
|
|
def collect_from_db(self):
|
|
"""Collect metrics from database"""
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Agent counts by tier
|
|
cursor = conn.execute("""
|
|
SELECT current_tier, COUNT(*) as count
|
|
FROM agent_metrics GROUP BY current_tier
|
|
""")
|
|
for row in cursor.fetchall():
|
|
self.agent_active.set(row["count"], {"tier": str(row["current_tier"])})
|
|
|
|
# Recent violations by type/severity
|
|
cursor = conn.execute("""
|
|
SELECT violation_type, severity, COUNT(*) as count
|
|
FROM violations
|
|
WHERE timestamp > datetime('now', '-1 hour')
|
|
GROUP BY violation_type, severity
|
|
""")
|
|
for row in cursor.fetchall():
|
|
self.violations_total.values[(row["violation_type"], row["severity"])] = row["count"]
|
|
|
|
# Promotions
|
|
cursor = conn.execute("""
|
|
SELECT from_tier, to_tier, COUNT(*) as count
|
|
FROM promotions GROUP BY from_tier, to_tier
|
|
""")
|
|
for row in cursor.fetchall():
|
|
self.promotions_total.values[(str(row["from_tier"]), str(row["to_tier"]))] = row["count"]
|
|
|
|
# Tenant usage
|
|
cursor = conn.execute("""
|
|
SELECT tu.tenant_id, tu.api_calls, tu.tokens_used,
|
|
tq.max_api_calls_per_day, tq.max_tokens_per_day
|
|
FROM tenant_usage tu
|
|
JOIN tenant_quotas tq ON tu.tenant_id = tq.tenant_id
|
|
WHERE tu.period_start = date('now')
|
|
""")
|
|
for row in cursor.fetchall():
|
|
if row["max_api_calls_per_day"] > 0:
|
|
ratio = row["api_calls"] / row["max_api_calls_per_day"]
|
|
self.tenant_quota_usage.set(ratio, {"tenant_id": row["tenant_id"], "resource": "api_calls"})
|
|
if row["max_tokens_per_day"] > 0:
|
|
ratio = row["tokens_used"] / row["max_tokens_per_day"]
|
|
self.tenant_quota_usage.set(ratio, {"tenant_id": row["tenant_id"], "resource": "tokens"})
|
|
|
|
# Marketplace stats
|
|
cursor = conn.execute("""
|
|
SELECT category, SUM(active_installs) as installs
|
|
FROM template_stats ts
|
|
JOIN agent_templates t ON ts.template_id = t.template_id
|
|
GROUP BY category
|
|
""")
|
|
for row in cursor.fetchall():
|
|
self.template_installs.set(row["installs"] or 0, {"category": row["category"]})
|
|
|
|
conn.close()
|
|
except Exception as e:
|
|
print(f"Error collecting DB metrics: {e}")
|
|
|
|
def check_health(self):
|
|
"""Check component health"""
|
|
import subprocess
|
|
|
|
# Database
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.execute("SELECT 1")
|
|
conn.close()
|
|
self.component_health.set(1, {"component": "database"})
|
|
except:
|
|
self.component_health.set(0, {"component": "database"})
|
|
|
|
# Vault
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "exec", "vault", "vault", "status", "-format=json"],
|
|
capture_output=True, timeout=5
|
|
)
|
|
self.component_health.set(1 if result.returncode == 0 else 0, {"component": "vault"})
|
|
except:
|
|
self.component_health.set(0, {"component": "vault"})
|
|
|
|
# DragonflyDB
|
|
try:
|
|
import redis
|
|
r = redis.Redis(host='127.0.0.1', port=6379, password='governance2026')
|
|
r.ping()
|
|
self.component_health.set(1, {"component": "dragonfly"})
|
|
except:
|
|
self.component_health.set(0, {"component": "dragonfly"})
|
|
|
|
# Uptime
|
|
self.uptime.set(time.time() - self._start_time)
|
|
|
|
def to_prometheus(self) -> str:
|
|
"""Export all metrics in Prometheus format"""
|
|
self.collect_from_db()
|
|
self.check_health()
|
|
|
|
metrics = [
|
|
self.agent_executions_total,
|
|
self.agent_execution_duration,
|
|
self.agent_active,
|
|
self.violations_total,
|
|
self.promotions_total,
|
|
self.api_requests_total,
|
|
self.api_request_duration,
|
|
self.tenant_quota_usage,
|
|
self.tenant_api_calls,
|
|
self.component_health,
|
|
self.db_connections,
|
|
self.template_downloads,
|
|
self.template_installs,
|
|
self.orchestration_requests,
|
|
self.orchestration_tokens,
|
|
self.uptime,
|
|
]
|
|
|
|
output = []
|
|
for metric in metrics:
|
|
if metric.values or isinstance(metric, Histogram) and metric.observations:
|
|
output.append(metric.to_prometheus())
|
|
elif not metric.values and not (isinstance(metric, Histogram) and metric.observations):
|
|
# Include empty metrics with help text
|
|
output.append(f"# HELP {metric.name} {metric.help}")
|
|
output.append(f"# TYPE {metric.name} {type(metric).__name__.lower()}")
|
|
|
|
return "\n\n".join(output)
|
|
|
|
|
|
# =============================================================================
|
|
# Global Registry
|
|
# =============================================================================
|
|
|
|
registry = MetricsRegistry()
|
|
|
|
|
|
# =============================================================================
|
|
# FastAPI Router
|
|
# =============================================================================
|
|
|
|
from fastapi import APIRouter, Response
|
|
|
|
router = APIRouter(tags=["Observability"])
|
|
|
|
|
|
@router.get("/metrics")
|
|
async def get_metrics():
|
|
"""Prometheus metrics endpoint"""
|
|
return Response(
|
|
content=registry.to_prometheus(),
|
|
media_type="text/plain; version=0.0.4; charset=utf-8"
|
|
)
|
|
|
|
|
|
@router.get("/health/detailed")
|
|
async def detailed_health():
|
|
"""Detailed health check with all components"""
|
|
import subprocess
|
|
|
|
health = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"uptime_seconds": time.time() - registry._start_time,
|
|
"components": {}
|
|
}
|
|
|
|
# Database
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cursor = conn.execute("SELECT COUNT(*) FROM agent_metrics")
|
|
count = cursor.fetchone()[0]
|
|
conn.close()
|
|
health["components"]["database"] = {"status": "healthy", "agent_count": count}
|
|
except Exception as e:
|
|
health["components"]["database"] = {"status": "unhealthy", "error": str(e)}
|
|
health["status"] = "degraded"
|
|
|
|
# Vault
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "exec", "vault", "vault", "status", "-format=json"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode == 0:
|
|
import json
|
|
vault_status = json.loads(result.stdout)
|
|
health["components"]["vault"] = {
|
|
"status": "healthy",
|
|
"sealed": vault_status.get("sealed", True),
|
|
"version": vault_status.get("version", "unknown")
|
|
}
|
|
else:
|
|
health["components"]["vault"] = {"status": "unhealthy", "error": "non-zero exit"}
|
|
health["status"] = "degraded"
|
|
except Exception as e:
|
|
health["components"]["vault"] = {"status": "unhealthy", "error": str(e)}
|
|
health["status"] = "degraded"
|
|
|
|
# DragonflyDB
|
|
try:
|
|
import redis
|
|
r = redis.Redis(host='127.0.0.1', port=6379, password='governance2026')
|
|
info = r.info()
|
|
health["components"]["dragonfly"] = {
|
|
"status": "healthy",
|
|
"connected_clients": info.get("connected_clients", 0),
|
|
"used_memory_human": info.get("used_memory_human", "unknown")
|
|
}
|
|
except Exception as e:
|
|
health["components"]["dragonfly"] = {"status": "unhealthy", "error": str(e)}
|
|
health["status"] = "degraded"
|
|
|
|
return health
|
|
|
|
|
|
# =============================================================================
|
|
# Middleware for request metrics
|
|
# =============================================================================
|
|
|
|
class MetricsMiddleware:
|
|
"""Middleware to collect API request metrics"""
|
|
|
|
def __init__(self, app):
|
|
self.app = app
|
|
|
|
async def __call__(self, scope, receive, send):
|
|
if scope["type"] != "http":
|
|
await self.app(scope, receive, send)
|
|
return
|
|
|
|
start_time = time.time()
|
|
path = scope.get("path", "unknown")
|
|
method = scope.get("method", "unknown")
|
|
|
|
# Normalize path (remove IDs)
|
|
import re
|
|
normalized_path = re.sub(r'/[a-f0-9-]{8,}', '/{id}', path)
|
|
normalized_path = re.sub(r'/\d+', '/{id}', normalized_path)
|
|
|
|
status_code = 500
|
|
|
|
async def send_wrapper(message):
|
|
nonlocal status_code
|
|
if message["type"] == "http.response.start":
|
|
status_code = message["status"]
|
|
await send(message)
|
|
|
|
try:
|
|
await self.app(scope, receive, send_wrapper)
|
|
finally:
|
|
duration = time.time() - start_time
|
|
|
|
# Record metrics
|
|
registry.api_requests_total.inc({
|
|
"method": method,
|
|
"endpoint": normalized_path,
|
|
"status": str(status_code)
|
|
})
|
|
registry.api_request_duration.observe(duration, {
|
|
"method": method,
|
|
"endpoint": normalized_path
|
|
})
|
|
|
|
|
|
# =============================================================================
|
|
# Helper functions for use in other modules
|
|
# =============================================================================
|
|
|
|
def record_agent_execution(tier: int, action: str, success: bool, duration: float):
|
|
"""Record agent execution metrics"""
|
|
registry.agent_executions_total.inc({
|
|
"tier": str(tier),
|
|
"action": action,
|
|
"status": "success" if success else "error"
|
|
})
|
|
registry.agent_execution_duration.observe(duration, {
|
|
"tier": str(tier),
|
|
"action": action
|
|
})
|
|
|
|
|
|
def record_violation(violation_type: str, severity: str):
|
|
"""Record violation metric"""
|
|
registry.violations_total.inc({
|
|
"type": violation_type,
|
|
"severity": severity
|
|
})
|
|
|
|
|
|
def record_promotion(from_tier: int, to_tier: int):
|
|
"""Record promotion metric"""
|
|
registry.promotions_total.inc({
|
|
"from_tier": str(from_tier),
|
|
"to_tier": str(to_tier)
|
|
})
|
|
|
|
|
|
def record_orchestration(model: str, success: bool, tokens: int = 0):
|
|
"""Record orchestration metrics"""
|
|
registry.orchestration_requests.inc({
|
|
"model": model,
|
|
"status": "success" if success else "error"
|
|
})
|
|
if tokens > 0:
|
|
registry.orchestration_tokens.inc({"model": model}, tokens)
|
|
|
|
|
|
def record_template_download(template_id: str, category: str):
|
|
"""Record template download"""
|
|
registry.template_downloads.inc({
|
|
"template_id": template_id,
|
|
"category": category
|
|
})
|