Major additions: - marketplace/: Agent template registry with FTS5 search, ratings, versioning - observability/: Prometheus metrics, distributed tracing, structured logging - ledger/migrations/: Database migration scripts for multi-tenant support - tests/governance/: 15 new test files for phases 6-12 (295 total tests) - bin/validate-phases: Full 12-phase validation script New features: - Multi-tenant support with tenant isolation and quota enforcement - Agent marketplace with semantic versioning and search - Observability with metrics, tracing, and log correlation - Tier-1 agent bootstrap scripts Updated components: - ledger/api.py: Extended API for tenants, marketplace, observability - ledger/schema.sql: Added tenant, project, marketplace tables - testing/framework.ts: Enhanced test framework - checkpoint/checkpoint.py: Improved checkpoint management Archived: - External integrations (Slack/GitHub/PagerDuty) moved to .archive/ - Old checkpoint files cleaned up Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
413 lines
13 KiB
Python
413 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Structured Logging Module
|
|
=========================
|
|
Provides JSON-structured logging with trace correlation.
|
|
|
|
Usage:
|
|
from observability.logging import get_logger
|
|
|
|
logger = get_logger("my_module")
|
|
logger.info("Operation completed", user_id="123", duration_ms=45.2)
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import sys
|
|
import sqlite3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Any, Dict
|
|
from dataclasses import dataclass
|
|
import threading
|
|
|
|
from .tracing import get_current_trace_id, get_current_span
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
DB_PATH = Path("/opt/agent-governance/ledger/governance.db")
|
|
LOG_LEVEL = logging.INFO
|
|
LOG_TO_DB = True
|
|
LOG_RETENTION_DAYS = 7
|
|
|
|
# =============================================================================
|
|
# Structured Log Entry
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class LogEntry:
|
|
"""Structured log entry"""
|
|
timestamp: str
|
|
level: str
|
|
logger: str
|
|
message: str
|
|
trace_id: Optional[str] = None
|
|
span_id: Optional[str] = None
|
|
tenant_id: str = "default"
|
|
attributes: Dict[str, Any] = None
|
|
|
|
def to_dict(self) -> dict:
|
|
result = {
|
|
"timestamp": self.timestamp,
|
|
"level": self.level,
|
|
"logger": self.logger,
|
|
"message": self.message
|
|
}
|
|
if self.trace_id:
|
|
result["trace_id"] = self.trace_id
|
|
if self.span_id:
|
|
result["span_id"] = self.span_id
|
|
if self.tenant_id:
|
|
result["tenant_id"] = self.tenant_id
|
|
if self.attributes:
|
|
result.update(self.attributes)
|
|
return result
|
|
|
|
def to_json(self) -> str:
|
|
return json.dumps(self.to_dict())
|
|
|
|
|
|
# =============================================================================
|
|
# Log Storage
|
|
# =============================================================================
|
|
|
|
class LogStorage:
|
|
"""Stores logs in SQLite for querying"""
|
|
|
|
_instance = None
|
|
_lock = threading.Lock()
|
|
|
|
def __new__(cls):
|
|
if cls._instance is None:
|
|
with cls._lock:
|
|
if cls._instance is None:
|
|
cls._instance = super().__new__(cls)
|
|
cls._instance._initialized = False
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
if self._initialized:
|
|
return
|
|
self._ensure_table()
|
|
self._initialized = True
|
|
|
|
def _ensure_table(self):
|
|
"""Create logs table if not exists"""
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS logs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
timestamp TEXT NOT NULL,
|
|
level TEXT NOT NULL,
|
|
logger TEXT NOT NULL,
|
|
message TEXT NOT NULL,
|
|
trace_id TEXT,
|
|
span_id TEXT,
|
|
tenant_id TEXT DEFAULT 'default',
|
|
attributes TEXT, -- JSON
|
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
""")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_timestamp ON logs(timestamp)")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_level ON logs(level)")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_trace ON logs(trace_id)")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_tenant ON logs(tenant_id)")
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_logger ON logs(logger)")
|
|
conn.commit()
|
|
conn.close()
|
|
except Exception as e:
|
|
print(f"Error creating logs table: {e}", file=sys.stderr)
|
|
|
|
def save(self, entry: LogEntry):
|
|
"""Save log entry to database"""
|
|
if not LOG_TO_DB:
|
|
return
|
|
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.execute("""
|
|
INSERT INTO logs (timestamp, level, logger, message, trace_id, span_id, tenant_id, attributes)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
entry.timestamp, entry.level, entry.logger, entry.message,
|
|
entry.trace_id, entry.span_id, entry.tenant_id,
|
|
json.dumps(entry.attributes) if entry.attributes else None
|
|
))
|
|
conn.commit()
|
|
conn.close()
|
|
except Exception as e:
|
|
print(f"Error saving log: {e}", file=sys.stderr)
|
|
|
|
def search(
|
|
self,
|
|
level: str = None,
|
|
logger: str = None,
|
|
trace_id: str = None,
|
|
tenant_id: str = None,
|
|
message_contains: str = None,
|
|
start_time: str = None,
|
|
end_time: str = None,
|
|
limit: int = 100
|
|
) -> list:
|
|
"""Search logs"""
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
query = "SELECT * FROM logs WHERE 1=1"
|
|
params = []
|
|
|
|
if level:
|
|
query += " AND level = ?"
|
|
params.append(level)
|
|
if logger:
|
|
query += " AND logger = ?"
|
|
params.append(logger)
|
|
if trace_id:
|
|
query += " AND trace_id = ?"
|
|
params.append(trace_id)
|
|
if tenant_id:
|
|
query += " AND tenant_id = ?"
|
|
params.append(tenant_id)
|
|
if message_contains:
|
|
query += " AND message LIKE ?"
|
|
params.append(f"%{message_contains}%")
|
|
if start_time:
|
|
query += " AND timestamp >= ?"
|
|
params.append(start_time)
|
|
if end_time:
|
|
query += " AND timestamp <= ?"
|
|
params.append(end_time)
|
|
|
|
query += " ORDER BY timestamp DESC LIMIT ?"
|
|
params.append(limit)
|
|
|
|
cursor = conn.execute(query, params)
|
|
results = []
|
|
for row in cursor.fetchall():
|
|
entry = dict(row)
|
|
entry["attributes"] = json.loads(entry["attributes"]) if entry.get("attributes") else {}
|
|
results.append(entry)
|
|
|
|
conn.close()
|
|
return results
|
|
except Exception as e:
|
|
print(f"Error searching logs: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
def cleanup(self, days: int = LOG_RETENTION_DAYS):
|
|
"""Remove old logs"""
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.execute(f"""
|
|
DELETE FROM logs WHERE timestamp < datetime('now', '-{days} days')
|
|
""")
|
|
conn.commit()
|
|
deleted = conn.total_changes
|
|
conn.close()
|
|
return deleted
|
|
except Exception as e:
|
|
print(f"Error cleaning up logs: {e}", file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
# =============================================================================
|
|
# JSON Formatter
|
|
# =============================================================================
|
|
|
|
class JSONFormatter(logging.Formatter):
|
|
"""Format log records as JSON"""
|
|
|
|
def format(self, record: logging.LogRecord) -> str:
|
|
# Get trace context
|
|
trace_id = get_current_trace_id()
|
|
span = get_current_span()
|
|
span_id = span.span_id if span else None
|
|
|
|
# Build entry
|
|
entry = LogEntry(
|
|
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
level=record.levelname,
|
|
logger=record.name,
|
|
message=record.getMessage(),
|
|
trace_id=trace_id,
|
|
span_id=span_id,
|
|
tenant_id=getattr(record, 'tenant_id', 'default'),
|
|
attributes=getattr(record, 'extra_attrs', None)
|
|
)
|
|
|
|
# Store in DB
|
|
storage = LogStorage()
|
|
storage.save(entry)
|
|
|
|
return entry.to_json()
|
|
|
|
|
|
# =============================================================================
|
|
# Custom Logger
|
|
# =============================================================================
|
|
|
|
class StructuredLogger:
|
|
"""Logger with structured output and trace correlation"""
|
|
|
|
def __init__(self, name: str, tenant_id: str = "default"):
|
|
self.name = name
|
|
self.tenant_id = tenant_id
|
|
self._logger = logging.getLogger(name)
|
|
self._logger.setLevel(LOG_LEVEL)
|
|
|
|
# Add JSON handler if not already present
|
|
if not any(isinstance(h, logging.StreamHandler) and isinstance(h.formatter, JSONFormatter)
|
|
for h in self._logger.handlers):
|
|
handler = logging.StreamHandler(sys.stdout)
|
|
handler.setFormatter(JSONFormatter())
|
|
self._logger.addHandler(handler)
|
|
|
|
def _log(self, level: int, message: str, **kwargs):
|
|
"""Internal log method"""
|
|
extra = {
|
|
'tenant_id': kwargs.pop('tenant_id', self.tenant_id),
|
|
'extra_attrs': kwargs if kwargs else None
|
|
}
|
|
self._logger.log(level, message, extra=extra)
|
|
|
|
def debug(self, message: str, **kwargs):
|
|
"""Log debug message"""
|
|
self._log(logging.DEBUG, message, **kwargs)
|
|
|
|
def info(self, message: str, **kwargs):
|
|
"""Log info message"""
|
|
self._log(logging.INFO, message, **kwargs)
|
|
|
|
def warning(self, message: str, **kwargs):
|
|
"""Log warning message"""
|
|
self._log(logging.WARNING, message, **kwargs)
|
|
|
|
def error(self, message: str, **kwargs):
|
|
"""Log error message"""
|
|
self._log(logging.ERROR, message, **kwargs)
|
|
|
|
def critical(self, message: str, **kwargs):
|
|
"""Log critical message"""
|
|
self._log(logging.CRITICAL, message, **kwargs)
|
|
|
|
def exception(self, message: str, **kwargs):
|
|
"""Log exception with traceback"""
|
|
import traceback
|
|
kwargs['traceback'] = traceback.format_exc()
|
|
self._log(logging.ERROR, message, **kwargs)
|
|
|
|
|
|
# =============================================================================
|
|
# Logger Factory
|
|
# =============================================================================
|
|
|
|
_loggers: Dict[str, StructuredLogger] = {}
|
|
|
|
|
|
def get_logger(name: str, tenant_id: str = "default") -> StructuredLogger:
|
|
"""Get or create a structured logger"""
|
|
key = f"{name}:{tenant_id}"
|
|
if key not in _loggers:
|
|
_loggers[key] = StructuredLogger(name, tenant_id)
|
|
return _loggers[key]
|
|
|
|
|
|
# =============================================================================
|
|
# FastAPI Router
|
|
# =============================================================================
|
|
|
|
from fastapi import APIRouter, Query, HTTPException
|
|
|
|
router = APIRouter(prefix="/logs", tags=["Logging"])
|
|
|
|
|
|
@router.get("")
|
|
async def search_logs(
|
|
level: Optional[str] = None,
|
|
logger: Optional[str] = None,
|
|
trace_id: Optional[str] = None,
|
|
message: Optional[str] = None,
|
|
start_time: Optional[str] = None,
|
|
end_time: Optional[str] = None,
|
|
limit: int = Query(100, le=500)
|
|
):
|
|
"""Search logs"""
|
|
storage = LogStorage()
|
|
logs = storage.search(
|
|
level=level,
|
|
logger=logger,
|
|
trace_id=trace_id,
|
|
message_contains=message,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
limit=limit
|
|
)
|
|
return {"logs": logs, "count": len(logs)}
|
|
|
|
|
|
@router.get("/trace/{trace_id}")
|
|
async def get_logs_by_trace(trace_id: str, limit: int = Query(100, le=500)):
|
|
"""Get all logs for a trace"""
|
|
storage = LogStorage()
|
|
logs = storage.search(trace_id=trace_id, limit=limit)
|
|
return {"logs": logs, "count": len(logs), "trace_id": trace_id}
|
|
|
|
|
|
@router.get("/stats")
|
|
async def get_log_stats():
|
|
"""Get log statistics"""
|
|
try:
|
|
conn = sqlite3.connect(DB_PATH)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
stats = {}
|
|
|
|
# Count by level
|
|
cursor = conn.execute("""
|
|
SELECT level, COUNT(*) as count
|
|
FROM logs
|
|
WHERE timestamp > datetime('now', '-1 hour')
|
|
GROUP BY level
|
|
""")
|
|
stats["last_hour_by_level"] = {row["level"]: row["count"] for row in cursor.fetchall()}
|
|
|
|
# Count by logger
|
|
cursor = conn.execute("""
|
|
SELECT logger, COUNT(*) as count
|
|
FROM logs
|
|
WHERE timestamp > datetime('now', '-1 hour')
|
|
GROUP BY logger
|
|
ORDER BY count DESC
|
|
LIMIT 10
|
|
""")
|
|
stats["last_hour_by_logger"] = {row["logger"]: row["count"] for row in cursor.fetchall()}
|
|
|
|
# Total count
|
|
cursor = conn.execute("SELECT COUNT(*) as total FROM logs")
|
|
stats["total_logs"] = cursor.fetchone()["total"]
|
|
|
|
# Error count last 24h
|
|
cursor = conn.execute("""
|
|
SELECT COUNT(*) as count FROM logs
|
|
WHERE level IN ('ERROR', 'CRITICAL')
|
|
AND timestamp > datetime('now', '-1 day')
|
|
""")
|
|
stats["errors_last_24h"] = cursor.fetchone()["count"]
|
|
|
|
conn.close()
|
|
return stats
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/cleanup")
|
|
async def cleanup_logs(days: int = Query(7, ge=1, le=365)):
|
|
"""Clean up old logs"""
|
|
storage = LogStorage()
|
|
deleted = storage.cleanup(days)
|
|
return {"deleted": deleted, "retention_days": days}
|