profit 8c6e7831e9 Add Phase 10-12 implementation: multi-tenant, marketplace, observability
Major additions:
- marketplace/: Agent template registry with FTS5 search, ratings, versioning
- observability/: Prometheus metrics, distributed tracing, structured logging
- ledger/migrations/: Database migration scripts for multi-tenant support
- tests/governance/: 15 new test files for phases 6-12 (295 total tests)
- bin/validate-phases: Full 12-phase validation script

New features:
- Multi-tenant support with tenant isolation and quota enforcement
- Agent marketplace with semantic versioning and search
- Observability with metrics, tracing, and log correlation
- Tier-1 agent bootstrap scripts

Updated components:
- ledger/api.py: Extended API for tenants, marketplace, observability
- ledger/schema.sql: Added tenant, project, marketplace tables
- testing/framework.ts: Enhanced test framework
- checkpoint/checkpoint.py: Improved checkpoint management

Archived:
- External integrations (Slack/GitHub/PagerDuty) moved to .archive/
- Old checkpoint files cleaned up

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:39:47 -05:00

413 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Structured Logging Module
=========================
Provides JSON-structured logging with trace correlation.
Usage:
from observability.logging import get_logger
logger = get_logger("my_module")
logger.info("Operation completed", user_id="123", duration_ms=45.2)
"""
import json
import logging
import sys
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Any, Dict
from dataclasses import dataclass
import threading
from .tracing import get_current_trace_id, get_current_span
# =============================================================================
# Configuration
# =============================================================================
DB_PATH = Path("/opt/agent-governance/ledger/governance.db")
LOG_LEVEL = logging.INFO
LOG_TO_DB = True
LOG_RETENTION_DAYS = 7
# =============================================================================
# Structured Log Entry
# =============================================================================
@dataclass
class LogEntry:
"""Structured log entry"""
timestamp: str
level: str
logger: str
message: str
trace_id: Optional[str] = None
span_id: Optional[str] = None
tenant_id: str = "default"
attributes: Dict[str, Any] = None
def to_dict(self) -> dict:
result = {
"timestamp": self.timestamp,
"level": self.level,
"logger": self.logger,
"message": self.message
}
if self.trace_id:
result["trace_id"] = self.trace_id
if self.span_id:
result["span_id"] = self.span_id
if self.tenant_id:
result["tenant_id"] = self.tenant_id
if self.attributes:
result.update(self.attributes)
return result
def to_json(self) -> str:
return json.dumps(self.to_dict())
# =============================================================================
# Log Storage
# =============================================================================
class LogStorage:
"""Stores logs in SQLite for querying"""
_instance = None
_lock = threading.Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
if self._initialized:
return
self._ensure_table()
self._initialized = True
def _ensure_table(self):
"""Create logs table if not exists"""
try:
conn = sqlite3.connect(DB_PATH)
conn.execute("""
CREATE TABLE IF NOT EXISTS logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
level TEXT NOT NULL,
logger TEXT NOT NULL,
message TEXT NOT NULL,
trace_id TEXT,
span_id TEXT,
tenant_id TEXT DEFAULT 'default',
attributes TEXT, -- JSON
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_timestamp ON logs(timestamp)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_level ON logs(level)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_trace ON logs(trace_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_tenant ON logs(tenant_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_logs_logger ON logs(logger)")
conn.commit()
conn.close()
except Exception as e:
print(f"Error creating logs table: {e}", file=sys.stderr)
def save(self, entry: LogEntry):
"""Save log entry to database"""
if not LOG_TO_DB:
return
try:
conn = sqlite3.connect(DB_PATH)
conn.execute("""
INSERT INTO logs (timestamp, level, logger, message, trace_id, span_id, tenant_id, attributes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
entry.timestamp, entry.level, entry.logger, entry.message,
entry.trace_id, entry.span_id, entry.tenant_id,
json.dumps(entry.attributes) if entry.attributes else None
))
conn.commit()
conn.close()
except Exception as e:
print(f"Error saving log: {e}", file=sys.stderr)
def search(
self,
level: str = None,
logger: str = None,
trace_id: str = None,
tenant_id: str = None,
message_contains: str = None,
start_time: str = None,
end_time: str = None,
limit: int = 100
) -> list:
"""Search logs"""
try:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
query = "SELECT * FROM logs WHERE 1=1"
params = []
if level:
query += " AND level = ?"
params.append(level)
if logger:
query += " AND logger = ?"
params.append(logger)
if trace_id:
query += " AND trace_id = ?"
params.append(trace_id)
if tenant_id:
query += " AND tenant_id = ?"
params.append(tenant_id)
if message_contains:
query += " AND message LIKE ?"
params.append(f"%{message_contains}%")
if start_time:
query += " AND timestamp >= ?"
params.append(start_time)
if end_time:
query += " AND timestamp <= ?"
params.append(end_time)
query += " ORDER BY timestamp DESC LIMIT ?"
params.append(limit)
cursor = conn.execute(query, params)
results = []
for row in cursor.fetchall():
entry = dict(row)
entry["attributes"] = json.loads(entry["attributes"]) if entry.get("attributes") else {}
results.append(entry)
conn.close()
return results
except Exception as e:
print(f"Error searching logs: {e}", file=sys.stderr)
return []
def cleanup(self, days: int = LOG_RETENTION_DAYS):
"""Remove old logs"""
try:
conn = sqlite3.connect(DB_PATH)
conn.execute(f"""
DELETE FROM logs WHERE timestamp < datetime('now', '-{days} days')
""")
conn.commit()
deleted = conn.total_changes
conn.close()
return deleted
except Exception as e:
print(f"Error cleaning up logs: {e}", file=sys.stderr)
return 0
# =============================================================================
# JSON Formatter
# =============================================================================
class JSONFormatter(logging.Formatter):
"""Format log records as JSON"""
def format(self, record: logging.LogRecord) -> str:
# Get trace context
trace_id = get_current_trace_id()
span = get_current_span()
span_id = span.span_id if span else None
# Build entry
entry = LogEntry(
timestamp=datetime.now(timezone.utc).isoformat(),
level=record.levelname,
logger=record.name,
message=record.getMessage(),
trace_id=trace_id,
span_id=span_id,
tenant_id=getattr(record, 'tenant_id', 'default'),
attributes=getattr(record, 'extra_attrs', None)
)
# Store in DB
storage = LogStorage()
storage.save(entry)
return entry.to_json()
# =============================================================================
# Custom Logger
# =============================================================================
class StructuredLogger:
"""Logger with structured output and trace correlation"""
def __init__(self, name: str, tenant_id: str = "default"):
self.name = name
self.tenant_id = tenant_id
self._logger = logging.getLogger(name)
self._logger.setLevel(LOG_LEVEL)
# Add JSON handler if not already present
if not any(isinstance(h, logging.StreamHandler) and isinstance(h.formatter, JSONFormatter)
for h in self._logger.handlers):
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(JSONFormatter())
self._logger.addHandler(handler)
def _log(self, level: int, message: str, **kwargs):
"""Internal log method"""
extra = {
'tenant_id': kwargs.pop('tenant_id', self.tenant_id),
'extra_attrs': kwargs if kwargs else None
}
self._logger.log(level, message, extra=extra)
def debug(self, message: str, **kwargs):
"""Log debug message"""
self._log(logging.DEBUG, message, **kwargs)
def info(self, message: str, **kwargs):
"""Log info message"""
self._log(logging.INFO, message, **kwargs)
def warning(self, message: str, **kwargs):
"""Log warning message"""
self._log(logging.WARNING, message, **kwargs)
def error(self, message: str, **kwargs):
"""Log error message"""
self._log(logging.ERROR, message, **kwargs)
def critical(self, message: str, **kwargs):
"""Log critical message"""
self._log(logging.CRITICAL, message, **kwargs)
def exception(self, message: str, **kwargs):
"""Log exception with traceback"""
import traceback
kwargs['traceback'] = traceback.format_exc()
self._log(logging.ERROR, message, **kwargs)
# =============================================================================
# Logger Factory
# =============================================================================
_loggers: Dict[str, StructuredLogger] = {}
def get_logger(name: str, tenant_id: str = "default") -> StructuredLogger:
"""Get or create a structured logger"""
key = f"{name}:{tenant_id}"
if key not in _loggers:
_loggers[key] = StructuredLogger(name, tenant_id)
return _loggers[key]
# =============================================================================
# FastAPI Router
# =============================================================================
from fastapi import APIRouter, Query, HTTPException
router = APIRouter(prefix="/logs", tags=["Logging"])
@router.get("")
async def search_logs(
level: Optional[str] = None,
logger: Optional[str] = None,
trace_id: Optional[str] = None,
message: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
limit: int = Query(100, le=500)
):
"""Search logs"""
storage = LogStorage()
logs = storage.search(
level=level,
logger=logger,
trace_id=trace_id,
message_contains=message,
start_time=start_time,
end_time=end_time,
limit=limit
)
return {"logs": logs, "count": len(logs)}
@router.get("/trace/{trace_id}")
async def get_logs_by_trace(trace_id: str, limit: int = Query(100, le=500)):
"""Get all logs for a trace"""
storage = LogStorage()
logs = storage.search(trace_id=trace_id, limit=limit)
return {"logs": logs, "count": len(logs), "trace_id": trace_id}
@router.get("/stats")
async def get_log_stats():
"""Get log statistics"""
try:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
stats = {}
# Count by level
cursor = conn.execute("""
SELECT level, COUNT(*) as count
FROM logs
WHERE timestamp > datetime('now', '-1 hour')
GROUP BY level
""")
stats["last_hour_by_level"] = {row["level"]: row["count"] for row in cursor.fetchall()}
# Count by logger
cursor = conn.execute("""
SELECT logger, COUNT(*) as count
FROM logs
WHERE timestamp > datetime('now', '-1 hour')
GROUP BY logger
ORDER BY count DESC
LIMIT 10
""")
stats["last_hour_by_logger"] = {row["logger"]: row["count"] for row in cursor.fetchall()}
# Total count
cursor = conn.execute("SELECT COUNT(*) as total FROM logs")
stats["total_logs"] = cursor.fetchone()["total"]
# Error count last 24h
cursor = conn.execute("""
SELECT COUNT(*) as count FROM logs
WHERE level IN ('ERROR', 'CRITICAL')
AND timestamp > datetime('now', '-1 day')
""")
stats["errors_last_24h"] = cursor.fetchone()["count"]
conn.close()
return stats
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/cleanup")
async def cleanup_logs(days: int = Query(7, ge=1, le=365)):
"""Clean up old logs"""
storage = LogStorage()
deleted = storage.cleanup(days)
return {"deleted": deleted, "retention_days": days}