profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

738 lines
23 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Ledger API - FastAPI Service
============================
RESTful API for agent governance ledger access.
Endpoints:
- /health - Health check
- /agents - Agent metrics CRUD
- /actions - Agent action logs
- /violations - Violation records
- /promotions - Promotion history
- /orchestration - Orchestration logs
Authentication: Vault token in X-Vault-Token header
"""
import json
import os
import sqlite3
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, List, Any
from contextlib import contextmanager
from fastapi import FastAPI, HTTPException, Header, Query, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
import uvicorn
# =============================================================================
# Configuration
# =============================================================================
DB_PATH = Path("/opt/agent-governance/ledger/governance.db")
VAULT_ADDR = os.environ.get("VAULT_ADDR", "https://127.0.0.1:8200")
API_PORT = int(os.environ.get("LEDGER_API_PORT", "8080"))
REQUIRE_AUTH = os.environ.get("LEDGER_API_AUTH", "true").lower() == "true"
# =============================================================================
# Database
# =============================================================================
@contextmanager
def get_db():
"""Get database connection with row factory"""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def row_to_dict(row: sqlite3.Row) -> dict:
"""Convert sqlite Row to dictionary"""
return dict(row)
def rows_to_list(rows: list) -> list:
"""Convert list of sqlite Rows to list of dicts"""
return [row_to_dict(row) for row in rows]
# =============================================================================
# Authentication
# =============================================================================
async def verify_vault_token(x_vault_token: Optional[str] = Header(None)) -> dict:
"""Verify Vault token and return token info"""
if not REQUIRE_AUTH:
return {"authenticated": False, "policies": ["*"]}
if not x_vault_token:
raise HTTPException(status_code=401, detail="Missing X-Vault-Token header")
try:
result = subprocess.run([
"curl", "-sk",
"-H", f"X-Vault-Token: {x_vault_token}",
f"{VAULT_ADDR}/v1/auth/token/lookup-self"
], capture_output=True, text=True, timeout=10)
data = json.loads(result.stdout)
if "errors" in data:
raise HTTPException(status_code=401, detail="Invalid Vault token")
return {
"authenticated": True,
"policies": data.get("data", {}).get("policies", []),
"accessor": data.get("data", {}).get("accessor"),
"display_name": data.get("data", {}).get("display_name")
}
except subprocess.TimeoutExpired:
raise HTTPException(status_code=503, detail="Vault timeout")
except json.JSONDecodeError:
raise HTTPException(status_code=401, detail="Invalid Vault response")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Auth error: {str(e)}")
# =============================================================================
# Models
# =============================================================================
class AgentMetrics(BaseModel):
agent_id: str
current_tier: int = 0
compliant_runs: int = 0
consecutive_compliant: int = 0
total_runs: int = 0
last_violation_at: Optional[str] = None
last_active_at: Optional[str] = None
promotion_eligible: int = 0
class AgentMetricsUpdate(BaseModel):
current_tier: Optional[int] = None
compliant_runs: Optional[int] = None
consecutive_compliant: Optional[int] = None
total_runs: Optional[int] = None
promotion_eligible: Optional[int] = None
class AgentAction(BaseModel):
agent_id: str
agent_version: str = "1.0"
tier: int
action: str
decision: str
confidence: float
target: Optional[str] = None
side_effects: Optional[str] = None
success: int
error_type: Optional[str] = None
error_message: Optional[str] = None
vault_token_accessor: Optional[str] = None
session_id: Optional[str] = None
class Violation(BaseModel):
agent_id: str
violation_type: str
severity: str # low, medium, high, critical
description: str
triggering_action: Optional[str] = None
evidence: Optional[str] = None
remediation: Optional[str] = None
class ViolationAcknowledge(BaseModel):
acknowledged_by: str
class Promotion(BaseModel):
agent_id: str
from_tier: int
to_tier: int
approved_by: str
rationale: Optional[str] = None
evidence: Optional[str] = None
class OrchestrationLog(BaseModel):
session_id: Optional[str] = None
agent_id: Optional[str] = None
orchestration_mode: str
model_id: Optional[str] = None
command_type: str
command: str
response: Optional[str] = None
tokens_used: Optional[int] = None
success: int
# =============================================================================
# FastAPI App
# =============================================================================
app = FastAPI(
title="Agent Governance Ledger API",
description="RESTful API for agent governance ledger access",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# =============================================================================
# Health Endpoints
# =============================================================================
@app.get("/health")
async def health_check():
"""Health check endpoint"""
db_ok = DB_PATH.exists()
vault_ok = False
try:
result = subprocess.run(
["docker", "exec", "vault", "vault", "status", "-format=json"],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
vault_ok = True
except:
pass
return {
"status": "healthy" if db_ok else "degraded",
"database": "ok" if db_ok else "missing",
"vault": "ok" if vault_ok else "unavailable",
"timestamp": datetime.now(timezone.utc).isoformat()
}
@app.get("/")
async def root():
"""API root"""
return {
"service": "Agent Governance Ledger API",
"version": "1.0.0",
"endpoints": {
"health": "/health",
"agents": "/agents",
"actions": "/actions",
"violations": "/violations",
"promotions": "/promotions",
"orchestration": "/orchestration"
},
"docs": "/docs"
}
# =============================================================================
# Agent Metrics Endpoints
# =============================================================================
@app.get("/agents")
async def list_agents(
limit: int = Query(100, le=1000),
offset: int = Query(0, ge=0),
tier: Optional[int] = None,
auth: dict = Depends(verify_vault_token)
):
"""List all agents with metrics"""
with get_db() as conn:
query = "SELECT * FROM agent_metrics"
params = []
if tier is not None:
query += " WHERE current_tier = ?"
params.append(tier)
query += " ORDER BY updated_at DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(query, params)
agents = rows_to_list(cursor.fetchall())
return {"agents": agents, "count": len(agents)}
@app.get("/agents/{agent_id}")
async def get_agent(agent_id: str, auth: dict = Depends(verify_vault_token)):
"""Get agent metrics by ID"""
with get_db() as conn:
cursor = conn.execute(
"SELECT * FROM agent_metrics WHERE agent_id = ?",
(agent_id,)
)
row = cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Agent not found")
return row_to_dict(row)
@app.post("/agents")
async def create_agent(agent: AgentMetrics, auth: dict = Depends(verify_vault_token)):
"""Create or update agent metrics"""
with get_db() as conn:
conn.execute("""
INSERT OR REPLACE INTO agent_metrics
(agent_id, current_tier, compliant_runs, consecutive_compliant,
total_runs, last_violation_at, last_active_at, promotion_eligible, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
""", (
agent.agent_id, agent.current_tier, agent.compliant_runs,
agent.consecutive_compliant, agent.total_runs,
agent.last_violation_at, agent.last_active_at, agent.promotion_eligible
))
conn.commit()
return {"status": "created", "agent_id": agent.agent_id}
@app.patch("/agents/{agent_id}")
async def update_agent(
agent_id: str,
update: AgentMetricsUpdate,
auth: dict = Depends(verify_vault_token)
):
"""Partially update agent metrics"""
updates = []
params = []
if update.current_tier is not None:
updates.append("current_tier = ?")
params.append(update.current_tier)
if update.compliant_runs is not None:
updates.append("compliant_runs = ?")
params.append(update.compliant_runs)
if update.consecutive_compliant is not None:
updates.append("consecutive_compliant = ?")
params.append(update.consecutive_compliant)
if update.total_runs is not None:
updates.append("total_runs = ?")
params.append(update.total_runs)
if update.promotion_eligible is not None:
updates.append("promotion_eligible = ?")
params.append(update.promotion_eligible)
if not updates:
raise HTTPException(status_code=400, detail="No fields to update")
updates.append("updated_at = datetime('now')")
params.append(agent_id)
with get_db() as conn:
cursor = conn.execute(
f"UPDATE agent_metrics SET {', '.join(updates)} WHERE agent_id = ?",
params
)
conn.commit()
if cursor.rowcount == 0:
raise HTTPException(status_code=404, detail="Agent not found")
return {"status": "updated", "agent_id": agent_id}
# =============================================================================
# Actions Endpoints
# =============================================================================
@app.get("/actions")
async def list_actions(
limit: int = Query(100, le=1000),
offset: int = Query(0, ge=0),
agent_id: Optional[str] = None,
success: Optional[int] = None,
auth: dict = Depends(verify_vault_token)
):
"""List agent actions"""
with get_db() as conn:
query = "SELECT * FROM agent_actions WHERE 1=1"
params = []
if agent_id:
query += " AND agent_id = ?"
params.append(agent_id)
if success is not None:
query += " AND success = ?"
params.append(success)
query += " ORDER BY timestamp DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(query, params)
actions = rows_to_list(cursor.fetchall())
return {"actions": actions, "count": len(actions)}
@app.post("/actions")
async def create_action(action: AgentAction, auth: dict = Depends(verify_vault_token)):
"""Log an agent action"""
timestamp = datetime.now(timezone.utc).isoformat()
with get_db() as conn:
cursor = conn.execute("""
INSERT INTO agent_actions
(timestamp, agent_id, agent_version, tier, action, decision, confidence,
target, side_effects, success, error_type, error_message,
vault_token_accessor, session_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
timestamp, action.agent_id, action.agent_version, action.tier,
action.action, action.decision, action.confidence, action.target,
action.side_effects, action.success, action.error_type,
action.error_message, action.vault_token_accessor, action.session_id
))
conn.commit()
action_id = cursor.lastrowid
return {"status": "created", "id": action_id, "timestamp": timestamp}
# =============================================================================
# Violations Endpoints
# =============================================================================
@app.get("/violations")
async def list_violations(
limit: int = Query(100, le=1000),
offset: int = Query(0, ge=0),
agent_id: Optional[str] = None,
severity: Optional[str] = None,
acknowledged: Optional[int] = None,
auth: dict = Depends(verify_vault_token)
):
"""List violations"""
with get_db() as conn:
query = "SELECT * FROM violations WHERE 1=1"
params = []
if agent_id:
query += " AND agent_id = ?"
params.append(agent_id)
if severity:
query += " AND severity = ?"
params.append(severity)
if acknowledged is not None:
query += " AND acknowledged = ?"
params.append(acknowledged)
query += " ORDER BY timestamp DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(query, params)
violations = rows_to_list(cursor.fetchall())
return {"violations": violations, "count": len(violations)}
@app.get("/violations/{violation_id}")
async def get_violation(violation_id: int, auth: dict = Depends(verify_vault_token)):
"""Get violation by ID"""
with get_db() as conn:
cursor = conn.execute(
"SELECT * FROM violations WHERE id = ?",
(violation_id,)
)
row = cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Violation not found")
return row_to_dict(row)
@app.post("/violations")
async def create_violation(
violation: Violation,
auth: dict = Depends(verify_vault_token)
):
"""Report a violation"""
timestamp = datetime.now(timezone.utc).isoformat()
with get_db() as conn:
cursor = conn.execute("""
INSERT INTO violations
(timestamp, agent_id, violation_type, severity, description,
triggering_action, evidence, remediation)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
timestamp, violation.agent_id, violation.violation_type,
violation.severity, violation.description, violation.triggering_action,
violation.evidence, violation.remediation
))
conn.commit()
violation_id = cursor.lastrowid
return {"status": "created", "id": violation_id, "timestamp": timestamp}
@app.post("/violations/{violation_id}/acknowledge")
async def acknowledge_violation(
violation_id: int,
ack: ViolationAcknowledge,
auth: dict = Depends(verify_vault_token)
):
"""Acknowledge a violation"""
with get_db() as conn:
cursor = conn.execute("""
UPDATE violations
SET acknowledged = 1, acknowledged_by = ?
WHERE id = ?
""", (ack.acknowledged_by, violation_id))
conn.commit()
if cursor.rowcount == 0:
raise HTTPException(status_code=404, detail="Violation not found")
return {"status": "acknowledged", "id": violation_id}
# =============================================================================
# Promotions Endpoints
# =============================================================================
@app.get("/promotions")
async def list_promotions(
limit: int = Query(100, le=1000),
offset: int = Query(0, ge=0),
agent_id: Optional[str] = None,
auth: dict = Depends(verify_vault_token)
):
"""List promotions"""
with get_db() as conn:
query = "SELECT * FROM promotions"
params = []
if agent_id:
query += " WHERE agent_id = ?"
params.append(agent_id)
query += " ORDER BY timestamp DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(query, params)
promotions = rows_to_list(cursor.fetchall())
return {"promotions": promotions, "count": len(promotions)}
@app.post("/promotions")
async def create_promotion(
promotion: Promotion,
auth: dict = Depends(verify_vault_token)
):
"""Record a promotion"""
timestamp = datetime.now(timezone.utc).isoformat()
with get_db() as conn:
cursor = conn.execute("""
INSERT INTO promotions
(timestamp, agent_id, from_tier, to_tier, approved_by, rationale, evidence)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
timestamp, promotion.agent_id, promotion.from_tier, promotion.to_tier,
promotion.approved_by, promotion.rationale, promotion.evidence
))
conn.commit()
promotion_id = cursor.lastrowid
# Update agent metrics
conn.execute("""
UPDATE agent_metrics
SET current_tier = ?, updated_at = datetime('now')
WHERE agent_id = ?
""", (promotion.to_tier, promotion.agent_id))
conn.commit()
return {"status": "created", "id": promotion_id, "timestamp": timestamp}
# =============================================================================
# Orchestration Endpoints
# =============================================================================
@app.get("/orchestration")
async def list_orchestration_logs(
limit: int = Query(100, le=1000),
offset: int = Query(0, ge=0),
mode: Optional[str] = None,
success: Optional[int] = None,
auth: dict = Depends(verify_vault_token)
):
"""List orchestration logs"""
with get_db() as conn:
query = "SELECT * FROM orchestration_log WHERE 1=1"
params = []
if mode:
query += " AND orchestration_mode = ?"
params.append(mode)
if success is not None:
query += " AND success = ?"
params.append(success)
query += " ORDER BY timestamp DESC LIMIT ? OFFSET ?"
params.extend([limit, offset])
cursor = conn.execute(query, params)
logs = rows_to_list(cursor.fetchall())
return {"logs": logs, "count": len(logs)}
@app.get("/orchestration/summary")
async def get_orchestration_summary(auth: dict = Depends(verify_vault_token)):
"""Get orchestration summary by mode and model"""
with get_db() as conn:
cursor = conn.execute("SELECT * FROM orchestration_summary")
summary = rows_to_list(cursor.fetchall())
return {"summary": summary}
@app.post("/orchestration")
async def create_orchestration_log(
log: OrchestrationLog,
auth: dict = Depends(verify_vault_token)
):
"""Log an orchestration event"""
timestamp = datetime.now(timezone.utc).isoformat()
with get_db() as conn:
cursor = conn.execute("""
INSERT INTO orchestration_log
(timestamp, session_id, agent_id, orchestration_mode, model_id,
command_type, command, response, tokens_used, success)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
timestamp, log.session_id, log.agent_id, log.orchestration_mode,
log.model_id, log.command_type, log.command, log.response,
log.tokens_used, log.success
))
conn.commit()
log_id = cursor.lastrowid
return {"status": "created", "id": log_id, "timestamp": timestamp}
# =============================================================================
# Stats Endpoint
# =============================================================================
@app.get("/stats")
async def get_stats(auth: dict = Depends(verify_vault_token)):
"""Get overall ledger statistics"""
with get_db() as conn:
stats = {}
# Agent counts by tier
cursor = conn.execute("""
SELECT current_tier, COUNT(*) as count
FROM agent_metrics
GROUP BY current_tier
""")
stats["agents_by_tier"] = {row["current_tier"]: row["count"] for row in cursor.fetchall()}
# Total agents
cursor = conn.execute("SELECT COUNT(*) as count FROM agent_metrics")
stats["total_agents"] = cursor.fetchone()["count"]
# Violation counts by severity
cursor = conn.execute("""
SELECT severity, COUNT(*) as count
FROM violations
GROUP BY severity
""")
stats["violations_by_severity"] = {row["severity"]: row["count"] for row in cursor.fetchall()}
# Unacknowledged violations
cursor = conn.execute("""
SELECT COUNT(*) as count FROM violations WHERE acknowledged = 0
""")
stats["unacknowledged_violations"] = cursor.fetchone()["count"]
# Total promotions
cursor = conn.execute("SELECT COUNT(*) as count FROM promotions")
stats["total_promotions"] = cursor.fetchone()["count"]
# Recent activity (last 24h)
cursor = conn.execute("""
SELECT COUNT(*) as count FROM agent_actions
WHERE timestamp > datetime('now', '-1 day')
""")
stats["actions_last_24h"] = cursor.fetchone()["count"]
# Orchestration token usage
cursor = conn.execute("""
SELECT SUM(tokens_used) as total FROM orchestration_log
""")
row = cursor.fetchone()
stats["total_tokens_used"] = row["total"] or 0
return stats
# =============================================================================
# Main
# =============================================================================
if __name__ == "__main__":
print(f"""
╔══════════════════════════════════════════════════════════╗
║ AGENT GOVERNANCE LEDGER API ║
╚══════════════════════════════════════════════════════════╝
Database: {DB_PATH}
Auth Required: {REQUIRE_AUTH}
Port: {API_PORT}
Endpoints:
GET /health Health check
GET / API info
GET /docs Swagger UI
GET /agents List agents
GET /agents/:id Get agent
POST /agents Create/update agent
PATCH /agents/:id Update agent
GET /actions List actions
POST /actions Log action
GET /violations List violations
GET /violations/:id Get violation
POST /violations Report violation
POST /violations/:id/ack Acknowledge
GET /promotions List promotions
POST /promotions Record promotion
GET /orchestration List orchestration logs
GET /orchestration/summary Summary by mode
POST /orchestration Log orchestration
GET /stats Overall statistics
""")
uvicorn.run(app, host="0.0.0.0", port=API_PORT)