agent-governance/testing/framework.ts
profit 77655c298c Initial commit: Agent Governance System Phase 8
Phase 8 Production Hardening with complete governance infrastructure:

- Vault integration with tiered policies (T0-T4)
- DragonflyDB state management
- SQLite audit ledger
- Pipeline DSL and templates
- Promotion/revocation engine
- Checkpoint system for session persistence
- Health manager and circuit breaker for fault tolerance
- GitHub/Slack integrations
- Architectural test pipeline with bug watcher, suggestion engine, council review
- Multi-agent chaos testing framework

Test Results:
- Governance tests: 68/68 passing
- E2E workflow: 16/16 passing
- Phase 2 Vault: 14/14 passing
- Integration tests: 27/27 passing

Coverage: 57.6% average across 12 phases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 22:07:06 -05:00

657 lines
21 KiB
TypeScript

/**
* Agent Testing Framework
* =======================
* Provides mocks, utilities, and test harnesses for agent development.
*/
import { describe, it, expect, beforeAll, afterAll, beforeEach, mock } from "bun:test";
// =============================================================================
// Type Definitions
// =============================================================================
export interface TestContext {
taskId: string;
agentId: string;
startTime: number;
mockVault: MockVault;
mockDragonfly: MockDragonfly;
mockLLM: MockLLM;
}
export interface TestScenario {
name: string;
description: string;
setup: () => Promise<void>;
execute: (ctx: TestContext) => Promise<void>;
assertions: (ctx: TestContext) => Promise<void>;
cleanup: () => Promise<void>;
}
export interface TestMetrics {
passed: number;
failed: number;
skipped: number;
duration: number;
coverage: number;
}
// =============================================================================
// Mock Vault
// =============================================================================
export class MockVault {
private secrets: Map<string, any> = new Map();
private policies: Map<string, string[]> = new Map();
private tokens: Map<string, { policy: string; ttl: number; created: number }> = new Map();
private accessLog: Array<{ path: string; action: string; timestamp: number }> = [];
constructor() {
// Initialize with default test secrets
this.secrets.set("api-keys/openrouter", { api_key: "test-key" });
this.secrets.set("services/dragonfly", {
host: "127.0.0.1",
port: 6379,
password: "test-password",
});
// Default policies
this.policies.set("t0-observer", ["read:secret/data/docs/*", "read:secret/data/inventory/*"]);
this.policies.set("t1-operator", ["read:ssh/creds/sandbox-*", "read:proxmox/creds/sandbox"]);
}
async getSecret(path: string): Promise<any> {
this.accessLog.push({ path, action: "read", timestamp: Date.now() });
return this.secrets.get(path) || null;
}
setSecret(path: string, value: any): void {
this.secrets.set(path, value);
}
async createToken(policy: string, ttl: number = 3600): Promise<string> {
const token = "hvs.test-" + Math.random().toString(36).slice(2);
this.tokens.set(token, { policy, ttl, created: Date.now() });
return token;
}
async validateToken(token: string): Promise<boolean> {
const tokenData = this.tokens.get(token);
if (!tokenData) return false;
const elapsed = (Date.now() - tokenData.created) / 1000;
return elapsed < tokenData.ttl;
}
async revokeToken(token: string): Promise<void> {
this.tokens.delete(token);
}
checkAccess(token: string, path: string): boolean {
const tokenData = this.tokens.get(token);
if (!tokenData) return false;
const allowedPaths = this.policies.get(tokenData.policy) || [];
return allowedPaths.some(p => {
const pattern = p.replace("*", ".*");
return new RegExp(pattern).test(path);
});
}
getAccessLog(): Array<{ path: string; action: string; timestamp: number }> {
return this.accessLog;
}
reset(): void {
this.accessLog = [];
this.tokens.clear();
}
}
// =============================================================================
// Mock DragonflyDB
// =============================================================================
export class MockDragonfly {
private store: Map<string, any> = new Map();
private hashes: Map<string, Map<string, any>> = new Map();
private lists: Map<string, any[]> = new Map();
private expirations: Map<string, number> = new Map();
private subscribers: Map<string, Array<(msg: string) => void>> = new Map();
// String operations
async set(key: string, value: any, options?: { EX?: number; NX?: boolean }): Promise<string | null> {
if (options?.NX && this.store.has(key)) return null;
this.store.set(key, value);
if (options?.EX) {
this.expirations.set(key, Date.now() + options.EX * 1000);
}
return "OK";
}
async get(key: string): Promise<any> {
if (this.isExpired(key)) {
this.store.delete(key);
return null;
}
return this.store.get(key) || null;
}
async del(key: string): Promise<number> {
return this.store.delete(key) ? 1 : 0;
}
async exists(key: string): Promise<number> {
return this.store.has(key) && !this.isExpired(key) ? 1 : 0;
}
async expire(key: string, seconds: number): Promise<boolean> {
if (!this.store.has(key)) return false;
this.expirations.set(key, Date.now() + seconds * 1000);
return true;
}
// Hash operations
async hSet(key: string, field: string, value: any): Promise<number> {
if (!this.hashes.has(key)) this.hashes.set(key, new Map());
const existed = this.hashes.get(key)!.has(field);
this.hashes.get(key)!.set(field, value);
return existed ? 0 : 1;
}
async hGet(key: string, field: string): Promise<any> {
return this.hashes.get(key)?.get(field) || null;
}
async hGetAll(key: string): Promise<Record<string, any>> {
const hash = this.hashes.get(key);
if (!hash) return {};
const result: Record<string, any> = {};
for (const [k, v] of hash) {
result[k] = v;
}
return result;
}
async hIncrBy(key: string, field: string, increment: number): Promise<number> {
if (!this.hashes.has(key)) this.hashes.set(key, new Map());
const current = parseInt(this.hashes.get(key)!.get(field) || "0");
const newValue = current + increment;
this.hashes.get(key)!.set(field, newValue.toString());
return newValue;
}
// List operations
async rPush(key: string, ...values: any[]): Promise<number> {
if (!this.lists.has(key)) this.lists.set(key, []);
this.lists.get(key)!.push(...values);
return this.lists.get(key)!.length;
}
async lRange(key: string, start: number, stop: number): Promise<any[]> {
const list = this.lists.get(key) || [];
if (stop < 0) stop = list.length + stop + 1;
return list.slice(start, stop);
}
// Pub/Sub
async subscribe(channel: string, handler: (msg: string) => void): Promise<void> {
if (!this.subscribers.has(channel)) this.subscribers.set(channel, []);
this.subscribers.get(channel)!.push(handler);
}
async publish(channel: string, message: string): Promise<number> {
const handlers = this.subscribers.get(channel) || [];
handlers.forEach(h => h(message));
return handlers.length;
}
async unsubscribe(channel?: string): Promise<void> {
if (channel) {
this.subscribers.delete(channel);
} else {
this.subscribers.clear();
}
}
// Utility
private isExpired(key: string): boolean {
const expiry = this.expirations.get(key);
if (!expiry) return false;
return Date.now() > expiry;
}
reset(): void {
this.store.clear();
this.hashes.clear();
this.lists.clear();
this.expirations.clear();
this.subscribers.clear();
}
// Test helpers
getState(): { store: Map<string, any>; hashes: Map<string, Map<string, any>>; lists: Map<string, any[]> } {
return { store: this.store, hashes: this.hashes, lists: this.lists };
}
}
// =============================================================================
// Mock LLM
// =============================================================================
export class MockLLM {
private responses: Map<string, string> = new Map();
private callLog: Array<{ prompt: string; response: string; timestamp: number }> = [];
private defaultResponse: string = '{"confidence": 0.5, "steps": [], "assumptions": []}';
private latencyMs: number = 0;
private failureRate: number = 0;
setResponse(pattern: string, response: string): void {
this.responses.set(pattern, response);
}
setDefaultResponse(response: string): void {
this.defaultResponse = response;
}
setLatency(ms: number): void {
this.latencyMs = ms;
}
setFailureRate(rate: number): void {
this.failureRate = rate;
}
async complete(prompt: string, options?: { maxTokens?: number; temperature?: number }): Promise<string> {
// Simulate latency
if (this.latencyMs > 0) {
await new Promise(r => setTimeout(r, this.latencyMs));
}
// Simulate failures
if (Math.random() < this.failureRate) {
throw new Error("LLM_ERROR: Simulated failure");
}
// Find matching response
let response = this.defaultResponse;
for (const [pattern, resp] of this.responses) {
if (prompt.toLowerCase().includes(pattern.toLowerCase())) {
response = resp;
break;
}
}
this.callLog.push({ prompt, response, timestamp: Date.now() });
return response;
}
getCallLog(): Array<{ prompt: string; response: string; timestamp: number }> {
return this.callLog;
}
getCallCount(): number {
return this.callLog.length;
}
reset(): void {
this.callLog = [];
this.latencyMs = 0;
this.failureRate = 0;
}
}
// =============================================================================
// Test Utilities
// =============================================================================
export function createTestContext(overrides?: Partial<TestContext>): TestContext {
return {
taskId: "test-task-" + Math.random().toString(36).slice(2, 8),
agentId: "test-agent-" + Math.random().toString(36).slice(2, 8),
startTime: Date.now(),
mockVault: new MockVault(),
mockDragonfly: new MockDragonfly(),
mockLLM: new MockLLM(),
...overrides,
};
}
export async function withTimeout<T>(promise: Promise<T>, ms: number, message: string = "Timeout"): Promise<T> {
const timeout = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(message)), ms);
});
return Promise.race([promise, timeout]);
}
export function generateInstructionPacket(taskId: string, agentId: string, objective: string) {
return {
agent_id: agentId,
task_id: taskId,
created_for: "Test Task",
objective,
deliverables: ["plan", "report"],
constraints: {
scope: ["sandbox only"],
forbidden: ["no prod access"],
required_steps: ["plan before execute"],
},
success_criteria: ["plan generated"],
error_budget: {
max_total_errors: 5,
max_same_error_repeats: 2,
max_procedure_violations: 1,
},
escalation_rules: ["If confidence < 0.7 -> escalate"],
created_at: new Date().toISOString(),
};
}
// =============================================================================
// Test Harness
// =============================================================================
export class TestHarness {
private scenarios: TestScenario[] = [];
private results: Map<string, { passed: boolean; error?: string; duration: number }> = new Map();
addScenario(scenario: TestScenario): void {
this.scenarios.push(scenario);
}
async runAll(): Promise<TestMetrics> {
const startTime = Date.now();
let passed = 0;
let failed = 0;
for (const scenario of this.scenarios) {
const scenarioStart = Date.now();
const ctx = createTestContext();
try {
console.log(`\n[TEST] Running: ${scenario.name}`);
await scenario.setup();
await scenario.execute(ctx);
await scenario.assertions(ctx);
await scenario.cleanup();
this.results.set(scenario.name, {
passed: true,
duration: Date.now() - scenarioStart,
});
passed++;
console.log(`[PASS] ${scenario.name} (${Date.now() - scenarioStart}ms)`);
} catch (error: any) {
this.results.set(scenario.name, {
passed: false,
error: error.message,
duration: Date.now() - scenarioStart,
});
failed++;
console.log(`[FAIL] ${scenario.name}: ${error.message}`);
try {
await scenario.cleanup();
} catch {}
}
}
return {
passed,
failed,
skipped: 0,
duration: Date.now() - startTime,
coverage: this.scenarios.length > 0 ? (passed / this.scenarios.length) * 100 : 0,
};
}
getResults(): Map<string, { passed: boolean; error?: string; duration: number }> {
return this.results;
}
}
// =============================================================================
// Pre-built Test Scenarios
// =============================================================================
export const CommonScenarios = {
// Happy path - agent completes successfully
happyPath: (AgentClass: any): TestScenario => ({
name: "Happy Path - Successful Completion",
description: "Agent completes all phases without errors",
setup: async () => {},
execute: async (ctx) => {
// Set up successful LLM responses
ctx.mockLLM.setResponse("plan", JSON.stringify({
title: "Test Plan",
confidence: 0.85,
steps: [{ step: 1, action: "Test action" }],
}));
// Create instruction packet
const packet = generateInstructionPacket(ctx.taskId, ctx.agentId, "Test objective");
await ctx.mockDragonfly.set(`agent:${ctx.agentId}:packet`, JSON.stringify(packet));
},
assertions: async (ctx) => {
// Check state reached EXIT
const stateStr = await ctx.mockDragonfly.get(`agent:${ctx.agentId}:state`);
if (stateStr) {
const state = JSON.parse(stateStr);
expect(state.phase).toBe("EXIT");
expect(state.status).toBe("COMPLETED");
}
},
cleanup: async () => {},
}),
// Error budget exceeded
errorBudgetExceeded: (AgentClass: any): TestScenario => ({
name: "Error Budget Exceeded - Revocation",
description: "Agent is revoked when error budget is exceeded",
setup: async () => {},
execute: async (ctx) => {
ctx.mockLLM.setFailureRate(1.0); // All LLM calls fail
const packet = generateInstructionPacket(ctx.taskId, ctx.agentId, "Test objective");
packet.error_budget.max_total_errors = 2;
await ctx.mockDragonfly.set(`agent:${ctx.agentId}:packet`, JSON.stringify(packet));
// Simulate errors
await ctx.mockDragonfly.hIncrBy(`agent:${ctx.agentId}:errors`, "total_errors", 3);
},
assertions: async (ctx) => {
const stateStr = await ctx.mockDragonfly.get(`agent:${ctx.agentId}:state`);
if (stateStr) {
const state = JSON.parse(stateStr);
expect(state.status).toBe("REVOKED");
}
},
cleanup: async () => {},
}),
// Stuck detection
stuckDetection: (): TestScenario => ({
name: "Stuck Detection - GAMMA Spawn",
description: "GAMMA is spawned when agents are stuck",
setup: async () => {},
execute: async (ctx) => {
// Set up agent state as stuck (old last_activity)
const stuckState = {
agent_id: ctx.agentId,
role: "ALPHA",
status: "WORKING",
last_activity: new Date(Date.now() - 60000).toISOString(), // 60 seconds ago
};
await ctx.mockDragonfly.hSet(`agents:${ctx.taskId}`, "ALPHA", JSON.stringify(stuckState));
},
assertions: async (ctx) => {
// Check that stuck would be detected
const stateStr = await ctx.mockDragonfly.hGet(`agents:${ctx.taskId}`, "ALPHA");
if (stateStr) {
const state = JSON.parse(stateStr);
const inactivity = (Date.now() - new Date(state.last_activity).getTime()) / 1000;
expect(inactivity).toBeGreaterThan(30);
}
},
cleanup: async () => {},
}),
// Conflict resolution
conflictResolution: (): TestScenario => ({
name: "Conflict Resolution",
description: "Multiple proposals lead to conflict detection",
setup: async () => {},
execute: async (ctx) => {
// Simulate conflicting proposals
await ctx.mockDragonfly.hSet(`blackboard:${ctx.taskId}:solutions`, "proposal_1", JSON.stringify({
author: "ALPHA",
value: { approach: "Approach A", confidence: 0.8 },
}));
await ctx.mockDragonfly.hSet(`blackboard:${ctx.taskId}:solutions`, "proposal_2", JSON.stringify({
author: "ALPHA",
value: { approach: "Approach B", confidence: 0.7 },
}));
// BETA rejects proposal_2
await ctx.mockDragonfly.hSet(`blackboard:${ctx.taskId}:progress`, "eval_proposal_2", JSON.stringify({
accepted: false,
score: 0.5,
}));
await ctx.mockDragonfly.hIncrBy(`metrics:${ctx.taskId}`, "conflicts_detected", 1);
},
assertions: async (ctx) => {
const conflicts = await ctx.mockDragonfly.hGet(`metrics:${ctx.taskId}`, "conflicts_detected");
expect(parseInt(conflicts || "0")).toBeGreaterThan(0);
},
cleanup: async () => {},
}),
};
// =============================================================================
// Example Test Suite
// =============================================================================
export function runExampleTests() {
describe("Agent Governance Tests", () => {
let ctx: TestContext;
beforeEach(() => {
ctx = createTestContext();
});
describe("MockVault", () => {
it("should store and retrieve secrets", async () => {
ctx.mockVault.setSecret("test/secret", { key: "value" });
const secret = await ctx.mockVault.getSecret("test/secret");
expect(secret.key).toBe("value");
});
it("should create and validate tokens", async () => {
const token = await ctx.mockVault.createToken("t0-observer", 60);
expect(token).toStartWith("hvs.test-");
expect(await ctx.mockVault.validateToken(token)).toBe(true);
});
it("should revoke tokens", async () => {
const token = await ctx.mockVault.createToken("t0-observer");
await ctx.mockVault.revokeToken(token);
expect(await ctx.mockVault.validateToken(token)).toBe(false);
});
});
describe("MockDragonfly", () => {
it("should handle string operations", async () => {
await ctx.mockDragonfly.set("key", "value");
expect(await ctx.mockDragonfly.get("key")).toBe("value");
});
it("should handle hash operations", async () => {
await ctx.mockDragonfly.hSet("hash", "field", "value");
expect(await ctx.mockDragonfly.hGet("hash", "field")).toBe("value");
});
it("should handle list operations", async () => {
await ctx.mockDragonfly.rPush("list", "a", "b", "c");
const items = await ctx.mockDragonfly.lRange("list", 0, -1);
expect(items).toEqual(["a", "b", "c"]);
});
it("should handle expiration", async () => {
await ctx.mockDragonfly.set("expiring", "value", { EX: 1 });
expect(await ctx.mockDragonfly.get("expiring")).toBe("value");
// Note: In real tests, we'd wait for expiration
});
it("should handle NX option", async () => {
await ctx.mockDragonfly.set("existing", "first");
const result = await ctx.mockDragonfly.set("existing", "second", { NX: true });
expect(result).toBeNull();
expect(await ctx.mockDragonfly.get("existing")).toBe("first");
});
});
describe("MockLLM", () => {
it("should return default response", async () => {
const response = await ctx.mockLLM.complete("test prompt");
expect(response).toContain("confidence");
});
it("should match patterns", async () => {
ctx.mockLLM.setResponse("terraform", '{"tool": "terraform"}');
const response = await ctx.mockLLM.complete("Create a terraform plan");
expect(response).toContain("terraform");
});
it("should simulate failures", async () => {
ctx.mockLLM.setFailureRate(1.0);
expect(ctx.mockLLM.complete("test")).rejects.toThrow("LLM_ERROR");
});
it("should track call count", async () => {
await ctx.mockLLM.complete("prompt 1");
await ctx.mockLLM.complete("prompt 2");
expect(ctx.mockLLM.getCallCount()).toBe(2);
});
});
describe("Instruction Packets", () => {
it("should generate valid packets", () => {
const packet = generateInstructionPacket("task-1", "agent-1", "Test objective");
expect(packet.agent_id).toBe("agent-1");
expect(packet.task_id).toBe("task-1");
expect(packet.error_budget.max_total_errors).toBe(5);
});
});
});
}
// =============================================================================
// CLI
// =============================================================================
if (import.meta.main) {
console.log("Agent Testing Framework");
console.log("=======================\n");
const harness = new TestHarness();
// Add pre-built scenarios
harness.addScenario(CommonScenarios.happyPath(null));
harness.addScenario(CommonScenarios.errorBudgetExceeded(null));
harness.addScenario(CommonScenarios.stuckDetection());
harness.addScenario(CommonScenarios.conflictResolution());
harness.runAll().then(metrics => {
console.log("\n" + "=".repeat(50));
console.log("TEST RESULTS");
console.log("=".repeat(50));
console.log(`Passed: ${metrics.passed}`);
console.log(`Failed: ${metrics.failed}`);
console.log(`Skipped: ${metrics.skipped}`);
console.log(`Duration: ${metrics.duration}ms`);
console.log(`Coverage: ${metrics.coverage.toFixed(1)}%`);
console.log("=".repeat(50));
process.exit(metrics.failed > 0 ? 1 : 0);
});
}