Phase 8 Production Hardening with complete governance infrastructure: - Vault integration with tiered policies (T0-T4) - DragonflyDB state management - SQLite audit ledger - Pipeline DSL and templates - Promotion/revocation engine - Checkpoint system for session persistence - Health manager and circuit breaker for fault tolerance - GitHub/Slack integrations - Architectural test pipeline with bug watcher, suggestion engine, council review - Multi-agent chaos testing framework Test Results: - Governance tests: 68/68 passing - E2E workflow: 16/16 passing - Phase 2 Vault: 14/14 passing - Integration tests: 27/27 passing Coverage: 57.6% average across 12 phases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
309 lines
9.8 KiB
Python
309 lines
9.8 KiB
Python
"""
|
|
MockLLM - Simulates LLM responses for deterministic testing.
|
|
|
|
Provides pattern-based response matching for testing agent behavior
|
|
without requiring actual LLM API calls.
|
|
"""
|
|
|
|
from typing import Dict, List, Any, Optional, Callable, Tuple
|
|
from dataclasses import dataclass, field
|
|
import re
|
|
import json
|
|
|
|
|
|
@dataclass
|
|
class MockResponse:
|
|
"""A configured mock response"""
|
|
pattern: str
|
|
response: str
|
|
confidence: float = 0.9
|
|
delay: float = 0.0 # Simulated latency
|
|
tokens_used: int = 100
|
|
call_count: int = 0
|
|
|
|
|
|
class MockLLM:
|
|
"""
|
|
Mock LLM implementation for testing.
|
|
|
|
Features:
|
|
- Pattern-based response matching
|
|
- Configurable confidence levels
|
|
- Token counting simulation
|
|
- Call tracking for assertions
|
|
- Error injection for failure testing
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._responses: List[MockResponse] = []
|
|
self._default_response: str = '{"decision": "EXECUTE", "confidence": 0.5, "steps": []}'
|
|
self._call_history: List[Dict[str, Any]] = []
|
|
self._total_tokens: int = 0
|
|
self._error_mode: Optional[str] = None
|
|
self._error_after: int = 0
|
|
self._call_count: int = 0
|
|
|
|
# Set up some default patterns
|
|
self._setup_defaults()
|
|
|
|
def _setup_defaults(self):
|
|
"""Set up default response patterns"""
|
|
# Plan generation
|
|
self.add_response(
|
|
pattern="generate.*plan|create.*plan|plan.*for",
|
|
response=json.dumps({
|
|
"decision": "PLAN",
|
|
"confidence": 0.85,
|
|
"plan": {
|
|
"title": "Generated Plan",
|
|
"steps": [
|
|
{"action": "analyze", "description": "Analyze requirements"},
|
|
{"action": "implement", "description": "Implement solution"},
|
|
{"action": "verify", "description": "Verify results"}
|
|
]
|
|
},
|
|
"assumptions": [],
|
|
"risks": []
|
|
}),
|
|
confidence=0.85
|
|
)
|
|
|
|
# Read operations
|
|
self.add_response(
|
|
pattern="read|view|show|list|get",
|
|
response=json.dumps({
|
|
"decision": "EXECUTE",
|
|
"confidence": 0.95,
|
|
"action": "read",
|
|
"result": "Operation completed successfully"
|
|
}),
|
|
confidence=0.95
|
|
)
|
|
|
|
# Execute operations
|
|
self.add_response(
|
|
pattern="execute|run|deploy|apply",
|
|
response=json.dumps({
|
|
"decision": "EXECUTE",
|
|
"confidence": 0.80,
|
|
"action": "execute",
|
|
"steps": [
|
|
{"command": "example_command", "status": "pending"}
|
|
],
|
|
"requires_approval": True
|
|
}),
|
|
confidence=0.80
|
|
)
|
|
|
|
# Error/unknown
|
|
self.add_response(
|
|
pattern="error|fail|invalid",
|
|
response=json.dumps({
|
|
"decision": "ERROR",
|
|
"confidence": 0.1,
|
|
"error": "Simulated error response",
|
|
"recommendations": ["Check input parameters", "Verify permissions"]
|
|
}),
|
|
confidence=0.1
|
|
)
|
|
|
|
def add_response(self, pattern: str, response: str, confidence: float = 0.9,
|
|
delay: float = 0.0, tokens: int = 100):
|
|
"""
|
|
Add a response pattern.
|
|
|
|
Args:
|
|
pattern: Regex pattern to match against prompt
|
|
response: Response to return (usually JSON string)
|
|
confidence: Confidence score for this response
|
|
delay: Simulated response latency (seconds)
|
|
tokens: Simulated token usage
|
|
"""
|
|
self._responses.insert(0, MockResponse(
|
|
pattern=pattern,
|
|
response=response,
|
|
confidence=confidence,
|
|
delay=delay,
|
|
tokens_used=tokens
|
|
))
|
|
|
|
def set_default_response(self, response: str):
|
|
"""Set the default response when no pattern matches"""
|
|
self._default_response = response
|
|
|
|
def set_error_mode(self, error_type: str, after_calls: int = 0):
|
|
"""
|
|
Configure error injection.
|
|
|
|
Args:
|
|
error_type: Type of error ("timeout", "rate_limit", "api_error", None)
|
|
after_calls: Number of successful calls before error
|
|
"""
|
|
self._error_mode = error_type
|
|
self._error_after = after_calls
|
|
|
|
def complete(self, prompt: str, max_tokens: int = 1000,
|
|
temperature: float = 0.7) -> Tuple[str, Dict[str, Any]]:
|
|
"""
|
|
Generate a completion for the prompt.
|
|
|
|
Returns: (response_text, metadata)
|
|
"""
|
|
self._call_count += 1
|
|
|
|
# Record call
|
|
call_record = {
|
|
"prompt": prompt[:500], # Truncate for storage
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"timestamp": __import__("datetime").datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Check for error injection
|
|
if self._error_mode and self._call_count > self._error_after:
|
|
call_record["error"] = self._error_mode
|
|
self._call_history.append(call_record)
|
|
raise self._create_error(self._error_mode)
|
|
|
|
# Find matching response
|
|
prompt_lower = prompt.lower()
|
|
matched_response = None
|
|
|
|
for mock_resp in self._responses:
|
|
if re.search(mock_resp.pattern, prompt_lower, re.IGNORECASE):
|
|
matched_response = mock_resp
|
|
matched_response.call_count += 1
|
|
break
|
|
|
|
if matched_response:
|
|
response = matched_response.response
|
|
tokens = matched_response.tokens_used
|
|
confidence = matched_response.confidence
|
|
|
|
# Simulate delay
|
|
if matched_response.delay > 0:
|
|
import time
|
|
time.sleep(matched_response.delay)
|
|
else:
|
|
response = self._default_response
|
|
tokens = 50
|
|
confidence = 0.5
|
|
|
|
self._total_tokens += tokens
|
|
|
|
metadata = {
|
|
"tokens_used": tokens,
|
|
"confidence": confidence,
|
|
"pattern_matched": matched_response.pattern if matched_response else None,
|
|
"total_tokens": self._total_tokens
|
|
}
|
|
|
|
call_record["response"] = response[:500]
|
|
call_record["metadata"] = metadata
|
|
self._call_history.append(call_record)
|
|
|
|
return response, metadata
|
|
|
|
def _create_error(self, error_type: str) -> Exception:
|
|
"""Create an appropriate error for testing"""
|
|
if error_type == "timeout":
|
|
return TimeoutError("LLM request timed out")
|
|
elif error_type == "rate_limit":
|
|
return Exception("Rate limit exceeded")
|
|
elif error_type == "api_error":
|
|
return Exception("API error: 500 Internal Server Error")
|
|
else:
|
|
return Exception(f"Unknown error: {error_type}")
|
|
|
|
def chat(self, messages: List[Dict[str, str]], **kwargs) -> Tuple[str, Dict[str, Any]]:
|
|
"""
|
|
Chat completion interface.
|
|
|
|
Args:
|
|
messages: List of {"role": "user/assistant/system", "content": "..."}
|
|
"""
|
|
# Combine messages into a single prompt
|
|
prompt = "\n".join([
|
|
f"{m['role']}: {m['content']}"
|
|
for m in messages
|
|
])
|
|
return self.complete(prompt, **kwargs)
|
|
|
|
# === Test Helpers ===
|
|
|
|
def reset(self):
|
|
"""Reset all state for testing"""
|
|
self._responses.clear()
|
|
self._call_history.clear()
|
|
self._total_tokens = 0
|
|
self._error_mode = None
|
|
self._error_after = 0
|
|
self._call_count = 0
|
|
self._setup_defaults()
|
|
|
|
def get_call_history(self) -> List[Dict[str, Any]]:
|
|
"""Get call history for assertions"""
|
|
return self._call_history.copy()
|
|
|
|
def get_call_count(self) -> int:
|
|
"""Get total number of calls"""
|
|
return self._call_count
|
|
|
|
def get_total_tokens(self) -> int:
|
|
"""Get total tokens used"""
|
|
return self._total_tokens
|
|
|
|
def assert_called_with(self, pattern: str) -> bool:
|
|
"""Check if any call matched a pattern"""
|
|
for call in self._call_history:
|
|
if re.search(pattern, call["prompt"], re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
def get_response_stats(self) -> Dict[str, int]:
|
|
"""Get call counts per response pattern"""
|
|
return {
|
|
resp.pattern: resp.call_count
|
|
for resp in self._responses
|
|
if resp.call_count > 0
|
|
}
|
|
|
|
|
|
class MockLLMBuilder:
|
|
"""Builder for creating configured MockLLM instances"""
|
|
|
|
def __init__(self):
|
|
self._llm = MockLLM()
|
|
|
|
def with_response(self, pattern: str, response: str, **kwargs) -> 'MockLLMBuilder':
|
|
"""Add a response pattern"""
|
|
self._llm.add_response(pattern, response, **kwargs)
|
|
return self
|
|
|
|
def with_json_response(self, pattern: str, data: Dict[str, Any],
|
|
**kwargs) -> 'MockLLMBuilder':
|
|
"""Add a JSON response pattern"""
|
|
self._llm.add_response(pattern, json.dumps(data), **kwargs)
|
|
return self
|
|
|
|
def with_error_after(self, calls: int, error_type: str = "api_error") -> 'MockLLMBuilder':
|
|
"""Configure error injection"""
|
|
self._llm.set_error_mode(error_type, calls)
|
|
return self
|
|
|
|
def with_high_confidence(self) -> 'MockLLMBuilder':
|
|
"""Configure all responses for high confidence"""
|
|
for resp in self._llm._responses:
|
|
resp.confidence = 0.95
|
|
return self
|
|
|
|
def with_low_confidence(self) -> 'MockLLMBuilder':
|
|
"""Configure all responses for low confidence"""
|
|
for resp in self._llm._responses:
|
|
resp.confidence = 0.3
|
|
return self
|
|
|
|
def build(self) -> MockLLM:
|
|
"""Build the configured MockLLM"""
|
|
return self._llm
|