Add Auto-Optimize: AI agent for history-driven prompt improvement

When viewing any past run in History, click "Optimize" to trigger an
automated workflow that:

1. Analyzes the original prompt + responses + score
2. Identifies improvement strategies (clarity, depth, specificity, etc.)
3. Generates 3-5 improved prompt variations
4. Tests each variation across original mode + brainstorm
5. Auto-scores all results via background judge
6. Ranks results and highlights the winner
7. "Use This" button loads winning prompt into composer

Architecture:
- _run_optimize(job_id, run_id): background thread, 5-phase engine
- POST /api/runs/<id>/optimize: starts optimization job
- GET /api/optimize/<job_id>/stream: SSE for live progress
- Budget-capped at 15 model calls per optimization
- Child runs saved as real team_runs (source: "optimize")
- Auto-scored → feeds into analytics + routing table automatically
- Results saved to pipeline_runs (pipeline: "optimize")

Frontend:
- "Optimize" button in history detail panel (accent-colored)
- startOptimize(runId): replaces detail view with live optimization stream
- Phase cards: Analysis → Variations → Testing → Ranked Results
- Score bars with color coding (green/amber/red)
- Winner row highlighted with star + "Use This" button

Closes the learning loop: system studies its own history → generates
better prompts → tests them → scores results → routing table improves.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-03-29 07:03:27 -05:00
parent 8ad221b41f
commit 3b4fa449f1

View File

@ -6438,6 +6438,62 @@ def score_run(run_id):
return jsonify({"error": str(e)}), 500 return jsonify({"error": str(e)}), 500
@app.route("/api/runs/<int:run_id>/optimize", methods=["POST"])
@admin_required
def start_optimize(run_id):
"""Start an auto-optimize job for a past run."""
job_id = f"opt-{run_id}-{int(time.time())}"
try:
with get_db() as conn:
with conn.cursor() as cur:
cur.execute("SELECT id FROM team_runs WHERE id = %s", (run_id,))
if not cur.fetchone():
return jsonify({"error": "Run not found"}), 404
except Exception as e:
return jsonify({"error": str(e)}), 500
# Don't allow double-optimize
for jid, info in _optimize_jobs.items():
if jid.startswith(f"opt-{run_id}-") and info.get("status") == "running":
return jsonify({"error": "Already optimizing this run", "job_id": jid}), 409
_optimize_jobs[job_id] = {"status": "starting"}
_optimize_queues[job_id] = []
t = threading.Thread(target=_run_optimize, args=(job_id, run_id), daemon=True)
_optimize_jobs[job_id]["thread"] = t
t.start()
return jsonify({"ok": True, "job_id": job_id})
@app.route("/api/optimize/<job_id>/stream")
@login_required
def optimize_stream(job_id):
"""SSE stream for optimization progress."""
q = []
_optimize_queues.setdefault(job_id, []).append(q)
def generate():
try:
idle_count = 0
while True:
if q:
idle_count = 0
data = q.pop(0)
yield f"data: {json.dumps(data)}\n\n"
if data.get("type") == "done":
break
else:
idle_count += 1
if idle_count > 300: # 5 min timeout
break
time.sleep(1)
yield ": keepalive\n\n"
finally:
try:
_optimize_queues.get(job_id, []).remove(q)
except ValueError:
pass
return Response(generate(), mimetype="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"]) @app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
@login_required @login_required
def archive_run(run_id): def archive_run(run_id):
@ -6706,6 +6762,145 @@ function toast(msg, ok) {
setTimeout(function(){t.remove()},2500); setTimeout(function(){t.remove()},2500);
} }
async function startOptimize(runId) {
var r = await fetch('/api/runs/'+runId+'/optimize', {method:'POST'});
var data = await r.json();
if (data.error) { toast(data.error, false); return; }
var jobId = data.job_id;
var panel = document.getElementById('detail-panel');
panel.textContent = '';
// Header
var hdr = document.createElement('div'); hdr.style.cssText = 'display:flex;align-items:center;gap:10px;margin-bottom:16px';
var backBtn = document.createElement('button'); backBtn.className = 'tool-btn';
backBtn.textContent = '\u2190 Back'; backBtn.onclick = function(){ openDetail(runId); };
hdr.appendChild(backBtn);
var title = document.createElement('span');
title.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);font-weight:700';
title.textContent = 'OPTIMIZING RUN #'+runId;
hdr.appendChild(title);
panel.appendChild(hdr);
// Status
var statusEl = document.createElement('div');
statusEl.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:11px;color:var(--text2);margin-bottom:12px';
statusEl.textContent = 'Starting optimization...';
panel.appendChild(statusEl);
// Results container
var resultsEl = document.createElement('div');
panel.appendChild(resultsEl);
// SSE stream
var es = new EventSource('/api/optimize/'+jobId+'/stream');
es.onmessage = function(e) {
var d = JSON.parse(e.data);
if (d.type === 'status') {
statusEl.textContent = d.text;
}
if (d.type === 'error') {
var err = document.createElement('div');
err.style.cssText = 'color:var(--red);font-family:JetBrains Mono,monospace;font-size:11px;margin:8px 0;border-left:2px solid var(--red);padding-left:8px';
err.textContent = 'Error: ' + d.text;
resultsEl.appendChild(err);
}
if (d.type === 'phase') {
var block = document.createElement('div');
block.style.cssText = 'background:var(--surface);border:1px solid var(--border);border-radius:2px;padding:12px;margin-bottom:8px';
var phTitle = document.createElement('div');
phTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);margin-bottom:6px;font-weight:700';
phTitle.textContent = d.phase === 'analyze' ? 'Analysis' : d.count + ' Variations Generated';
block.appendChild(phTitle);
var body = document.createElement('div');
body.style.cssText = 'font-size:12px;line-height:1.6;color:var(--text);white-space:pre-wrap;max-height:200px;overflow-y:auto';
if (d.phase === 'analyze') {
body.textContent = d.text || '';
} else if (d.variations) {
d.variations.forEach(function(v, i) {
var line = document.createElement('div');
line.style.cssText = 'margin-bottom:8px;padding:6px 8px;background:rgba(0,0,0,0.1);border-radius:2px';
line.textContent = 'V'+(i+1)+' ['+v.strategy+'] '+v.prompt;
body.appendChild(line);
});
body.style.whiteSpace = 'normal';
}
block.appendChild(body);
resultsEl.appendChild(block);
}
if (d.type === 'test') {
statusEl.textContent = 'Testing V'+(d.variation+1)+' ['+d.strategy+'] in '+d.mode+'... '+d.status;
}
if (d.type === 'results') {
statusEl.textContent = 'Optimization complete!';
var table = document.createElement('div'); table.style.cssText = 'margin-top:12px';
var tTitle = document.createElement('div');
tTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:2px;color:var(--accent);margin-bottom:10px;font-weight:700';
tTitle.textContent = 'RANKED RESULTS' + (d.original_score ? ' (original: '+d.original_score+'/10)' : '');
table.appendChild(tTitle);
(d.ranked||[]).forEach(function(r, i) {
var row = document.createElement('div');
row.style.cssText = 'display:flex;align-items:center;gap:10px;padding:10px 12px;margin-bottom:4px;background:var(--surface);border:1px solid var(--border);border-radius:2px';
if (i === 0) row.style.borderColor = 'var(--accent)';
var rank = document.createElement('span');
rank.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:14px;font-weight:700;min-width:24px;color:'+(i===0?'var(--accent)':'var(--text2)');
rank.textContent = i === 0 ? '\u2605' : '#'+(i+1);
row.appendChild(rank);
var info = document.createElement('div'); info.style.cssText = 'flex:1;min-width:0';
var label = document.createElement('div');
label.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--text2);text-transform:uppercase;letter-spacing:0.5px';
label.textContent = 'V'+(r.variation+1)+' ['+r.strategy+'] \u00D7 '+r.mode;
info.appendChild(label);
var snippet = document.createElement('div');
snippet.style.cssText = 'font-size:11px;color:var(--text);margin-top:2px;white-space:nowrap;overflow:hidden;text-overflow:ellipsis';
snippet.textContent = r.snippet || '';
info.appendChild(snippet);
row.appendChild(info);
var scoreBar = document.createElement('div'); scoreBar.style.cssText = 'width:80px;display:flex;align-items:center;gap:6px';
var bar = document.createElement('div'); bar.style.cssText = 'flex:1;height:6px;background:rgba(0,0,0,0.15);border-radius:3px;overflow:hidden';
var fill = document.createElement('div');
var pct = ((r.score||0)/10)*100;
fill.style.cssText = 'height:100%;border-radius:3px;background:'+(pct>=70?'var(--green)':pct>=50?'var(--accent)':'var(--red)')+';width:'+pct+'%';
bar.appendChild(fill); scoreBar.appendChild(bar);
var scoreNum = document.createElement('span');
scoreNum.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:12px;font-weight:700;min-width:24px;text-align:right';
scoreNum.textContent = r.score ? r.score.toFixed(1) : '?';
scoreBar.appendChild(scoreNum);
row.appendChild(scoreBar);
if (i === 0 && r.prompt) {
var useBtn = document.createElement('button'); useBtn.className = 'tool-btn';
useBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);font-size:9px;white-space:nowrap';
useBtn.textContent = 'Use This';
useBtn.onclick = function(){
var promptEl = document.getElementById('prompt');
if (promptEl) promptEl.value = r.prompt;
window.location.href = '/';
};
row.appendChild(useBtn);
}
table.appendChild(row);
});
resultsEl.appendChild(table);
}
if (d.type === 'done') {
es.close();
if (d.improvement > 0) {
statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 (+'+(d.improvement||0).toFixed(1)+' improvement) | '+d.calls_used+' model calls';
statusEl.style.color = 'var(--green)';
} else {
statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 | Original: '+(d.original_score||'?')+'/10 | '+d.calls_used+' calls';
}
}
};
es.onerror = function() { es.close(); statusEl.textContent += ' (stream ended)'; };
}
async function loadRuns() { async function loadRuns() {
var mode = document.getElementById('filter-mode').value; var mode = document.getElementById('filter-mode').value;
var tag = document.getElementById('filter-tag').value; var tag = document.getElementById('filter-tag').value;
@ -6819,6 +7014,11 @@ async function openDetail(id) {
var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete'; var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete';
delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} }; delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} };
actions.appendChild(delBtn); actions.appendChild(delBtn);
var optBtn = document.createElement('button'); optBtn.className = 'tool-btn';
optBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);margin-left:auto';
optBtn.textContent = '\u26A1 Optimize';
optBtn.onclick = function(){ startOptimize(id); };
actions.appendChild(optBtn);
panel.appendChild(actions); panel.appendChild(actions);
// Responses // Responses
@ -7043,6 +7243,207 @@ def get_self_report(rid):
_meta_threads = {} _meta_threads = {}
_meta_status = {} # pipeline_id -> {stage, substep, progress} _meta_status = {} # pipeline_id -> {stage, substep, progress}
# ─── AUTO-OPTIMIZE ENGINE ────────────────────────────────────
_optimize_jobs = {} # job_id -> {"thread": Thread, "status": str}
_optimize_queues = {} # job_id -> [[event_dicts]]
_OPTIMIZE_MAX_CALLS = 15
def _optimize_emit(job_id, data):
for q in _optimize_queues.get(job_id, []):
q.append(data)
def _run_optimize(job_id, run_id):
"""Background: analyze a past run, generate improved prompts, test them, rank results."""
import time as _time
start = _time.time()
calls_used = 0
_optimize_jobs[job_id]["status"] = "running"
def _budget_call(model, prompt):
nonlocal calls_used
if calls_used >= _OPTIMIZE_MAX_CALLS:
raise RuntimeError("Budget exhausted")
calls_used += 1
return safe_query(model, prompt)
try:
# Fetch original run
with get_db() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("SELECT * FROM team_runs WHERE id = %s", (run_id,))
run = cur.fetchone()
if not run:
_optimize_emit(job_id, {"type": "error", "text": "Run not found"})
_optimize_emit(job_id, {"type": "done"})
return
original_prompt = run["prompt"]
original_mode = run["mode"]
original_score = run.get("quality_score") or 0
responses = run.get("responses") or []
models_used = run.get("models_used") or ["qwen2.5:latest"]
best_resp = ""
if responses:
candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
if candidates:
best_resp = max(candidates, key=lambda r: len(r.get("text", "")))["text"][:2000]
# Phase A: Analyze
_optimize_emit(job_id, {"type": "status", "text": "Analyzing original run..."})
analysis_prompt = (
f"Analyze this LLM prompt for improvement opportunities.\n\n"
f"MODE: {original_mode}\nSCORE: {original_score}/10\n\n"
f"PROMPT:\n{original_prompt[:1500]}\n\n"
f"BEST RESPONSE (excerpt):\n{best_resp[:1000]}\n\n"
f"Identify 3-4 specific improvement strategies. For each, name the strategy type.\n"
f"Return JSON: {{\"analysis\": \"brief overall assessment\", \"strategies\": [\"clarity\", \"depth\", ...]}}"
)
analysis_raw = _budget_call(_SCORE_MODEL, analysis_prompt)
_optimize_emit(job_id, {"type": "phase", "phase": "analyze", "text": analysis_raw})
# Parse strategies
strategies = ["clarity", "depth", "specificity"]
try:
j_s = analysis_raw.find("{")
j_e = analysis_raw.rfind("}") + 1
if j_s >= 0 and j_e > j_s:
parsed = json.loads(analysis_raw[j_s:j_e])
strategies = parsed.get("strategies", strategies)[:5]
except Exception:
pass
# Phase B: Generate variations
_optimize_emit(job_id, {"type": "status", "text": f"Generating {len(strategies)} prompt variations..."})
gen_prompt = (
f"Generate {len(strategies)} improved versions of this prompt. Each targets a different improvement strategy.\n\n"
f"ORIGINAL PROMPT:\n{original_prompt[:1500]}\n\n"
f"STRATEGIES TO APPLY: {', '.join(strategies)}\n\n"
f"Return a JSON array: [{{\"strategy\": \"...\", \"prompt\": \"the full improved prompt\", \"rationale\": \"why this is better\"}}]\n"
f"Each prompt should be complete and ready to use, not a description of changes."
)
gen_raw = _budget_call(_SCORE_MODEL, gen_prompt)
variations = []
try:
j_s = gen_raw.find("[")
j_e = gen_raw.rfind("]") + 1
if j_s >= 0 and j_e > j_s:
variations = json.loads(gen_raw[j_s:j_e])
except Exception:
pass
if not variations:
# Fallback: create simple variations
variations = [
{"strategy": "clarity", "prompt": f"Please be specific and clear: {original_prompt}", "rationale": "Added clarity directive"},
{"strategy": "depth", "prompt": f"Provide a comprehensive, detailed answer: {original_prompt}", "rationale": "Added depth directive"},
{"strategy": "structure", "prompt": f"Structure your response with clear sections and examples: {original_prompt}", "rationale": "Added structure directive"},
]
_optimize_emit(job_id, {"type": "phase", "phase": "variations", "count": len(variations),
"variations": [{"strategy": v.get("strategy", "?"), "prompt": v.get("prompt", "")[:200], "rationale": v.get("rationale", "")} for v in variations]})
# Phase C: Multi-mode test
test_modes = [original_mode]
if original_mode != "brainstorm":
test_modes.append("brainstorm")
# Budget check: need 1 call per variation×mode, cap if needed
max_tests = _OPTIMIZE_MAX_CALLS - calls_used - 1 # reserve 1 for summary
if len(variations) * len(test_modes) > max_tests:
test_modes = [original_mode]
if len(variations) > max_tests:
variations = variations[:max_tests]
child_run_ids = []
for vi, var in enumerate(variations):
var_prompt = var.get("prompt", original_prompt)
strategy = var.get("strategy", "unknown")
for mode in test_modes:
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "running"})
try:
# Pick a model from the original run's model list
model = models_used[vi % len(models_used)]
result = _budget_call(model, var_prompt)
test_responses = [{"model": model, "text": result, "role": "response"}]
test_config = {"source": "optimize", "parent_run": run_id, "job_id": job_id, "variation": vi, "strategy": strategy}
rid = save_run(mode, var_prompt, test_config, test_responses)
if rid:
child_run_ids.append({"run_id": rid, "variation": vi, "strategy": strategy, "mode": mode, "prompt": var_prompt})
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "done"})
except Exception as e:
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": f"error: {e}"})
# Phase D: Wait for scores and rank
_optimize_emit(job_id, {"type": "status", "text": "Waiting for auto-scoring..."})
ranked = []
if child_run_ids:
child_ids = [c["run_id"] for c in child_run_ids]
# Poll for scores (auto-scoring runs in background threads)
for attempt in range(20):
_time.sleep(3)
try:
with get_db() as conn:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute("SELECT id, quality_score FROM team_runs WHERE id = ANY(%s)", (child_ids,))
scores = {r["id"]: r["quality_score"] for r in cur.fetchall()}
scored_count = sum(1 for s in scores.values() if s is not None)
if scored_count >= len(child_ids) * 0.8:
break
except Exception:
pass
# Build ranked results
for child in child_run_ids:
score = scores.get(child["run_id"])
ranked.append({
"run_id": child["run_id"],
"variation": child["variation"],
"strategy": child["strategy"],
"mode": child["mode"],
"prompt": child["prompt"],
"score": float(score) if score else None,
"snippet": child["prompt"][:150],
})
ranked.sort(key=lambda r: r.get("score") or 0, reverse=True)
_optimize_emit(job_id, {"type": "results", "ranked": ranked, "original_score": original_score})
# Phase E: Report
best_score = ranked[0]["score"] if ranked and ranked[0].get("score") else original_score
improvement = (best_score or 0) - (original_score or 0)
duration = int((_time.time() - start) * 1000)
result_data = {
"parent_run": run_id, "original_score": original_score,
"best_score": best_score, "improvement": improvement,
"variations_tested": len(variations), "modes_tested": test_modes,
"calls_used": calls_used, "ranked": ranked[:5],
}
_save_pipeline("optimize", original_prompt[:200],
[{"step": "analyze"}, {"step": "generate", "count": len(variations)}, {"step": "test", "tests": len(child_run_ids)}, {"step": "rank"}],
result_data, models_used + [_SCORE_MODEL], start * 1000)
# Tag original run as optimized
try:
with get_db() as conn:
with conn.cursor() as cur:
cur.execute(
"UPDATE team_runs SET score_metadata = COALESCE(score_metadata, '{}') || %s WHERE id = %s",
(json.dumps({"optimized": True, "best_variation_run": ranked[0]["run_id"] if ranked else None, "optimize_job": job_id}), run_id)
)
conn.commit()
except Exception:
pass
_optimize_emit(job_id, {"type": "done", "best_score": best_score, "original_score": original_score, "improvement": improvement, "calls_used": calls_used})
_optimize_jobs[job_id]["status"] = "completed"
except Exception as e:
_optimize_emit(job_id, {"type": "error", "text": str(e)})
_optimize_emit(job_id, {"type": "done", "best_score": 0, "original_score": 0, "improvement": 0})
_optimize_jobs[job_id]["status"] = f"error: {e}"
def _gather_data_source(source): def _gather_data_source(source):
"""Pull data from a system source for pipeline input.""" """Pull data from a system source for pipeline input."""
if source == "team_runs": if source == "team_runs":