Add Auto-Optimize: AI agent for history-driven prompt improvement
When viewing any past run in History, click "Optimize" to trigger an automated workflow that: 1. Analyzes the original prompt + responses + score 2. Identifies improvement strategies (clarity, depth, specificity, etc.) 3. Generates 3-5 improved prompt variations 4. Tests each variation across original mode + brainstorm 5. Auto-scores all results via background judge 6. Ranks results and highlights the winner 7. "Use This" button loads winning prompt into composer Architecture: - _run_optimize(job_id, run_id): background thread, 5-phase engine - POST /api/runs/<id>/optimize: starts optimization job - GET /api/optimize/<job_id>/stream: SSE for live progress - Budget-capped at 15 model calls per optimization - Child runs saved as real team_runs (source: "optimize") - Auto-scored → feeds into analytics + routing table automatically - Results saved to pipeline_runs (pipeline: "optimize") Frontend: - "Optimize" button in history detail panel (accent-colored) - startOptimize(runId): replaces detail view with live optimization stream - Phase cards: Analysis → Variations → Testing → Ranked Results - Score bars with color coding (green/amber/red) - Winner row highlighted with star + "Use This" button Closes the learning loop: system studies its own history → generates better prompts → tests them → scores results → routing table improves. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8ad221b41f
commit
3b4fa449f1
401
llm_team_ui.py
401
llm_team_ui.py
@ -6438,6 +6438,62 @@ def score_run(run_id):
|
|||||||
return jsonify({"error": str(e)}), 500
|
return jsonify({"error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/runs/<int:run_id>/optimize", methods=["POST"])
|
||||||
|
@admin_required
|
||||||
|
def start_optimize(run_id):
|
||||||
|
"""Start an auto-optimize job for a past run."""
|
||||||
|
job_id = f"opt-{run_id}-{int(time.time())}"
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT id FROM team_runs WHERE id = %s", (run_id,))
|
||||||
|
if not cur.fetchone():
|
||||||
|
return jsonify({"error": "Run not found"}), 404
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"error": str(e)}), 500
|
||||||
|
# Don't allow double-optimize
|
||||||
|
for jid, info in _optimize_jobs.items():
|
||||||
|
if jid.startswith(f"opt-{run_id}-") and info.get("status") == "running":
|
||||||
|
return jsonify({"error": "Already optimizing this run", "job_id": jid}), 409
|
||||||
|
_optimize_jobs[job_id] = {"status": "starting"}
|
||||||
|
_optimize_queues[job_id] = []
|
||||||
|
t = threading.Thread(target=_run_optimize, args=(job_id, run_id), daemon=True)
|
||||||
|
_optimize_jobs[job_id]["thread"] = t
|
||||||
|
t.start()
|
||||||
|
return jsonify({"ok": True, "job_id": job_id})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/optimize/<job_id>/stream")
|
||||||
|
@login_required
|
||||||
|
def optimize_stream(job_id):
|
||||||
|
"""SSE stream for optimization progress."""
|
||||||
|
q = []
|
||||||
|
_optimize_queues.setdefault(job_id, []).append(q)
|
||||||
|
def generate():
|
||||||
|
try:
|
||||||
|
idle_count = 0
|
||||||
|
while True:
|
||||||
|
if q:
|
||||||
|
idle_count = 0
|
||||||
|
data = q.pop(0)
|
||||||
|
yield f"data: {json.dumps(data)}\n\n"
|
||||||
|
if data.get("type") == "done":
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
idle_count += 1
|
||||||
|
if idle_count > 300: # 5 min timeout
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
yield ": keepalive\n\n"
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
_optimize_queues.get(job_id, []).remove(q)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return Response(generate(), mimetype="text/event-stream",
|
||||||
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
|
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
|
||||||
@login_required
|
@login_required
|
||||||
def archive_run(run_id):
|
def archive_run(run_id):
|
||||||
@ -6706,6 +6762,145 @@ function toast(msg, ok) {
|
|||||||
setTimeout(function(){t.remove()},2500);
|
setTimeout(function(){t.remove()},2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function startOptimize(runId) {
|
||||||
|
var r = await fetch('/api/runs/'+runId+'/optimize', {method:'POST'});
|
||||||
|
var data = await r.json();
|
||||||
|
if (data.error) { toast(data.error, false); return; }
|
||||||
|
var jobId = data.job_id;
|
||||||
|
|
||||||
|
var panel = document.getElementById('detail-panel');
|
||||||
|
panel.textContent = '';
|
||||||
|
|
||||||
|
// Header
|
||||||
|
var hdr = document.createElement('div'); hdr.style.cssText = 'display:flex;align-items:center;gap:10px;margin-bottom:16px';
|
||||||
|
var backBtn = document.createElement('button'); backBtn.className = 'tool-btn';
|
||||||
|
backBtn.textContent = '\u2190 Back'; backBtn.onclick = function(){ openDetail(runId); };
|
||||||
|
hdr.appendChild(backBtn);
|
||||||
|
var title = document.createElement('span');
|
||||||
|
title.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);font-weight:700';
|
||||||
|
title.textContent = 'OPTIMIZING RUN #'+runId;
|
||||||
|
hdr.appendChild(title);
|
||||||
|
panel.appendChild(hdr);
|
||||||
|
|
||||||
|
// Status
|
||||||
|
var statusEl = document.createElement('div');
|
||||||
|
statusEl.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:11px;color:var(--text2);margin-bottom:12px';
|
||||||
|
statusEl.textContent = 'Starting optimization...';
|
||||||
|
panel.appendChild(statusEl);
|
||||||
|
|
||||||
|
// Results container
|
||||||
|
var resultsEl = document.createElement('div');
|
||||||
|
panel.appendChild(resultsEl);
|
||||||
|
|
||||||
|
// SSE stream
|
||||||
|
var es = new EventSource('/api/optimize/'+jobId+'/stream');
|
||||||
|
es.onmessage = function(e) {
|
||||||
|
var d = JSON.parse(e.data);
|
||||||
|
|
||||||
|
if (d.type === 'status') {
|
||||||
|
statusEl.textContent = d.text;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d.type === 'error') {
|
||||||
|
var err = document.createElement('div');
|
||||||
|
err.style.cssText = 'color:var(--red);font-family:JetBrains Mono,monospace;font-size:11px;margin:8px 0;border-left:2px solid var(--red);padding-left:8px';
|
||||||
|
err.textContent = 'Error: ' + d.text;
|
||||||
|
resultsEl.appendChild(err);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d.type === 'phase') {
|
||||||
|
var block = document.createElement('div');
|
||||||
|
block.style.cssText = 'background:var(--surface);border:1px solid var(--border);border-radius:2px;padding:12px;margin-bottom:8px';
|
||||||
|
var phTitle = document.createElement('div');
|
||||||
|
phTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);margin-bottom:6px;font-weight:700';
|
||||||
|
phTitle.textContent = d.phase === 'analyze' ? 'Analysis' : d.count + ' Variations Generated';
|
||||||
|
block.appendChild(phTitle);
|
||||||
|
var body = document.createElement('div');
|
||||||
|
body.style.cssText = 'font-size:12px;line-height:1.6;color:var(--text);white-space:pre-wrap;max-height:200px;overflow-y:auto';
|
||||||
|
if (d.phase === 'analyze') {
|
||||||
|
body.textContent = d.text || '';
|
||||||
|
} else if (d.variations) {
|
||||||
|
d.variations.forEach(function(v, i) {
|
||||||
|
var line = document.createElement('div');
|
||||||
|
line.style.cssText = 'margin-bottom:8px;padding:6px 8px;background:rgba(0,0,0,0.1);border-radius:2px';
|
||||||
|
line.textContent = 'V'+(i+1)+' ['+v.strategy+'] '+v.prompt;
|
||||||
|
body.appendChild(line);
|
||||||
|
});
|
||||||
|
body.style.whiteSpace = 'normal';
|
||||||
|
}
|
||||||
|
block.appendChild(body);
|
||||||
|
resultsEl.appendChild(block);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d.type === 'test') {
|
||||||
|
statusEl.textContent = 'Testing V'+(d.variation+1)+' ['+d.strategy+'] in '+d.mode+'... '+d.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d.type === 'results') {
|
||||||
|
statusEl.textContent = 'Optimization complete!';
|
||||||
|
var table = document.createElement('div'); table.style.cssText = 'margin-top:12px';
|
||||||
|
var tTitle = document.createElement('div');
|
||||||
|
tTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:2px;color:var(--accent);margin-bottom:10px;font-weight:700';
|
||||||
|
tTitle.textContent = 'RANKED RESULTS' + (d.original_score ? ' (original: '+d.original_score+'/10)' : '');
|
||||||
|
table.appendChild(tTitle);
|
||||||
|
(d.ranked||[]).forEach(function(r, i) {
|
||||||
|
var row = document.createElement('div');
|
||||||
|
row.style.cssText = 'display:flex;align-items:center;gap:10px;padding:10px 12px;margin-bottom:4px;background:var(--surface);border:1px solid var(--border);border-radius:2px';
|
||||||
|
if (i === 0) row.style.borderColor = 'var(--accent)';
|
||||||
|
var rank = document.createElement('span');
|
||||||
|
rank.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:14px;font-weight:700;min-width:24px;color:'+(i===0?'var(--accent)':'var(--text2)');
|
||||||
|
rank.textContent = i === 0 ? '\u2605' : '#'+(i+1);
|
||||||
|
row.appendChild(rank);
|
||||||
|
var info = document.createElement('div'); info.style.cssText = 'flex:1;min-width:0';
|
||||||
|
var label = document.createElement('div');
|
||||||
|
label.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--text2);text-transform:uppercase;letter-spacing:0.5px';
|
||||||
|
label.textContent = 'V'+(r.variation+1)+' ['+r.strategy+'] \u00D7 '+r.mode;
|
||||||
|
info.appendChild(label);
|
||||||
|
var snippet = document.createElement('div');
|
||||||
|
snippet.style.cssText = 'font-size:11px;color:var(--text);margin-top:2px;white-space:nowrap;overflow:hidden;text-overflow:ellipsis';
|
||||||
|
snippet.textContent = r.snippet || '';
|
||||||
|
info.appendChild(snippet);
|
||||||
|
row.appendChild(info);
|
||||||
|
var scoreBar = document.createElement('div'); scoreBar.style.cssText = 'width:80px;display:flex;align-items:center;gap:6px';
|
||||||
|
var bar = document.createElement('div'); bar.style.cssText = 'flex:1;height:6px;background:rgba(0,0,0,0.15);border-radius:3px;overflow:hidden';
|
||||||
|
var fill = document.createElement('div');
|
||||||
|
var pct = ((r.score||0)/10)*100;
|
||||||
|
fill.style.cssText = 'height:100%;border-radius:3px;background:'+(pct>=70?'var(--green)':pct>=50?'var(--accent)':'var(--red)')+';width:'+pct+'%';
|
||||||
|
bar.appendChild(fill); scoreBar.appendChild(bar);
|
||||||
|
var scoreNum = document.createElement('span');
|
||||||
|
scoreNum.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:12px;font-weight:700;min-width:24px;text-align:right';
|
||||||
|
scoreNum.textContent = r.score ? r.score.toFixed(1) : '?';
|
||||||
|
scoreBar.appendChild(scoreNum);
|
||||||
|
row.appendChild(scoreBar);
|
||||||
|
if (i === 0 && r.prompt) {
|
||||||
|
var useBtn = document.createElement('button'); useBtn.className = 'tool-btn';
|
||||||
|
useBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);font-size:9px;white-space:nowrap';
|
||||||
|
useBtn.textContent = 'Use This';
|
||||||
|
useBtn.onclick = function(){
|
||||||
|
var promptEl = document.getElementById('prompt');
|
||||||
|
if (promptEl) promptEl.value = r.prompt;
|
||||||
|
window.location.href = '/';
|
||||||
|
};
|
||||||
|
row.appendChild(useBtn);
|
||||||
|
}
|
||||||
|
table.appendChild(row);
|
||||||
|
});
|
||||||
|
resultsEl.appendChild(table);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d.type === 'done') {
|
||||||
|
es.close();
|
||||||
|
if (d.improvement > 0) {
|
||||||
|
statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 (+'+(d.improvement||0).toFixed(1)+' improvement) | '+d.calls_used+' model calls';
|
||||||
|
statusEl.style.color = 'var(--green)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 | Original: '+(d.original_score||'?')+'/10 | '+d.calls_used+' calls';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
es.onerror = function() { es.close(); statusEl.textContent += ' (stream ended)'; };
|
||||||
|
}
|
||||||
|
|
||||||
async function loadRuns() {
|
async function loadRuns() {
|
||||||
var mode = document.getElementById('filter-mode').value;
|
var mode = document.getElementById('filter-mode').value;
|
||||||
var tag = document.getElementById('filter-tag').value;
|
var tag = document.getElementById('filter-tag').value;
|
||||||
@ -6819,6 +7014,11 @@ async function openDetail(id) {
|
|||||||
var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete';
|
var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete';
|
||||||
delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} };
|
delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} };
|
||||||
actions.appendChild(delBtn);
|
actions.appendChild(delBtn);
|
||||||
|
var optBtn = document.createElement('button'); optBtn.className = 'tool-btn';
|
||||||
|
optBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);margin-left:auto';
|
||||||
|
optBtn.textContent = '\u26A1 Optimize';
|
||||||
|
optBtn.onclick = function(){ startOptimize(id); };
|
||||||
|
actions.appendChild(optBtn);
|
||||||
panel.appendChild(actions);
|
panel.appendChild(actions);
|
||||||
|
|
||||||
// Responses
|
// Responses
|
||||||
@ -7043,6 +7243,207 @@ def get_self_report(rid):
|
|||||||
_meta_threads = {}
|
_meta_threads = {}
|
||||||
_meta_status = {} # pipeline_id -> {stage, substep, progress}
|
_meta_status = {} # pipeline_id -> {stage, substep, progress}
|
||||||
|
|
||||||
|
# ─── AUTO-OPTIMIZE ENGINE ────────────────────────────────────
|
||||||
|
_optimize_jobs = {} # job_id -> {"thread": Thread, "status": str}
|
||||||
|
_optimize_queues = {} # job_id -> [[event_dicts]]
|
||||||
|
_OPTIMIZE_MAX_CALLS = 15
|
||||||
|
|
||||||
|
def _optimize_emit(job_id, data):
|
||||||
|
for q in _optimize_queues.get(job_id, []):
|
||||||
|
q.append(data)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_optimize(job_id, run_id):
|
||||||
|
"""Background: analyze a past run, generate improved prompts, test them, rank results."""
|
||||||
|
import time as _time
|
||||||
|
start = _time.time()
|
||||||
|
calls_used = 0
|
||||||
|
_optimize_jobs[job_id]["status"] = "running"
|
||||||
|
|
||||||
|
def _budget_call(model, prompt):
|
||||||
|
nonlocal calls_used
|
||||||
|
if calls_used >= _OPTIMIZE_MAX_CALLS:
|
||||||
|
raise RuntimeError("Budget exhausted")
|
||||||
|
calls_used += 1
|
||||||
|
return safe_query(model, prompt)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Fetch original run
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute("SELECT * FROM team_runs WHERE id = %s", (run_id,))
|
||||||
|
run = cur.fetchone()
|
||||||
|
if not run:
|
||||||
|
_optimize_emit(job_id, {"type": "error", "text": "Run not found"})
|
||||||
|
_optimize_emit(job_id, {"type": "done"})
|
||||||
|
return
|
||||||
|
|
||||||
|
original_prompt = run["prompt"]
|
||||||
|
original_mode = run["mode"]
|
||||||
|
original_score = run.get("quality_score") or 0
|
||||||
|
responses = run.get("responses") or []
|
||||||
|
models_used = run.get("models_used") or ["qwen2.5:latest"]
|
||||||
|
best_resp = ""
|
||||||
|
if responses:
|
||||||
|
candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
|
||||||
|
if candidates:
|
||||||
|
best_resp = max(candidates, key=lambda r: len(r.get("text", "")))["text"][:2000]
|
||||||
|
|
||||||
|
# Phase A: Analyze
|
||||||
|
_optimize_emit(job_id, {"type": "status", "text": "Analyzing original run..."})
|
||||||
|
analysis_prompt = (
|
||||||
|
f"Analyze this LLM prompt for improvement opportunities.\n\n"
|
||||||
|
f"MODE: {original_mode}\nSCORE: {original_score}/10\n\n"
|
||||||
|
f"PROMPT:\n{original_prompt[:1500]}\n\n"
|
||||||
|
f"BEST RESPONSE (excerpt):\n{best_resp[:1000]}\n\n"
|
||||||
|
f"Identify 3-4 specific improvement strategies. For each, name the strategy type.\n"
|
||||||
|
f"Return JSON: {{\"analysis\": \"brief overall assessment\", \"strategies\": [\"clarity\", \"depth\", ...]}}"
|
||||||
|
)
|
||||||
|
analysis_raw = _budget_call(_SCORE_MODEL, analysis_prompt)
|
||||||
|
_optimize_emit(job_id, {"type": "phase", "phase": "analyze", "text": analysis_raw})
|
||||||
|
|
||||||
|
# Parse strategies
|
||||||
|
strategies = ["clarity", "depth", "specificity"]
|
||||||
|
try:
|
||||||
|
j_s = analysis_raw.find("{")
|
||||||
|
j_e = analysis_raw.rfind("}") + 1
|
||||||
|
if j_s >= 0 and j_e > j_s:
|
||||||
|
parsed = json.loads(analysis_raw[j_s:j_e])
|
||||||
|
strategies = parsed.get("strategies", strategies)[:5]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Phase B: Generate variations
|
||||||
|
_optimize_emit(job_id, {"type": "status", "text": f"Generating {len(strategies)} prompt variations..."})
|
||||||
|
gen_prompt = (
|
||||||
|
f"Generate {len(strategies)} improved versions of this prompt. Each targets a different improvement strategy.\n\n"
|
||||||
|
f"ORIGINAL PROMPT:\n{original_prompt[:1500]}\n\n"
|
||||||
|
f"STRATEGIES TO APPLY: {', '.join(strategies)}\n\n"
|
||||||
|
f"Return a JSON array: [{{\"strategy\": \"...\", \"prompt\": \"the full improved prompt\", \"rationale\": \"why this is better\"}}]\n"
|
||||||
|
f"Each prompt should be complete and ready to use, not a description of changes."
|
||||||
|
)
|
||||||
|
gen_raw = _budget_call(_SCORE_MODEL, gen_prompt)
|
||||||
|
|
||||||
|
variations = []
|
||||||
|
try:
|
||||||
|
j_s = gen_raw.find("[")
|
||||||
|
j_e = gen_raw.rfind("]") + 1
|
||||||
|
if j_s >= 0 and j_e > j_s:
|
||||||
|
variations = json.loads(gen_raw[j_s:j_e])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not variations:
|
||||||
|
# Fallback: create simple variations
|
||||||
|
variations = [
|
||||||
|
{"strategy": "clarity", "prompt": f"Please be specific and clear: {original_prompt}", "rationale": "Added clarity directive"},
|
||||||
|
{"strategy": "depth", "prompt": f"Provide a comprehensive, detailed answer: {original_prompt}", "rationale": "Added depth directive"},
|
||||||
|
{"strategy": "structure", "prompt": f"Structure your response with clear sections and examples: {original_prompt}", "rationale": "Added structure directive"},
|
||||||
|
]
|
||||||
|
|
||||||
|
_optimize_emit(job_id, {"type": "phase", "phase": "variations", "count": len(variations),
|
||||||
|
"variations": [{"strategy": v.get("strategy", "?"), "prompt": v.get("prompt", "")[:200], "rationale": v.get("rationale", "")} for v in variations]})
|
||||||
|
|
||||||
|
# Phase C: Multi-mode test
|
||||||
|
test_modes = [original_mode]
|
||||||
|
if original_mode != "brainstorm":
|
||||||
|
test_modes.append("brainstorm")
|
||||||
|
# Budget check: need 1 call per variation×mode, cap if needed
|
||||||
|
max_tests = _OPTIMIZE_MAX_CALLS - calls_used - 1 # reserve 1 for summary
|
||||||
|
if len(variations) * len(test_modes) > max_tests:
|
||||||
|
test_modes = [original_mode]
|
||||||
|
if len(variations) > max_tests:
|
||||||
|
variations = variations[:max_tests]
|
||||||
|
|
||||||
|
child_run_ids = []
|
||||||
|
for vi, var in enumerate(variations):
|
||||||
|
var_prompt = var.get("prompt", original_prompt)
|
||||||
|
strategy = var.get("strategy", "unknown")
|
||||||
|
for mode in test_modes:
|
||||||
|
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "running"})
|
||||||
|
try:
|
||||||
|
# Pick a model from the original run's model list
|
||||||
|
model = models_used[vi % len(models_used)]
|
||||||
|
result = _budget_call(model, var_prompt)
|
||||||
|
test_responses = [{"model": model, "text": result, "role": "response"}]
|
||||||
|
test_config = {"source": "optimize", "parent_run": run_id, "job_id": job_id, "variation": vi, "strategy": strategy}
|
||||||
|
rid = save_run(mode, var_prompt, test_config, test_responses)
|
||||||
|
if rid:
|
||||||
|
child_run_ids.append({"run_id": rid, "variation": vi, "strategy": strategy, "mode": mode, "prompt": var_prompt})
|
||||||
|
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "done"})
|
||||||
|
except Exception as e:
|
||||||
|
_optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": f"error: {e}"})
|
||||||
|
|
||||||
|
# Phase D: Wait for scores and rank
|
||||||
|
_optimize_emit(job_id, {"type": "status", "text": "Waiting for auto-scoring..."})
|
||||||
|
ranked = []
|
||||||
|
if child_run_ids:
|
||||||
|
child_ids = [c["run_id"] for c in child_run_ids]
|
||||||
|
# Poll for scores (auto-scoring runs in background threads)
|
||||||
|
for attempt in range(20):
|
||||||
|
_time.sleep(3)
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute("SELECT id, quality_score FROM team_runs WHERE id = ANY(%s)", (child_ids,))
|
||||||
|
scores = {r["id"]: r["quality_score"] for r in cur.fetchall()}
|
||||||
|
scored_count = sum(1 for s in scores.values() if s is not None)
|
||||||
|
if scored_count >= len(child_ids) * 0.8:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Build ranked results
|
||||||
|
for child in child_run_ids:
|
||||||
|
score = scores.get(child["run_id"])
|
||||||
|
ranked.append({
|
||||||
|
"run_id": child["run_id"],
|
||||||
|
"variation": child["variation"],
|
||||||
|
"strategy": child["strategy"],
|
||||||
|
"mode": child["mode"],
|
||||||
|
"prompt": child["prompt"],
|
||||||
|
"score": float(score) if score else None,
|
||||||
|
"snippet": child["prompt"][:150],
|
||||||
|
})
|
||||||
|
ranked.sort(key=lambda r: r.get("score") or 0, reverse=True)
|
||||||
|
|
||||||
|
_optimize_emit(job_id, {"type": "results", "ranked": ranked, "original_score": original_score})
|
||||||
|
|
||||||
|
# Phase E: Report
|
||||||
|
best_score = ranked[0]["score"] if ranked and ranked[0].get("score") else original_score
|
||||||
|
improvement = (best_score or 0) - (original_score or 0)
|
||||||
|
|
||||||
|
duration = int((_time.time() - start) * 1000)
|
||||||
|
result_data = {
|
||||||
|
"parent_run": run_id, "original_score": original_score,
|
||||||
|
"best_score": best_score, "improvement": improvement,
|
||||||
|
"variations_tested": len(variations), "modes_tested": test_modes,
|
||||||
|
"calls_used": calls_used, "ranked": ranked[:5],
|
||||||
|
}
|
||||||
|
_save_pipeline("optimize", original_prompt[:200],
|
||||||
|
[{"step": "analyze"}, {"step": "generate", "count": len(variations)}, {"step": "test", "tests": len(child_run_ids)}, {"step": "rank"}],
|
||||||
|
result_data, models_used + [_SCORE_MODEL], start * 1000)
|
||||||
|
|
||||||
|
# Tag original run as optimized
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE team_runs SET score_metadata = COALESCE(score_metadata, '{}') || %s WHERE id = %s",
|
||||||
|
(json.dumps({"optimized": True, "best_variation_run": ranked[0]["run_id"] if ranked else None, "optimize_job": job_id}), run_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_optimize_emit(job_id, {"type": "done", "best_score": best_score, "original_score": original_score, "improvement": improvement, "calls_used": calls_used})
|
||||||
|
_optimize_jobs[job_id]["status"] = "completed"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
_optimize_emit(job_id, {"type": "error", "text": str(e)})
|
||||||
|
_optimize_emit(job_id, {"type": "done", "best_score": 0, "original_score": 0, "improvement": 0})
|
||||||
|
_optimize_jobs[job_id]["status"] = f"error: {e}"
|
||||||
|
|
||||||
|
|
||||||
def _gather_data_source(source):
|
def _gather_data_source(source):
|
||||||
"""Pull data from a system source for pipeline input."""
|
"""Pull data from a system source for pipeline input."""
|
||||||
if source == "team_runs":
|
if source == "team_runs":
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user