diff --git a/llm_team_ui.py b/llm_team_ui.py index aadae10..3424a27 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -6438,6 +6438,62 @@ def score_run(run_id): return jsonify({"error": str(e)}), 500 +@app.route("/api/runs//optimize", methods=["POST"]) +@admin_required +def start_optimize(run_id): + """Start an auto-optimize job for a past run.""" + job_id = f"opt-{run_id}-{int(time.time())}" + try: + with get_db() as conn: + with conn.cursor() as cur: + cur.execute("SELECT id FROM team_runs WHERE id = %s", (run_id,)) + if not cur.fetchone(): + return jsonify({"error": "Run not found"}), 404 + except Exception as e: + return jsonify({"error": str(e)}), 500 + # Don't allow double-optimize + for jid, info in _optimize_jobs.items(): + if jid.startswith(f"opt-{run_id}-") and info.get("status") == "running": + return jsonify({"error": "Already optimizing this run", "job_id": jid}), 409 + _optimize_jobs[job_id] = {"status": "starting"} + _optimize_queues[job_id] = [] + t = threading.Thread(target=_run_optimize, args=(job_id, run_id), daemon=True) + _optimize_jobs[job_id]["thread"] = t + t.start() + return jsonify({"ok": True, "job_id": job_id}) + + +@app.route("/api/optimize//stream") +@login_required +def optimize_stream(job_id): + """SSE stream for optimization progress.""" + q = [] + _optimize_queues.setdefault(job_id, []).append(q) + def generate(): + try: + idle_count = 0 + while True: + if q: + idle_count = 0 + data = q.pop(0) + yield f"data: {json.dumps(data)}\n\n" + if data.get("type") == "done": + break + else: + idle_count += 1 + if idle_count > 300: # 5 min timeout + break + time.sleep(1) + yield ": keepalive\n\n" + finally: + try: + _optimize_queues.get(job_id, []).remove(q) + except ValueError: + pass + return Response(generate(), mimetype="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"}) + + @app.route("/api/runs//archive", methods=["POST"]) @login_required def archive_run(run_id): @@ -6706,6 +6762,145 @@ function toast(msg, ok) { setTimeout(function(){t.remove()},2500); } +async function startOptimize(runId) { + var r = await fetch('/api/runs/'+runId+'/optimize', {method:'POST'}); + var data = await r.json(); + if (data.error) { toast(data.error, false); return; } + var jobId = data.job_id; + + var panel = document.getElementById('detail-panel'); + panel.textContent = ''; + + // Header + var hdr = document.createElement('div'); hdr.style.cssText = 'display:flex;align-items:center;gap:10px;margin-bottom:16px'; + var backBtn = document.createElement('button'); backBtn.className = 'tool-btn'; + backBtn.textContent = '\u2190 Back'; backBtn.onclick = function(){ openDetail(runId); }; + hdr.appendChild(backBtn); + var title = document.createElement('span'); + title.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);font-weight:700'; + title.textContent = 'OPTIMIZING RUN #'+runId; + hdr.appendChild(title); + panel.appendChild(hdr); + + // Status + var statusEl = document.createElement('div'); + statusEl.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:11px;color:var(--text2);margin-bottom:12px'; + statusEl.textContent = 'Starting optimization...'; + panel.appendChild(statusEl); + + // Results container + var resultsEl = document.createElement('div'); + panel.appendChild(resultsEl); + + // SSE stream + var es = new EventSource('/api/optimize/'+jobId+'/stream'); + es.onmessage = function(e) { + var d = JSON.parse(e.data); + + if (d.type === 'status') { + statusEl.textContent = d.text; + } + + if (d.type === 'error') { + var err = document.createElement('div'); + err.style.cssText = 'color:var(--red);font-family:JetBrains Mono,monospace;font-size:11px;margin:8px 0;border-left:2px solid var(--red);padding-left:8px'; + err.textContent = 'Error: ' + d.text; + resultsEl.appendChild(err); + } + + if (d.type === 'phase') { + var block = document.createElement('div'); + block.style.cssText = 'background:var(--surface);border:1px solid var(--border);border-radius:2px;padding:12px;margin-bottom:8px'; + var phTitle = document.createElement('div'); + phTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);margin-bottom:6px;font-weight:700'; + phTitle.textContent = d.phase === 'analyze' ? 'Analysis' : d.count + ' Variations Generated'; + block.appendChild(phTitle); + var body = document.createElement('div'); + body.style.cssText = 'font-size:12px;line-height:1.6;color:var(--text);white-space:pre-wrap;max-height:200px;overflow-y:auto'; + if (d.phase === 'analyze') { + body.textContent = d.text || ''; + } else if (d.variations) { + d.variations.forEach(function(v, i) { + var line = document.createElement('div'); + line.style.cssText = 'margin-bottom:8px;padding:6px 8px;background:rgba(0,0,0,0.1);border-radius:2px'; + line.textContent = 'V'+(i+1)+' ['+v.strategy+'] '+v.prompt; + body.appendChild(line); + }); + body.style.whiteSpace = 'normal'; + } + block.appendChild(body); + resultsEl.appendChild(block); + } + + if (d.type === 'test') { + statusEl.textContent = 'Testing V'+(d.variation+1)+' ['+d.strategy+'] in '+d.mode+'... '+d.status; + } + + if (d.type === 'results') { + statusEl.textContent = 'Optimization complete!'; + var table = document.createElement('div'); table.style.cssText = 'margin-top:12px'; + var tTitle = document.createElement('div'); + tTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:2px;color:var(--accent);margin-bottom:10px;font-weight:700'; + tTitle.textContent = 'RANKED RESULTS' + (d.original_score ? ' (original: '+d.original_score+'/10)' : ''); + table.appendChild(tTitle); + (d.ranked||[]).forEach(function(r, i) { + var row = document.createElement('div'); + row.style.cssText = 'display:flex;align-items:center;gap:10px;padding:10px 12px;margin-bottom:4px;background:var(--surface);border:1px solid var(--border);border-radius:2px'; + if (i === 0) row.style.borderColor = 'var(--accent)'; + var rank = document.createElement('span'); + rank.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:14px;font-weight:700;min-width:24px;color:'+(i===0?'var(--accent)':'var(--text2)'); + rank.textContent = i === 0 ? '\u2605' : '#'+(i+1); + row.appendChild(rank); + var info = document.createElement('div'); info.style.cssText = 'flex:1;min-width:0'; + var label = document.createElement('div'); + label.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--text2);text-transform:uppercase;letter-spacing:0.5px'; + label.textContent = 'V'+(r.variation+1)+' ['+r.strategy+'] \u00D7 '+r.mode; + info.appendChild(label); + var snippet = document.createElement('div'); + snippet.style.cssText = 'font-size:11px;color:var(--text);margin-top:2px;white-space:nowrap;overflow:hidden;text-overflow:ellipsis'; + snippet.textContent = r.snippet || ''; + info.appendChild(snippet); + row.appendChild(info); + var scoreBar = document.createElement('div'); scoreBar.style.cssText = 'width:80px;display:flex;align-items:center;gap:6px'; + var bar = document.createElement('div'); bar.style.cssText = 'flex:1;height:6px;background:rgba(0,0,0,0.15);border-radius:3px;overflow:hidden'; + var fill = document.createElement('div'); + var pct = ((r.score||0)/10)*100; + fill.style.cssText = 'height:100%;border-radius:3px;background:'+(pct>=70?'var(--green)':pct>=50?'var(--accent)':'var(--red)')+';width:'+pct+'%'; + bar.appendChild(fill); scoreBar.appendChild(bar); + var scoreNum = document.createElement('span'); + scoreNum.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:12px;font-weight:700;min-width:24px;text-align:right'; + scoreNum.textContent = r.score ? r.score.toFixed(1) : '?'; + scoreBar.appendChild(scoreNum); + row.appendChild(scoreBar); + if (i === 0 && r.prompt) { + var useBtn = document.createElement('button'); useBtn.className = 'tool-btn'; + useBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);font-size:9px;white-space:nowrap'; + useBtn.textContent = 'Use This'; + useBtn.onclick = function(){ + var promptEl = document.getElementById('prompt'); + if (promptEl) promptEl.value = r.prompt; + window.location.href = '/'; + }; + row.appendChild(useBtn); + } + table.appendChild(row); + }); + resultsEl.appendChild(table); + } + + if (d.type === 'done') { + es.close(); + if (d.improvement > 0) { + statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 (+'+(d.improvement||0).toFixed(1)+' improvement) | '+d.calls_used+' model calls'; + statusEl.style.color = 'var(--green)'; + } else { + statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 | Original: '+(d.original_score||'?')+'/10 | '+d.calls_used+' calls'; + } + } + }; + es.onerror = function() { es.close(); statusEl.textContent += ' (stream ended)'; }; +} + async function loadRuns() { var mode = document.getElementById('filter-mode').value; var tag = document.getElementById('filter-tag').value; @@ -6819,6 +7014,11 @@ async function openDetail(id) { var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete'; delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} }; actions.appendChild(delBtn); + var optBtn = document.createElement('button'); optBtn.className = 'tool-btn'; + optBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);margin-left:auto'; + optBtn.textContent = '\u26A1 Optimize'; + optBtn.onclick = function(){ startOptimize(id); }; + actions.appendChild(optBtn); panel.appendChild(actions); // Responses @@ -7043,6 +7243,207 @@ def get_self_report(rid): _meta_threads = {} _meta_status = {} # pipeline_id -> {stage, substep, progress} +# ─── AUTO-OPTIMIZE ENGINE ──────────────────────────────────── +_optimize_jobs = {} # job_id -> {"thread": Thread, "status": str} +_optimize_queues = {} # job_id -> [[event_dicts]] +_OPTIMIZE_MAX_CALLS = 15 + +def _optimize_emit(job_id, data): + for q in _optimize_queues.get(job_id, []): + q.append(data) + + +def _run_optimize(job_id, run_id): + """Background: analyze a past run, generate improved prompts, test them, rank results.""" + import time as _time + start = _time.time() + calls_used = 0 + _optimize_jobs[job_id]["status"] = "running" + + def _budget_call(model, prompt): + nonlocal calls_used + if calls_used >= _OPTIMIZE_MAX_CALLS: + raise RuntimeError("Budget exhausted") + calls_used += 1 + return safe_query(model, prompt) + + try: + # Fetch original run + with get_db() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("SELECT * FROM team_runs WHERE id = %s", (run_id,)) + run = cur.fetchone() + if not run: + _optimize_emit(job_id, {"type": "error", "text": "Run not found"}) + _optimize_emit(job_id, {"type": "done"}) + return + + original_prompt = run["prompt"] + original_mode = run["mode"] + original_score = run.get("quality_score") or 0 + responses = run.get("responses") or [] + models_used = run.get("models_used") or ["qwen2.5:latest"] + best_resp = "" + if responses: + candidates = [r for r in responses if r.get("role") != "error" and r.get("text")] + if candidates: + best_resp = max(candidates, key=lambda r: len(r.get("text", "")))["text"][:2000] + + # Phase A: Analyze + _optimize_emit(job_id, {"type": "status", "text": "Analyzing original run..."}) + analysis_prompt = ( + f"Analyze this LLM prompt for improvement opportunities.\n\n" + f"MODE: {original_mode}\nSCORE: {original_score}/10\n\n" + f"PROMPT:\n{original_prompt[:1500]}\n\n" + f"BEST RESPONSE (excerpt):\n{best_resp[:1000]}\n\n" + f"Identify 3-4 specific improvement strategies. For each, name the strategy type.\n" + f"Return JSON: {{\"analysis\": \"brief overall assessment\", \"strategies\": [\"clarity\", \"depth\", ...]}}" + ) + analysis_raw = _budget_call(_SCORE_MODEL, analysis_prompt) + _optimize_emit(job_id, {"type": "phase", "phase": "analyze", "text": analysis_raw}) + + # Parse strategies + strategies = ["clarity", "depth", "specificity"] + try: + j_s = analysis_raw.find("{") + j_e = analysis_raw.rfind("}") + 1 + if j_s >= 0 and j_e > j_s: + parsed = json.loads(analysis_raw[j_s:j_e]) + strategies = parsed.get("strategies", strategies)[:5] + except Exception: + pass + + # Phase B: Generate variations + _optimize_emit(job_id, {"type": "status", "text": f"Generating {len(strategies)} prompt variations..."}) + gen_prompt = ( + f"Generate {len(strategies)} improved versions of this prompt. Each targets a different improvement strategy.\n\n" + f"ORIGINAL PROMPT:\n{original_prompt[:1500]}\n\n" + f"STRATEGIES TO APPLY: {', '.join(strategies)}\n\n" + f"Return a JSON array: [{{\"strategy\": \"...\", \"prompt\": \"the full improved prompt\", \"rationale\": \"why this is better\"}}]\n" + f"Each prompt should be complete and ready to use, not a description of changes." + ) + gen_raw = _budget_call(_SCORE_MODEL, gen_prompt) + + variations = [] + try: + j_s = gen_raw.find("[") + j_e = gen_raw.rfind("]") + 1 + if j_s >= 0 and j_e > j_s: + variations = json.loads(gen_raw[j_s:j_e]) + except Exception: + pass + if not variations: + # Fallback: create simple variations + variations = [ + {"strategy": "clarity", "prompt": f"Please be specific and clear: {original_prompt}", "rationale": "Added clarity directive"}, + {"strategy": "depth", "prompt": f"Provide a comprehensive, detailed answer: {original_prompt}", "rationale": "Added depth directive"}, + {"strategy": "structure", "prompt": f"Structure your response with clear sections and examples: {original_prompt}", "rationale": "Added structure directive"}, + ] + + _optimize_emit(job_id, {"type": "phase", "phase": "variations", "count": len(variations), + "variations": [{"strategy": v.get("strategy", "?"), "prompt": v.get("prompt", "")[:200], "rationale": v.get("rationale", "")} for v in variations]}) + + # Phase C: Multi-mode test + test_modes = [original_mode] + if original_mode != "brainstorm": + test_modes.append("brainstorm") + # Budget check: need 1 call per variation×mode, cap if needed + max_tests = _OPTIMIZE_MAX_CALLS - calls_used - 1 # reserve 1 for summary + if len(variations) * len(test_modes) > max_tests: + test_modes = [original_mode] + if len(variations) > max_tests: + variations = variations[:max_tests] + + child_run_ids = [] + for vi, var in enumerate(variations): + var_prompt = var.get("prompt", original_prompt) + strategy = var.get("strategy", "unknown") + for mode in test_modes: + _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "running"}) + try: + # Pick a model from the original run's model list + model = models_used[vi % len(models_used)] + result = _budget_call(model, var_prompt) + test_responses = [{"model": model, "text": result, "role": "response"}] + test_config = {"source": "optimize", "parent_run": run_id, "job_id": job_id, "variation": vi, "strategy": strategy} + rid = save_run(mode, var_prompt, test_config, test_responses) + if rid: + child_run_ids.append({"run_id": rid, "variation": vi, "strategy": strategy, "mode": mode, "prompt": var_prompt}) + _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "done"}) + except Exception as e: + _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": f"error: {e}"}) + + # Phase D: Wait for scores and rank + _optimize_emit(job_id, {"type": "status", "text": "Waiting for auto-scoring..."}) + ranked = [] + if child_run_ids: + child_ids = [c["run_id"] for c in child_run_ids] + # Poll for scores (auto-scoring runs in background threads) + for attempt in range(20): + _time.sleep(3) + try: + with get_db() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute("SELECT id, quality_score FROM team_runs WHERE id = ANY(%s)", (child_ids,)) + scores = {r["id"]: r["quality_score"] for r in cur.fetchall()} + scored_count = sum(1 for s in scores.values() if s is not None) + if scored_count >= len(child_ids) * 0.8: + break + except Exception: + pass + + # Build ranked results + for child in child_run_ids: + score = scores.get(child["run_id"]) + ranked.append({ + "run_id": child["run_id"], + "variation": child["variation"], + "strategy": child["strategy"], + "mode": child["mode"], + "prompt": child["prompt"], + "score": float(score) if score else None, + "snippet": child["prompt"][:150], + }) + ranked.sort(key=lambda r: r.get("score") or 0, reverse=True) + + _optimize_emit(job_id, {"type": "results", "ranked": ranked, "original_score": original_score}) + + # Phase E: Report + best_score = ranked[0]["score"] if ranked and ranked[0].get("score") else original_score + improvement = (best_score or 0) - (original_score or 0) + + duration = int((_time.time() - start) * 1000) + result_data = { + "parent_run": run_id, "original_score": original_score, + "best_score": best_score, "improvement": improvement, + "variations_tested": len(variations), "modes_tested": test_modes, + "calls_used": calls_used, "ranked": ranked[:5], + } + _save_pipeline("optimize", original_prompt[:200], + [{"step": "analyze"}, {"step": "generate", "count": len(variations)}, {"step": "test", "tests": len(child_run_ids)}, {"step": "rank"}], + result_data, models_used + [_SCORE_MODEL], start * 1000) + + # Tag original run as optimized + try: + with get_db() as conn: + with conn.cursor() as cur: + cur.execute( + "UPDATE team_runs SET score_metadata = COALESCE(score_metadata, '{}') || %s WHERE id = %s", + (json.dumps({"optimized": True, "best_variation_run": ranked[0]["run_id"] if ranked else None, "optimize_job": job_id}), run_id) + ) + conn.commit() + except Exception: + pass + + _optimize_emit(job_id, {"type": "done", "best_score": best_score, "original_score": original_score, "improvement": improvement, "calls_used": calls_used}) + _optimize_jobs[job_id]["status"] = "completed" + + except Exception as e: + _optimize_emit(job_id, {"type": "error", "text": str(e)}) + _optimize_emit(job_id, {"type": "done", "best_score": 0, "original_score": 0, "improvement": 0}) + _optimize_jobs[job_id]["status"] = f"error: {e}" + + def _gather_data_source(source): """Pull data from a system source for pipeline input.""" if source == "team_runs":