Add Auto-Optimize: AI agent for history-driven prompt improvement

When viewing any past run in History, click "Optimize" to trigger an automated workflow that: 1. Analyzes the original prompt + responses + score 2. Identifies improvement strategies (clarity, depth, specificity, etc.) 3. Generates 3-5 improved prompt variations 4. Tests each variation across original mode + brainstorm 5. Auto-scores all results via background judge 6. Ranks results and highlights the winner 7. "Use This" button loads winning prompt into composer Architecture: - _run_optimize(job_id, run_id): background thread, 5-phase engine - POST /api/runs/<id>/optimize: starts optimization job - GET /api/optimize/<job_id>/stream: SSE for live progress - Budget-capped at 15 model calls per optimization - Child runs saved as real team_runs (source: "optimize") - Auto-scored → feeds into analytics + routing table automatically - Results saved to pipeline_runs (pipeline: "optimize") Frontend: - "Optimize" button in history detail panel (accent-colored) - startOptimize(runId): replaces detail view with live optimization stream - Phase cards: Analysis → Variations → Testing → Ranked Results - Score bars with color coding (green/amber/red) - Winner row highlighted with star + "Use This" button Closes the learning loop: system studies its own history → generates better prompts → tests them → scores results → routing table improves. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 07:03:27 -05:00 · 2026-03-29 07:03:27 -05:00 · 3b4fa449f1
commit 3b4fa449f1
parent 8ad221b41f
1 changed files with 401 additions and 0 deletions
--- a/llm_team_ui.py
+++ b/llm_team_ui.py
@ -6438,6 +6438,62 @@ def score_run(run_id):
        return jsonify({"error": str(e)}), 500
@app.route("/api/runs/<int:run_id>/optimize", methods=["POST"])
@admin_required
 def start_optimize(run_id):
    """Start an auto-optimize job for a past run."""
    job_id = f"opt-{run_id}-{int(time.time())}"
    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT id FROM team_runs WHERE id = %s", (run_id,))
                if not cur.fetchone():
                    return jsonify({"error": "Run not found"}), 404
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    # Don't allow double-optimize
    for jid, info in _optimize_jobs.items():
        if jid.startswith(f"opt-{run_id}-") and info.get("status") == "running":
            return jsonify({"error": "Already optimizing this run", "job_id": jid}), 409
    _optimize_jobs[job_id] = {"status": "starting"}
    _optimize_queues[job_id] = []
    t = threading.Thread(target=_run_optimize, args=(job_id, run_id), daemon=True)
    _optimize_jobs[job_id]["thread"] = t
    t.start()
    return jsonify({"ok": True, "job_id": job_id})
@app.route("/api/optimize/<job_id>/stream")
@login_required
 def optimize_stream(job_id):
    """SSE stream for optimization progress."""
    q = []
    _optimize_queues.setdefault(job_id, []).append(q)
    def generate():
        try:
            idle_count = 0
            while True:
                if q:
                    idle_count = 0
                    data = q.pop(0)
                    yield f"data: {json.dumps(data)}\n\n"
                    if data.get("type") == "done":
                        break
                else:
                    idle_count += 1
                    if idle_count > 300:  # 5 min timeout
                        break
                    time.sleep(1)
                    yield ": keepalive\n\n"
        finally:
            try:
                _optimize_queues.get(job_id, []).remove(q)
            except ValueError:
                pass
    return Response(generate(), mimetype="text/event-stream",
                    headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
@login_required
 def archive_run(run_id):
@ -6706,6 +6762,145 @@ function toast(msg, ok) {
  setTimeout(function(){t.remove()},2500);
 }
 async function startOptimize(runId) {
  var r = await fetch('/api/runs/'+runId+'/optimize', {method:'POST'});
  var data = await r.json();
  if (data.error) { toast(data.error, false); return; }
  var jobId = data.job_id;
  var panel = document.getElementById('detail-panel');
  panel.textContent = '';
  // Header
  var hdr = document.createElement('div'); hdr.style.cssText = 'display:flex;align-items:center;gap:10px;margin-bottom:16px';
  var backBtn = document.createElement('button'); backBtn.className = 'tool-btn';
  backBtn.textContent = '\u2190 Back'; backBtn.onclick = function(){ openDetail(runId); };
  hdr.appendChild(backBtn);
  var title = document.createElement('span');
  title.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);font-weight:700';
  title.textContent = 'OPTIMIZING RUN #'+runId;
  hdr.appendChild(title);
  panel.appendChild(hdr);
  // Status
  var statusEl = document.createElement('div');
  statusEl.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:11px;color:var(--text2);margin-bottom:12px';
  statusEl.textContent = 'Starting optimization...';
  panel.appendChild(statusEl);
  // Results container
  var resultsEl = document.createElement('div');
  panel.appendChild(resultsEl);
  // SSE stream
  var es = new EventSource('/api/optimize/'+jobId+'/stream');
  es.onmessage = function(e) {
    var d = JSON.parse(e.data);
    if (d.type === 'status') {
      statusEl.textContent = d.text;
    }
    if (d.type === 'error') {
      var err = document.createElement('div');
      err.style.cssText = 'color:var(--red);font-family:JetBrains Mono,monospace;font-size:11px;margin:8px 0;border-left:2px solid var(--red);padding-left:8px';
      err.textContent = 'Error: ' + d.text;
      resultsEl.appendChild(err);
    }
    if (d.type === 'phase') {
      var block = document.createElement('div');
      block.style.cssText = 'background:var(--surface);border:1px solid var(--border);border-radius:2px;padding:12px;margin-bottom:8px';
      var phTitle = document.createElement('div');
      phTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:1.5px;color:var(--accent);margin-bottom:6px;font-weight:700';
      phTitle.textContent = d.phase === 'analyze' ? 'Analysis' : d.count + ' Variations Generated';
      block.appendChild(phTitle);
      var body = document.createElement('div');
      body.style.cssText = 'font-size:12px;line-height:1.6;color:var(--text);white-space:pre-wrap;max-height:200px;overflow-y:auto';
      if (d.phase === 'analyze') {
        body.textContent = d.text || '';
      } else if (d.variations) {
        d.variations.forEach(function(v, i) {
          var line = document.createElement('div');
          line.style.cssText = 'margin-bottom:8px;padding:6px 8px;background:rgba(0,0,0,0.1);border-radius:2px';
          line.textContent = 'V'+(i+1)+' ['+v.strategy+'] '+v.prompt;
          body.appendChild(line);
        });
        body.style.whiteSpace = 'normal';
      }
      block.appendChild(body);
      resultsEl.appendChild(block);
    }
    if (d.type === 'test') {
      statusEl.textContent = 'Testing V'+(d.variation+1)+' ['+d.strategy+'] in '+d.mode+'... '+d.status;
    }
    if (d.type === 'results') {
      statusEl.textContent = 'Optimization complete!';
      var table = document.createElement('div'); table.style.cssText = 'margin-top:12px';
      var tTitle = document.createElement('div');
      tTitle.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:9px;text-transform:uppercase;letter-spacing:2px;color:var(--accent);margin-bottom:10px;font-weight:700';
      tTitle.textContent = 'RANKED RESULTS' + (d.original_score ? ' (original: '+d.original_score+'/10)' : '');
      table.appendChild(tTitle);
      (d.ranked||[]).forEach(function(r, i) {
        var row = document.createElement('div');
        row.style.cssText = 'display:flex;align-items:center;gap:10px;padding:10px 12px;margin-bottom:4px;background:var(--surface);border:1px solid var(--border);border-radius:2px';
        if (i === 0) row.style.borderColor = 'var(--accent)';
        var rank = document.createElement('span');
        rank.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:14px;font-weight:700;min-width:24px;color:'+(i===0?'var(--accent)':'var(--text2)');
        rank.textContent = i === 0 ? '\u2605' : '#'+(i+1);
        row.appendChild(rank);
        var info = document.createElement('div'); info.style.cssText = 'flex:1;min-width:0';
        var label = document.createElement('div');
        label.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--text2);text-transform:uppercase;letter-spacing:0.5px';
        label.textContent = 'V'+(r.variation+1)+' ['+r.strategy+'] \u00D7 '+r.mode;
        info.appendChild(label);
        var snippet = document.createElement('div');
        snippet.style.cssText = 'font-size:11px;color:var(--text);margin-top:2px;white-space:nowrap;overflow:hidden;text-overflow:ellipsis';
        snippet.textContent = r.snippet || '';
        info.appendChild(snippet);
        row.appendChild(info);
        var scoreBar = document.createElement('div'); scoreBar.style.cssText = 'width:80px;display:flex;align-items:center;gap:6px';
        var bar = document.createElement('div'); bar.style.cssText = 'flex:1;height:6px;background:rgba(0,0,0,0.15);border-radius:3px;overflow:hidden';
        var fill = document.createElement('div');
        var pct = ((r.score||0)/10)*100;
        fill.style.cssText = 'height:100%;border-radius:3px;background:'+(pct>=70?'var(--green)':pct>=50?'var(--accent)':'var(--red)')+';width:'+pct+'%';
        bar.appendChild(fill); scoreBar.appendChild(bar);
        var scoreNum = document.createElement('span');
        scoreNum.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:12px;font-weight:700;min-width:24px;text-align:right';
        scoreNum.textContent = r.score ? r.score.toFixed(1) : '?';
        scoreBar.appendChild(scoreNum);
        row.appendChild(scoreBar);
        if (i === 0 && r.prompt) {
          var useBtn = document.createElement('button'); useBtn.className = 'tool-btn';
          useBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);font-size:9px;white-space:nowrap';
          useBtn.textContent = 'Use This';
          useBtn.onclick = function(){
            var promptEl = document.getElementById('prompt');
            if (promptEl) promptEl.value = r.prompt;
            window.location.href = '/';
          };
          row.appendChild(useBtn);
        }
        table.appendChild(row);
      });
      resultsEl.appendChild(table);
    }
    if (d.type === 'done') {
      es.close();
      if (d.improvement > 0) {
        statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 (+'+(d.improvement||0).toFixed(1)+' improvement) | '+d.calls_used+' model calls';
        statusEl.style.color = 'var(--green)';
      } else {
        statusEl.textContent = 'Done! Best: '+(d.best_score||'?')+'/10 | Original: '+(d.original_score||'?')+'/10 | '+d.calls_used+' calls';
      }
    }
  };
  es.onerror = function() { es.close(); statusEl.textContent += ' (stream ended)'; };
 }
 async function loadRuns() {
  var mode = document.getElementById('filter-mode').value;
  var tag = document.getElementById('filter-tag').value;
@ -6819,6 +7014,11 @@ async function openDetail(id) {
  var delBtn = document.createElement('button'); delBtn.className = 'tool-btn red'; delBtn.textContent = 'Delete';
  delBtn.onclick = function(){ if(confirm('Delete permanently?')){fetch('/api/runs/'+id,{method:'DELETE'}).then(function(){toast('Deleted',true);loadRuns();panel.className='detail-panel'})} };
  actions.appendChild(delBtn);
  var optBtn = document.createElement('button'); optBtn.className = 'tool-btn';
  optBtn.style.cssText = 'color:var(--accent);border-color:var(--accent);margin-left:auto';
  optBtn.textContent = '\u26A1 Optimize';
  optBtn.onclick = function(){ startOptimize(id); };
  actions.appendChild(optBtn);
  panel.appendChild(actions);
  // Responses
@ -7043,6 +7243,207 @@ def get_self_report(rid):
 _meta_threads = {}
 _meta_status = {}  # pipeline_id -> {stage, substep, progress}
 # ─── AUTO-OPTIMIZE ENGINE ────────────────────────────────────
 _optimize_jobs = {}    # job_id -> {"thread": Thread, "status": str}
 _optimize_queues = {}  # job_id -> [[event_dicts]]
 _OPTIMIZE_MAX_CALLS = 15
 def _optimize_emit(job_id, data):
    for q in _optimize_queues.get(job_id, []):
        q.append(data)
 def _run_optimize(job_id, run_id):
    """Background: analyze a past run, generate improved prompts, test them, rank results."""
    import time as _time
    start = _time.time()
    calls_used = 0
    _optimize_jobs[job_id]["status"] = "running"
    def _budget_call(model, prompt):
        nonlocal calls_used
        if calls_used >= _OPTIMIZE_MAX_CALLS:
            raise RuntimeError("Budget exhausted")
        calls_used += 1
        return safe_query(model, prompt)
    try:
        # Fetch original run
        with get_db() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute("SELECT * FROM team_runs WHERE id = %s", (run_id,))
                run = cur.fetchone()
        if not run:
            _optimize_emit(job_id, {"type": "error", "text": "Run not found"})
            _optimize_emit(job_id, {"type": "done"})
            return
        original_prompt = run["prompt"]
        original_mode = run["mode"]
        original_score = run.get("quality_score") or 0
        responses = run.get("responses") or []
        models_used = run.get("models_used") or ["qwen2.5:latest"]
        best_resp = ""
        if responses:
            candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
            if candidates:
                best_resp = max(candidates, key=lambda r: len(r.get("text", "")))["text"][:2000]
        # Phase A: Analyze
        _optimize_emit(job_id, {"type": "status", "text": "Analyzing original run..."})
        analysis_prompt = (
            f"Analyze this LLM prompt for improvement opportunities.\n\n"
            f"MODE: {original_mode}\nSCORE: {original_score}/10\n\n"
            f"PROMPT:\n{original_prompt[:1500]}\n\n"
            f"BEST RESPONSE (excerpt):\n{best_resp[:1000]}\n\n"
            f"Identify 3-4 specific improvement strategies. For each, name the strategy type.\n"
            f"Return JSON: {{\"analysis\": \"brief overall assessment\", \"strategies\": [\"clarity\", \"depth\", ...]}}"
        )
        analysis_raw = _budget_call(_SCORE_MODEL, analysis_prompt)
        _optimize_emit(job_id, {"type": "phase", "phase": "analyze", "text": analysis_raw})
        # Parse strategies
        strategies = ["clarity", "depth", "specificity"]
        try:
            j_s = analysis_raw.find("{")
            j_e = analysis_raw.rfind("}") + 1
            if j_s >= 0 and j_e > j_s:
                parsed = json.loads(analysis_raw[j_s:j_e])
                strategies = parsed.get("strategies", strategies)[:5]
        except Exception:
            pass
        # Phase B: Generate variations
        _optimize_emit(job_id, {"type": "status", "text": f"Generating {len(strategies)} prompt variations..."})
        gen_prompt = (
            f"Generate {len(strategies)} improved versions of this prompt. Each targets a different improvement strategy.\n\n"
            f"ORIGINAL PROMPT:\n{original_prompt[:1500]}\n\n"
            f"STRATEGIES TO APPLY: {', '.join(strategies)}\n\n"
            f"Return a JSON array: [{{\"strategy\": \"...\", \"prompt\": \"the full improved prompt\", \"rationale\": \"why this is better\"}}]\n"
            f"Each prompt should be complete and ready to use, not a description of changes."
        )
        gen_raw = _budget_call(_SCORE_MODEL, gen_prompt)
        variations = []
        try:
            j_s = gen_raw.find("[")
            j_e = gen_raw.rfind("]") + 1
            if j_s >= 0 and j_e > j_s:
                variations = json.loads(gen_raw[j_s:j_e])
        except Exception:
            pass
        if not variations:
            # Fallback: create simple variations
            variations = [
                {"strategy": "clarity", "prompt": f"Please be specific and clear: {original_prompt}", "rationale": "Added clarity directive"},
                {"strategy": "depth", "prompt": f"Provide a comprehensive, detailed answer: {original_prompt}", "rationale": "Added depth directive"},
                {"strategy": "structure", "prompt": f"Structure your response with clear sections and examples: {original_prompt}", "rationale": "Added structure directive"},
            ]
        _optimize_emit(job_id, {"type": "phase", "phase": "variations", "count": len(variations),
            "variations": [{"strategy": v.get("strategy", "?"), "prompt": v.get("prompt", "")[:200], "rationale": v.get("rationale", "")} for v in variations]})
        # Phase C: Multi-mode test
        test_modes = [original_mode]
        if original_mode != "brainstorm":
            test_modes.append("brainstorm")
        # Budget check: need 1 call per variation×mode, cap if needed
        max_tests = _OPTIMIZE_MAX_CALLS - calls_used - 1  # reserve 1 for summary
        if len(variations) * len(test_modes) > max_tests:
            test_modes = [original_mode]
        if len(variations) > max_tests:
            variations = variations[:max_tests]
        child_run_ids = []
        for vi, var in enumerate(variations):
            var_prompt = var.get("prompt", original_prompt)
            strategy = var.get("strategy", "unknown")
            for mode in test_modes:
                _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "running"})
                try:
                    # Pick a model from the original run's model list
                    model = models_used[vi % len(models_used)]
                    result = _budget_call(model, var_prompt)
                    test_responses = [{"model": model, "text": result, "role": "response"}]
                    test_config = {"source": "optimize", "parent_run": run_id, "job_id": job_id, "variation": vi, "strategy": strategy}
                    rid = save_run(mode, var_prompt, test_config, test_responses)
                    if rid:
                        child_run_ids.append({"run_id": rid, "variation": vi, "strategy": strategy, "mode": mode, "prompt": var_prompt})
                    _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": "done"})
                except Exception as e:
                    _optimize_emit(job_id, {"type": "test", "variation": vi, "strategy": strategy, "mode": mode, "status": f"error: {e}"})
        # Phase D: Wait for scores and rank
        _optimize_emit(job_id, {"type": "status", "text": "Waiting for auto-scoring..."})
        ranked = []
        if child_run_ids:
            child_ids = [c["run_id"] for c in child_run_ids]
            # Poll for scores (auto-scoring runs in background threads)
            for attempt in range(20):
                _time.sleep(3)
                try:
                    with get_db() as conn:
                        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                            cur.execute("SELECT id, quality_score FROM team_runs WHERE id = ANY(%s)", (child_ids,))
                            scores = {r["id"]: r["quality_score"] for r in cur.fetchall()}
                    scored_count = sum(1 for s in scores.values() if s is not None)
                    if scored_count >= len(child_ids) * 0.8:
                        break
                except Exception:
                    pass
            # Build ranked results
            for child in child_run_ids:
                score = scores.get(child["run_id"])
                ranked.append({
                    "run_id": child["run_id"],
                    "variation": child["variation"],
                    "strategy": child["strategy"],
                    "mode": child["mode"],
                    "prompt": child["prompt"],
                    "score": float(score) if score else None,
                    "snippet": child["prompt"][:150],
                })
            ranked.sort(key=lambda r: r.get("score") or 0, reverse=True)
        _optimize_emit(job_id, {"type": "results", "ranked": ranked, "original_score": original_score})
        # Phase E: Report
        best_score = ranked[0]["score"] if ranked and ranked[0].get("score") else original_score
        improvement = (best_score or 0) - (original_score or 0)
        duration = int((_time.time() - start) * 1000)
        result_data = {
            "parent_run": run_id, "original_score": original_score,
            "best_score": best_score, "improvement": improvement,
            "variations_tested": len(variations), "modes_tested": test_modes,
            "calls_used": calls_used, "ranked": ranked[:5],
        }
        _save_pipeline("optimize", original_prompt[:200],
            [{"step": "analyze"}, {"step": "generate", "count": len(variations)}, {"step": "test", "tests": len(child_run_ids)}, {"step": "rank"}],
            result_data, models_used + [_SCORE_MODEL], start * 1000)
        # Tag original run as optimized
        try:
            with get_db() as conn:
                with conn.cursor() as cur:
                    cur.execute(
                        "UPDATE team_runs SET score_metadata = COALESCE(score_metadata, '{}') || %s WHERE id = %s",
                        (json.dumps({"optimized": True, "best_variation_run": ranked[0]["run_id"] if ranked else None, "optimize_job": job_id}), run_id)
                    )
                conn.commit()
        except Exception:
            pass
        _optimize_emit(job_id, {"type": "done", "best_score": best_score, "original_score": original_score, "improvement": improvement, "calls_used": calls_used})
        _optimize_jobs[job_id]["status"] = "completed"
    except Exception as e:
        _optimize_emit(job_id, {"type": "error", "text": str(e)})
        _optimize_emit(job_id, {"type": "done", "best_score": 0, "original_score": 0, "improvement": 0})
        _optimize_jobs[job_id]["status"] = f"error: {e}"
 def _gather_data_source(source):
    """Pull data from a system source for pipeline input."""
    if source == "team_runs":