diff --git a/llm_team_ui.py b/llm_team_ui.py index 12595c5..aadae10 100644 --- a/llm_team_ui.py +++ b/llm_team_ui.py @@ -1776,16 +1776,73 @@ def get_db(): def save_run(mode, prompt, config_data, responses): models = list({r.get("model", "") for r in responses if r.get("model")}) + run_id = None try: with get_db() as conn: with conn.cursor() as cur: cur.execute( - "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s)", + "INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id", (mode, prompt, json.dumps(config_data), json.dumps(responses), models) ) + run_id = cur.fetchone()[0] conn.commit() except Exception as e: print(f"[DB] save_run error: {e}") + if run_id and responses: + threading.Thread(target=_auto_score_run, args=(run_id, mode, prompt, responses), daemon=True).start() + return run_id + + +# ─── AUTO-SCORING ENGINE ───────────────────────────────────── +_SCORE_MODEL = "qwen2.5:latest" + +def _auto_score_run(run_id, mode, prompt, responses): + """Background: auto-score a completed run via judge model.""" + try: + # Pick the longest non-error response as representative + candidates = [r for r in responses if r.get("role") != "error" and r.get("text")] + if not candidates: + return + best = max(candidates, key=lambda r: len(r.get("text", ""))) + text = best["text"][:3000] + + judge_prompt = ( + f"Rate the quality of this AI response on a scale of 1-10.\n" + f"Consider: relevance to the prompt, completeness, accuracy, clarity, usefulness.\n\n" + f"PROMPT: {prompt[:500]}\n\n" + f"MODE: {mode}\n\n" + f"RESPONSE:\n{text}\n\n" + f"Return ONLY a JSON object: {{\"score\": N, \"reason\": \"one sentence\"}}" + ) + judgment = query_model(_SCORE_MODEL, judge_prompt) + + # Parse score + score = None + try: + j_start = judgment.find("{") + j_end = judgment.rfind("}") + 1 + if j_start >= 0 and j_end > j_start: + parsed = json.loads(judgment[j_start:j_end]) + score = float(parsed.get("score", 0)) + except Exception: + pass + if score is None: + m = re.search(r'\b([1-9]|10)\b', judgment) + score = float(m.group(1)) if m else None + if score is None or score < 1 or score > 10: + return + + with get_db() as conn: + with conn.cursor() as cur: + cur.execute( + "UPDATE team_runs SET quality_score = %s, score_method = 'auto', score_metadata = %s WHERE id = %s AND (score_method IS NULL OR score_method = 'auto')", + (score, json.dumps({"judge": _SCORE_MODEL, "judgment": judgment[:500], "scored_model": best.get("model", ""), "reason": judgment[:200]}), run_id) + ) + conn.commit() + print(f"[SCORE] run {run_id} scored {score}/10 by {_SCORE_MODEL}") + except Exception as e: + print(f"[SCORE] auto-score error for run {run_id}: {e}") + HTML = r""" @@ -2881,6 +2938,7 @@ function buildConfig() { let _runStartTime = 0; let _runTimer = null; +let _lastRunId = null; let _runEventCount = 0; let _runResponseCount = 0; let _runTotalChars = 0; @@ -3132,6 +3190,11 @@ function handleEvent(evt) { return; } if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; } + if (evt.type === 'run_saved') { + _lastRunId = evt.run_id; + document.querySelectorAll('.vote-btn').forEach(function(b) { b.disabled = false; }); + return; + } if (evt.type === 'response') { _runResponseCount++; _runTotalChars += (evt.text || '').length; @@ -3150,6 +3213,14 @@ function handleEvent(evt) { label.textContent = phaseName; output.appendChild(label); } + // Reactive pipeline notification — not a full card + if (evt.role === 'reactive') { + var note = document.createElement('div'); + note.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--accent);border:1px dashed var(--accent);border-radius:2px;padding:8px 12px;margin:4px 0;opacity:0.8;font-style:italic'; + note.textContent = '\u26A1 ' + evt.text; + output.appendChild(note); + return; + } const mi = availableModels.findIndex(m => m.name === evt.model); const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length]; const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model; @@ -3161,7 +3232,7 @@ function handleEvent(evt) { const roleTag = evt.role ? `${evt.role}` : ''; const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4); const errorLink = isError ? `View error details in monitor →` : ''; - card.innerHTML = `
${displayName}${roleTag}
${escapeHtml(evt.text)}
${errorLink}
`; + card.innerHTML = `
${displayName}${roleTag}
${escapeHtml(evt.text)}
${errorLink}
`; card.dataset.model = evt.model; card.dataset.role = evt.role || ''; card.dataset.displayName = displayName; @@ -3174,6 +3245,22 @@ function handleEvent(evt) { function escapeHtml(t) { return t.replace(/&/g,'&').replace(//g,'>'); } // ─── CARD ACTIONS ──────────────────────────────────── +async function voteRun(btn, vote) { + if (!_lastRunId) return; + try { + const r = await fetch('/api/runs/' + _lastRunId + '/score', { + method: 'POST', headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({vote: vote}) + }); + if (r.ok) { + btn.closest('.card-actions').querySelectorAll('.vote-btn').forEach(function(b) { b.style.opacity = '0.3'; b.disabled = true; }); + btn.style.opacity = '1'; + btn.style.borderColor = vote === 'up' ? 'var(--green)' : 'var(--red)'; + btn.style.color = vote === 'up' ? 'var(--green)' : 'var(--red)'; + } + } catch(e) { console.error('Vote error:', e); } +} + function copyCard(uid, btn) { const el = document.getElementById(uid); if (!el) return; @@ -3584,6 +3671,7 @@ ADMIN_HTML = r"""
OpenRouter
Timeouts
Security
+
Analytics
@@ -3687,6 +3775,32 @@ ADMIN_HTML = r"""
+ + +
+
+

Scoring Coverage

+
Loading...
+
+
+
+

Score by Mode

+
Loading...
+
+
+

Score by Model

+
Loading...
+
+
+
+

Model × Mode Heatmap

+
Loading...
+
+
+

Score Trend (30 days)

+
Loading...
+
+