Add self-improving pipeline: auto-scoring, analytics, reactive refine, routing intelligence
Phase 1 — Run Quality Scoring: - Auto-score every run in background via qwen2.5 judge (1-10) - Thumbs up/down vote buttons on output cards - POST /api/runs/<id>/score for user feedback - run_saved SSE event enables vote buttons after run completes - User votes override auto-scores (race-condition safe) - DB: quality_score, score_method, score_metadata on team_runs Phase 1 — Analytics Dashboard: - GET /api/admin/analytics: score-by-mode, score-by-model, heatmap, trend - New Analytics tab on Admin page with bar charts, heatmap table, trend sparkline - Scoring coverage tracker (scored vs total runs) - Model × Mode heatmap with color-coded cells Phase 2 — Reactive Pipeline: - _assess_stage(): orchestrator evaluates each stage's output mid-run - _reactive_decide(): can insert/skip stages based on assessment - Dynamic stage loop replaces fixed iteration in run_refine() - Budget tracking prevents infinite loops (max_stages hard cap) - Reactive decisions render as dashed notification bars between cards - Pipeline adjusts in real-time: "Inserting VALIDATE — high severity gaps found" Phase 3 — Cross-Run Learning: - _build_routing_table(): queries historical scores for model×mode performance - Best stage sequences per content_type from pipeline_runs - Routing table cached with 30-min TTL - Auto-Refine strategist prompt augmented with historical data - GET /api/suggest-models?mode=X returns top 3 models for that mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c2cc211f21
commit
8ad221b41f
428
llm_team_ui.py
428
llm_team_ui.py
@ -1776,16 +1776,73 @@ def get_db():
|
|||||||
|
|
||||||
def save_run(mode, prompt, config_data, responses):
|
def save_run(mode, prompt, config_data, responses):
|
||||||
models = list({r.get("model", "") for r in responses if r.get("model")})
|
models = list({r.get("model", "") for r in responses if r.get("model")})
|
||||||
|
run_id = None
|
||||||
try:
|
try:
|
||||||
with get_db() as conn:
|
with get_db() as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s)",
|
"INSERT INTO team_runs (mode, prompt, config, responses, models_used) VALUES (%s, %s, %s, %s, %s) RETURNING id",
|
||||||
(mode, prompt, json.dumps(config_data), json.dumps(responses), models)
|
(mode, prompt, json.dumps(config_data), json.dumps(responses), models)
|
||||||
)
|
)
|
||||||
|
run_id = cur.fetchone()[0]
|
||||||
conn.commit()
|
conn.commit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[DB] save_run error: {e}")
|
print(f"[DB] save_run error: {e}")
|
||||||
|
if run_id and responses:
|
||||||
|
threading.Thread(target=_auto_score_run, args=(run_id, mode, prompt, responses), daemon=True).start()
|
||||||
|
return run_id
|
||||||
|
|
||||||
|
|
||||||
|
# ─── AUTO-SCORING ENGINE ─────────────────────────────────────
|
||||||
|
_SCORE_MODEL = "qwen2.5:latest"
|
||||||
|
|
||||||
|
def _auto_score_run(run_id, mode, prompt, responses):
|
||||||
|
"""Background: auto-score a completed run via judge model."""
|
||||||
|
try:
|
||||||
|
# Pick the longest non-error response as representative
|
||||||
|
candidates = [r for r in responses if r.get("role") != "error" and r.get("text")]
|
||||||
|
if not candidates:
|
||||||
|
return
|
||||||
|
best = max(candidates, key=lambda r: len(r.get("text", "")))
|
||||||
|
text = best["text"][:3000]
|
||||||
|
|
||||||
|
judge_prompt = (
|
||||||
|
f"Rate the quality of this AI response on a scale of 1-10.\n"
|
||||||
|
f"Consider: relevance to the prompt, completeness, accuracy, clarity, usefulness.\n\n"
|
||||||
|
f"PROMPT: {prompt[:500]}\n\n"
|
||||||
|
f"MODE: {mode}\n\n"
|
||||||
|
f"RESPONSE:\n{text}\n\n"
|
||||||
|
f"Return ONLY a JSON object: {{\"score\": N, \"reason\": \"one sentence\"}}"
|
||||||
|
)
|
||||||
|
judgment = query_model(_SCORE_MODEL, judge_prompt)
|
||||||
|
|
||||||
|
# Parse score
|
||||||
|
score = None
|
||||||
|
try:
|
||||||
|
j_start = judgment.find("{")
|
||||||
|
j_end = judgment.rfind("}") + 1
|
||||||
|
if j_start >= 0 and j_end > j_start:
|
||||||
|
parsed = json.loads(judgment[j_start:j_end])
|
||||||
|
score = float(parsed.get("score", 0))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if score is None:
|
||||||
|
m = re.search(r'\b([1-9]|10)\b', judgment)
|
||||||
|
score = float(m.group(1)) if m else None
|
||||||
|
if score is None or score < 1 or score > 10:
|
||||||
|
return
|
||||||
|
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE team_runs SET quality_score = %s, score_method = 'auto', score_metadata = %s WHERE id = %s AND (score_method IS NULL OR score_method = 'auto')",
|
||||||
|
(score, json.dumps({"judge": _SCORE_MODEL, "judgment": judgment[:500], "scored_model": best.get("model", ""), "reason": judgment[:200]}), run_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
print(f"[SCORE] run {run_id} scored {score}/10 by {_SCORE_MODEL}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[SCORE] auto-score error for run {run_id}: {e}")
|
||||||
|
|
||||||
|
|
||||||
HTML = r"""
|
HTML = r"""
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
@ -2881,6 +2938,7 @@ function buildConfig() {
|
|||||||
|
|
||||||
let _runStartTime = 0;
|
let _runStartTime = 0;
|
||||||
let _runTimer = null;
|
let _runTimer = null;
|
||||||
|
let _lastRunId = null;
|
||||||
let _runEventCount = 0;
|
let _runEventCount = 0;
|
||||||
let _runResponseCount = 0;
|
let _runResponseCount = 0;
|
||||||
let _runTotalChars = 0;
|
let _runTotalChars = 0;
|
||||||
@ -3132,6 +3190,11 @@ function handleEvent(evt) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; }
|
if (evt.type === 'done') { const bar = output.querySelector('.status-bar'); if (bar) bar.remove(); return; }
|
||||||
|
if (evt.type === 'run_saved') {
|
||||||
|
_lastRunId = evt.run_id;
|
||||||
|
document.querySelectorAll('.vote-btn').forEach(function(b) { b.disabled = false; });
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (evt.type === 'response') {
|
if (evt.type === 'response') {
|
||||||
_runResponseCount++;
|
_runResponseCount++;
|
||||||
_runTotalChars += (evt.text || '').length;
|
_runTotalChars += (evt.text || '').length;
|
||||||
@ -3150,6 +3213,14 @@ function handleEvent(evt) {
|
|||||||
label.textContent = phaseName;
|
label.textContent = phaseName;
|
||||||
output.appendChild(label);
|
output.appendChild(label);
|
||||||
}
|
}
|
||||||
|
// Reactive pipeline notification — not a full card
|
||||||
|
if (evt.role === 'reactive') {
|
||||||
|
var note = document.createElement('div');
|
||||||
|
note.style.cssText = 'font-family:JetBrains Mono,monospace;font-size:10px;color:var(--accent);border:1px dashed var(--accent);border-radius:2px;padding:8px 12px;margin:4px 0;opacity:0.8;font-style:italic';
|
||||||
|
note.textContent = '\u26A1 ' + evt.text;
|
||||||
|
output.appendChild(note);
|
||||||
|
return;
|
||||||
|
}
|
||||||
const mi = availableModels.findIndex(m => m.name === evt.model);
|
const mi = availableModels.findIndex(m => m.name === evt.model);
|
||||||
const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length];
|
const color = COLORS[(mi >= 0 ? mi : 0) % COLORS.length];
|
||||||
const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model;
|
const displayName = mi >= 0 ? (availableModels[mi].display_name || evt.model) : evt.model;
|
||||||
@ -3161,7 +3232,7 @@ function handleEvent(evt) {
|
|||||||
const roleTag = evt.role ? `<span class="role-tag">${evt.role}</span>` : '';
|
const roleTag = evt.role ? `<span class="role-tag">${evt.role}</span>` : '';
|
||||||
const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4);
|
const uid = 'resp-' + Date.now() + '-' + Math.random().toString(36).substr(2,4);
|
||||||
const errorLink = isError ? `<a class="error-link" href="/admin/monitor">View error details in monitor →</a>` : '';
|
const errorLink = isError ? `<a class="error-link" href="/admin/monitor">View error details in monitor →</a>` : '';
|
||||||
card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button></div>`;
|
card.innerHTML = `<div class="card-header" style="cursor:pointer" onclick="openRepipe('${uid}')"><div class="dot" style="background:${isError ? 'var(--red)' : color}"></div>${displayName}${roleTag}</div><div class="card-body" id="${uid}">${escapeHtml(evt.text)}</div>${errorLink}<div class="card-actions"><button class="card-act" onclick="event.stopPropagation();copyCard('${uid}',this)">Copy</button><button class="card-act" onclick="event.stopPropagation();useAsPrompt('${uid}')">Use as Prompt</button><button class="card-act" onclick="event.stopPropagation();openRepipe('${uid}')">Iterate</button><span style="flex:1"></span><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'up')" title="Good output">\u{1F44D}</button><button class="card-act vote-btn" disabled onclick="event.stopPropagation();voteRun(this,'down')" title="Bad output">\u{1F44E}</button></div>`;
|
||||||
card.dataset.model = evt.model;
|
card.dataset.model = evt.model;
|
||||||
card.dataset.role = evt.role || '';
|
card.dataset.role = evt.role || '';
|
||||||
card.dataset.displayName = displayName;
|
card.dataset.displayName = displayName;
|
||||||
@ -3174,6 +3245,22 @@ function handleEvent(evt) {
|
|||||||
function escapeHtml(t) { return t.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); }
|
function escapeHtml(t) { return t.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); }
|
||||||
|
|
||||||
// ─── CARD ACTIONS ────────────────────────────────────
|
// ─── CARD ACTIONS ────────────────────────────────────
|
||||||
|
async function voteRun(btn, vote) {
|
||||||
|
if (!_lastRunId) return;
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/runs/' + _lastRunId + '/score', {
|
||||||
|
method: 'POST', headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({vote: vote})
|
||||||
|
});
|
||||||
|
if (r.ok) {
|
||||||
|
btn.closest('.card-actions').querySelectorAll('.vote-btn').forEach(function(b) { b.style.opacity = '0.3'; b.disabled = true; });
|
||||||
|
btn.style.opacity = '1';
|
||||||
|
btn.style.borderColor = vote === 'up' ? 'var(--green)' : 'var(--red)';
|
||||||
|
btn.style.color = vote === 'up' ? 'var(--green)' : 'var(--red)';
|
||||||
|
}
|
||||||
|
} catch(e) { console.error('Vote error:', e); }
|
||||||
|
}
|
||||||
|
|
||||||
function copyCard(uid, btn) {
|
function copyCard(uid, btn) {
|
||||||
const el = document.getElementById(uid);
|
const el = document.getElementById(uid);
|
||||||
if (!el) return;
|
if (!el) return;
|
||||||
@ -3584,6 +3671,7 @@ ADMIN_HTML = r"""
|
|||||||
<div class="tab" onclick="switchTab('openrouter')">OpenRouter</div>
|
<div class="tab" onclick="switchTab('openrouter')">OpenRouter</div>
|
||||||
<div class="tab" onclick="switchTab('timeouts')">Timeouts</div>
|
<div class="tab" onclick="switchTab('timeouts')">Timeouts</div>
|
||||||
<div class="tab" onclick="switchTab('security')">Security</div>
|
<div class="tab" onclick="switchTab('security')">Security</div>
|
||||||
|
<div class="tab" onclick="switchTab('analytics')">Analytics</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- PROVIDERS TAB -->
|
<!-- PROVIDERS TAB -->
|
||||||
@ -3687,6 +3775,32 @@ ADMIN_HTML = r"""
|
|||||||
<div id="allowlist"></div>
|
<div id="allowlist"></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- ANALYTICS TAB -->
|
||||||
|
<div id="tab-analytics" class="tab-content">
|
||||||
|
<div class="card" id="ana-coverage-card">
|
||||||
|
<h3>Scoring Coverage</h3>
|
||||||
|
<div id="ana-coverage" style="font-size:13px;color:var(--text2)">Loading...</div>
|
||||||
|
</div>
|
||||||
|
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px">
|
||||||
|
<div class="card">
|
||||||
|
<h3>Score by Mode</h3>
|
||||||
|
<div id="ana-by-mode" style="font-size:12px">Loading...</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<h3>Score by Model</h3>
|
||||||
|
<div id="ana-by-model" style="font-size:12px">Loading...</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<h3>Model × Mode Heatmap</h3>
|
||||||
|
<div id="ana-heatmap" style="font-size:11px;overflow-x:auto">Loading...</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<h3>Score Trend (30 days)</h3>
|
||||||
|
<div id="ana-trend" style="font-size:12px">Loading...</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
@ -3918,6 +4032,91 @@ function switchTab(name) {
|
|||||||
if (name === 'timeouts') renderTimeouts();
|
if (name === 'timeouts') renderTimeouts();
|
||||||
if (name === 'models') { loadOllamaModels(); renderCloudModels(); }
|
if (name === 'models') { loadOllamaModels(); renderCloudModels(); }
|
||||||
if (name === 'security') { loadDemoStatus(); loadAllowlist(); }
|
if (name === 'security') { loadDemoStatus(); loadAllowlist(); }
|
||||||
|
if (name === 'analytics') loadAnalytics();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadAnalytics() {
|
||||||
|
try {
|
||||||
|
var r = await fetch('/api/admin/analytics');
|
||||||
|
var d = await r.json();
|
||||||
|
if (d.error) { document.getElementById('ana-coverage').textContent = 'Error: ' + d.error; return; }
|
||||||
|
|
||||||
|
// Coverage
|
||||||
|
var cov = d.coverage || {};
|
||||||
|
document.getElementById('ana-coverage').innerHTML =
|
||||||
|
'<strong>' + (cov.scored||0) + '</strong> / ' + (cov.total||0) + ' runs scored (' +
|
||||||
|
(cov.total ? Math.round((cov.scored||0)/(cov.total)*100) : 0) + '%)';
|
||||||
|
|
||||||
|
// Score by Mode - horizontal bars
|
||||||
|
var modeHtml = '';
|
||||||
|
(d.by_mode||[]).forEach(function(m) {
|
||||||
|
var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
|
||||||
|
modeHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
|
||||||
|
'<span style="min-width:100px;font-family:JetBrains Mono,monospace;font-size:10px;text-transform:uppercase">' + m.mode + '</span>' +
|
||||||
|
'<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
|
||||||
|
'<div style="width:' + pct + '%;height:100%;background:var(--accent);border-radius:2px"></div></div>' +
|
||||||
|
'<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
|
||||||
|
'<span style="font-size:9px;color:var(--text2)">' + m.runs + ' runs</span></div>';
|
||||||
|
});
|
||||||
|
document.getElementById('ana-by-mode').innerHTML = modeHtml || '<span style="color:var(--text2)">No scored runs yet</span>';
|
||||||
|
|
||||||
|
// Score by Model
|
||||||
|
var modelHtml = '';
|
||||||
|
(d.by_model||[]).forEach(function(m) {
|
||||||
|
var pct = Math.round((parseFloat(m.avg_score)||0) * 10);
|
||||||
|
var name = m.model.length > 20 ? m.model.substring(0,18) + '..' : m.model;
|
||||||
|
modelHtml += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px">' +
|
||||||
|
'<span style="min-width:120px;font-family:JetBrains Mono,monospace;font-size:10px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap">' + name + '</span>' +
|
||||||
|
'<div style="flex:1;height:14px;background:rgba(0,0,0,0.15);border-radius:2px;overflow:hidden">' +
|
||||||
|
'<div style="width:' + pct + '%;height:100%;background:var(--green);border-radius:2px"></div></div>' +
|
||||||
|
'<span style="font-family:JetBrains Mono,monospace;font-size:11px;font-weight:700;min-width:30px">' + m.avg_score + '</span>' +
|
||||||
|
'<span style="font-size:9px;color:var(--text2)">' + m.runs + '</span></div>';
|
||||||
|
});
|
||||||
|
document.getElementById('ana-by-model').innerHTML = modelHtml || '<span style="color:var(--text2)">No data yet</span>';
|
||||||
|
|
||||||
|
// Heatmap - table
|
||||||
|
var hm = d.heatmap || [];
|
||||||
|
if (hm.length) {
|
||||||
|
var models = [...new Set(hm.map(function(h){return h.model}))];
|
||||||
|
var modes = [...new Set(hm.map(function(h){return h.mode}))];
|
||||||
|
var lookup = {};
|
||||||
|
hm.forEach(function(h) { lookup[h.mode+'|'+h.model] = h.avg_score; });
|
||||||
|
var tbl = '<table style="width:100%;border-collapse:collapse;font-family:JetBrains Mono,monospace;font-size:10px"><tr><th style="text-align:left;padding:4px">Mode</th>';
|
||||||
|
models.forEach(function(m) { tbl += '<th style="padding:4px;text-align:center">' + (m.length>12?m.substring(0,10)+'..':m) + '</th>'; });
|
||||||
|
tbl += '</tr>';
|
||||||
|
modes.forEach(function(mode) {
|
||||||
|
tbl += '<tr><td style="padding:4px;text-transform:uppercase;letter-spacing:0.5px">' + mode + '</td>';
|
||||||
|
models.forEach(function(model) {
|
||||||
|
var score = lookup[mode+'|'+model];
|
||||||
|
var bg = score ? 'rgba(' + (score >= 7 ? '74,222,128' : score >= 5 ? '226,181,90' : '224,82,82') + ',' + (parseFloat(score)/15) + ')' : 'transparent';
|
||||||
|
tbl += '<td style="padding:4px;text-align:center;background:' + bg + ';font-weight:700">' + (score || '-') + '</td>';
|
||||||
|
});
|
||||||
|
tbl += '</tr>';
|
||||||
|
});
|
||||||
|
tbl += '</table>';
|
||||||
|
document.getElementById('ana-heatmap').innerHTML = tbl;
|
||||||
|
} else {
|
||||||
|
document.getElementById('ana-heatmap').innerHTML = '<span style="color:var(--text2)">Need 2+ scored runs per model/mode combination</span>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trend
|
||||||
|
var trend = d.trend || [];
|
||||||
|
if (trend.length) {
|
||||||
|
var trendHtml = '<div style="display:flex;align-items:flex-end;gap:2px;height:80px">';
|
||||||
|
var maxRuns = Math.max(...trend.map(function(t){return t.runs}));
|
||||||
|
trend.forEach(function(t) {
|
||||||
|
var h = Math.max(4, Math.round((t.runs/maxRuns)*70));
|
||||||
|
var color = parseFloat(t.avg_score) >= 7 ? 'var(--green)' : parseFloat(t.avg_score) >= 5 ? 'var(--accent)' : 'var(--red)';
|
||||||
|
trendHtml += '<div title="' + t.day + ': ' + t.avg_score + ' avg (' + t.runs + ' runs)" style="flex:1;height:' + h + 'px;background:' + color + ';border-radius:1px;min-width:4px"></div>';
|
||||||
|
});
|
||||||
|
trendHtml += '</div><div style="display:flex;justify-content:space-between;font-size:8px;color:var(--text2);margin-top:4px"><span>' + trend[0].day + '</span><span>' + trend[trend.length-1].day + '</span></div>';
|
||||||
|
document.getElementById('ana-trend').innerHTML = trendHtml;
|
||||||
|
} else {
|
||||||
|
document.getElementById('ana-trend').innerHTML = '<span style="color:var(--text2)">No data in last 30 days</span>';
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
document.getElementById('ana-coverage').textContent = 'Error loading analytics: ' + e.message;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loadDemoStatus() {
|
async function loadDemoStatus() {
|
||||||
@ -5795,6 +5994,63 @@ def admin_mass_ban():
|
|||||||
|
|
||||||
# ─── ADMIN MONITOR ─────────────────────────────────────────────
|
# ─── ADMIN MONITOR ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
@app.route("/api/admin/analytics")
|
||||||
|
@admin_required
|
||||||
|
def admin_analytics():
|
||||||
|
"""Analytics: score-by-mode, score-by-model, heatmap, trend."""
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT mode, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score,
|
||||||
|
ROUND(STDDEV(quality_score)::numeric, 2) as std_score
|
||||||
|
FROM team_runs WHERE quality_score IS NOT NULL
|
||||||
|
GROUP BY mode ORDER BY avg_score DESC
|
||||||
|
""")
|
||||||
|
by_mode = [dict(r) for r in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
SELECT m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
|
||||||
|
FROM team_runs, unnest(models_used) as m
|
||||||
|
WHERE quality_score IS NOT NULL
|
||||||
|
GROUP BY m ORDER BY avg_score DESC
|
||||||
|
""")
|
||||||
|
by_model = [dict(r) for r in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
SELECT mode, m as model, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
|
||||||
|
FROM team_runs, unnest(models_used) as m
|
||||||
|
WHERE quality_score IS NOT NULL
|
||||||
|
GROUP BY mode, m HAVING COUNT(*) >= 2
|
||||||
|
ORDER BY avg_score DESC
|
||||||
|
""")
|
||||||
|
heatmap = [dict(r) for r in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
SELECT DATE(created_at) as day, COUNT(*) as runs, ROUND(AVG(quality_score)::numeric, 2) as avg_score
|
||||||
|
FROM team_runs WHERE quality_score IS NOT NULL AND created_at > NOW() - INTERVAL '30 days'
|
||||||
|
GROUP BY DATE(created_at) ORDER BY day
|
||||||
|
""")
|
||||||
|
trend = [{"day": str(r["day"]), "runs": r["runs"], "avg_score": float(r["avg_score"])} for r in cur.fetchall()]
|
||||||
|
|
||||||
|
cur.execute("SELECT COUNT(*) as total, COUNT(quality_score) as scored FROM team_runs WHERE archived = false")
|
||||||
|
coverage = dict(cur.fetchone())
|
||||||
|
|
||||||
|
return jsonify({"by_mode": by_mode, "by_model": by_model, "heatmap": heatmap, "trend": trend, "coverage": coverage})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/suggest-models")
|
||||||
|
@login_required
|
||||||
|
def suggest_models():
|
||||||
|
"""Return top-performing models for a given mode based on historical scores."""
|
||||||
|
mode = request.args.get("mode", "")
|
||||||
|
routing = _build_routing_table()
|
||||||
|
perf = routing.get("model_perf", {}).get(mode, [])
|
||||||
|
return jsonify({"mode": mode, "suggestions": perf[:3]})
|
||||||
|
|
||||||
|
|
||||||
@app.route("/admin/monitor")
|
@app.route("/admin/monitor")
|
||||||
@admin_required
|
@admin_required
|
||||||
def monitor_page():
|
def monitor_page():
|
||||||
@ -6159,6 +6415,29 @@ def delete_run(run_id):
|
|||||||
return jsonify({"error": str(e)}), 500
|
return jsonify({"error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/runs/<int:run_id>/score", methods=["POST"])
|
||||||
|
@login_required
|
||||||
|
def score_run(run_id):
|
||||||
|
"""User thumbs up/down on a run — overrides auto-score."""
|
||||||
|
data = request.json or {}
|
||||||
|
vote = data.get("vote")
|
||||||
|
if vote not in ("up", "down"):
|
||||||
|
return jsonify({"error": "vote must be 'up' or 'down'"}), 400
|
||||||
|
score = 8.0 if vote == "up" else 3.0
|
||||||
|
method = f"user_{vote}"
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE team_runs SET quality_score = %s, score_method = %s, score_metadata = score_metadata || %s WHERE id = %s",
|
||||||
|
(score, method, json.dumps({"user": session.get("username", "unknown"), "voted_at": time.time()}), run_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
return jsonify({"ok": True, "score": score, "method": method})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"error": str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
|
@app.route("/api/runs/<int:run_id>/archive", methods=["POST"])
|
||||||
@login_required
|
@login_required
|
||||||
def archive_run(run_id):
|
def archive_run(run_id):
|
||||||
@ -7537,7 +7816,9 @@ def run_team():
|
|||||||
_log_run(dict(run, run_id=run_id))
|
_log_run(dict(run, run_id=run_id))
|
||||||
_active_runs.pop(run_id, None)
|
_active_runs.pop(run_id, None)
|
||||||
if collected:
|
if collected:
|
||||||
save_run(mode, config.get("prompt", ""), config, collected)
|
rid = save_run(mode, config.get("prompt", ""), config, collected)
|
||||||
|
if rid:
|
||||||
|
yield sse({"type": "run_saved", "run_id": rid})
|
||||||
|
|
||||||
return Response(generate(), mimetype="text/event-stream",
|
return Response(generate(), mimetype="text/event-stream",
|
||||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"})
|
||||||
@ -8533,6 +8814,105 @@ def run_extract(config):
|
|||||||
_save_pipeline("extract", prompt or source, steps, result_data, all_models, start)
|
_save_pipeline("extract", prompt or source, steps, result_data, all_models, start)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── CROSS-RUN LEARNING ───────────────────────────────────────
|
||||||
|
_routing_table = {}
|
||||||
|
_routing_table_ts = 0
|
||||||
|
_ROUTING_TTL = 1800 # 30 minutes
|
||||||
|
|
||||||
|
def _build_routing_table():
|
||||||
|
"""Build routing intelligence from historical scored runs."""
|
||||||
|
global _routing_table, _routing_table_ts
|
||||||
|
now = time.time()
|
||||||
|
if _routing_table and (now - _routing_table_ts) < _ROUTING_TTL:
|
||||||
|
return _routing_table
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||||||
|
# Best model per mode
|
||||||
|
cur.execute("""
|
||||||
|
SELECT mode, m as model, ROUND(AVG(quality_score)::numeric, 2) as avg_score, COUNT(*) as runs
|
||||||
|
FROM team_runs, unnest(models_used) as m
|
||||||
|
WHERE quality_score IS NOT NULL AND quality_score >= 5
|
||||||
|
GROUP BY mode, m HAVING COUNT(*) >= 2
|
||||||
|
ORDER BY mode, avg_score DESC
|
||||||
|
""")
|
||||||
|
model_perf = {}
|
||||||
|
for r in cur.fetchall():
|
||||||
|
mode = r["mode"]
|
||||||
|
if mode not in model_perf:
|
||||||
|
model_perf[mode] = []
|
||||||
|
model_perf[mode].append({"model": r["model"], "avg_score": float(r["avg_score"]), "runs": r["runs"]})
|
||||||
|
|
||||||
|
# Best stage sequences for refine pipelines
|
||||||
|
cur.execute("""
|
||||||
|
SELECT result->>'content_type' as content_type,
|
||||||
|
result->'stages_run' as stages,
|
||||||
|
COUNT(*) as runs
|
||||||
|
FROM pipeline_runs
|
||||||
|
WHERE pipeline = 'refine' AND result->>'content_type' IS NOT NULL
|
||||||
|
GROUP BY result->>'content_type', result->'stages_run'
|
||||||
|
ORDER BY runs DESC
|
||||||
|
""")
|
||||||
|
stage_perf = {}
|
||||||
|
for r in cur.fetchall():
|
||||||
|
ct = r["content_type"]
|
||||||
|
if ct and ct not in stage_perf:
|
||||||
|
stage_perf[ct] = {"stages": r["stages"], "runs": r["runs"]}
|
||||||
|
|
||||||
|
_routing_table = {"model_perf": model_perf, "stage_perf": stage_perf}
|
||||||
|
_routing_table_ts = now
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ROUTING] build error: {e}")
|
||||||
|
return _routing_table
|
||||||
|
|
||||||
|
|
||||||
|
def _assess_stage(orchestrator, stage_name, stage_output, content_type):
|
||||||
|
"""Assess a stage's output — returns structured metadata for reactive decisions."""
|
||||||
|
assess_prompt = (
|
||||||
|
f"You just reviewed the output of a {stage_name} stage on a {content_type}.\n"
|
||||||
|
f"Assess the output briefly. Return ONLY a JSON object:\n"
|
||||||
|
f'{{"confidence": 0.0-1.0, "gaps": ["gap1", "gap2"], "severity": "low|medium|high", "suggest_stage": null}}\n\n'
|
||||||
|
f"If the output reveals a critical problem that needs a specific follow-up stage, set suggest_stage to one of: "
|
||||||
|
f"VALIDATE, CRITIQUE, EXPAND, STRUCTURE, STAKEHOLDER, CLARITY, EDGE_CASES, ALIGN\n"
|
||||||
|
f"Otherwise leave suggest_stage as null.\n\n"
|
||||||
|
f"OUTPUT TO ASSESS:\n{stage_output[:2000]}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
raw = safe_query(orchestrator, assess_prompt)
|
||||||
|
text = raw.strip()
|
||||||
|
j_start = text.find("{")
|
||||||
|
j_end = text.rfind("}") + 1
|
||||||
|
if j_start >= 0 and j_end > j_start:
|
||||||
|
return json.loads(text[j_start:j_end])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {"confidence": 0.5, "gaps": [], "severity": "low", "suggest_stage": None}
|
||||||
|
|
||||||
|
|
||||||
|
def _reactive_decide(assessment, remaining_stages, stages_executed, max_stages):
|
||||||
|
"""Decide whether to insert, skip, or continue based on assessment."""
|
||||||
|
budget_left = max_stages - stages_executed
|
||||||
|
if budget_left <= 1:
|
||||||
|
return "continue", None, "budget exhausted"
|
||||||
|
|
||||||
|
suggested = assessment.get("suggest_stage")
|
||||||
|
severity = assessment.get("severity", "low")
|
||||||
|
confidence = assessment.get("confidence", 0.5)
|
||||||
|
|
||||||
|
# Insert a stage if the assessment suggests one and it's not already planned
|
||||||
|
if suggested and suggested not in remaining_stages and severity in ("medium", "high") and budget_left >= 2:
|
||||||
|
return "insert", suggested, f"{severity} severity — {', '.join(assessment.get('gaps', [])[:2])}"
|
||||||
|
|
||||||
|
# Skip next stage if confidence is very high and remaining stage seems redundant
|
||||||
|
if confidence > 0.9 and len(remaining_stages) > 1 and severity == "low":
|
||||||
|
next_stage = remaining_stages[0]
|
||||||
|
# Don't skip synthesis-oriented stages
|
||||||
|
if next_stage in ("EXPAND", "CLARITY"):
|
||||||
|
return "skip", next_stage, f"high confidence ({confidence:.0%}) — {next_stage} likely unnecessary"
|
||||||
|
|
||||||
|
return "continue", None, None
|
||||||
|
|
||||||
|
|
||||||
def run_refine(config):
|
def run_refine(config):
|
||||||
"""Auto-Refine: AI analyzes content, selects the best sequence of modes, executes them, synthesizes final version."""
|
"""Auto-Refine: AI analyzes content, selects the best sequence of modes, executes them, synthesizes final version."""
|
||||||
import time
|
import time
|
||||||
@ -8549,10 +8929,20 @@ def run_refine(config):
|
|||||||
yield sse({"type": "status", "message": "Analyzing content and planning refinement pipeline..."})
|
yield sse({"type": "status", "message": "Analyzing content and planning refinement pipeline..."})
|
||||||
yield sse({"type": "progress", "current": 0, "total": 3, "label": "analyzing"})
|
yield sse({"type": "progress", "current": 0, "total": 3, "label": "analyzing"})
|
||||||
|
|
||||||
|
# Inject routing intelligence from historical data
|
||||||
|
routing = _build_routing_table()
|
||||||
|
routing_context = ""
|
||||||
|
if routing.get("stage_perf"):
|
||||||
|
routing_context = "\nHISTORICAL DATA (from past successful runs):\n"
|
||||||
|
for ct, data in list(routing["stage_perf"].items())[:5]:
|
||||||
|
routing_context += f"- For '{ct}' content, sequence {data['stages']} was used {data['runs']} times\n"
|
||||||
|
routing_context += "Use this as guidance but adapt to the specific content.\n"
|
||||||
|
|
||||||
plan_prompt = f"""You are a refinement strategist. Analyze this content and determine the optimal sequence of refinement stages to improve it.
|
plan_prompt = f"""You are a refinement strategist. Analyze this content and determine the optimal sequence of refinement stages to improve it.
|
||||||
|
|
||||||
CONTENT TO REFINE:
|
CONTENT TO REFINE:
|
||||||
{prompt[:8000]}
|
{prompt[:8000]}
|
||||||
|
{routing_context}
|
||||||
|
|
||||||
AVAILABLE REFINEMENT STAGES (pick 3-{max_stages} in the best order):
|
AVAILABLE REFINEMENT STAGES (pick 3-{max_stages} in the best order):
|
||||||
- VALIDATE: Fact-check claims, verify accuracy, flag unsupported statements
|
- VALIDATE: Fact-check claims, verify accuracy, flag unsupported statements
|
||||||
@ -8617,11 +9007,18 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
|
|||||||
}
|
}
|
||||||
|
|
||||||
prev_output = ""
|
prev_output = ""
|
||||||
for si, stage in enumerate(stages):
|
remaining = list(stages)
|
||||||
stage_num = si + 1
|
stages_executed = 0
|
||||||
worker = workers[si % len(workers)]
|
worker_idx = 0
|
||||||
yield sse({"type": "progress", "current": stage_num, "total": total_stages, "label": stage.lower()})
|
|
||||||
yield sse({"type": "status", "message": f"Stage {stage_num}/{total_stages}: {stage} ({worker})..."})
|
while remaining and stages_executed < max_stages:
|
||||||
|
stage = remaining.pop(0)
|
||||||
|
stages_executed += 1
|
||||||
|
worker = workers[worker_idx % len(workers)]
|
||||||
|
worker_idx += 1
|
||||||
|
total_stages = stages_executed + len(remaining) + 1 # +1 for synthesis
|
||||||
|
yield sse({"type": "progress", "current": stages_executed, "total": total_stages, "label": stage.lower()})
|
||||||
|
yield sse({"type": "status", "message": f"Stage {stages_executed}: {stage} ({worker})..."})
|
||||||
|
|
||||||
template = stage_prompts.get(stage, "Analyze and improve this {type}:\n\n{content}")
|
template = stage_prompts.get(stage, "Analyze and improve this {type}:\n\n{content}")
|
||||||
stage_prompt = template.format(
|
stage_prompt = template.format(
|
||||||
@ -8637,13 +9034,26 @@ Pick ONLY the stages that will meaningfully improve THIS specific content. Not e
|
|||||||
prev_output = result
|
prev_output = result
|
||||||
steps.append({"step": stage, "model": worker, "output": result[:1000]})
|
steps.append({"step": stage, "model": worker, "output": result[:1000]})
|
||||||
|
|
||||||
# For STRUCTURE and CLARITY stages, the output replaces the working content
|
|
||||||
if stage in ("STRUCTURE", "CLARITY"):
|
if stage in ("STRUCTURE", "CLARITY"):
|
||||||
current_content = result
|
current_content = result
|
||||||
|
|
||||||
|
# Reactive assessment — should we adjust the pipeline?
|
||||||
|
if remaining and stages_executed < max_stages - 1:
|
||||||
|
assessment = _assess_stage(orchestrator, stage, result, content_type)
|
||||||
|
decision, target, reason = _reactive_decide(assessment, remaining, stages_executed, max_stages)
|
||||||
|
if decision == "insert" and target:
|
||||||
|
remaining.insert(0, target)
|
||||||
|
yield sse({"type": "response", "model": "system", "text": f"Reactive: inserting {target} stage — {reason}", "role": "reactive"})
|
||||||
|
steps.append({"step": "reactive_insert", "target": target, "reason": reason})
|
||||||
|
elif decision == "skip" and target:
|
||||||
|
remaining.remove(target)
|
||||||
|
yield sse({"type": "response", "model": "system", "text": f"Reactive: skipping {target} — {reason}", "role": "reactive"})
|
||||||
|
steps.append({"step": "reactive_skip", "target": target, "reason": reason})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield sse({"type": "response", "model": worker, "text": f"{stage} failed: {e}", "role": "error"})
|
yield sse({"type": "response", "model": worker, "text": f"{stage} failed: {e}", "role": "error"})
|
||||||
stage_outputs[stage] = f"Error: {e}"
|
stage_outputs[stage] = f"Error: {e}"
|
||||||
|
|
||||||
|
total_stages = stages_executed + 1
|
||||||
# Stage 3: Final synthesis — combine all insights into the definitive refined version
|
# Stage 3: Final synthesis — combine all insights into the definitive refined version
|
||||||
yield sse({"type": "progress", "current": total_stages, "total": total_stages, "label": "synthesize"})
|
yield sse({"type": "progress", "current": total_stages, "total": total_stages, "label": "synthesize"})
|
||||||
yield sse({"type": "status", "message": f"Final synthesis with {orchestrator}..."})
|
yield sse({"type": "status", "message": f"Final synthesis with {orchestrator}..."})
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user