#!/usr/bin/env bash # A/B test of T3 overseer: does it actually make subsequent runs better? # Chains Run B (T3 seed) → Run C (T3 + read-back) → Run D (T3 cloud). # Run A is assumed already complete (launched separately). Aggregates # metrics at the end into ab_scorecard.json. set -e cd "$(dirname "$0")/.." export OLLAMA_CLOUD_KEY="$(python3 -c "import json; print(json.load(open('/root/llm_team_config.json'))['providers']['ollama_cloud']['api_key'])")" echo "▶ A/B test start at $(date -Iseconds)" echo "▶ prior lessons dir: $(ls data/_playbook_lessons 2>/dev/null | wc -l) files" # Run B — T3 enabled local, no prior lessons should exist yet echo "──── RUN B: T3 local, seeds first lesson ────" bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_B.log 2>&1 || true echo " B exit=$?" ls data/_playbook_lessons/*.json 2>/dev/null | head -5 # Run C — T3 enabled local, B's lesson should load echo "──── RUN C: T3 local, reads B's lesson ────" bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_C.log 2>&1 || true echo " C exit=$?" # Run D — T3 enabled CLOUD (gpt-oss:120b), reads B+C lessons echo "──── RUN D: T3 cloud, reads B+C lessons ────" LH_OVERVIEW_CLOUD=1 bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_D.log 2>&1 || true echo " D exit=$?" echo "▶ all runs done at $(date -Iseconds)" echo "▶ scorecard:" ls -1dt tests/multi-agent/playbooks/scenario-* | head -4 | tac | python3 -c " import sys, os, json runs = [l.strip() for l in sys.stdin if l.strip()] labels = ['A(no-T3)','B(T3-seed)','C(T3-read)','D(T3-cloud)'] # Prepend Run A: most recent BEFORE the ab_t3_test kicked off is Run A # (launched separately). But we only picked up the most recent 4 runs. # Actually: ab_t3_test runs B/C/D, so recent 3 = B,C,D. Run A is the one # BEFORE those — find it separately. # Reread to include Run A: import subprocess all_runs = subprocess.check_output(['bash','-c','ls -1dt tests/multi-agent/playbooks/scenario-* | head -8']).decode().strip().split('\n') # The 4 most recent are D, C, B, A (reverse chronological). top4 = list(reversed(all_runs[:4])) # oldest first → A,B,C,D rows = [] for i, path in enumerate(top4): try: results = json.load(open(os.path.join(path, 'results.json'))) except FileNotFoundError: continue ok = sum(1 for r in results if r.get('ok')) turns = sum(r.get('turns', 0) for r in results) gaps = sum(len(r.get('gap_signals', [])) for r in results) cites = sum(len(r.get('playbook_citations') or []) for r in results) prior = [] try: prior = json.load(open(os.path.join(path, 'prior_lessons.json'))) except FileNotFoundError: pass rows.append({ 'label': labels[i] if i < len(labels) else f'run{i}', 'path': path, 'ok_events': ok, 'total_events': len(results), 'total_turns': turns, 'total_gaps': gaps, 'total_citations': cites, 'prior_lessons_loaded': len(prior), }) scorecard = {'generated_at': __import__('datetime').datetime.utcnow().isoformat()+'Z', 'runs': rows} open('tests/multi-agent/playbooks/ab_scorecard.json','w').write(json.dumps(scorecard, indent=2)) print(json.dumps(scorecard, indent=2)) " echo "▶ saved: tests/multi-agent/playbooks/ab_scorecard.json"