matrix-agent-validated/scripts/ab_t3_test.sh

#!/usr/bin/env bash
# A/B test of T3 overseer: does it actually make subsequent runs better?
# Chains Run B (T3 seed) → Run C (T3 + read-back) → Run D (T3 cloud).
# Run A is assumed already complete (launched separately). Aggregates
# metrics at the end into ab_scorecard.json.

set -e
cd "$(dirname "$0")/.."

export OLLAMA_CLOUD_KEY="$(python3 -c "import json; print(json.load(open('/root/llm_team_config.json'))['providers']['ollama_cloud']['api_key'])")"

echo "▶ A/B test start at $(date -Iseconds)"
echo "▶ prior lessons dir: $(ls data/_playbook_lessons 2>/dev/null | wc -l) files"

# Run B — T3 enabled local, no prior lessons should exist yet
echo "──── RUN B: T3 local, seeds first lesson ────"
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_B.log 2>&1 || true
echo "  B exit=$?"
ls data/_playbook_lessons/*.json 2>/dev/null | head -5

# Run C — T3 enabled local, B's lesson should load
echo "──── RUN C: T3 local, reads B's lesson ────"
bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_C.log 2>&1 || true
echo "  C exit=$?"

# Run D — T3 enabled CLOUD (gpt-oss:120b), reads B+C lessons
echo "──── RUN D: T3 cloud, reads B+C lessons ────"
LH_OVERVIEW_CLOUD=1 bun tests/multi-agent/scenario.ts > /tmp/lakehouse_ab_D.log 2>&1 || true
echo "  D exit=$?"

echo "▶ all runs done at $(date -Iseconds)"
echo "▶ scorecard:"
ls -1dt tests/multi-agent/playbooks/scenario-* | head -4 | tac | python3 -c "
import sys, os, json

runs = [l.strip() for l in sys.stdin if l.strip()]
labels = ['A(no-T3)','B(T3-seed)','C(T3-read)','D(T3-cloud)']
# Prepend Run A: most recent BEFORE the ab_t3_test kicked off is Run A
# (launched separately). But we only picked up the most recent 4 runs.
# Actually: ab_t3_test runs B/C/D, so recent 3 = B,C,D. Run A is the one
# BEFORE those — find it separately.
# Reread to include Run A:
import subprocess
all_runs = subprocess.check_output(['bash','-c','ls -1dt tests/multi-agent/playbooks/scenario-* | head -8']).decode().strip().split('\n')
# The 4 most recent are D, C, B, A (reverse chronological).
top4 = list(reversed(all_runs[:4]))  # oldest first → A,B,C,D
rows = []
for i, path in enumerate(top4):
    try:
        results = json.load(open(os.path.join(path, 'results.json')))
    except FileNotFoundError:
        continue
    ok = sum(1 for r in results if r.get('ok'))
    turns = sum(r.get('turns', 0) for r in results)
    gaps = sum(len(r.get('gap_signals', [])) for r in results)
    cites = sum(len(r.get('playbook_citations') or []) for r in results)
    prior = []
    try:
        prior = json.load(open(os.path.join(path, 'prior_lessons.json')))
    except FileNotFoundError:
        pass
    rows.append({
        'label': labels[i] if i < len(labels) else f'run{i}',
        'path': path,
        'ok_events': ok,
        'total_events': len(results),
        'total_turns': turns,
        'total_gaps': gaps,
        'total_citations': cites,
        'prior_lessons_loaded': len(prior),
    })

scorecard = {'generated_at': __import__('datetime').datetime.utcnow().isoformat()+'Z', 'runs': rows}
open('tests/multi-agent/playbooks/ab_scorecard.json','w').write(json.dumps(scorecard, indent=2))
print(json.dumps(scorecard, indent=2))
"
echo "▶ saved: tests/multi-agent/playbooks/ab_scorecard.json"