matrix-agent-validated/scripts/kb_staffer_report.py

#!/usr/bin/env python3
"""Phase 23 staffer leaderboard + cross-staffer pattern finder.

Reads data/_kb/staffers.jsonl + outcomes.jsonl + signatures.jsonl and
emits:
  - Leaderboard sorted by competence_score
  - Per-staffer breakdown (fill rate, turns, citations, rescue rate)
  - Cross-staffer common workers: names endorsed by multiple top
    staffers on similar signatures — candidates for "auto-discovered
    high-value worker" labels.

Run after scripts/run_staffer_demo.sh completes.
"""
import json
import os
from collections import defaultdict, Counter
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
KB = ROOT / "data" / "_kb"
PLAYBOOKS = ROOT / "tests" / "multi-agent" / "playbooks"


def load_jsonl(p):
    if not p.exists():
        return []
    out = []
    for line in p.read_text().splitlines():
        if line.strip():
            try:
                out.append(json.loads(line))
            except json.JSONDecodeError:
                pass
    return out


def main():
    staffers = load_jsonl(KB / "staffers.jsonl")
    outcomes = load_jsonl(KB / "outcomes.jsonl")

    if not staffers:
        print("(no staffer stats yet — run scripts/run_staffer_demo.sh first)")
        return

    staffers.sort(key=lambda s: -s["competence_score"])

    print("=== STAFFER LEADERBOARD ===")
    print(f"{'rank':4s} {'id':7s} {'name':16s} {'role':8s} {'mo':>4s} {'runs':>4s} {'fill':>6s} {'turns':>6s} {'cites':>6s} {'rescue':>7s} {'score':>6s}")
    print("-" * 100)
    for i, s in enumerate(staffers):
        rank_mark = "★" if i == 0 else " "
        print(f"{rank_mark}{i+1:3d} {s['id']:7s} {s['name']:16s} {s['role']:8s} {s['tenure_months']:>4d} {s['total_runs']:>4d} {s['fill_rate']*100:>5.1f}% {s['avg_turns_per_event']:>6.1f} {s['avg_citations_per_run']:>6.2f} {s['rescue_rate']*100:>6.1f}% {s['competence_score']:>6.3f}")
    print()

    # Cross-staffer pattern — workers endorsed on the same sig_hash
    # across multiple staffers. Auto-discovered "reliable performers".
    print("=== CROSS-STAFFER WORKER OVERLAP ===")
    print("(workers endorsed on same sig_hash by ≥2 staffers)")
    worker_touches = defaultdict(lambda: {"staffers": set(), "sigs": set(), "endorsements": 0})
    for o in outcomes:
        run_dir = PLAYBOOKS / o["run_id"]
        results_file = run_dir / "results.json"
        if not results_file.exists():
            continue
        try:
            results = json.loads(results_file.read_text())
        except Exception:
            continue
        staffer_id = o.get("staffer", {}).get("id")
        if not staffer_id:
            continue
        for r in results:
            if not r.get("ok"):
                continue
            for f in r.get("fills", []):
                name = f.get("name")
                if not name or name.startswith("Candidate "):
                    continue
                key = (name, r["event"]["role"], r["event"]["city"], r["event"]["state"])
                worker_touches[key]["staffers"].add(staffer_id)
                worker_touches[key]["sigs"].add(o["sig_hash"])
                worker_touches[key]["endorsements"] += 1
    shared = sorted(
        [(k, v) for k, v in worker_touches.items() if len(v["staffers"]) >= 2],
        key=lambda x: -x[1]["endorsements"],
    )
    if shared:
        for (name, role, city, state), v in shared[:15]:
            print(f"  {name:25s} {role:22s} {city:15s} {state}: {v['endorsements']} endorsements across {len(v['staffers'])} staffers")
    else:
        print("  (none yet — needs ≥2 staffers on overlapping scenarios)")
    print()

    # Competence differential — how much does top vs bottom differ?
    if len(staffers) >= 2:
        top = staffers[0]
        bot = staffers[-1]
        print(f"=== TOP vs BOTTOM DIFFERENTIAL ===")
        print(f"{top['name']} (top) vs {bot['name']} (bottom):")
        print(f"  fill rate:       {top['fill_rate']*100:.1f}% vs {bot['fill_rate']*100:.1f}% (Δ {(top['fill_rate']-bot['fill_rate'])*100:+.1f}pt)")
        print(f"  avg turns:       {top['avg_turns_per_event']:.1f} vs {bot['avg_turns_per_event']:.1f} (Δ {top['avg_turns_per_event']-bot['avg_turns_per_event']:+.1f})")
        print(f"  avg citations:   {top['avg_citations_per_run']:.2f} vs {bot['avg_citations_per_run']:.2f} (Δ {top['avg_citations_per_run']-bot['avg_citations_per_run']:+.2f})")
        print(f"  rescue rate:     {top['rescue_rate']*100:.1f}% vs {bot['rescue_rate']*100:.1f}%")
        print(f"  competence:      {top['competence_score']:.3f} vs {bot['competence_score']:.3f}")


if __name__ == "__main__":
    main()