#!/usr/bin/env python3 """Aggregate KB state for item 3 decision. Reads data/_kb/*.jsonl and tests/multi-agent/playbooks/*/results.json to answer: - How many distinct signatures exist? - Total runs, avg ok rate, avg citations per event? - Which (role, city) combos have NEVER gotten a citation? - Recommender confidence progression (cold → medium → high)? - Mean turn count trend across runs (proxy for efficiency). Run after `scripts/run_kb_batch.sh` completes. Writes a markdown summary to tests/multi-agent/playbooks/kb_measurement.md and prints to stdout. """ import json import os import sys from collections import Counter, defaultdict from pathlib import Path ROOT = Path(__file__).resolve().parents[1] KB = ROOT / "data" / "_kb" PLAYBOOKS = ROOT / "tests" / "multi-agent" / "playbooks" def load_jsonl(p): if not p.exists(): return [] out = [] for line in p.read_text().splitlines(): if line.strip(): try: out.append(json.loads(line)) except json.JSONDecodeError: pass return out def main(): sigs = load_jsonl(KB / "signatures.jsonl") outcomes = load_jsonl(KB / "outcomes.jsonl") recs = load_jsonl(KB / "pathway_recommendations.jsonl") corrections = load_jsonl(KB / "error_corrections.jsonl") # --- Basic counts --- print(f"Signatures: {len(sigs)}") print(f"Outcomes: {len(outcomes)}") print(f"Recommendations: {len(recs)}") print(f"Error corrections: {len(corrections)}") print() # --- Recommender confidence progression --- conf_counts = Counter(r.get("confidence", "?") for r in recs) print(f"Recommender confidence distribution:") for c in ("high", "medium", "low"): print(f" {c:8s}: {conf_counts.get(c, 0)}") print() # Time-ordered confidence recs_sorted = sorted(recs, key=lambda r: r.get("generated_at", "")) neighbor_counts = [len(r.get("neighbors_consulted", [])) for r in recs_sorted] if neighbor_counts: print(f"Neighbors consulted over time (first → last):") print(f" first 3: {neighbor_counts[:3]}") print(f" last 3: {neighbor_counts[-3:]}") print(f" max: {max(neighbor_counts)}") print() # --- Fill rate + citation density per run --- if outcomes: total_ok = sum(o["ok_events"] for o in outcomes) total_events = sum(o["total_events"] for o in outcomes) total_cites = sum(o.get("total_citations", 0) for o in outcomes) total_turns = sum(o.get("total_turns", 0) for o in outcomes) print(f"Fill rate: {total_ok}/{total_events} = {100*total_ok/max(1,total_events):.1f}%") print(f"Avg citations per run: {total_cites/len(outcomes):.2f}") print(f"Avg turns per run: {total_turns/len(outcomes):.1f}") print() # First 5 runs vs last 5 — does it get better? sorted_out = sorted(outcomes, key=lambda o: o.get("created_at", "")) if len(sorted_out) >= 10: first = sorted_out[:5] last = sorted_out[-5:] fok = sum(o["ok_events"] for o in first) / sum(o["total_events"] for o in first) lok = sum(o["ok_events"] for o in last) / sum(o["total_events"] for o in last) fcit = sum(o.get("total_citations", 0) for o in first) / 5 lcit = sum(o.get("total_citations", 0) for o in last) / 5 print(f"First 5 runs ok rate: {100*fok:.1f}% avg cites: {fcit:.2f}") print(f"Last 5 runs ok rate: {100*lok:.1f}% avg cites: {lcit:.2f}") print() # --- Per-(role, city) citation coverage --- cite_by_combo = Counter() combo_attempts = Counter() for o in outcomes: for ev in o.get("per_event", []): key = (ev.get("role", "?"), "?") # city not in per_event summary combo_attempts[key] += 1 # Read the playbook dirs for full event detail (has city) cites_by_role_city = defaultdict(lambda: {"attempts": 0, "citations": 0, "ok": 0}) for o in outcomes: run_dir = PLAYBOOKS / o["run_id"] results_file = run_dir / "results.json" if not results_file.exists(): continue try: results = json.loads(results_file.read_text()) except Exception: continue for r in results: e = r.get("event", {}) key = (e.get("role"), e.get("city"), e.get("state")) cites_by_role_city[key]["attempts"] += 1 cites_by_role_city[key]["citations"] += len(r.get("playbook_citations") or []) if r.get("ok"): cites_by_role_city[key]["ok"] += 1 combos_with_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] > 0] combos_zero_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] == 0 and v["ok"] > 0] print(f"(role, city, state) combos with any citation: {len(combos_with_cites)}") print(f"(role, city, state) combos with ok fills but 0 cites: {len(combos_zero_cites)}") print() if combos_with_cites: print("Top 10 combos by citation count:") for (role, city, state), v in sorted(combos_with_cites, key=lambda x: -x[1]["citations"])[:10]: print(f" {role:25s} {city:15s} {state}: {v['citations']} cites across {v['attempts']} attempts ({v['ok']} ok)") print() # --- Write markdown report --- lines = ["# KB Measurement Report", ""] lines.append(f"Generated from {len(outcomes)} runs across {len(sigs)} distinct signatures.") lines.append("") lines.append("## Recommender confidence") for c in ("high", "medium", "low"): lines.append(f"- {c}: {conf_counts.get(c, 0)}") lines.append("") lines.append("## Overall fill + citation") if outcomes: lines.append(f"- Fill rate: **{total_ok}/{total_events}** ({100*total_ok/max(1,total_events):.1f}%)") lines.append(f"- Avg citations per run: **{total_cites/len(outcomes):.2f}**") lines.append(f"- Avg turns per run: {total_turns/len(outcomes):.1f}") lines.append("") lines.append("## Citation coverage by (role, city, state)") lines.append(f"- Combos with ≥1 citation: {len(combos_with_cites)}") lines.append(f"- Combos with ok fills but 0 citations: {len(combos_zero_cites)}") lines.append("") lines.append("## Item 3 decision signal") if combos_zero_cites: lines.append("Non-zero: there are **combos that succeeded but never triggered playbook_memory boost**. Candidates for item 3 investigation:") for (role, city, state), v in combos_zero_cites[:5]: lines.append(f"- {role} in {city}, {state}: {v['ok']}/{v['attempts']} ok, 0 cites") else: lines.append("All ok combos got at least some citation firing. Boost mechanism is healthy; raising the cap may help but isn't forced.") lines.append("") out = PLAYBOOKS / "kb_measurement.md" out.write_text("\n".join(lines)) print(f"✓ markdown report → {out}") if __name__ == "__main__": main()