matrix-agent-validated/scripts/kb_measure.py

#!/usr/bin/env python3
"""Aggregate KB state for item 3 decision.

Reads data/_kb/*.jsonl and tests/multi-agent/playbooks/*/results.json
to answer:
  - How many distinct signatures exist?
  - Total runs, avg ok rate, avg citations per event?
  - Which (role, city) combos have NEVER gotten a citation?
  - Recommender confidence progression (cold → medium → high)?
  - Mean turn count trend across runs (proxy for efficiency).

Run after `scripts/run_kb_batch.sh` completes. Writes a markdown
summary to tests/multi-agent/playbooks/kb_measurement.md and prints
to stdout.
"""
import json
import os
import sys
from collections import Counter, defaultdict
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
KB = ROOT / "data" / "_kb"
PLAYBOOKS = ROOT / "tests" / "multi-agent" / "playbooks"


def load_jsonl(p):
    if not p.exists():
        return []
    out = []
    for line in p.read_text().splitlines():
        if line.strip():
            try:
                out.append(json.loads(line))
            except json.JSONDecodeError:
                pass
    return out


def main():
    sigs = load_jsonl(KB / "signatures.jsonl")
    outcomes = load_jsonl(KB / "outcomes.jsonl")
    recs = load_jsonl(KB / "pathway_recommendations.jsonl")
    corrections = load_jsonl(KB / "error_corrections.jsonl")

    # --- Basic counts ---
    print(f"Signatures:       {len(sigs)}")
    print(f"Outcomes:         {len(outcomes)}")
    print(f"Recommendations:  {len(recs)}")
    print(f"Error corrections: {len(corrections)}")
    print()

    # --- Recommender confidence progression ---
    conf_counts = Counter(r.get("confidence", "?") for r in recs)
    print(f"Recommender confidence distribution:")
    for c in ("high", "medium", "low"):
        print(f"  {c:8s}: {conf_counts.get(c, 0)}")
    print()

    # Time-ordered confidence
    recs_sorted = sorted(recs, key=lambda r: r.get("generated_at", ""))
    neighbor_counts = [len(r.get("neighbors_consulted", [])) for r in recs_sorted]
    if neighbor_counts:
        print(f"Neighbors consulted over time (first → last):")
        print(f"  first 3: {neighbor_counts[:3]}")
        print(f"  last 3:  {neighbor_counts[-3:]}")
        print(f"  max:     {max(neighbor_counts)}")
    print()

    # --- Fill rate + citation density per run ---
    if outcomes:
        total_ok = sum(o["ok_events"] for o in outcomes)
        total_events = sum(o["total_events"] for o in outcomes)
        total_cites = sum(o.get("total_citations", 0) for o in outcomes)
        total_turns = sum(o.get("total_turns", 0) for o in outcomes)
        print(f"Fill rate: {total_ok}/{total_events} = {100*total_ok/max(1,total_events):.1f}%")
        print(f"Avg citations per run: {total_cites/len(outcomes):.2f}")
        print(f"Avg turns per run:     {total_turns/len(outcomes):.1f}")
        print()

        # First 5 runs vs last 5 — does it get better?
        sorted_out = sorted(outcomes, key=lambda o: o.get("created_at", ""))
        if len(sorted_out) >= 10:
            first = sorted_out[:5]
            last = sorted_out[-5:]
            fok = sum(o["ok_events"] for o in first) / sum(o["total_events"] for o in first)
            lok = sum(o["ok_events"] for o in last) / sum(o["total_events"] for o in last)
            fcit = sum(o.get("total_citations", 0) for o in first) / 5
            lcit = sum(o.get("total_citations", 0) for o in last) / 5
            print(f"First 5 runs ok rate: {100*fok:.1f}%    avg cites: {fcit:.2f}")
            print(f"Last 5 runs ok rate:  {100*lok:.1f}%    avg cites: {lcit:.2f}")
            print()

    # --- Per-(role, city) citation coverage ---
    cite_by_combo = Counter()
    combo_attempts = Counter()
    for o in outcomes:
        for ev in o.get("per_event", []):
            key = (ev.get("role", "?"), "?")  # city not in per_event summary
            combo_attempts[key] += 1
    # Read the playbook dirs for full event detail (has city)
    cites_by_role_city = defaultdict(lambda: {"attempts": 0, "citations": 0, "ok": 0})
    for o in outcomes:
        run_dir = PLAYBOOKS / o["run_id"]
        results_file = run_dir / "results.json"
        if not results_file.exists():
            continue
        try:
            results = json.loads(results_file.read_text())
        except Exception:
            continue
        for r in results:
            e = r.get("event", {})
            key = (e.get("role"), e.get("city"), e.get("state"))
            cites_by_role_city[key]["attempts"] += 1
            cites_by_role_city[key]["citations"] += len(r.get("playbook_citations") or [])
            if r.get("ok"):
                cites_by_role_city[key]["ok"] += 1

    combos_with_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] > 0]
    combos_zero_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] == 0 and v["ok"] > 0]
    print(f"(role, city, state) combos with any citation:   {len(combos_with_cites)}")
    print(f"(role, city, state) combos with ok fills but 0 cites: {len(combos_zero_cites)}")
    print()
    if combos_with_cites:
        print("Top 10 combos by citation count:")
        for (role, city, state), v in sorted(combos_with_cites, key=lambda x: -x[1]["citations"])[:10]:
            print(f"  {role:25s} {city:15s} {state}: {v['citations']} cites across {v['attempts']} attempts ({v['ok']} ok)")
    print()

    # --- Write markdown report ---
    lines = ["# KB Measurement Report", ""]
    lines.append(f"Generated from {len(outcomes)} runs across {len(sigs)} distinct signatures.")
    lines.append("")
    lines.append("## Recommender confidence")
    for c in ("high", "medium", "low"):
        lines.append(f"- {c}: {conf_counts.get(c, 0)}")
    lines.append("")
    lines.append("## Overall fill + citation")
    if outcomes:
        lines.append(f"- Fill rate: **{total_ok}/{total_events}** ({100*total_ok/max(1,total_events):.1f}%)")
        lines.append(f"- Avg citations per run: **{total_cites/len(outcomes):.2f}**")
        lines.append(f"- Avg turns per run: {total_turns/len(outcomes):.1f}")
        lines.append("")
    lines.append("## Citation coverage by (role, city, state)")
    lines.append(f"- Combos with ≥1 citation: {len(combos_with_cites)}")
    lines.append(f"- Combos with ok fills but 0 citations: {len(combos_zero_cites)}")
    lines.append("")
    lines.append("## Item 3 decision signal")
    if combos_zero_cites:
        lines.append("Non-zero: there are **combos that succeeded but never triggered playbook_memory boost**. Candidates for item 3 investigation:")
        for (role, city, state), v in combos_zero_cites[:5]:
            lines.append(f"- {role} in {city}, {state}: {v['ok']}/{v['attempts']} ok, 0 cites")
    else:
        lines.append("All ok combos got at least some citation firing. Boost mechanism is healthy; raising the cap may help but isn't forced.")
    lines.append("")
    out = PLAYBOOKS / "kb_measurement.md"
    out.write_text("\n".join(lines))
    print(f"✓ markdown report → {out}")


if __name__ == "__main__":
    main()