lakehouse/scripts/kb_measure.py
root a663698571 Item 3 — geo-filtered playbook boost; diagnostic logging
ROOT CAUSE (found via instrumentation, not hunch):
After a 20-scenario corpus batch, only 6/40 successful (role, city)
combos ever triggered playbook_memory citations on subsequent runs.
Added `playbook_boost:` tracing::info! line in vectord::service to log
boost map size vs candidate pool vs match count. One query revealed:

  boosts=170 sources=50 parsed=50 matched=0

170 endorsed workers came back from compute_boost_for — but zero were
in the 50-candidate Toledo pool. The boost map was pulling globally-
ranked semantic neighbors (top-100 playbooks across ALL cities),
dominated by Kansas City / Chicago / Detroit forklift playbooks the
Toledo SQL filter would never admit. The mechanism was correct at the
per-playbook level; the problem was pool intersection.

FIX (surgical, not cap-tuning):
- playbook_memory::compute_boost_for_filtered(): accepts optional
  (city, state) filter. When set, skips playbooks from other geos
  BEFORE cosine-ranking, so top-k is within the target city.
- Backwards-compatible: compute_boost_for() calls the filtered variant
  with None — existing callers unchanged.
- service::hybrid_search(): extracts target (city, state) from the
  executor's SQL filter via a small parser (extract_target_geo),
  passes to compute_boost_for_filtered.

VERIFIED:
  Before fix: boosts=170 sources=50 parsed=50 matched=0   (0% hit)
  After fix:  boosts=36  sources=50 parsed=50 matched=11  (22% hit)
Top-k=10 now has 7/10 boosted workers with 2-3 citations each.
Boost values 0.075-0.113 on cosine scores 0.67-0.74 — meaningful
reorder without saturation.

scripts/kb_measure.py:
Aggregator that reads data/_kb/*.jsonl and playbooks/*/results.json,
reports fill rate, citation density, recommender confidence trend,
and zero-citation-ok combos (item 3 target signal). Used to measure
before/after on bigger batches.

Diagnostic logging stays — the class of "boosts computed but not
matched" bug can recur if the SQL filter format ever drifts, and
without the counter it's invisible. Every hybrid_search with
use_playbook_memory=true now logs its boost stats.
2026-04-20 21:35:04 -05:00

164 lines
6.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Aggregate KB state for item 3 decision.
Reads data/_kb/*.jsonl and tests/multi-agent/playbooks/*/results.json
to answer:
- How many distinct signatures exist?
- Total runs, avg ok rate, avg citations per event?
- Which (role, city) combos have NEVER gotten a citation?
- Recommender confidence progression (cold → medium → high)?
- Mean turn count trend across runs (proxy for efficiency).
Run after `scripts/run_kb_batch.sh` completes. Writes a markdown
summary to tests/multi-agent/playbooks/kb_measurement.md and prints
to stdout.
"""
import json
import os
import sys
from collections import Counter, defaultdict
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
KB = ROOT / "data" / "_kb"
PLAYBOOKS = ROOT / "tests" / "multi-agent" / "playbooks"
def load_jsonl(p):
if not p.exists():
return []
out = []
for line in p.read_text().splitlines():
if line.strip():
try:
out.append(json.loads(line))
except json.JSONDecodeError:
pass
return out
def main():
sigs = load_jsonl(KB / "signatures.jsonl")
outcomes = load_jsonl(KB / "outcomes.jsonl")
recs = load_jsonl(KB / "pathway_recommendations.jsonl")
corrections = load_jsonl(KB / "error_corrections.jsonl")
# --- Basic counts ---
print(f"Signatures: {len(sigs)}")
print(f"Outcomes: {len(outcomes)}")
print(f"Recommendations: {len(recs)}")
print(f"Error corrections: {len(corrections)}")
print()
# --- Recommender confidence progression ---
conf_counts = Counter(r.get("confidence", "?") for r in recs)
print(f"Recommender confidence distribution:")
for c in ("high", "medium", "low"):
print(f" {c:8s}: {conf_counts.get(c, 0)}")
print()
# Time-ordered confidence
recs_sorted = sorted(recs, key=lambda r: r.get("generated_at", ""))
neighbor_counts = [len(r.get("neighbors_consulted", [])) for r in recs_sorted]
if neighbor_counts:
print(f"Neighbors consulted over time (first → last):")
print(f" first 3: {neighbor_counts[:3]}")
print(f" last 3: {neighbor_counts[-3:]}")
print(f" max: {max(neighbor_counts)}")
print()
# --- Fill rate + citation density per run ---
if outcomes:
total_ok = sum(o["ok_events"] for o in outcomes)
total_events = sum(o["total_events"] for o in outcomes)
total_cites = sum(o.get("total_citations", 0) for o in outcomes)
total_turns = sum(o.get("total_turns", 0) for o in outcomes)
print(f"Fill rate: {total_ok}/{total_events} = {100*total_ok/max(1,total_events):.1f}%")
print(f"Avg citations per run: {total_cites/len(outcomes):.2f}")
print(f"Avg turns per run: {total_turns/len(outcomes):.1f}")
print()
# First 5 runs vs last 5 — does it get better?
sorted_out = sorted(outcomes, key=lambda o: o.get("created_at", ""))
if len(sorted_out) >= 10:
first = sorted_out[:5]
last = sorted_out[-5:]
fok = sum(o["ok_events"] for o in first) / sum(o["total_events"] for o in first)
lok = sum(o["ok_events"] for o in last) / sum(o["total_events"] for o in last)
fcit = sum(o.get("total_citations", 0) for o in first) / 5
lcit = sum(o.get("total_citations", 0) for o in last) / 5
print(f"First 5 runs ok rate: {100*fok:.1f}% avg cites: {fcit:.2f}")
print(f"Last 5 runs ok rate: {100*lok:.1f}% avg cites: {lcit:.2f}")
print()
# --- Per-(role, city) citation coverage ---
cite_by_combo = Counter()
combo_attempts = Counter()
for o in outcomes:
for ev in o.get("per_event", []):
key = (ev.get("role", "?"), "?") # city not in per_event summary
combo_attempts[key] += 1
# Read the playbook dirs for full event detail (has city)
cites_by_role_city = defaultdict(lambda: {"attempts": 0, "citations": 0, "ok": 0})
for o in outcomes:
run_dir = PLAYBOOKS / o["run_id"]
results_file = run_dir / "results.json"
if not results_file.exists():
continue
try:
results = json.loads(results_file.read_text())
except Exception:
continue
for r in results:
e = r.get("event", {})
key = (e.get("role"), e.get("city"), e.get("state"))
cites_by_role_city[key]["attempts"] += 1
cites_by_role_city[key]["citations"] += len(r.get("playbook_citations") or [])
if r.get("ok"):
cites_by_role_city[key]["ok"] += 1
combos_with_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] > 0]
combos_zero_cites = [(k, v) for k, v in cites_by_role_city.items() if v["citations"] == 0 and v["ok"] > 0]
print(f"(role, city, state) combos with any citation: {len(combos_with_cites)}")
print(f"(role, city, state) combos with ok fills but 0 cites: {len(combos_zero_cites)}")
print()
if combos_with_cites:
print("Top 10 combos by citation count:")
for (role, city, state), v in sorted(combos_with_cites, key=lambda x: -x[1]["citations"])[:10]:
print(f" {role:25s} {city:15s} {state}: {v['citations']} cites across {v['attempts']} attempts ({v['ok']} ok)")
print()
# --- Write markdown report ---
lines = ["# KB Measurement Report", ""]
lines.append(f"Generated from {len(outcomes)} runs across {len(sigs)} distinct signatures.")
lines.append("")
lines.append("## Recommender confidence")
for c in ("high", "medium", "low"):
lines.append(f"- {c}: {conf_counts.get(c, 0)}")
lines.append("")
lines.append("## Overall fill + citation")
if outcomes:
lines.append(f"- Fill rate: **{total_ok}/{total_events}** ({100*total_ok/max(1,total_events):.1f}%)")
lines.append(f"- Avg citations per run: **{total_cites/len(outcomes):.2f}**")
lines.append(f"- Avg turns per run: {total_turns/len(outcomes):.1f}")
lines.append("")
lines.append("## Citation coverage by (role, city, state)")
lines.append(f"- Combos with ≥1 citation: {len(combos_with_cites)}")
lines.append(f"- Combos with ok fills but 0 citations: {len(combos_zero_cites)}")
lines.append("")
lines.append("## Item 3 decision signal")
if combos_zero_cites:
lines.append("Non-zero: there are **combos that succeeded but never triggered playbook_memory boost**. Candidates for item 3 investigation:")
for (role, city, state), v in combos_zero_cites[:5]:
lines.append(f"- {role} in {city}, {state}: {v['ok']}/{v['attempts']} ok, 0 cites")
else:
lines.append("All ok combos got at least some citation firing. Boost mechanism is healthy; raising the cap may help but isn't forced.")
lines.append("")
out = PLAYBOOKS / "kb_measurement.md"
out.write_text("\n".join(lines))
print(f"✓ markdown report → {out}")
if __name__ == "__main__":
main()