From d56f08e74008c0a8e411c6e5395b5e9c5bf155b2 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 10:45:29 -0500 Subject: [PATCH] =?UTF-8?q?staffing:=20A=20=E2=80=94=20fill=5Fevents.parqu?= =?UTF-8?q?et=20from=2044=20scenarios=20+=2064=20lessons=20(deterministic)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decision A from reports/staffing/synthetic-data-gap-report.md §7. Walks tests/multi-agent/scenarios/scen_*.json and data/_playbook_lessons/*.json, normalizes to a single fill_events.parquet at data/datasets/fill_events.parquet. One row per scenario event, lesson outcomes joined by (client, date) where the tuple matches. rows: 123 scenarios contributing: 40 events with outcome data: 62 unique (client, date) tuples: 40 Reproducibility: event_id is SHA1(client|date|role|at|city) truncated to 16 hex chars; rows sorted by event_id before write so re-runs produce bit-identical output. Verified. Pure normalization — no LLM, no new data, no distillation substrate mutation. --- data/datasets/fill_events.parquet | Bin 0 -> 13343 bytes scripts/staffing/build_fill_events.py | 157 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 data/datasets/fill_events.parquet create mode 100644 scripts/staffing/build_fill_events.py diff --git a/data/datasets/fill_events.parquet b/data/datasets/fill_events.parquet new file mode 100644 index 0000000000000000000000000000000000000000..15cf4ff68bf96083d403cac2d677adc7243980b1 GIT binary patch literal 13343 zcmd6O4RBP~eeb!eU7-aDOQ2n^K+qLvH^SlL-0yp-LRo!Ftb~w-5XANR+%IX-YFF&8 zKxiBnsBwT0f(2G6C3zTQ*RK6i$Hgge-Km4!1RUZ=2ytwOq{L10{PNOP&3Kq}>UR1+ zceT<=TEH`J=1n!+yZ4-X&j0)W`=4{J-5Kg)Jx*9B*1CM_lEKqUczcd7iMEB*;4Ss92XcS+FeC5+z=9mHRZ# zP%SJng~hULnJ`4xu_(w`)_C0#MV=|IWqA%;ro`!(w`5|n?s6;uFGUm;R*@vm)d@os zgQ!@tWJ}OlNp;mI0&nrA%$kO3N;1aoa*#mbG+igGh!xp#L5C=@GB0te$`Y)*jw4fOAAWK+PU7ZRiiL605ET|&36vcJ37J39$=M+wnR87%j1_mjzsmq4SVU<^P(RHDg zvqYVf3Fa`-IgV4Anp&R4nxPTPWrb>#W3+_&ZXF+^P6?w24Ckd_* zg%@;*C>G1AK!qi=N5rDZgJLpPbpv{8c^PwtVd@|s5p+Xx8H%j(vIOQ;IdBZGfTo6} zNFo903{%xWQ>VaI{i|O#Slu)&3sZB691IdomQ{4qU;~dTg3PKCZ?dwY5KGhCH3Ez` zMOnuh#~ZQ$))6s_u_hQiE5RHia7z)vJUrHU%MdwHQ^4v9sKfzR2?k3_V6`F=9@GGv zX_jhYF>n!jB$n3*R*1qHn(8*3x?Hu&PgvdpQ;7z16Wpw6CNJt1D{+FQG(Hd32R1qp zppyr;xh_Kw$7>viMH%R^EC@^_L&35l3B05kVxYfPBB~4)H7r9HHId_7hCrZ86*vX+ zrXq`t4T@xNrfEnLM5-uSqU)?8SrRxHf{>SpsK~CHU>af)!%`u1IZax1UXW$T23;2W%93wQ^*1;sQ~URPNjJkA@d9u-s=4iq3@c(B{5^9l&B zVpXs}XI>Rd@S+O2X%SU~fX0f18-J;&M3EuWWX_N@h0r)uHDU-X@$6!cZAOJ5klSP08Sz!f( zp(y{FueAmFV1AG>P>O~4{&5x^~sP4@6_1Wi^f4MI-C;34Di8AStB={n}J0>plBu*1PBH23%HdJToiPklb{b^h?gY6 zeKfSThO;CMK+50^{@`!?x(KG{B+d#wS0pQ#F%%tqsE7(!K#(L$b$1G)&T0^!gtsI~ zQ35_iS9NU43a84lpji|abWOshChCF+p~Y#9MHI0GlW>B`auzR|;2%Q}Wr%M}WJMXG zc4M_}N`UAl){TR&K@fwH70x6gKrtb#05NG<77JMi2G>QEZ#*d&fbWn|ys1mPs=RPk zF=QRU9RdN8LS|XlWn{SCuOu*2;E3fmiVV|w`mIW^sRAAoA^Z#ra2pa-07|^d2KoU8 zCCgBCT{8g#)aXkXDwuAFk_sl&$d}&&6(q|9eJv1JBv;N05P1Tm0xxKqAd^)W1yf{Y zQ^uNw4I;`{9)%8#6L>=<5PZD0>Z$;_!~>crBEYXgqC@4-p{g=tm8yjBI-rM1z?dda zAZ&=Dww{C(2h=ePD4mi5Xa*o;8W`%Lu8S-I$QPcy3_Y@mC6+*8VF3=zApy(<>1hF) z!Z7^8L=AKh87iD)kkIyk@9D!e0v14XuncbnZk1XrD9jX;>rfj(^BamKDu9AK zShb$51%v6P#z9q=0o}+8XOUI)Y_-U02;4ptR=WF*Mb)f6~NAt zz@dOm9_0BgR#wwd+Mn5=KmSlaQ1kA!vlZP@%+i1f4B4755IgW*7V~0@r!5T z-}}G+_ShR|hgw!I`O4+<$t$t-|Kr;8>9+s5<1gNOY4`X4Y4mSzy*v_L`=|fwqYI-y zi2wE9Ph5Po<1c>tFTZBu&+H*UJVj_Sw0j+~h7 zSj&{Q0Rhwfk7&bKGwNp9NQ;?_ra_}fQzBucB2BSq++Bz2#_C{#+Hn|FG@*^-O{hVN zbb=VBnT&)|DRWSZ?OoP@+Q%A@lNB|c+0f)b4P#r-5XO<_q)F0|Fc~mYk#?&O)Q{`Y zc$}o8ftOJ8cnccgB0VJDH%yWSNIy6PbmQq%By5g=(LfK^rqd@-ixjDcZZjT_k{B3Q zA0U0?GwRUz&m7%YiFAN|{V@_ZBH=_I7%Hl#nttX$jpM(7wJk_f7c3Zm*U`^Pk*?@& zGifCg@pR64CbF%lXwN5E1^vp+GMG9&ddbGOO-Qy_*q#@|Iq z(K;Q@=46#p+C_$jd!6+G4kN>;9xXe@>bMN8i zqRtpu^=&EaK%W~s24>)HL>GFu2eacX9v0 zoc1$F?P)`${ouvWEZH`^>@!RkywEeDN{cUFac%hS3+cr8jteiK# z31pU=F3zi4Q83=@fZ=W9XzgFQA$(j8C+3^R(Arm+(_co2A3a<+{%;H&-HiTb>FJ|j z*bUFSlg2kTp_mi@^8$5*~qGUENp<3-PWllj%~ z$o1qTBKbU(VUNScqvF*?(3u4vFqqe~WG2cMWncQHo@|*H)J%%Z+-4}u-FA9@?hJdD z+hqURF|idkhbRkT{wWY!2)!A=@=pTRV(7_0ls^}u?uYIiEb>oc5wnzOP_iiFw^62) ziA`+Ipou?=CilT?8#w$naQFc@D4#4&_@{BAobs7M27d+_qH)){hAB+&XEDJwSUa8m zemnh{6GJrP{ZkoFD^4bv{WN%(_njohc#0v1Q?BbYiCvvctuUEL{!Ai)-NzWusab)# zIFGcxg|oO(cC8aFlfH0Pm2!;Jqkl2l^4gp2{0c|=r9Usc^^FsF<@@2}-(BT;&R_b# zUEk5O@*9QDz|xIxKEt07p6R&J@Y9vW3+F%eWtUyBK5}{vIl`XRwkIz*g!#aN!<@7R zi)%2?fd>_O6Dt-L3u|Br3C~=`!XkLNjP~%jqLRLlGksDMLTp845gj^&kjhme>3Jk| zBX}Xbg^*Cu1XI~%rzrtRL(&6CVRKos9GK@7#&dC&dFE$jVmaUBGw@*T;M_jZJ82PD zmH$v#3;UU~=ihDn(yPB%UOM!=FHm~$RcXKa@|OFTd0yOF_LTc8?)9IkKk~VkS3LE_ zXa8Y-!zyp`?V_$r>q{Hbtg!s#OIJ9w=sT4Yzufi6GB5?i`k4aHxq`4~g>9jCn2B8r zpg)5Tep;R9L36SU7sF$2v89^PVhRBR#k3Gwb_q>`=@G4#X*ysU$K24GDr7WOC#y}0 zsqovSh1M*_oM>xK!h_!~Dk(|}>PI5(ga=w8D4R@eppwZFEn>DbFb^H}jjj zFhjuSvMJ}a@~Qwz9l?!YYBK2m-#5sc{*O2AcZI%JUVaJv>e&467ya(c#>*F0`h?C? z=bA!Kc)FVAzb)X-g3cnmgS~WWY+>os8?SZm`ZJo6VC8O4XV|mQw($ey4l3`F5#hZ1l-!a;4f!-?coCUX6+9G|( zT%(9GmG=~S5_*)RO`4M6;(=JyO8YktnMsmPB&iF+31XP8`K6sveZX|jM}(O_5vCrV zL@hH((ec6l=+F?XT&RDO12lf>JL+f9*=EKJGf9u#+JPM{PW~3~J`8u4m(sG>n3 zZuOR75`$p$cbj^@yV?)21xw7h4##RO6dt=w-Hh&@Vw4|)3NyTX?kN#%JI+8HM}6b& zt}UnPpA|pvG)IpIUOau^t4zhpo)?8P{pCOWaoNGozjuE3i4ym72fpw%XA3(1ho|?W zAOC(gDyrUD4+>fDQre8SQ1gk4MtN#F`~nu zczTWsu!C_=k@A(ym^?>`Ct3`y|BqjGFUEtHUx|_9HSSBRD^5j!baDTZ&&+p>4{mAN zr5{>-bG(1S_o9AViH{tfcR-06+o~5BKeZQD*JI4MVU1npMqHV@NM6somm2qN5kyD;r$%%kEwG9a*7$J{HZwNTp+M^h`zZ>uND z_Ln){c(6F=xo_n@Jbd)@1K<3~b4ypa17GZa(bxZT_vQ`|1j6Z7f#=$k>JyB;#IaTP zhuWE9(0*Al;+7Rt?LF0quP6rXp-X`W7W$FAtOW_nd_WqQuyim3=>#T-SX_rMcpx`X0jy1dm&q% zp}-5a_WWgvkyT3;b?HwU)_W~~dG-sQmDP;%zEZ?>d$)e^h4bHEapXtOs_8Fue)osZ zE&j`=wq_jL>FEx8%I%1*Ipc@EAY4AUR=`yO*AlqszPlQ(O1P@vnj@XhX-{*~iC&O2 zr+Ihj0GAvE1>Kq^@9#POh@-9*^T%~G@SH{2eWbmuuKRod3n?H3HlpWd^X!RrazMB1|^IJIQk}+q?z6SJMULh!B`bR^B z3X?|&`pQRd$O}SDw`|4lD8$BUvb1FpsxDp0lvM`X%l7YgH0*OUQXxt`&Y)HQIX52AdYL4+?z@f`4Ex``4bmY$)$+5X$}o zX~*vOTy&meG+ z%PW#1+dKXi^u#6v2Sxdg<=o*)_>_m?w0kWRKJZmU_feenRpf(@oACXEcWpxTM?OTg z+^mDPk})5A>VN^NtNF+na`M<3Lp})(ojI0+NsW(+ez1ku(GT$?VJRLmjdY5q1^W70%^V>}or(irGl((~Kjo z?O!Wb;k%O{yP6NU96%PcahIbMzA{U)`^15tQBGLVeum0r_zH#D==Tn6c2J)Q4?kK^ zF2aXIRyus{7yzyE@F@q9LYtkc5U4qppa*hiovUT|b|F9|riZx@xQ}@rpRvast!cSO z%)#MczA(xJot#nMrtWLFE2VtSjAW?Lk6-WsWBRb_4<F9bI}hTyjgMX`RMT{J}{JkO=l#> zawXZL_@|q3~P|FW;A+ z!DN{{?mZD4DOURa|=j@ z&4knYje_7Sj|8thTuC=aciwx?<@M1#s>~Q;OaffiwRbH?8MYT3-q#($f7XKkNxhCOI<#f=iTdDhUtGY-X|Qv17`3w1DH;4nCI@FKPA(96wD0Q8xHSHNAOx-@C^&N zPVax`Zinx|H5+|*uD}_Fzh=C@Wr7pY;F+lH(mU;U=dklWeg}uzhK2OQobj^Mik z!D9n6Q+M`GcMdn|$5DEqKx<<*bmlG&a@pJ(44xVUHq*P;IeXbp$L5Y}jSRQ98Six_ zcs&t3mjG_lTUN?#x&<4;yK~DrY-)jGFI_S$-eJ7&GQn4N1wS6DyvLq(4ioQ<`84sS zjE5=$g{WfII%v{v5b{j$?PTqoy=(=X@Y(B?+8K_>%A9WjHA!-*_fkpl^)CdkrKsfp z{q`Ai?e&}bDSLTOy~e;2FIeO~vY$Z}mA>LoSJ&ok>ofZa-@b+G=-+5xQz#VL0vG+; z6bkJL)%As-uP;>JXZO|Bg+lH0sXi2HrN7Y5{z5~Co`&tv7aF8wn)@2zQK9Wd+TOgS z0cHw7HnfwHX$nC*G(g+yD4F&>K(>A>unRp%+nsx0YW*HMRR;?22TOl*N7@Y4QPJym8@*jSHI9w-aw9RF@7S%y)IHmKyAq++ zQJBB3^|NrJ1MB+fBr44$efkgzbc)?$nx9b_eYY+w+B~`CYX0p7YZh z*A>&~$jx-Vl33T7;UCKIxAnF4qhms$?evz*pEUREBz)a&vQ5OjT*qz$e(4ynhnlfR zsLpm$^QbaupZfgvp}o2G>EU(&EuB?&6Yx=}qhTh!O*?1NvrUy_Z)*R%^svTv_1ii_ zysg`9>r0&ue+^Um*JbQCTYuZsVEbXCrLBKE-(~F>jHR?Db?=VeCYJQJ#k6fb!v-a_ z*_Ll$oi)RrRFh1+(1%dSq_Qk9dGAD zj=XmK5iT}t2TBP11M#(MO1@TGC_5jfpbq%a-sb7t0j2;JeWAKN8GNHMOxmmO)9}{^ z)zl7~Q$sr)Uk^E*YoBJDpgq8TviY}AyL829hf3ugZGcHRcm`k$f7AJ8%U`#X^md8# zMW}8R#;e=mmcg$~{^YgKj^-wuk*}_ij_fqpPgDc^3l|- z8>R8Lo$IkOKB>3mV`uTZL%cvvwCx0p9VFXEVxjhK%5MvG7WfBDsM`gE2Fn!F$@rV? zUpt7JGkDg^#q1cTr7k-a`TX!vWY@5nUZW2Uts%)|VvisGyG8m1+t2Z9@mdxRPvDU4 R|NS3&2mUjNPWWGg{tMCbUY`H} literal 0 HcmV?d00001 diff --git a/scripts/staffing/build_fill_events.py b/scripts/staffing/build_fill_events.py new file mode 100644 index 0000000..91eec14 --- /dev/null +++ b/scripts/staffing/build_fill_events.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +build_fill_events.py — Decision A from the synthetic-data gap report. + +Walks tests/multi-agent/scenarios/*.json (43 client-day scenarios) and +data/_playbook_lessons/*.json (64 retrospective outcomes) and emits a +single normalized fill_events.parquet at data/datasets/fill_events.parquet. + +Pure deterministic normalization — no LLM, no new data. Each scenario +event becomes one row. Lesson outcomes augment scenario events with +success/fail counts where (client, date, city, state) matches. + +Reproducibility: identical inputs → bit-identical output. event_id is +SHA1(client|date|role|at|city) truncated to 16 hex chars; rows are +sorted by event_id before write so re-runs produce the same parquet. +""" + +import hashlib +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq + +REPO = Path(__file__).resolve().parents[2] +SCENARIO_DIR = REPO / "tests" / "multi-agent" / "scenarios" +LESSONS_DIR = REPO / "data" / "_playbook_lessons" +OUT_PATH = REPO / "data" / "datasets" / "fill_events.parquet" + + +def event_id(client: str, date: str, role: str, at: str, city: str) -> str: + h = hashlib.sha1(f"{client}|{date}|{role}|{at}|{city}".encode()).hexdigest() + return h[:16] + + +def load_lessons() -> dict: + """Returns map of (client, date) → outcome dict.""" + out: dict = {} + for path in sorted(LESSONS_DIR.glob("*.json")): + try: + d = json.loads(path.read_text()) + except json.JSONDecodeError: + continue + client = d.get("client") + date = d.get("date") + if not client or not date: + continue + out[(client, date)] = { + "outcome_events_total": d.get("events_total"), + "outcome_events_ok": d.get("events_ok"), + "outcome_checkpoint_count": d.get("checkpoint_count"), + "outcome_model": d.get("model"), + "outcome_cloud": d.get("cloud"), + "outcome_lesson_path": str(path.relative_to(REPO)), + } + return out + + +def load_scenarios(lessons: dict) -> list[dict]: + rows: list[dict] = [] + for path in sorted(SCENARIO_DIR.glob("scen_*.json")): + try: + d = json.loads(path.read_text()) + except json.JSONDecodeError: + continue + client = d.get("client") + date = d.get("date") + contract = d.get("contract") or {} + events = d.get("events") or [] + if not client or not date or not events: + continue + outcome = lessons.get((client, date), {}) + for event in events: + role = event.get("role") or "" + at = event.get("at") or "" + city = event.get("city") or "" + state = event.get("state") or "" + rows.append({ + "event_id": event_id(client, date, role, at, city), + "source_file": str(path.relative_to(REPO)), + "source_kind": "scenario", + "client": client, + "date": date, + "city": city, + "state": state, + "role": role, + "count": int(event.get("count") or 0), + "kind": event.get("kind") or "", + "at": at, + "shift_start": event.get("shift_start") or "", + "contract_deadline": contract.get("deadline"), + "contract_budget_per_hour_max": contract.get("budget_per_hour_max"), + "contract_local_bonus_per_hour": contract.get("local_bonus_per_hour"), + "contract_local_bonus_radius_mi": contract.get("local_bonus_radius_mi"), + "contract_fill_requirement": contract.get("fill_requirement"), + "outcome_events_total": outcome.get("outcome_events_total"), + "outcome_events_ok": outcome.get("outcome_events_ok"), + "outcome_checkpoint_count": outcome.get("outcome_checkpoint_count"), + "outcome_model": outcome.get("outcome_model"), + "outcome_cloud": outcome.get("outcome_cloud"), + "outcome_lesson_path": outcome.get("outcome_lesson_path"), + }) + return rows + + +def main() -> int: + lessons = load_lessons() + rows = load_scenarios(lessons) + if not rows: + print("no rows produced — scenario dir empty?", file=sys.stderr) + return 1 + rows.sort(key=lambda r: r["event_id"]) + + schema = pa.schema([ + ("event_id", pa.string()), + ("source_file", pa.string()), + ("source_kind", pa.string()), + ("client", pa.string()), + ("date", pa.string()), + ("city", pa.string()), + ("state", pa.string()), + ("role", pa.string()), + ("count", pa.int32()), + ("kind", pa.string()), + ("at", pa.string()), + ("shift_start", pa.string()), + ("contract_deadline", pa.string()), + ("contract_budget_per_hour_max", pa.int32()), + ("contract_local_bonus_per_hour", pa.int32()), + ("contract_local_bonus_radius_mi", pa.int32()), + ("contract_fill_requirement", pa.string()), + ("outcome_events_total", pa.int32()), + ("outcome_events_ok", pa.int32()), + ("outcome_checkpoint_count", pa.int32()), + ("outcome_model", pa.string()), + ("outcome_cloud", pa.bool_()), + ("outcome_lesson_path", pa.string()), + ]) + table = pa.Table.from_pylist(rows, schema=schema) + + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) + pq.write_table(table, OUT_PATH, compression="snappy") + + matched = sum(1 for r in rows if r["outcome_events_total"] is not None) + print(f"fill_events.parquet written: {OUT_PATH.relative_to(REPO)}") + print(f" rows: {len(rows)}") + print(f" scenarios: {len({r['source_file'] for r in rows})}") + print(f" with outcome: {matched}") + print(f" unique (client,date): {len({(r['client'], r['date']) for r in rows})}") + print(f" generated_at: {datetime.now(timezone.utc).isoformat(timespec='seconds')}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())