golangLAKEHOUSE/scripts/cutover/parity/extract_json_parity.sh

#!/usr/bin/env bash
# extract_json_parity — feed identical model-output strings through
# both Rust extract_json AND Go ExtractJSON; diff outputs.
#
# Why: the iteration loop's correctness hinges on extract_json finding
# the same JSON object in the same model output regardless of runtime.
# A divergence here means a model output that one runtime accepts and
# the other rejects (or worse, both accept but parse differently).
#
# Approach:
#   1. Run cargo test -p gateway extract_json to assert the LIVE Rust
#      function still passes its own unit tests (substrate gate)
#   2. For each fixture (input, label) tuple:
#        Rust: ./target/release/parity_extract_json < fixture
#        Go:   ./bin/parity_extract_json_go < fixture
#      Compare {matched, value} JSON outputs
#   3. Emit a markdown report with per-fixture matches/diffs
#
# Outputs: reports/cutover/gauntlet_2026-05-02/parity/extract_json_parity.md
#
# Env overrides:
#   RUST_REPO=/home/profit/lakehouse
#   RUST_BIN=$RUST_REPO/target/release/parity_extract_json

set -uo pipefail
cd "$(dirname "$0")/../../.."

RUST_REPO="${RUST_REPO:-/home/profit/lakehouse}"
RUST_BIN="${RUST_BIN:-$RUST_REPO/target/release/parity_extract_json}"
GO_BIN="${GO_BIN:-./bin/parity_extract_json_go}"
OUT_DIR="reports/cutover/gauntlet_2026-05-02/parity"
mkdir -p "$OUT_DIR"
OUT="$OUT_DIR/extract_json_parity.md"

export PATH="$PATH:/usr/local/go/bin"

# ── Build / verify both sides ───────────────────────────────────────
if [ ! -x "$RUST_BIN" ]; then
  echo "[extract-json-parity] building Rust helper..."
  (cd "$RUST_REPO" && cargo build -p gateway --bin parity_extract_json --release 2>&1 | tail -3)
fi
if [ ! -x "$RUST_BIN" ]; then
  echo "[extract-json-parity] SKIP: $RUST_BIN missing"
  exit 0
fi

# Run live Rust unit tests (substrate gate) — ensures our helper
# matches the production extract_json behavior.
echo "[extract-json-parity] running cargo test extract_json (substrate gate)..."
(cd "$RUST_REPO" && cargo test -p gateway --release extract_json 2>&1 | tail -8) > /tmp/rust_extract_test.log
if ! grep -q "test result: ok" /tmp/rust_extract_test.log; then
  echo "[extract-json-parity] live Rust tests FAILED — aborting probe"
  cat /tmp/rust_extract_test.log
  exit 1
fi
echo "  ✓ live Rust extract_json tests PASS"

# Build Go-side helper from internal/validator.ExtractJSON.
go build -o "$GO_BIN" ./scripts/cutover/parity/extract_json_helper

# ── Fixture set ─────────────────────────────────────────────────────
# Inline as label||raw pairs. Curated to exercise every documented
# branch:
#   - fenced ```json``` block
#   - fenced unlabeled ``` block
#   - bare-braces with stray prose
#   - first-balanced-of-many
#   - nested object
#   - unicode in string values
#   - escaped quotes
#   - empty object
#   - top-level array (both runtimes return first inner object)
#   - no JSON at all
#   - malformed JSON-shaped text (depth balanced but invalid syntax)
#   - very-large input (~10KB of prose around a tiny object)
declare -a FIXTURES=(
  "fenced_json_block||Here's my answer:
\`\`\`json
{\"fills\":[{\"candidate_id\":\"W-1\"}]}
\`\`\`
Done."
  "fenced_unlabeled||result:
\`\`\`
{\"k\":\"v\"}
\`\`\`"
  "bare_braces||Here you go: {\"fills\":[{\"candidate_id\":\"W-2\"}]}"
  "first_of_many||{\"a\":1} then {\"b\":2}"
  "nested||prefix {\"outer\":{\"inner\":[1,2,3]},\"x\":\"y\"} suffix"
  "unicode||{\"name\":\"Café résumé\",\"emoji\":\"⭐\"}"
  "escaped_quotes||{\"msg\":\"she said \\\"hello\\\"\"}"
  "empty_object||{}"
  "array_of_objects||[{\"a\":1},{\"b\":2}]"
  "no_json||just prose, no json"
  "depth_balanced_invalid||{not a key: still not}"
  "trailing_garbage||{\"k\":\"v\"} and then 5} more } stuff"
)

TOTAL=0; MATCH=0; DIFF=0
DIFF_DETAIL=""

for entry in "${FIXTURES[@]}"; do
  IFS='||' read -r label raw <<<"$entry"
  TOTAL=$((TOTAL+1))
  rust_out=$(printf '%s' "$raw" | "$RUST_BIN" 2>/dev/null || echo "RUST_ERROR")
  go_out=$(printf '%s' "$raw" | "$GO_BIN" 2>/dev/null || echo "GO_ERROR")
  # Normalize JSON serialization (key order) before comparing.
  rust_norm=$(echo "$rust_out" | jq -cS . 2>/dev/null || echo "$rust_out")
  go_norm=$(echo "$go_out" | jq -cS . 2>/dev/null || echo "$go_out")
  if [ "$rust_norm" = "$go_norm" ]; then
    MATCH=$((MATCH+1))
  else
    DIFF=$((DIFF+1))
    raw_short=$(printf '%s' "$raw" | head -c 120 | tr '\n' ' ')
    DIFF_DETAIL="$DIFF_DETAIL"$'\n\n'"### $label"$'\n''**Input (first 120 chars):** `'"$raw_short"'`'$'\n\n''**Rust:**'$'\n''```json'$'\n'"$rust_norm"$'\n''```'$'\n\n''**Go:**'$'\n''```json'$'\n'"$go_norm"$'\n''```'
  fi
done

# ── Report ──────────────────────────────────────────────────────────
{
  echo "# extract_json parity probe — Rust vs Go"
  echo
  echo "**Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ)"
  echo "**Rust helper:** \`$RUST_BIN\` (links live \`gateway::v1::iterate::extract_json\`)"
  echo "**Go helper:** \`$GO_BIN\` (links live \`internal/validator.ExtractJSON\`)"
  echo
  echo "Identical model-output strings → both runtimes' \`extract_json\`."
  echo "Match = identical \`{matched, value}\` JSON output."
  echo
  echo "**Substrate gate:** \`cargo test -p gateway extract_json\` PASS before probe."
  echo
  echo "**Tally:** $MATCH match · $DIFF diff (out of $TOTAL fixtures)"
  if [ -n "$DIFF_DETAIL" ]; then
    echo
    echo "## Divergences"
    echo "$DIFF_DETAIL"
  else
    echo
    echo "_No divergences — extract_json parity holds across all fixtures._"
  fi
} > "$OUT"

echo "[parity] extract_json: $MATCH match / $DIFF diff (out of $TOTAL) → $OUT"
[ "$DIFF" -eq 0 ]