diff --git a/reports/cutover/gauntlet_2026-05-02/parity/embed_parity.md b/reports/cutover/gauntlet_2026-05-02/parity/embed_parity.md new file mode 100644 index 0000000..03c7482 --- /dev/null +++ b/reports/cutover/gauntlet_2026-05-02/parity/embed_parity.md @@ -0,0 +1,17 @@ +# /v1/embed cross-runtime parity probe + +**Date:** 2026-05-02T11:28:19Z +**Rust:** `http://127.0.0.1:3100/ai/embed` · **Go:** `http://127.0.0.1:4110/v1/embed` +**Model:** `nomic-embed-text` (forced — overrides each side's default) +**Match metric:** cosine similarity ≥ `0.99999` + +Identical text → both endpoints. Cosine compares vector +DIRECTION (the operationally-meaningful property for HNSW +retrieval); byte-equal isn't expected because Go round-trips +through float32 internally while Rust stays at f64. + +**Tally:** 8 match · 0 diff (out of 8 fixtures) + +_Cosine ≥ 0.99999 on every fixture — embed parity holds_ +_post-sidecar-drop. Rust and Go produce vectors that point in_ +_the same direction in 768-dim space._ diff --git a/scripts/cutover/parity/embed_parity.sh b/scripts/cutover/parity/embed_parity.sh new file mode 100755 index 0000000..614781e --- /dev/null +++ b/scripts/cutover/parity/embed_parity.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# embed_parity — verify Rust /ai/embed (now direct Ollama post-sidecar +# drop) and Go /v1/embed (via embedd) produce equivalent 768-dim +# vectors for identical input. +# +# Why: today's sidecar drop (lakehouse commit ba928b1) changed Rust's +# embed transport. Go's embedd has always been direct. A drift would +# break retrieval semantics — same query, different vector → different +# top-K matches → different staffing recommendations. This probe is +# the regression gate. +# +# Match metric: cosine similarity > 0.99999. Byte-equal comparison +# isn't realistic — Go represents embeddings as []float32 and Rust +# as Vec, so JSON round-trip introduces small float drift. What +# matters for retrieval correctness is cosine direction, which both +# runtimes preserve when calling the same Ollama with the same model. +# +# Forces both endpoints to use `nomic-embed-text` so the v1-vs-v2-moe +# default difference doesn't pollute the comparison. +# +# Outputs: reports/cutover/gauntlet_2026-05-02/parity/embed_parity.md + +set -uo pipefail +cd "$(dirname "$0")/../../.." + +RUST_GW="${RUST_GW:-http://127.0.0.1:3100}" +GO_GW="${GO_GW:-http://127.0.0.1:4110}" +MODEL="${EMBED_MODEL:-nomic-embed-text}" +THRESHOLD="${EMBED_COSINE_THRESHOLD:-0.99999}" +OUT_DIR="reports/cutover/gauntlet_2026-05-02/parity" +mkdir -p "$OUT_DIR" +OUT="$OUT_DIR/embed_parity.md" + +# Fixtures cover the staffing-domain text shapes plus stress shapes +# (unicode, very short, very long). +FIXTURES=( + "forklift operator" + "Welder, Toledo OH, 2nd shift" + "OSHA-30 certified driver" + "dental hygienist with 3 years experience" + "CNC machine operator graveyard" + "Café résumé ⭐ 你好" + "x" + "$(printf 'long fixture: %.0s' {1..200})end" +) + +probe() { + local gw="$1" path="$2" text="$3" + curl -sf -m 15 -X POST "$gw$path" \ + -H 'Content-Type: application/json' \ + -d "$(jq -nc --arg t "$text" --arg m "$MODEL" '{texts:[$t], model:$m}')" 2>/dev/null +} + +# Cosine via inline python3 (bash arithmetic doesn't have sqrt). +cosine() { + local rust_json="$1" go_json="$2" + python3 - <= t)}'; then + MATCH=$((MATCH+1)) + else + DIFF=$((DIFF+1)) + DIFF_DETAIL="$DIFF_DETAIL"$'\n'"- \`$label\` → cos=$cos (below threshold $THRESHOLD)" + fi + ;; + esac +done + +{ + echo "# /v1/embed cross-runtime parity probe" + echo + echo "**Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "**Rust:** \`$RUST_GW/ai/embed\` · **Go:** \`$GO_GW/v1/embed\`" + echo "**Model:** \`$MODEL\` (forced — overrides each side's default)" + echo "**Match metric:** cosine similarity ≥ \`$THRESHOLD\`" + echo + echo "Identical text → both endpoints. Cosine compares vector" + echo "DIRECTION (the operationally-meaningful property for HNSW" + echo "retrieval); byte-equal isn't expected because Go round-trips" + echo "through float32 internally while Rust stays at f64." + echo + echo "**Tally:** $MATCH match · $DIFF diff (out of $TOTAL fixtures)" + if [ -n "$DIFF_DETAIL" ]; then + echo + echo "## Divergences" + echo "$DIFF_DETAIL" + else + echo + echo "_Cosine ≥ $THRESHOLD on every fixture — embed parity holds_" + echo "_post-sidecar-drop. Rust and Go produce vectors that point in_" + echo "_the same direction in 768-dim space._" + fi +} > "$OUT" + +echo "[parity] embed: $MATCH match / $DIFF diff (out of $TOTAL) → $OUT" +[ "$DIFF" -eq 0 ]