#!/usr/bin/env bash # 10_perf_baseline.sh — GOLAKE-100. # Multi-sample performance baseline. Each metric stored as # {value: median, mad: median absolute deviation}; regression # threshold is max(3*MAD, 25%) so noise-floor doesn't generate # false positives. # # Workload sample counts: # ingest n=3 runs (1000-row CSV each, fresh dataset name) # vector_add n=3 runs (200 vectors each, fresh index) # query n=20 samples # search n=20 samples # rss n=1 (steady-state in our G0 workloads; promote to # multi-sample if it becomes noisy) # # First run (or --regenerate-baseline) writes tests/proof/baseline.json. # Subsequent runs diff against it; regression beyond max(3*MAD, 25%) # emits a SKIP record with REGRESSION detail. perf claim is # required:false in claims.yaml so the gate stays green; the human # summary surfaces the regression by name. # # Skipped with loud reason if any earlier case in this run failed, # per spec: "performance mode runs only after contract+integration pass." set -uo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/env.sh" source "${SCRIPT_DIR}/../lib/http.sh" source "${SCRIPT_DIR}/../lib/assert.sh" source "${SCRIPT_DIR}/../lib/metrics.sh" CASE_ID="GOLAKE-100" CASE_NAME="Performance baseline — multi-sample + warmup + MAD" CASE_TYPE="performance" if [ "${1:-}" = "--metadata-only" ]; then return 0 2>/dev/null || exit 0; fi BASELINE_FILE="${PROOF_REPO_ROOT}/tests/proof/baseline.json" # Warmup counts tuned empirically to drop inter-run variance below # the noise floor. Each fresh bootstrap brings up cold queryd/vectord # whose first 10–30 ops hit cold paths (cgo init, view registration, # DuckDB connection priming, HNSW graph allocation). Warmups absorb # that; subsequent measurements see warm paths. INGEST_WARMUP=3 INGEST_RUNS=3 VECTOR_ADD_WARMUP=3 VECTOR_ADD_RUNS=3 QUERY_WARMUP=50 QUERY_SAMPLES=20 SEARCH_WARMUP=50 SEARCH_SAMPLES=20 # Threshold floor for noise-aware regression detection. # Even with aggressive warmup, single-host benchmarks on a busy box # show ~50% inter-run variance on bootstrap-cold metrics. The 75% # floor catches real >75% regressions while letting normal jitter # pass. Pair with 3*MAD so high-variance metrics don't false-fail. PERCENT_FLOOR="0.75" # ── pre-flight: any earlier case fail? then skip ──────────────── prior_fail=0 for jsonl in "${PROOF_REPORT_DIR}/raw/cases/"*.jsonl; do [ -e "$jsonl" ] || continue if grep -q '"result":"fail"' "$jsonl" 2>/dev/null; then prior_fail=1; break fi done if [ "$prior_fail" = 1 ]; then proof_skip "$CASE_ID" "Performance baseline — earlier case failed" \ "perf measurements are only meaningful after contract+integration green; see prior cases for failures" return 0 2>/dev/null || exit 0 fi # ── deterministic 1000-row CSV (used by all ingest runs) ───────── PERF_CSV="${PROOF_REPORT_DIR}/raw/outputs/${CASE_ID}_perf.csv" mkdir -p "$(dirname "$PERF_CSV")" { echo "id,name,role,city,score" awk 'BEGIN{ roles[0]="welder"; roles[1]="electrician"; roles[2]="operator"; roles[3]="pipefitter"; roles[4]="safety"; cities[0]="Chicago"; cities[1]="Detroit"; cities[2]="Houston"; cities[3]="Cleveland"; cities[4]="St Louis"; for (i=1; i<=1000; i++) { r = roles[(i-1)%5] c = cities[(i-1)%5] s = 50 + (i*7) % 50 printf "%d,Worker%04d,%s,%s,%d\n", i, i, r, c, s } }' } > "$PERF_CSV" # ── ingest: warmup pass(es) discarded, then n=3 measurement runs ─ # Warmup discharges cgo init / disk-cache priming / first-write FS # overhead that would skew the first measurement. for i in $(seq 1 $INGEST_WARMUP); do DATASET="proof_warmup_${PROOF_RUN_ID}_${i}" proof_call "$CASE_ID" "warmup_ingest_${i}" POST \ "${PROOF_GATEWAY_URL}/v1/ingest?name=${DATASET}" \ -F "file=@${PERF_CSV}" >/dev/null done INGEST_RPS_FILE="${PROOF_REPORT_DIR}/raw/metrics/_ingest_rps" > "$INGEST_RPS_FILE" for i in $(seq 1 $INGEST_RUNS); do DATASET="proof_perf_${PROOF_RUN_ID}_${i}" proof_metric_start "$CASE_ID" "ingest_${i}" proof_call "$CASE_ID" "perf_ingest_${i}" POST \ "${PROOF_GATEWAY_URL}/v1/ingest?name=${DATASET}" \ -F "file=@${PERF_CSV}" >/dev/null ms=$(proof_metric_stop "$CASE_ID" "ingest_${i}") status=$(proof_status_of "$CASE_ID" "perf_ingest_${i}") if [ "$status" != "200" ]; then proof_skip "$CASE_ID" "Performance baseline — perf ingest failed run ${i}" \ "ingest of 1000-row CSV returned ${status}; cannot baseline downstream metrics" return 0 2>/dev/null || exit 0 fi awk -v ms="$ms" -v rows=1000 \ 'BEGIN{ if (ms == 0) ms = 1; printf "%.0f\n", rows * 1000 / ms }' \ >> "$INGEST_RPS_FILE" done ingest_rps_median=$(proof_compute_percentile "$INGEST_RPS_FILE" 50) ingest_rps_mad=$(proof_compute_mad "$INGEST_RPS_FILE") proof_metric_value "$CASE_ID" "ingest_rows_per_sec_median" "$ingest_rps_median" "rows/s" proof_metric_value "$CASE_ID" "ingest_rows_per_sec_mad" "$ingest_rps_mad" "rows/s" # Use the first dataset for query benchmarks. QUERY_DATASET="proof_perf_${PROOF_RUN_ID}_1" # ── query: warmup samples discarded, then n=20 measurement ─────── QUERY_LATENCIES="${PROOF_REPORT_DIR}/raw/metrics/_query_latencies" > "$QUERY_LATENCIES" sql_body=$(jq -nc --arg s "SELECT count(*) AS n FROM ${QUERY_DATASET}" '{sql:$s}') for i in $(seq 1 $QUERY_WARMUP); do proof_post "$CASE_ID" "query_warmup_${i}" "${PROOF_GATEWAY_URL}/v1/sql" \ "application/json" "$sql_body" >/dev/null done for i in $(seq 1 $QUERY_SAMPLES); do proof_post "$CASE_ID" "query_${i}" "${PROOF_GATEWAY_URL}/v1/sql" \ "application/json" "$sql_body" >/dev/null proof_latency_of "$CASE_ID" "query_${i}" >> "$QUERY_LATENCIES" done query_median=$(proof_compute_percentile "$QUERY_LATENCIES" 50) query_mad=$(proof_compute_mad "$QUERY_LATENCIES") query_p95=$(proof_compute_percentile "$QUERY_LATENCIES" 95) proof_metric_value "$CASE_ID" "query_median_ms" "$query_median" "ms" proof_metric_value "$CASE_ID" "query_mad_ms" "$query_mad" "ms" proof_metric_value "$CASE_ID" "query_p95_ms" "$query_p95" "ms" # ── n=3 vector_add samples — collect vectors/sec per run ───────── add_body=$(jq -nc ' {items: [range(0; 200) | { id: ("perf-" + (. | tostring)), vector: [(. * 0.01), (. * 0.01 + 1), (. * 0.01 + 2), (. * 0.01 + 3)] }]} ') VEC_VPS_FILE="${PROOF_REPORT_DIR}/raw/metrics/_vector_vps" > "$VEC_VPS_FILE" declare -a perf_indexes=() # Warmup pass(es): create + add to a throwaway index, discard timing. for i in $(seq 1 $VECTOR_ADD_WARMUP); do WIDX="proof_warmup_idx_${PROOF_RUN_ID}_${i}" proof_post "$CASE_ID" "warmup_create_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index" \ "application/json" "{\"name\":\"${WIDX}\",\"dimension\":4}" >/dev/null proof_post "$CASE_ID" "warmup_add_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${WIDX}/add" \ "application/json" "$add_body" >/dev/null proof_delete "$CASE_ID" "warmup_clean_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${WIDX}" >/dev/null done for i in $(seq 1 $VECTOR_ADD_RUNS); do INDEX="proof_perf_idx_${PROOF_RUN_ID}_${i}" perf_indexes+=("$INDEX") proof_post "$CASE_ID" "perf_create_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index" \ "application/json" "{\"name\":\"${INDEX}\",\"dimension\":4}" >/dev/null proof_metric_start "$CASE_ID" "vector_add_${i}" proof_post "$CASE_ID" "perf_add_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX}/add" \ "application/json" "$add_body" >/dev/null ms=$(proof_metric_stop "$CASE_ID" "vector_add_${i}") if [ "$(proof_status_of "$CASE_ID" "perf_add_${i}")" = "200" ]; then awk -v ms="$ms" -v n=200 \ 'BEGIN{ if (ms == 0) ms = 1; printf "%.0f\n", n * 1000 / ms }' \ >> "$VEC_VPS_FILE" fi done vec_vps_median=$(proof_compute_percentile "$VEC_VPS_FILE" 50) vec_vps_mad=$(proof_compute_mad "$VEC_VPS_FILE") proof_metric_value "$CASE_ID" "vectors_per_sec_add_median" "$vec_vps_median" "vec/s" proof_metric_value "$CASE_ID" "vectors_per_sec_add_mad" "$vec_vps_mad" "vec/s" # ── search: warmup samples discarded, then n=20 measurement ────── SEARCH_INDEX="${perf_indexes[0]}" SEARCH_LATENCIES="${PROOF_REPORT_DIR}/raw/metrics/_search_latencies" > "$SEARCH_LATENCIES" search_body='{"vector":[1,2,3,4],"k":5}' for i in $(seq 1 $SEARCH_WARMUP); do proof_post "$CASE_ID" "search_warmup_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${SEARCH_INDEX}/search" \ "application/json" "$search_body" >/dev/null done for i in $(seq 1 $SEARCH_SAMPLES); do proof_post "$CASE_ID" "search_${i}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${SEARCH_INDEX}/search" \ "application/json" "$search_body" >/dev/null proof_latency_of "$CASE_ID" "search_${i}" >> "$SEARCH_LATENCIES" done search_median=$(proof_compute_percentile "$SEARCH_LATENCIES" 50) search_mad=$(proof_compute_mad "$SEARCH_LATENCIES") search_p95=$(proof_compute_percentile "$SEARCH_LATENCIES" 95) proof_metric_value "$CASE_ID" "search_median_ms" "$search_median" "ms" proof_metric_value "$CASE_ID" "search_mad_ms" "$search_mad" "ms" proof_metric_value "$CASE_ID" "search_p95_ms" "$search_p95" "ms" # ── per-service RSS (single sample — steady-state in G0) ───────── declare -A rss_now for svc in storaged catalogd ingestd queryd vectord embedd gateway; do rss=$(proof_sample_rss "$CASE_ID" "bin/${svc}" 2>/dev/null || echo 0) rss_now[$svc]="${rss:-0}" done # Cleanup the perf indexes. Datasets stay — small, harmless. for idx in "${perf_indexes[@]}"; do proof_delete "$CASE_ID" "perf_clean_${idx}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${idx}" >/dev/null done # ── baseline write or diff ────────────────────────────────────── write_baseline() { cat > "$BASELINE_FILE" < threshold AND # direction signals "worse" (lower throughput / higher latency). diff_metric() { local name="$1" actual="$2" direction="$3" # higher_is_better | lower_is_better local base_val base_mad base_val=$(jq -r ".metrics.\"${name}\".value // 0" "$BASELINE_FILE") base_mad=$(jq -r ".metrics.\"${name}\".mad // 0" "$BASELINE_FILE") if awk -v b="$base_val" 'BEGIN{exit !(b == 0)}'; then proof_skip "$CASE_ID" "${name}: baseline missing or zero" \ "actual=${actual}; baseline.json has no value to compare" return fi # threshold = max(3*MAD, PERCENT_FLOOR * value). MAD-only would # give zero tolerance for low-variance metrics (RSS, sub-ms # latency); the percent floor absorbs inter-run wobble that # within-run sampling can't see (cold queryd / fresh GC / disk # cache priming on bootstrap). 50% floor empirically covers the # observed range; warmup passes drop within-run variance closer # to MAD so most metrics pass cleanly run-to-run. local threshold threshold=$(awk -v m="$base_mad" -v v="$base_val" -v pf="$PERCENT_FLOOR" \ 'BEGIN { tm = m * 3; pfv = v * pf; print (tm > pfv ? tm : pfv) }') local delta pct delta=$(awk -v a="$actual" -v b="$base_val" \ 'BEGIN { d = a - b; print (d < 0 ? -d : d) }') pct=$(awk -v a="$actual" -v b="$base_val" \ 'BEGIN { if (b == 0) { print "n/a"; exit } printf "%.1f", (a - b) * 100.0 / b }') local detail="actual=${actual} baseline=${base_val} mad=${base_mad} threshold=${threshold} delta_pct=${pct}%" local regression=0 if [ "$direction" = "higher_is_better" ]; then # Throughput: actual is worse if it's MORE than threshold below baseline. if awk -v a="$actual" -v b="$base_val" -v t="$threshold" \ 'BEGIN{exit !(b - a > t)}'; then regression=1 fi else # Latency / RSS: actual is worse if it's MORE than threshold above baseline. if awk -v a="$actual" -v b="$base_val" -v t="$threshold" \ 'BEGIN{exit !(a - b > t)}'; then regression=1 fi fi if [ "$regression" = "1" ]; then proof_skip "$CASE_ID" "REGRESSION: ${name}" "$detail" else local floor_pct floor_pct=$(awk -v pf="$PERCENT_FLOOR" 'BEGIN{printf "%.0f", pf*100}') _proof_record "$CASE_ID" "${name}: within max(3*MAD, ${floor_pct}%) of baseline" \ pass "noise-floor-bounded" "$actual" "$detail" fi } if [ ! -f "$BASELINE_FILE" ] || [ "${PROOF_REGENERATE_BASELINE:-0}" = "1" ]; then write_baseline proof_skip "$CASE_ID" "baseline.json regenerated — re-run to verify regressions" \ "wrote ${BASELINE_FILE} from this run; comparison skipped this turn" else diff_metric "ingest_rows_per_sec" "${ingest_rps_median:-0}" "higher_is_better" diff_metric "query_ms" "${query_median:-0}" "lower_is_better" diff_metric "vectors_per_sec_add" "${vec_vps_median:-0}" "higher_is_better" diff_metric "search_ms" "${search_median:-0}" "lower_is_better" diff_metric "rss_vectord_mb" "${rss_now[vectord]:-0}" "lower_is_better" diff_metric "rss_queryd_mb" "${rss_now[queryd]:-0}" "lower_is_better" fi