From 175ad59cb3be093c6136f2d220729103cf533e66 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Apr 2026 05:30:11 -0500 Subject: [PATCH] =?UTF-8?q?proof=20harness=20Phase=20D:=20performance=20ba?= =?UTF-8?q?seline=20=C2=B7=201000-row=20ingest,=20p50/p95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GOLAKE-100. First run writes tests/proof/baseline.json; subsequent runs diff against it. >10% regression emits a SKIP with REGRESSION detail (not a fail — perf claim is required:false in claims.yaml so the gate stays green; the human summary tells the regression story honestly). Skip-with-loud-reason if any earlier case in the run failed, per spec "performance only after contract+integration pass." Workload (deterministic, repeatable): ingest 1000-row CSV (5 roles × 5 cities × seeded scores) → /v1/ingest query SELECT count(*) ×20 against the just-ingested dataset vector add 200 dim=4 vectors with formulaic content (no Ollama) search ×20 against the perf index with a fixed query vector RSS per-service post-workload sample via /proc//status Recorded metrics: ingest_rows_per_sec, query_p50_ms, query_p95_ms, vectors_per_sec_add, search_p50_ms, search_p95_ms, rss_{storaged,catalogd,ingestd,queryd,vectord,embedd,gateway}_mb baseline.json on this box (committed): 25000 rows/sec ingest · 17ms p50 / 24ms p95 query 6250 vectors/sec add · 8ms p50 / 20ms p95 search queryd 69 MiB · vectord 14 MiB · others 11-29 MiB Honest measurement-design finding from the very first compare run: back-to-back runs surfaced -41% ingest and +29% query p50 — pure disk-cache + queryd-cold-start noise. Single-sample baselines have real noise floor ≈40%. Recorded as REGRESSION skips so the human summary surfaces it, not a code regression. Tightening the threshold or moving to multi-sample medians is a Phase E recommendation. Verified end-to-end: just proof contract — 53 pass · 1 skip · ~4s just proof integration — 104 pass · 1 skip · ~8s just proof performance — 110 pass · 3 skip · ~10s just verify — 9 smokes still green · 29s All 11 cases (4 contract + 6 integration + 1 performance) deterministic end-to-end. Phase E (final report against the 9 mandated questions) is the last piece. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/proof/baseline.json | 19 +++ tests/proof/cases/10_perf_baseline.sh | 222 ++++++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 tests/proof/baseline.json create mode 100755 tests/proof/cases/10_perf_baseline.sh diff --git a/tests/proof/baseline.json b/tests/proof/baseline.json new file mode 100644 index 0000000..9450c9d --- /dev/null +++ b/tests/proof/baseline.json @@ -0,0 +1,19 @@ +{ + "captured_at_utc": "2026-04-29T10:28:34+00:00", + "git_sha": "1313eb2173a34a49db9d030e101fa0b5cee2cabc", + "metrics": { + "ingest_rows_per_sec": 25000, + "query_p50_ms": 17, + "query_p95_ms": 24, + "vectors_per_sec_add": 6250, + "search_p50_ms": 8, + "search_p95_ms": 20, + "rss_storaged_mb": 17.1, + "rss_catalogd_mb": 28.3, + "rss_ingestd_mb": 28.9, + "rss_queryd_mb": 69.3, + "rss_vectord_mb": 14.1, + "rss_embedd_mb": 11.0, + "rss_gateway_mb": 14.4 + } +} diff --git a/tests/proof/cases/10_perf_baseline.sh b/tests/proof/cases/10_perf_baseline.sh new file mode 100755 index 0000000..c80c172 --- /dev/null +++ b/tests/proof/cases/10_perf_baseline.sh @@ -0,0 +1,222 @@ +#!/usr/bin/env bash +# 10_perf_baseline.sh — GOLAKE-100. +# Performance baseline: rows/sec ingest, vectors/sec add, p50/p95 +# query latency, p50/p95 search latency, peak RSS per service. +# +# First run (or --regenerate-baseline) writes tests/proof/baseline.json. +# Subsequent runs diff against it; >10% regression emits a SKIP record +# with REGRESSION detail (not a fail — perf claim is required:false in +# claims.yaml so the gate stays green; the human summary tells the +# regression story honestly). +# +# Skipped with loud reason if any earlier case in this run failed, +# per spec: "performance mode runs only after contract+integration pass." + +set -uo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/env.sh" +source "${SCRIPT_DIR}/../lib/http.sh" +source "${SCRIPT_DIR}/../lib/assert.sh" +source "${SCRIPT_DIR}/../lib/metrics.sh" + +CASE_ID="GOLAKE-100" +CASE_NAME="Performance baseline — rows/sec, vectors/sec, p50/p95 latencies" +CASE_TYPE="performance" +if [ "${1:-}" = "--metadata-only" ]; then return 0 2>/dev/null || exit 0; fi + +BASELINE_FILE="${PROOF_REPO_ROOT}/tests/proof/baseline.json" +PERF_INDEX="proof_perf_${PROOF_RUN_ID}" +PERF_DATASET="proof_perf_${PROOF_RUN_ID}" + +# ── pre-flight: any earlier case fail? then skip ──────────────── +prior_fail=0 +for jsonl in "${PROOF_REPORT_DIR}/raw/cases/"*.jsonl; do + [ -e "$jsonl" ] || continue + if grep -q '"result":"fail"' "$jsonl" 2>/dev/null; then + prior_fail=1; break + fi +done +if [ "$prior_fail" = 1 ]; then + proof_skip "$CASE_ID" "Performance baseline — earlier case failed" \ + "perf measurements are only meaningful after contract+integration green; see prior cases for failures" + return 0 2>/dev/null || exit 0 +fi + +# ── measurement: rows/sec ingest ───────────────────────────────── +# Generate a deterministic 1000-row CSV inline. Using ID-derived field +# values so SHA is stable across runs and parquet_size is reproducible. +PERF_CSV="${PROOF_REPORT_DIR}/raw/outputs/${CASE_ID}_perf.csv" +mkdir -p "$(dirname "$PERF_CSV")" +{ + echo "id,name,role,city,score" + awk 'BEGIN{ + roles[0]="welder"; roles[1]="electrician"; roles[2]="operator"; + roles[3]="pipefitter"; roles[4]="safety"; + cities[0]="Chicago"; cities[1]="Detroit"; cities[2]="Houston"; + cities[3]="Cleveland"; cities[4]="St Louis"; + for (i=1; i<=1000; i++) { + r = roles[(i-1)%5] + c = cities[(i-1)%5] + s = 50 + (i*7) % 50 + printf "%d,Worker%04d,%s,%s,%d\n", i, i, r, c, s + } + }' +} > "$PERF_CSV" + +proof_metric_start "$CASE_ID" "ingest" +proof_call "$CASE_ID" "perf_ingest" POST \ + "${PROOF_GATEWAY_URL}/v1/ingest?name=${PERF_DATASET}" \ + -F "file=@${PERF_CSV}" >/dev/null +ingest_ms=$(proof_metric_stop "$CASE_ID" "ingest") +ingest_status=$(proof_status_of "$CASE_ID" "perf_ingest") + +if [ "$ingest_status" != "200" ]; then + proof_skip "$CASE_ID" "Performance baseline — perf ingest failed" \ + "ingest of 1000-row CSV returned ${ingest_status}; cannot baseline downstream metrics" + return 0 2>/dev/null || exit 0 +fi + +ingest_rows_per_sec=$(awk -v ms="$ingest_ms" -v rows=1000 \ + 'BEGIN{ if (ms == 0) ms = 1; printf "%.0f", rows * 1000 / ms }') +proof_metric_value "$CASE_ID" "ingest_rows_per_sec" "$ingest_rows_per_sec" "rows/s" + +# ── measurement: query p50/p95 latency ────────────────────────── +# Run the same SELECT 20 times; collect latencies; compute percentiles. +QUERY_LATENCIES="${PROOF_REPORT_DIR}/raw/metrics/_query_latencies" +> "$QUERY_LATENCIES" +sql_body=$(jq -nc --arg s "SELECT count(*) AS n FROM ${PERF_DATASET}" '{sql:$s}') +for i in $(seq 1 20); do + proof_post "$CASE_ID" "query_${i}" "${PROOF_GATEWAY_URL}/v1/sql" \ + "application/json" "$sql_body" >/dev/null + proof_latency_of "$CASE_ID" "query_${i}" >> "$QUERY_LATENCIES" +done +query_p50=$(proof_compute_percentile "$QUERY_LATENCIES" 50) +query_p95=$(proof_compute_percentile "$QUERY_LATENCIES" 95) +proof_metric_value "$CASE_ID" "query_p50_ms" "$query_p50" "ms" +proof_metric_value "$CASE_ID" "query_p95_ms" "$query_p95" "ms" + +# ── measurement: vectors/sec add ──────────────────────────────── +# 200 deterministic dim=4 vectors. Pure throughput metric — no +# embedding in the loop (we already measured embedding contract +# latency separately). +proof_post "$CASE_ID" "perf_create_index" \ + "${PROOF_GATEWAY_URL}/v1/vectors/index" \ + "application/json" "{\"name\":\"${PERF_INDEX}\",\"dimension\":4}" >/dev/null + +# Build add body via jq — 200 items, vector[i] = [i*0.01, (i*0.01)+1, (i*0.01)+2, (i*0.01)+3]. +add_body=$(jq -nc ' + {items: [range(0; 200) | { + id: ("perf-" + (. | tostring)), + vector: [(. * 0.01), (. * 0.01 + 1), (. * 0.01 + 2), (. * 0.01 + 3)] + }]} +') +proof_metric_start "$CASE_ID" "vector_add" +proof_post "$CASE_ID" "perf_add" \ + "${PROOF_GATEWAY_URL}/v1/vectors/index/${PERF_INDEX}/add" \ + "application/json" "$add_body" >/dev/null +add_ms=$(proof_metric_stop "$CASE_ID" "vector_add") +add_status=$(proof_status_of "$CASE_ID" "perf_add") +if [ "$add_status" = "200" ]; then + vectors_per_sec=$(awk -v ms="$add_ms" -v n=200 \ + 'BEGIN{ if (ms == 0) ms = 1; printf "%.0f", n * 1000 / ms }') + proof_metric_value "$CASE_ID" "vectors_per_sec_add" "$vectors_per_sec" "vec/s" +fi + +# ── measurement: search p50/p95 ───────────────────────────────── +SEARCH_LATENCIES="${PROOF_REPORT_DIR}/raw/metrics/_search_latencies" +> "$SEARCH_LATENCIES" +search_body='{"vector":[1,2,3,4],"k":5}' +for i in $(seq 1 20); do + proof_post "$CASE_ID" "search_${i}" \ + "${PROOF_GATEWAY_URL}/v1/vectors/index/${PERF_INDEX}/search" \ + "application/json" "$search_body" >/dev/null + proof_latency_of "$CASE_ID" "search_${i}" >> "$SEARCH_LATENCIES" +done +search_p50=$(proof_compute_percentile "$SEARCH_LATENCIES" 50) +search_p95=$(proof_compute_percentile "$SEARCH_LATENCIES" 95) +proof_metric_value "$CASE_ID" "search_p50_ms" "$search_p50" "ms" +proof_metric_value "$CASE_ID" "search_p95_ms" "$search_p95" "ms" + +# ── measurement: peak RSS per service ─────────────────────────── +declare -A rss_now +for svc in storaged catalogd ingestd queryd vectord embedd gateway; do + rss=$(proof_sample_rss "$CASE_ID" "bin/${svc}" 2>/dev/null || echo 0) + rss_now[$svc]="${rss:-0}" +done + +# Cleanup the perf index. Dataset stays — small, harmless. +proof_delete "$CASE_ID" "perf_clean" \ + "${PROOF_GATEWAY_URL}/v1/vectors/index/${PERF_INDEX}" >/dev/null + +# ── baseline write or diff ────────────────────────────────────── +write_baseline() { + cat > "$BASELINE_FILE" <10% regression = SKIP with REGRESSION detail. + # Faster-than-baseline always passes (no upper bound on improvement). + # For RSS and latency: higher = worse. For throughput: lower = worse. + diff_metric() { + local name="$1" actual="$2" direction="$3" # "lower_is_better" or "higher_is_better" + local baseline_val + baseline_val=$(jq -r ".metrics.${name} // 0" "$BASELINE_FILE") + if awk -v b="$baseline_val" 'BEGIN{exit !(b == 0)}'; then + proof_skip "$CASE_ID" "${name}: baseline missing or zero" \ + "actual=${actual} ${direction}; baseline.json has no value to compare" + return + fi + local pct + pct=$(awk -v a="$actual" -v b="$baseline_val" \ + 'BEGIN{printf "%.1f", (a - b) * 100.0 / b}') + local detail="actual=${actual} baseline=${baseline_val} delta=${pct}%" + if [ "$direction" = "higher_is_better" ]; then + # Throughput: actual < baseline*0.9 = regression. + if awk -v a="$actual" -v b="$baseline_val" 'BEGIN{exit !(a < b * 0.9)}'; then + proof_skip "$CASE_ID" "REGRESSION: ${name}" "$detail" + else + _proof_record "$CASE_ID" "${name}: within 10% of baseline" pass "≥90% of baseline" "$actual" "$detail" + fi + else + # Latency / RSS: actual > baseline*1.1 = regression. + if awk -v a="$actual" -v b="$baseline_val" 'BEGIN{exit !(a > b * 1.1)}'; then + proof_skip "$CASE_ID" "REGRESSION: ${name}" "$detail" + else + _proof_record "$CASE_ID" "${name}: within 10% of baseline" pass "≤110% of baseline" "$actual" "$detail" + fi + fi + } + + diff_metric "ingest_rows_per_sec" "${ingest_rows_per_sec:-0}" "higher_is_better" + diff_metric "query_p50_ms" "${query_p50:-0}" "lower_is_better" + diff_metric "query_p95_ms" "${query_p95:-0}" "lower_is_better" + diff_metric "vectors_per_sec_add" "${vectors_per_sec:-0}" "higher_is_better" + diff_metric "search_p50_ms" "${search_p50:-0}" "lower_is_better" + diff_metric "search_p95_ms" "${search_p95:-0}" "lower_is_better" + diff_metric "rss_vectord_mb" "${rss_now[vectord]:-0}" "lower_is_better" + diff_metric "rss_queryd_mb" "${rss_now[queryd]:-0}" "lower_is_better" +fi