#!/usr/bin/env bash # 06_vector_add_search.sh — GOLAKE-060 + GOLAKE-061. # Vector add + search round-trip. Synthetic dim=4 unit basis vectors, # no embedd dependency — this is the contract layer. # # GOLAKE-060: add succeeds + lookup-by-id returns the inserted IDs # GOLAKE-061: nearest-neighbor search — inserted vector ranks #1 vs itself set -uo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/env.sh" source "${SCRIPT_DIR}/../lib/http.sh" source "${SCRIPT_DIR}/../lib/assert.sh" CASE_ID="GOLAKE-060-061" CASE_NAME="Vector add + lookup + nearest-neighbor" CASE_TYPE="contract" if [ "${1:-}" = "--metadata-only" ]; then return 0 2>/dev/null || exit 0; fi INDEX_NAME="proof_contract_idx" # Idempotent prelude — clean any prior run state. 404 is fine. proof_delete "$CASE_ID" "pre_clean" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX_NAME}" >/dev/null # Create the index. vectord returns 201. proof_post "$CASE_ID" "create_index" "${PROOF_GATEWAY_URL}/v1/vectors/index" \ "application/json" \ "{\"name\":\"${INDEX_NAME}\",\"dimension\":4}" >/dev/null proof_assert_eq "$CASE_ID" "create index → 201" "201" \ "$(proof_status_of "$CASE_ID" "create_index")" # Add three deterministic vectors. Unit basis vectors so search recall # is unambiguous: searching for [1,0,0,0] must return v1 first. # vectord wants {"items": [...]}, NOT {"vectors": [...]}. add_body='{"items":[ {"id":"v1","vector":[1,0,0,0]}, {"id":"v2","vector":[0,1,0,0]}, {"id":"v3","vector":[0,0,1,0]} ]}' proof_post "$CASE_ID" "add_vectors" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX_NAME}/add" \ "application/json" "$add_body" >/dev/null proof_assert_eq "$CASE_ID" "add vectors → 200" "200" \ "$(proof_status_of "$CASE_ID" "add_vectors")" # Lookup-by-id (GOLAKE-060 evidence). The /index/{name} GET returns # {"params": {...}, "length": N}. proof_get "$CASE_ID" "get_index" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX_NAME}" >/dev/null proof_assert_eq "$CASE_ID" "get index → 200" "200" \ "$(proof_status_of "$CASE_ID" "get_index")" length=$(jq -r '.length' \ "${PROOF_REPORT_DIR}/raw/http/${CASE_ID}/get_index.body") proof_assert_eq "$CASE_ID" "index length = 3 after add" "3" "$length" # Search — query is identical to v1; expect v1 at rank 1 with distance ≈ 0. search_body='{"vector":[1,0,0,0],"k":3}' proof_post "$CASE_ID" "search" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX_NAME}/search" \ "application/json" "$search_body" >/dev/null proof_assert_eq "$CASE_ID" "search → 200" "200" \ "$(proof_status_of "$CASE_ID" "search")" # Search response shape: {"results": [{"id","distance","metadata?"}]}. search_body_path="${PROOF_REPORT_DIR}/raw/http/${CASE_ID}/search.body" top1_id=$(jq -r '.results[0].id' "$search_body_path") proof_assert_eq "$CASE_ID" "top-1 id = v1 (self-recall)" "v1" "$top1_id" top1_dist=$(jq -r '.results[0].distance' "$search_body_path") proof_assert_lt "$CASE_ID" "top-1 distance < 0.001 (cosine self ≈ 0)" \ "$top1_dist" "0.001" # Cleanup — vectord returns 204 No Content on delete success. proof_delete "$CASE_ID" "post_clean" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${INDEX_NAME}" >/dev/null proof_assert_status_in "$CASE_ID" "delete index → 200 or 204" "200 204" "post_clean" # ── integration tier — text → embed → add → search top-K ────────── # Skip in contract mode; full pipeline runs only when integration or # performance is the active mode. if [ "$PROOF_MODE" = "contract" ]; then return 0 2>/dev/null || exit 0; fi # Switch CASE_ID for the integration claim — assertions land under # GOLAKE-051 in their own JSONL so the per-case-id table tracks them # distinctly from the contract claims above. CASE_ID="GOLAKE-051" DOCS_FILE="${PROOF_REPO_ROOT}/tests/proof/fixtures/text/docs.txt" RANKINGS_FILE="${PROOF_REPO_ROOT}/tests/proof/fixtures/expected/rankings.json" SEM_INDEX="proof_sem_${PROOF_RUN_ID}" # Pre-flight: skip the integration block cleanly if Ollama is down so # we don't get a wall of "502" failures and so spec rule "skipped != # passed" stays honest. proof_post "$CASE_ID" "embed_health" "${PROOF_GATEWAY_URL}/v1/embed" \ "application/json" '{"texts":["health probe"]}' >/dev/null embed_status=$(proof_status_of "$CASE_ID" "embed_health") if [ "$embed_status" != "200" ]; then proof_skip "$CASE_ID" "Embedding integration — Ollama unreachable" \ "POST /v1/embed returned ${embed_status}; cannot exercise top-K ranking" return 0 2>/dev/null || exit 0 fi # Load 4 docs from fixture (tab-separated idtext). ids=() texts=() while IFS=$'\t' read -r id text; do [ -z "$id" ] && continue ids+=("$id") texts+=("$text") done < "$DOCS_FILE" # Embed all 4 docs in one batch — single round trip. texts_json=$(printf '%s\n' "${texts[@]}" | jq -R . | jq -s .) embed_body=$(jq -nc --argjson texts "$texts_json" '{texts:$texts}') proof_post "$CASE_ID" "embed_docs" "${PROOF_GATEWAY_URL}/v1/embed" \ "application/json" "$embed_body" >/dev/null embed_resp="${PROOF_REPORT_DIR}/raw/http/${CASE_ID}/embed_docs.body" proof_assert_eq "$CASE_ID" "embed 4 docs → 200" "200" \ "$(proof_status_of "$CASE_ID" "embed_docs")" # Create the dim=768 index. proof_post "$CASE_ID" "sem_create" "${PROOF_GATEWAY_URL}/v1/vectors/index" \ "application/json" "{\"name\":\"${SEM_INDEX}\",\"dimension\":768}" >/dev/null proof_assert_eq "$CASE_ID" "create dim=768 index → 201" "201" \ "$(proof_status_of "$CASE_ID" "sem_create")" # Build add body: zip ids[i] with vectors[i] from embed response. ids_json=$(printf '%s\n' "${ids[@]}" | jq -R . | jq -s .) add_body=$(jq -nc --argjson ids "$ids_json" --slurpfile e "$embed_resp" ' [range(0; ($ids | length)) | {id: $ids[.], vector: $e[0].vectors[.]}] | {items: .} ') proof_post "$CASE_ID" "sem_add" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${SEM_INDEX}/add" \ "application/json" "$add_body" >/dev/null proof_assert_eq "$CASE_ID" "add 4 docs to index → 200" "200" \ "$(proof_status_of "$CASE_ID" "sem_add")" # Test queries. Each must return its corresponding doc as top-1. declare -a query_keys=("welder_chicago" "warehouse_safety" "detroit_electrical" "houston_pipefitter") declare -a query_texts=( "welder needed in Chicago" "warehouse safety crew" "Detroit electrical contractor" "Houston pipefitter" ) # Capture top-1 per query. declare -A actual_top1 for i in "${!query_keys[@]}"; do key="${query_keys[$i]}" query="${query_texts[$i]}" qbody=$(jq -nc --arg q "$query" '{texts:[$q]}') proof_post "$CASE_ID" "embed_q_${key}" "${PROOF_GATEWAY_URL}/v1/embed" \ "application/json" "$qbody" >/dev/null qvec=$(jq -c '.vectors[0]' \ "${PROOF_REPORT_DIR}/raw/http/${CASE_ID}/embed_q_${key}.body") sbody=$(jq -nc --argjson v "$qvec" '{vector:$v,k:1}') proof_post "$CASE_ID" "search_${key}" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${SEM_INDEX}/search" \ "application/json" "$sbody" >/dev/null top1=$(jq -r '.results[0].id' \ "${PROOF_REPORT_DIR}/raw/http/${CASE_ID}/search_${key}.body") actual_top1[$key]="$top1" done # Assert against stored rankings — or write fixture on first run / # explicit --regenerate-rankings. need_regen=0 [ ! -f "$RANKINGS_FILE" ] && need_regen=1 [ "${PROOF_REGENERATE_RANKINGS:-0}" = "1" ] && need_regen=1 if [ "$need_regen" = "1" ]; then # Build JSON object {query_key: top1_id, ...} from the bash assoc array. out="{" sep="" for k in "${query_keys[@]}"; do out+="${sep}\"${k}\": \"${actual_top1[$k]}\"" sep="," done out+="}" echo "$out" | jq . > "$RANKINGS_FILE" proof_skip "$CASE_ID" "rankings fixture regenerated — re-run to verify" \ "wrote ${RANKINGS_FILE} from this run; assertions skipped this turn" else for k in "${query_keys[@]}"; do expected=$(jq -r ".${k}" "$RANKINGS_FILE") proof_assert_eq "$CASE_ID" "top-1 for query '${k}' matches fixture" \ "$expected" "${actual_top1[$k]}" done fi # Cleanup the semantic index. proof_delete "$CASE_ID" "sem_clean" \ "${PROOF_GATEWAY_URL}/v1/vectors/index/${SEM_INDEX}" >/dev/null