lakehouse/scripts/production_smoke.sh

#!/usr/bin/env bash
# Production substrate smoke — single command that verifies every
# production-critical surface end-to-end. Exits non-zero on the first
# failure so an operator can run this before:
#   - Swapping workers_500k.parquet → real Chicago contractor data
#   - Spinning up the Asterisk voice agent against /v1/chat
#   - Running staffing inference loops via /v1/iterate
#   - Wiring the assistant against the gateway
#
# Usage:
#   ./scripts/production_smoke.sh
#
# Tunable via env:
#   GATEWAY=http://localhost:3100   # gateway base URL
#   FAIL_FAST=1                     # exit on first failure (default 1)
#   VERBOSE=1                       # print full responses on success too

set -e
GATEWAY="${GATEWAY:-http://localhost:3100}"
FAIL_FAST="${FAIL_FAST:-1}"
VERBOSE="${VERBOSE:-0}"

PASS=0
FAIL=0
FAILURES=()

check() {
    local name="$1"
    local expected_status="$2"
    local cmd="$3"
    echo -n "  [$(($PASS + $FAIL + 1))] $name ... "
    local resp
    resp=$(eval "$cmd" 2>&1) || true
    local status="${resp%%|||*}"
    local body="${resp#*|||}"
    if [ "$status" = "$expected_status" ]; then
        PASS=$((PASS + 1))
        echo "✓ ($status)"
        if [ "$VERBOSE" = "1" ]; then echo "      $body" | head -3 | sed 's/^/      /'; fi
    else
        FAIL=$((FAIL + 1))
        FAILURES+=("$name: expected $expected_status, got $status")
        echo "✗ (got $status, expected $expected_status)"
        echo "      $body" | head -3 | sed 's/^/      /'
        [ "$FAIL_FAST" = "1" ] && { print_summary; exit 1; }
    fi
}

curl_with_status() {
    # Run curl, capture HTTP status + body, format as "status|||body"
    local args=("$@")
    curl -sS -w "\n%{http_code}" "${args[@]}" 2>&1 | awk '
        { lines[NR]=$0 }
        END {
            status=lines[NR]
            body=""
            for (i=1; i<NR; i++) body=body lines[i] (i<NR-1?"\n":"")
            print status "|||" body
        }
    '
}

print_summary() {
    echo ""
    echo "═══════════════════════════════════════════════════════════════"
    echo "  $PASS passed · $FAIL failed"
    if [ ${#FAILURES[@]} -gt 0 ]; then
        echo "  failures:"
        for f in "${FAILURES[@]}"; do echo "    - $f"; done
    fi
    echo "═══════════════════════════════════════════════════════════════"
}

echo "Production substrate smoke test against $GATEWAY"
echo ""

# ─── 1. Liveness ─────────────────────────────────────────────────────
echo "▶ Liveness"
check "gateway /health" "200" \
    'curl_with_status -m 5 "$GATEWAY/health"'

# ─── 2. Operational health ──────────────────────────────────────────
echo "▶ Operational state"
HEALTH_RESP=$(curl -sS -m 10 "$GATEWAY/v1/health" 2>&1) || HEALTH_RESP="{}"
WORKERS_COUNT=$(echo "$HEALTH_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('workers_count',0))" 2>/dev/null || echo 0)
PROVIDERS_OK=$(echo "$HEALTH_RESP" | python3 -c "import sys,json; d=json.load(sys.stdin).get('providers_configured',{}); print(sum(1 for v in d.values() if v))" 2>/dev/null || echo 0)
echo "  workers_count: $WORKERS_COUNT"
echo "  providers_configured (count): $PROVIDERS_OK"
if [ "$WORKERS_COUNT" -lt 1 ]; then
    FAIL=$((FAIL + 1))
    FAILURES+=("workers_count=0 — parquet load failed or empty")
    echo "  ✗ workers not loaded"
    [ "$FAIL_FAST" = "1" ] && { print_summary; exit 1; }
else
    PASS=$((PASS + 1))
    echo "  ✓ workers loaded"
fi

# ─── 3. Truth Layer ──────────────────────────────────────────────────
echo "▶ Truth Layer"
check "/v1/context returns rules" "200" \
    'curl_with_status -m 10 "$GATEWAY/v1/context"'

# ─── 4. /v1/chat (provider=ollama) ──────────────────────────────────
echo "▶ /v1/chat (provider=ollama, fast model)"
check "/v1/chat ping" "200" \
    'curl_with_status -m 60 -X POST "$GATEWAY/v1/chat" \
        -H "content-type: application/json" \
        -d "{\"provider\":\"ollama\",\"model\":\"qwen3.5:latest\",\"messages\":[{\"role\":\"user\",\"content\":\"reply: PONG\"}],\"max_tokens\":30,\"temperature\":0,\"think\":false}"'

# ─── 5. /v1/validate (negative + positive) ──────────────────────────
echo "▶ /v1/validate"
check "phantom candidate_id → 422 Consistency" "422" \
    'curl_with_status -m 10 -X POST "$GATEWAY/v1/validate" \
        -H "content-type: application/json" \
        -d "{\"kind\":\"fill\",\"artifact\":{\"fills\":[{\"candidate_id\":\"W-FAKE-0\",\"name\":\"Fake\"}]},\"context\":{\"target_count\":1}}"'

check "real worker (W-1) → 200 OK" "200" \
    'curl_with_status -m 10 -X POST "$GATEWAY/v1/validate" \
        -H "content-type: application/json" \
        -d "{\"kind\":\"fill\",\"artifact\":{\"fills\":[{\"candidate_id\":\"W-1\",\"name\":\"Anyone\"}]},\"context\":{\"target_count\":1}}"'

check "SSN in body → 422 Policy" "422" \
    'curl_with_status -m 10 -X POST "$GATEWAY/v1/validate" \
        -H "content-type: application/json" \
        -d "{\"kind\":\"email\",\"artifact\":{\"to\":\"a@b.com\",\"body\":\"Your SSN 123-45-6789 is on file.\"}}"'

# ─── 6. /v1/iterate (bounded retry loop) ───────────────────────────
# Phantom worker → expect 422 IterateFailure with history (not 200)
echo "▶ /v1/iterate (bounded retry)"
check "/v1/iterate phantom → bounded fail" "422" \
    'curl_with_status -m 240 -X POST "$GATEWAY/v1/iterate" \
        -H "content-type: application/json" \
        -d "{\"kind\":\"fill\",\"provider\":\"ollama\",\"model\":\"qwen3.5:latest\",\"system\":\"Reply with ONLY: {\\\"fills\\\":[{\\\"candidate_id\\\":\\\"W-99999999\\\",\\\"name\\\":\\\"X\\\"}]}\",\"prompt\":\"emit it\",\"context\":{\"target_count\":1},\"max_iterations\":1,\"max_tokens\":200,\"temperature\":0}"'

# ─── 7. Doc-drift batch ─────────────────────────────────────────────
echo "▶ Doc-drift scan"
check "/vectors/playbook_memory/doc_drift/scan" "200" \
    'curl_with_status -m 60 -X POST "$GATEWAY/vectors/playbook_memory/doc_drift/scan"'

# ─── 8. Usage tracking ──────────────────────────────────────────────
echo "▶ Usage tracking"
USAGE=$(curl -sS -m 10 "$GATEWAY/v1/usage" 2>&1)
USAGE_REQS=$(echo "$USAGE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('requests',0))" 2>/dev/null || echo 0)
echo "  usage.requests: $USAGE_REQS (should be > 0 if /v1/chat fired)"
if [ "$USAGE_REQS" -ge 1 ]; then
    PASS=$((PASS + 1))
    echo "  ✓ /v1/usage tracking"
else
    FAIL=$((FAIL + 1))
    FAILURES+=("/v1/usage didn't increment after /v1/chat call")
    echo "  ✗ /v1/usage didn't increment"
fi

print_summary

[ $FAIL -eq 0 ] && exit 0 || exit 1