golangLAKEHOUSE/scripts/cutover/start_go_stack.sh
root 0e530f4436 drift fix: validatord in start_go_stack + parity refresh
Two anchor-vs-reality drifts found during /read-mem audit:

1. start_go_stack.sh never started validatord :3221, even though
   it shipped 2026-05-02 (f9e7241) and STATE_OF_PLAY claims it as
   part of the persistent stack. Cold-boot quietly omitted it,
   leaving /v1/iterate unreachable on the persistent gateway.
   Fix: factored chatd's conditional-start block into a start_shared
   helper, called for both chatd :3220 and validatord :3221. Same
   shared-with-smokes posture as chatd (no S3 / JSONL-only state,
   no temp-toml override needed).

2. STATE_OF_PLAY header claimed 3 parity probes / 32 assertions.
   Reality is 6 probes / 38 assertions since subject_audit landed
   in 262a77a (2026-05-03). Header refreshed; cross-references
   the three runtime-divergence classes documented at
   lakehouse/STATE_OF_PLAY.md lines 36-39.

Parity reports regenerated as verification artifact (all 6 still
green: 8+12+2+4+1+6). Same pattern as c0a55b1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 03:27:47 -05:00

182 lines
7.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# scripts/cutover/start_go_stack.sh
#
# Bring up the full Go stack persistently — alongside the live Rust
# gateway on :3100 + alongside the harness-transient stacks the
# smokes spin up. All Go daemons land on the parallel port range
# :3110 + :3211-:3220 (no collision with Rust on :3100). Persistent
# daemons run under DIFFERENT BINARY NAMES (bin/persistent-*) and
# write to a SEPARATE MinIO BUCKET (lakehouse-go-persistent) so the
# pre-push smoke chain — which uses anchored `pkill -f "bin/(name)$"`
# teardown + reads from `lakehouse-go-primary` — can run without
# tearing down or polluting our long-running state.
#
# Three isolation layers (the third was added 2026-05-01 after the
# first push test exposed a port-collision bug — smoke catalogd
# failed to bind :3212 because persistent catalogd already had it,
# but smoke's poll_health 3212 succeeded responding to the
# persistent daemon, and the smoke happily proceeded talking to
# the persistent stack with the wrong bucket expectations):
#
# 1. BINARY NAMES — persistent stack runs via symlinks
# bin/persistent-<name> → bin/<name>. Smoke pkill pattern
# `bin/(storaged|...|gateway)$` matches `bin/<name>$` substrings;
# `bin/persistent-<name>` doesn't match because the slash is
# followed by 'p', not the daemon-name first letter.
# 2. MINIO BUCKETS — persistent stack uses lakehouse-go-persistent;
# smoke harnesses use lakehouse-go-primary. Different buckets
# mean rehydrate paths can't see each other's `_vectors/*`
# persistence files.
# 3. PORTS — persistent stack uses :4110 + :4211-:4219 (gateway +
# upstreams). Smoke harness uses :3110 + :3211-:3219. Both
# reach for the SAME chatd at :3220 because chatd is
# read-mostly (LLM dispatch, no persistent state to clobber)
# and operators don't want to maintain two LLM provider key
# sets. The temp toml at /tmp/lakehouse-persistent.toml
# overrides bucket + bind ports + upstream URLs (except chatd).
#
# Logs land in /tmp/gostack-logs/<bin>.log (one per daemon).
#
# Used to bring up the persistent stack 2026-05-01 — the first time
# the Go side has run as long-running daemons rather than per-harness
# transient processes. The 2-isolation-layer split was added the
# same day after the pre-push gate caught a smoke-vs-persistent
# collision (g1p_smoke saw count=2 when expecting count=1 because
# vectord's MinIO bucket had both the smoke's persist_demo AND the
# persistent stack's workers index).
set -euo pipefail
cd "$(dirname "$0")/../.."
if [ ! -d bin ]; then
echo "[gostack] bin/ missing — run 'just build' first" >&2
exit 1
fi
# ── Layer 1: symlink-based binary names ─────────────────────────────
# Create bin/persistent-* symlinks to bin/* so the persistent stack
# has distinct cmdline strings that smoke pkill won't match. Idempotent
# (existing symlinks are left alone).
DAEMONS=(storaged catalogd ingestd queryd embedd vectord pathwayd observerd matrixd gateway)
for d in "${DAEMONS[@]}"; do
target="bin/persistent-$d"
if [ ! -L "$target" ] && [ ! -e "$target" ]; then
ln -s "$d" "$target"
fi
done
# ── Layer 2: separate MinIO bucket via temp config ──────────────────
# Generate /tmp/lakehouse-persistent.toml from the canonical
# lakehouse.toml with [s3].bucket overridden. Caller can override the
# bucket name via LH_PERSISTENT_BUCKET env var.
PERSISTENT_BUCKET="${LH_PERSISTENT_BUCKET:-lakehouse-go-persistent}"
TEMP_TOML=/tmp/lakehouse-persistent.toml
# Create the bucket if missing. mc is idempotent with --ignore-existing.
if command -v mc >/dev/null 2>&1; then
mc mb --ignore-existing "local/$PERSISTENT_BUCKET" >/dev/null 2>&1 || true
fi
# sed-replace the bucket line + port range. Anchored to specific
# substrings so accidental matches don't fire. chatd's :3220 stays
# unchanged (read-mostly LLM dispatch, no persistent state).
sed -e "s/lakehouse-go-primary/$PERSISTENT_BUCKET/g" \
-e 's|127\.0\.0\.1:3110|127.0.0.1:4110|g' \
-e 's|127\.0\.0\.1:3211|127.0.0.1:4211|g' \
-e 's|127\.0\.0\.1:3212|127.0.0.1:4212|g' \
-e 's|127\.0\.0\.1:3213|127.0.0.1:4213|g' \
-e 's|127\.0\.0\.1:3214|127.0.0.1:4214|g' \
-e 's|127\.0\.0\.1:3215|127.0.0.1:4215|g' \
-e 's|127\.0\.0\.1:3216|127.0.0.1:4216|g' \
-e 's|127\.0\.0\.1:3217|127.0.0.1:4217|g' \
-e 's|127\.0\.0\.1:3218|127.0.0.1:4218|g' \
-e 's|127\.0\.0\.1:3219|127.0.0.1:4219|g' \
lakehouse.toml > "$TEMP_TOML"
echo "[gostack] config: $TEMP_TOML (bucket=$PERSISTENT_BUCKET, ports=4110+4211-4219)"
# ── Cleanup any prior persistent daemons ────────────────────────────
# Match by the persistent- prefix so smoke processes are untouched.
echo "[gostack] killing any stale persistent Go daemons (anchored on persistent-)"
pkill -f "bin/persistent-(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)$" 2>/dev/null || true
sleep 0.5
mkdir -p /tmp/gostack-logs
start() {
local bin="$1"
local port="$2"
local log="/tmp/gostack-logs/persistent-$bin.log"
nohup ./bin/persistent-"$bin" -config "$TEMP_TOML" > "$log" 2>&1 & disown
for _ in $(seq 1 50); do
if curl -sSf -m 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
echo " persistent-$bin :$port up (log: $log)"
return 0
fi
sleep 0.1
done
echo " persistent-$bin :$port FAILED — log tail:"
tail -20 "$log"
return 1
}
echo "[gostack] starting in dependency order (port range :4xxx)"
start storaged 4211
start catalogd 4212
start ingestd 4213
start queryd 4214
start embedd 4216
start vectord 4215
start pathwayd 4217
start observerd 4219
start matrixd 4218
start gateway 4110
# chatd + validatord are started independently — both sit OUTSIDE
# the persistent :4xxx range and are SHARED with smoke harnesses
# (no bucket / port conflict because chatd has no S3 state and
# validatord's state is JSONL files, not MinIO). Provider key files
# come from /etc/lakehouse/{ollama_cloud,openrouter,opencode,kimi}.env.
# If either is already up (long-running from a prior session) we
# don't touch it. The matching smokes (chatd_smoke / validatord_smoke)
# do `pkill -f "bin/<name>$"` on teardown — re-running this script
# brings them back. No temp-toml override needed for either.
start_shared() {
local bin="$1"
local port="$2"
if curl -sSf -m 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
echo " $bin :$port already up (skipping)"
return 0
fi
echo "[gostack] $bin :$port not up; starting"
nohup ./bin/"$bin" -config lakehouse.toml > "/tmp/gostack-logs/$bin.log" 2>&1 & disown
for _ in $(seq 1 50); do
if curl -sSf -m 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
echo " $bin :$port up"
return 0
fi
sleep 0.1
done
echo " $bin :$port FAILED — log tail:"
tail -20 "/tmp/gostack-logs/$bin.log"
return 1
}
start_shared chatd 3220
start_shared validatord 3221
echo
echo "[gostack] ready · sweep:"
for p in 4110 4211 4212 4213 4214 4215 4216 4217 4218 4219 3220 3221; do
curl -sSf -m 1 "http://127.0.0.1:$p/health" 2>/dev/null | head -c 80
echo
done
echo
echo "[gostack] persistent stack: ports :4110+:4211-:4219 · bucket=$PERSISTENT_BUCKET"
echo "[gostack] smoke harnesses: ports :3110+:3211-:3219 · bucket=lakehouse-go-primary"
echo "[gostack] shared: chatd :3220 + validatord :3221 (no S3 / JSONL-only state)"
echo "[gostack] tear down via: pkill -f 'bin/persistent-'"