diff --git a/scripts/cutover/start_go_stack.sh b/scripts/cutover/start_go_stack.sh index 25092a0..d95de99 100755 --- a/scripts/cutover/start_go_stack.sh +++ b/scripts/cutover/start_go_stack.sh @@ -2,31 +2,37 @@ # scripts/cutover/start_go_stack.sh # # Bring up the full Go stack persistently — alongside the live Rust -# gateway on :3100. All Go daemons land on the parallel port range -# :3110 + :3211-:3220 so there's no port collision. +# gateway on :3100 + alongside the harness-transient stacks the +# smokes spin up. All Go daemons land on the parallel port range +# :3110 + :3211-:3220 (no collision with Rust on :3100). Persistent +# daemons run under DIFFERENT BINARY NAMES (bin/persistent-*) and +# write to a SEPARATE MinIO BUCKET (lakehouse-go-persistent) so the +# pre-push smoke chain — which uses anchored `pkill -f "bin/(name)$"` +# teardown + reads from `lakehouse-go-primary` — can run without +# tearing down or polluting our long-running state. # -# Unlike playbook_lift.sh's transient harness boot (which kills the -# stack on exit), this script starts every daemon detached via nohup -# + disown. Operators run it once at boot or after a restart; the -# stack stays up until a `pkill -f "bin/(name)"` or reboot. +# Two isolation layers: +# 1. BINARY NAMES — persistent stack runs via symlinks +# bin/persistent- → bin/. Smoke pkill pattern +# `bin/(storaged|...|gateway)` matches `bin/` substrings; +# `bin/persistent-` doesn't match because the slash is +# followed by 'p', not the daemon-name first letter. +# 2. MINIO BUCKETS — persistent stack uses lakehouse-go-persistent; +# smoke harnesses use lakehouse-go-primary. Different buckets +# mean rehydrate paths can't see each other's `_vectors/*` +# persistence files. The temp toml at /tmp/lakehouse-persistent.toml +# overrides only [s3].bucket; everything else inherits from +# lakehouse.toml. # # Logs land in /tmp/gostack-logs/.log (one per daemon). # # Used to bring up the persistent stack 2026-05-01 — the first time # the Go side has run as long-running daemons rather than per-harness -# transient processes. -# -# KNOWN CONSTRAINT: the pre-push smoke chain (`just verify` → -# scripts/{d,g}*_smoke.sh) uses the SAME anchored `pkill -f -# "bin/(name)$"` pattern this script does, and ALSO matches our -# persistent daemons by name. Pushing while the persistent stack -# is up will kill 7 of 11 daemons (gateway, storaged, catalogd, -# ingestd, queryd, embedd, vectord; the smokes don't reach for -# pathwayd/observerd/matrixd/chatd). Workaround: re-run this -# script after every push. A proper fix is to give the persistent -# stack a different binary name (e.g. via build tags or a -# wrapper symlink) so smoke-side pkill doesn't see it; deferred -# until the trigger fires (i.e. when an operator gets bitten). +# transient processes. The 2-isolation-layer split was added the +# same day after the pre-push gate caught a smoke-vs-persistent +# collision (g1p_smoke saw count=2 when expecting count=1 because +# vectord's MinIO bucket had both the smoke's persist_demo AND the +# persistent stack's workers index). set -euo pipefail @@ -37,10 +43,42 @@ if [ ! -d bin ]; then exit 1 fi -# Ensure no leftover from a transient harness run. Anchored pattern -# per feedback_pkill_scope; never bare `bin/`. -echo "[gostack] killing any stale Go daemons (anchored pkill)" -pkill -f "bin/(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)$" 2>/dev/null || true +# ── Layer 1: symlink-based binary names ───────────────────────────── +# Create bin/persistent-* symlinks to bin/* so the persistent stack +# has distinct cmdline strings that smoke pkill won't match. Idempotent +# (existing symlinks are left alone). + +DAEMONS=(storaged catalogd ingestd queryd embedd vectord pathwayd observerd matrixd gateway) + +for d in "${DAEMONS[@]}"; do + target="bin/persistent-$d" + if [ ! -L "$target" ] && [ ! -e "$target" ]; then + ln -s "$d" "$target" + fi +done + +# ── Layer 2: separate MinIO bucket via temp config ────────────────── +# Generate /tmp/lakehouse-persistent.toml from the canonical +# lakehouse.toml with [s3].bucket overridden. Caller can override the +# bucket name via LH_PERSISTENT_BUCKET env var. + +PERSISTENT_BUCKET="${LH_PERSISTENT_BUCKET:-lakehouse-go-persistent}" +TEMP_TOML=/tmp/lakehouse-persistent.toml + +# Create the bucket if missing. mc is idempotent with --ignore-existing. +if command -v mc >/dev/null 2>&1; then + mc mb --ignore-existing "local/$PERSISTENT_BUCKET" >/dev/null 2>&1 || true +fi + +# sed-replace the bucket line. Anchored to "lakehouse-go-primary" so +# no other accidental "primary" mention gets touched. +sed "s/lakehouse-go-primary/$PERSISTENT_BUCKET/g" lakehouse.toml > "$TEMP_TOML" +echo "[gostack] config: $TEMP_TOML (bucket=$PERSISTENT_BUCKET)" + +# ── Cleanup any prior persistent daemons ──────────────────────────── +# Match by the persistent- prefix so smoke processes are untouched. +echo "[gostack] killing any stale persistent Go daemons (anchored on persistent-)" +pkill -f "bin/persistent-(storaged|catalogd|ingestd|queryd|embedd|vectord|pathwayd|observerd|matrixd|gateway)$" 2>/dev/null || true sleep 0.5 mkdir -p /tmp/gostack-logs @@ -48,16 +86,16 @@ mkdir -p /tmp/gostack-logs start() { local bin="$1" local port="$2" - local log="/tmp/gostack-logs/$bin.log" - nohup ./bin/"$bin" -config lakehouse.toml > "$log" 2>&1 & disown + local log="/tmp/gostack-logs/persistent-$bin.log" + nohup ./bin/persistent-"$bin" -config "$TEMP_TOML" > "$log" 2>&1 & disown for _ in $(seq 1 50); do if curl -sSf -m 1 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then - echo " $bin :$port up (log: $log)" + echo " persistent-$bin :$port up (log: $log)" return 0 fi sleep 0.1 done - echo " $bin :$port FAILED — log tail:" + echo " persistent-$bin :$port FAILED — log tail:" tail -20 "$log" return 1 } @@ -77,10 +115,17 @@ start gateway 3110 # chatd is started independently — its provider key files come from # /etc/lakehouse/{ollama_cloud,openrouter,opencode,kimi}.env; if # chatd is already up (long-running from a prior session) we don't -# touch it. +# touch it. chatd uses no S3, so no temp-toml override needed. if ! curl -sSf -m 1 http://127.0.0.1:3220/health >/dev/null 2>&1; then echo "[gostack] chatd :3220 not up; starting" - start chatd 3220 + nohup ./bin/chatd -config lakehouse.toml > /tmp/gostack-logs/chatd.log 2>&1 & disown + for _ in $(seq 1 50); do + if curl -sSf -m 1 "http://127.0.0.1:3220/health" >/dev/null 2>&1; then + echo " chatd :3220 up" + break + fi + sleep 0.1 + done else echo " chatd :3220 already up (skipping)" fi @@ -91,3 +136,7 @@ for p in 3110 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220; do curl -sSf -m 1 "http://127.0.0.1:$p/health" 2>/dev/null | head -c 80 echo done +echo +echo "[gostack] persistent stack uses bucket: $PERSISTENT_BUCKET" +echo "[gostack] smoke harnesses use bucket: lakehouse-go-primary" +echo "[gostack] tear down via: pkill -f 'bin/persistent-'"