#!/bin/bash # One-shot dump of all testing data into the `raw` MinIO bucket. # Persistent test corpus so we don't re-extract every run. # # Layout: # raw/ # staffing/ — workers_500k.parquet, resumes.parquet # entities/ — entities.jsonl, sec_company_tickers.json # llm_team/ — *.jsonl extracts from knowledge_base PG tables # chicago/ — permits_.json (last 30 days) # MANIFEST.json — documents what's here + when set -euo pipefail REPO=/home/profit/lakehouse BUCKET=raw ALIAS=local STAGE=$(mktemp -d /tmp/raw_dump.XXXXX) trap 'rm -rf "$STAGE"' EXIT DATE=$(date -u +%Y-%m-%d) log() { echo "[dump $(date -u +%H:%M:%S)] $*"; } log "creating bucket ${ALIAS}/${BUCKET} (idempotent)" mc mb --ignore-existing ${ALIAS}/${BUCKET} # ─── 1. STAFFING ─── log "staffing/ — workers_500k.parquet (323 MB) + resumes.parquet" mc cp -q ${REPO}/data/datasets/workers_500k.parquet ${ALIAS}/${BUCKET}/staffing/workers_500k.parquet mc cp -q ${REPO}/data/datasets/resumes.parquet ${ALIAS}/${BUCKET}/staffing/resumes.parquet # ─── 2. ENTITIES + SEC + GEO ─── log "entities/ — contractor entities cache + SEC tickers + svep + tif districts" mc cp -q ${REPO}/data/_entity_cache/entities.jsonl ${ALIAS}/${BUCKET}/entities/entities.jsonl mc cp -q ${REPO}/data/_entity_cache/sec_company_tickers.json ${ALIAS}/${BUCKET}/sec/company_tickers.json mc cp -q ${REPO}/data/_entity_cache/svep_log.json ${ALIAS}/${BUCKET}/entities/svep_log.json mc cp -q ${REPO}/data/_entity_cache/tif_districts.geojson ${ALIAS}/${BUCKET}/chicago/tif_districts.geojson # ─── 3. LLM TEAM HISTORY (Postgres → JSONL → S3) ─── log "llm_team/ — extracting from knowledge_base PG tables" LLM_TABLES=(team_runs pipeline_runs lab_experiments lab_trials meta_pipelines meta_runs conversations response_cache memory_entries adaptive_runs) for tbl in "${LLM_TABLES[@]}"; do out=${STAGE}/${tbl}.jsonl rows=$(sudo -u postgres psql -d knowledge_base -At -c "SELECT COUNT(*) FROM ${tbl};" 2>/dev/null || echo 0) if [ "$rows" -eq 0 ]; then log " · ${tbl}: 0 rows, skipping" continue fi sudo -u postgres psql -d knowledge_base -At -c "COPY (SELECT row_to_json(t) FROM ${tbl} t) TO STDOUT;" > "$out" 2>/dev/null size=$(du -h "$out" | awk '{print $1}') log " · ${tbl}: ${rows} rows (${size})" mc cp -q "$out" ${ALIAS}/${BUCKET}/llm_team/${tbl}.jsonl done # ─── 4. CHICAGO PERMITS (last 30 days, paginated) ─── log "chicago/ — pulling last 30 days of permits from data.cityofchicago.org" since=$(date -u -d '30 days ago' +%Y-%m-%d) out=${STAGE}/permits_${DATE}.json url="https://data.cityofchicago.org/resource/ydr8-5enu.json?\$where=issue_date%3E='${since}'&\$limit=10000&\$order=issue_date%20DESC" if curl -sf --max-time 60 "$url" -o "$out"; then count=$(python3 -c "import json; print(len(json.load(open('${out}'))))") size=$(du -h "$out" | awk '{print $1}') log " · permits since ${since}: ${count} records (${size})" mc cp -q "$out" ${ALIAS}/${BUCKET}/chicago/permits_${DATE}.json else log " · WARN: chicago permits fetch failed; skipping" fi # ─── 5. MANIFEST ─── log "writing MANIFEST.json" manifest=${STAGE}/MANIFEST.json python3 - <