diff --git a/.gitignore b/.gitignore index 2f0b818..6b1db00 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,11 @@ vendor/ /reports/scrum/_evidence/* !/reports/scrum/_evidence/.gitkeep +# Proof harness runtime output — same pattern as reports/scrum/_evidence. +# Track the directory but ignore per-run subdirs. +/tests/proof/reports/* +!/tests/proof/reports/.gitkeep + # Secrets — never commit. Resolved via SecretsProvider per ADR-001 §1.x. *.env secrets.toml diff --git a/docs/CLAUDE_REFACTOR_GUARDRAILS.md b/docs/CLAUDE_REFACTOR_GUARDRAILS.md new file mode 100644 index 0000000..efd2a1e --- /dev/null +++ b/docs/CLAUDE_REFACTOR_GUARDRAILS.md @@ -0,0 +1,281 @@ +# Claude Refactor Guardrails — Go Lakehouse + +## Mission + +Continue the Go refactor without recreating the Rust-era complexity. + +The Go rewrite exists to make the lakehouse operationally legible: + +- small binaries +- clear service boundaries +- gateway-fronted APIs +- smoke-testable behavior +- fast build/run/debug loop +- no accidental framework bureaucracy + +The Rust repo had maturity, but also accumulated control-plane weight: validator, auditor, provider routing, iteration loops, UI, truth layers, and agent-era scaffolding. Do not blindly port all of that back. + +## Prime Directive + +Preserve the Go spine: + +```text +gateway + → storaged + → catalogd + → ingestd + → queryd + → vectord + → embedd +Only add complexity when there is measured evidence, a failing smoke, or a documented feature-parity requirement. + +Refactor Rules +1. No silent behavior changes + +Before changing any service behavior, identify: + +current route +current request schema +current response schema +current status-code behavior +current smoke test covering it + +If no smoke exists, add one before or with the refactor. + +2. Keep service boundaries hard + +Do not let services reach into each other’s internals. + +Allowed: + +HTTP client calls +shared request/response structs when stable +small internal packages for config/secrets/logging + +Avoid: + +importing another service’s implementation package +hidden global state +“just for now” shared mutable registries +circular service knowledge +3. Go is not Rust + +Do not imitate Rust patterns mechanically. + +Prefer Go-native clarity: + +simple structs +explicit errors +small interfaces at the consumer side +context-aware HTTP handlers +table-driven tests +boring package names +no abstraction tax unless repeated 3+ times +4. Validation replaces the borrow checker + +Rust caught many problems at compile time. Go will not. + +Therefore every refactor must preserve or improve: + +input validation +dimension checks +duplicate handling +restart persistence checks +schema drift detection +error status mapping +smoke coverage +5. Performance work must be measured + +Do not optimize by vibes. + +For each performance change, record: + +baseline command +baseline result +changed code path +new result +regression risk +rollback plan + +Current known bottleneck: + +vectord Add is RWMutex-serialized. +500K vectors: ~35m36s, ~234/sec avg. +GPU around 65%, so embedding is not the only bottleneck. + +Do not claim concurrency improvements unless the HNSW library thread-safety is audited or writes are safely batched/sharded. + +File/Package Expectations +cmd/ + +One binary per service. Keep main files thin. + +Main should only: + +load config +construct dependencies +wire routes +start server +handle shutdown +internal/ + +Shared code belongs here only when it is genuinely shared. + +Good internal packages: + +config +secrets +storeclient +catalogclient +gateway routing helpers +logging +request/response contracts + +Bad internal packages: + +vague “utils” +giant “common” +cross-service god objects +hidden dependency containers +scripts/ + +Every major behavior needs a runnable smoke. + +Smokes are not decoration. They are the replacement nervous system. + +Existing smoke pattern must remain: + +d1 skeleton/health/gateway +d2 storaged +d3 catalogd +d4 ingestd +d5 queryd +d6 full ingest/query +g1 vectord +g1p vectord persistence +g2 embed → vector add → search + +New functionality needs a new smoke or an extension to the closest existing one. + +Refactor Checklist + +Before editing: + + Read README.md + Read docs/PRD.md + Read docs/SPEC.md + Read docs/DECISIONS.md + Read docs/PHASE_G0_KICKOFF.md + Identify affected services + Identify affected smokes + +During editing: + + Keep public API stable unless explicitly changing it + Keep errors explicit + Keep logs useful but not noisy + Avoid package sprawl + Avoid premature generic interfaces + Preserve restart behavior + Preserve gateway-only acceptance path + +After editing: + + Run go test ./... + Run relevant smoke script + Run full smoke loop when service contracts changed + Record evidence in a short refactor note + +Full smoke loop: + +for s in scripts/{d1,d2,d3,d4,d5,d6,g1,g1p,g2}_smoke.sh; do + "$s" || break +done +Refactor Note Format + +Create or update: + +docs/refactor-notes/YYYYMMDD-.md + +Use this structure: + +# Refactor Note: + +## Goal + +## Files changed + +## Behavior changed + +## Behavior preserved + +## Tests run + +## Smoke results + +## Performance before/after + +## Risks + +## Rollback +Anti-Patterns To Reject +Reject these unless specifically requested: + +porting Rust modules 1:1 +adding orchestration before service parity +adding AI/agent logic inside core services +making gateway business-aware +hiding failures behind retries +swallowing errors +“temporary” global maps +changing route contracts without smoke updates +adding dependencies for trivial code +optimizing vector ingestion without measurement +rebuilding the Rust bureaucracy in Go clothing +Preferred Next Targets + +Prioritize in this order: + +Stability of existing Go service contracts +Better smoke coverage +Persistence limits and large object handling +vectord ingestion bottleneck analysis +gateway observability +feature parity with Rust only where needed +UI/agent/auditor layers later, not now +Architectural Position + +The Go rewrite should remain the production spine. + +The Rust system remains historical reference and possible source for: + +validation ideas +audit semantics +provider-routing lessons +prior acceptance criteria +edge cases + +But Rust is not the shape to copy. + +Go owns the clean operational path. +Rust owns historical scar tissue and high-performance lessons. + +Do not confuse the two. + + +Also add a shorter command prompt when you hand this to Claude Code: + +```md +Read `docs/CLAUDE_REFACTOR_GUARDRAILS.md` first. + +Then inspect the current Go lakehouse repo and produce a refactor plan only. Do not edit code yet. + +Your plan must identify: +1. affected services +2. affected routes +3. affected request/response contracts +4. affected smoke scripts +5. risks of accidentally reintroducing Rust-era complexity +6. exact tests/smokes you will run after changes + +Do not port Rust structure blindly. Preserve the Go service spine. diff --git a/docs/TEST_PROOF_SCOPE.md b/docs/TEST_PROOF_SCOPE.md new file mode 100644 index 0000000..177ede4 --- /dev/null +++ b/docs/TEST_PROOF_SCOPE.md @@ -0,0 +1,258 @@ +Create `docs/TEST_PROOF_SCOPE.md`. + +Purpose: design a serious proof harness for the Go lakehouse refactor. + +You are not writing production features yet. You are designing and implementing a claims-verification test suite that proves or disproves what this system currently claims. + +## System Claims To Prove + +The Go lakehouse claims: + +1. Gateway-fronted services work as a coherent system. +2. CSV data can ingest into Parquet. +3. Catalog manifests remain consistent. +4. DuckDB query path returns correct results. +5. Embedding path works through Ollama or configured embedding backend. +6. Vector add/search works. +7. Vector persistence survives restart. +8. Service contracts are stable. +9. Refactor preserved behavior. +10. Performance claims are measurable, not vibes. + +## Required Output + +Create a proof harness under: + +```text +tests/proof/ +tests/proof/ + README.md + claims.yaml + run_proof.sh + lib/ + env.sh + http.sh + assert.sh + metrics.sh + cases/ + 00_health.sh + 01_storage_roundtrip.sh + 02_catalog_manifest.sh + 03_ingest_csv_to_parquet.sh + 04_query_correctness.sh + 05_embedding_contract.sh + 06_vector_add_search.sh + 07_vector_persistence_restart.sh + 08_gateway_contracts.sh + 09_failure_modes.sh + 10_perf_baseline.sh + fixtures/ + csv/ + expected/ + reports/ + .gitkeep + +Test Design Requirements + +Each test must produce evidence, not just pass/fail. + +For every case, record: + +claim tested +service routes called +input fixture hash +output hash +expected result +actual result +pass/fail +latency +status codes +logs location +timestamp +git commit hash + +Write results to: + +tests/proof/reports/proof-YYYYMMDD-HHMMSS/ + +Each run must produce: + +summary.md +summary.json +raw/ + http/ + logs/ + outputs/ + metrics/ +Claims File + +Create tests/proof/claims.yaml. + +Each claim should have: + +id: GOLAKE-001 +name: Gateway health routes respond +type: contract +services: + - gateway +routes: + - GET /health +evidence: + - status_code + - response_body + - latency_ms +required: true + +Include claims for: + +gateway health +each service health +storage put/get/list/delete if supported +catalog create/read/update/list if supported +ingest job creation +Parquet output existence +query correctness against known CSV fixture +embedding vector dimension +vector add/search nearest-neighbor correctness +vector restart persistence +invalid request rejection +missing object behavior +duplicate vector ID behavior +malformed CSV behavior +unavailable downstream service behavior +latency baseline +throughput baseline +Fixtures + +Create deterministic fixtures. + +Minimum CSV fixture: + +id,name,role,city,score +1,Ada,welder,Chicago,91 +2,Grace,electrician,Detroit,88 +3,Linus,operator,Chicago,77 +4,Ken,pipefitter,Houston,84 +5,Barbara,safety,Houston,95 + +Expected query assertions: + +count rows = 5 +city Chicago = 2 +max score = 95 +role safety belongs to Barbara +Houston average score = 89.5 + +For vector tests, use deterministic text fixtures: + +doc-001: industrial staffing for welders in Chicago +doc-002: safety compliance for warehouse crews +doc-003: electrical contractors assigned to Detroit +doc-004: pipefitters and heavy equipment operators in Houston + +Search assertions should verify that semantically related queries return expected top candidates where embeddings are enabled. + +If embeddings are not deterministic enough, support a contract-only mode that verifies: + +vector dimension +non-empty vector +add succeeds +search returns known inserted IDs +persistence survives restart +Modes + +Support three modes: + +tests/proof/run_proof.sh --mode contract +tests/proof/run_proof.sh --mode integration +tests/proof/run_proof.sh --mode performance +Contract mode + +Fast. No massive data. Verifies APIs, schemas, status codes, basic correctness. + +Integration mode + +Runs full gateway → service chain. + +Must prove: + +CSV fixture → storaged → ingestd → catalogd → queryd +text fixture → embedd → vectord → search +Performance mode + +Measures baseline only. Do not fake claims. + +Record: + +rows ingested/sec +vectors added/sec +p50/p95 query latency +p50/p95 vector search latency +memory usage if available +CPU usage if available +service restart time if available +Failure-Mode Tests + +Add tests proving the system fails cleanly. + +Required: + +malformed JSON +missing required field +invalid vector dimension +missing object +bad SQL query +duplicate vector ID +downstream service unavailable if easy to simulate +restart before persistence load completes if relevant + +Do not hide failures behind retries unless the system explicitly documents retry behavior. + +Hard Rules +Do not add production features unless needed to expose testable behavior. +Do not change public route contracts without documenting it. +Do not write tests that merely check “HTTP 200” unless the claim is health-only. +Do not use random data unless seeded and recorded. +Do not make performance claims without before/after metrics. +Do not assume Ollama is available; detect it and mark embedding tests skipped or degraded with explanation. +Do not let skipped tests appear as passed. +Do not silently ignore missing services. +Do not make the proof harness depend on external cloud services. +Final Deliverables + +After implementation, produce: + +tests/proof/README.md +tests/proof/claims.yaml +tests/proof/run_proof.sh +tests/proof/cases/*.sh +tests/proof/reports//summary.md +tests/proof/reports//summary.json +Final Report Must Answer + +At the end, write a clear report: + +Which claims are proven? +Which claims are partially proven? +Which claims failed? +Which claims were skipped and why? +What evidence supports each claim? +What bottlenecks were measured? +What contract drift was found? +What refactor risks remain? +What should be fixed first? +Execution Plan + +First inspect the repo. + +Then produce a short implementation plan. + +Then build the proof harness. + +Then run contract mode. + +Then run integration mode if services can be started. + +Then run performance mode only if contract and integration pass. + +Do not declare success without evidence files. diff --git a/justfile b/justfile index 560a9a5..3ae07f5 100644 --- a/justfile +++ b/justfile @@ -75,6 +75,14 @@ smoke-all: doctor *args: @bash scripts/doctor.sh {{args}} +# Proof harness — claims-verification tier above the smoke chain. +# See tests/proof/README.md and docs/TEST_PROOF_SCOPE.md. +# just proof contract fast: APIs + status codes + dim/nonempty +# just proof integration full: CSV→Parquet→SQL, text→vector→search +# just proof performance measurements; runs only after contract+integration +proof mode *flags: + @bash tests/proof/run_proof.sh --mode {{mode}} {{flags}} + # Install pre-push hook so `git push` runs `just verify` first. install-hooks: #!/usr/bin/env bash diff --git a/tests/proof/README.md b/tests/proof/README.md new file mode 100644 index 0000000..845e008 --- /dev/null +++ b/tests/proof/README.md @@ -0,0 +1,91 @@ +# tests/proof — claims-verification harness + +Per `docs/TEST_PROOF_SCOPE.md`. The 9 smokes prove that the system *runs*; this harness proves that the system *makes the claims it claims to make*. + +## Why this exists + +Smokes verify that services boot, talk, and pass deterministic round-trips. +They do not verify: + +- contract drift (a route silently changes its response shape) +- semantic correctness (the SQL query says what we claim it says) +- failure-mode discipline (a malformed request returns 4xx, not silent 200) +- performance regressions (vectors/sec drops 30% on a refactor) + +The proof harness produces evidence, not pass/fail. Each case writes +input/output hashes, latencies, status codes, log paths, git SHA → a +future auditor can re-run + diff. + +## Layout + +``` +tests/proof/ + README.md ← you are here + claims.yaml ← enumeration of every claim, with id + type + routes + run_proof.sh ← orchestrator (--mode contract|integration|performance) + lib/ + env.sh ← service URLs, report dir, mode, git context + http.sh ← curl wrappers (latency + status + body capture) + assert.sh ← structured assertions writing JSONL evidence + metrics.sh ← rss/cpu/timing capture for performance mode + cases/ + 00_health.sh + 01_storage_roundtrip.sh + … + 10_perf_baseline.sh + fixtures/ + csv/workers.csv ← canonical 5-row fixture (sha-pinned) + text/docs.txt ← 4 deterministic vector docs + expected/queries.json ← expected results for the 5 SQL assertions + expected/rankings.json ← stored top-K rankings for vector search + reports/ + proof-YYYYMMDD-HHMMSSZ/ ← per-run; gitignored + summary.md + summary.json + raw/ + context.json ← git_sha, hostname, timestamp, mode + cases/.jsonl ← one JSONL line per assertion + http//*.{json,body,headers} + logs/.log ← captured stdout+stderr from booted services + metrics/.jsonl +``` + +## Modes + +```bash +just proof contract # APIs, schemas, status codes; no big data; ~30s +just proof integration # full chain CSV→storaged→…→queryd, text→embedd→vectord +just proof performance # measurements only; runs after contract+integration +``` + +The `just` recipes wrap `tests/proof/run_proof.sh` with `--mode `. Use the script directly for advanced flags (`--no-bootstrap`, `--regenerate-rankings`, `--regenerate-baseline`). + +## Hard rules (from TEST_PROOF_SCOPE.md) + +- Don't claim performance without before/after metrics +- Detect Ollama unavailability; mark embedding tests skipped or degraded with explanation +- Skipped tests do not appear as passed +- No silent ignore of missing services +- No external cloud dependencies +- No "HTTP 200" assertions unless the claim is health-only +- No random data without a seed + +## How to read a report + +After `just proof integration`: + +1. Open `tests/proof/reports/proof-/summary.md` for the human view. +2. `summary.json` is the machine-readable counterpart. +3. To investigate a single failed assertion: + - find its `case_id` in `summary.md` + - read `raw/cases/.jsonl` (each line is one assertion) + - cross-reference `raw/http//.{json,body,headers}` for the underlying HTTP round-trip + +Every record cites the git SHA at run time; a clean re-run of the same SHA against the same fixtures must produce identical evidence (modulo timestamps + non-deterministic embedding noise). + +## Reading order for new contributors + +1. `docs/TEST_PROOF_SCOPE.md` — the spec this harness implements. +2. `docs/CLAUDE_REFACTOR_GUARDRAILS.md` — process discipline this harness must obey when extended. +3. `tests/proof/claims.yaml` — what's claimed. +4. `tests/proof/cases/00_health.sh` — canonical case shape; copy-paste to add new cases. diff --git a/tests/proof/cases/00_health.sh b/tests/proof/cases/00_health.sh new file mode 100755 index 0000000..65a3326 --- /dev/null +++ b/tests/proof/cases/00_health.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# 00_health.sh — GOLAKE-001 + GOLAKE-002. +# Verifies that gateway and each backing service answer GET /health +# with 200 and a body that includes the service name. Canonical case +# shape — copy this file when adding new cases. + +set -uo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=../lib/env.sh +source "${SCRIPT_DIR}/../lib/env.sh" +# shellcheck source=../lib/http.sh +source "${SCRIPT_DIR}/../lib/http.sh" +# shellcheck source=../lib/assert.sh +source "${SCRIPT_DIR}/../lib/assert.sh" + +CASE_ID="GOLAKE-001-002" +CASE_NAME="health endpoints respond" +CASE_TYPE="contract" + +# Allow run_proof.sh to read metadata without executing. +if [ "${1:-}" = "--metadata-only" ]; then return 0 2>/dev/null || exit 0; fi + +# Each row: . Service name in /health body must match. +SERVICES=( + "gateway:3110" + "storaged:3211" + "catalogd:3212" + "ingestd:3213" + "queryd:3214" + "vectord:3215" + "embedd:3216" +) + +for spec in "${SERVICES[@]}"; do + name="${spec%:*}" + port="${spec#*:}" + probe="${name}_health" + + # Probe — captures status, body, latency to raw/http//.json + proof_get "$CASE_ID" "$probe" "http://127.0.0.1:${port}/health" >/dev/null + + status=$(proof_status_of "$CASE_ID" "$probe") + body=$(proof_body_of "$CASE_ID" "$probe") + latency=$(proof_latency_of "$CASE_ID" "$probe") + + proof_assert_eq "$CASE_ID" "${name} /health → 200" "200" "$status" + proof_assert_contains "$CASE_ID" "${name} body identifies service" "$name" "$body" + # Latency budget — generous so we don't get spurious failures from + # cold-start or system jitter; tighten if a real budget emerges. + proof_assert_lt "$CASE_ID" "${name} health latency < 500ms" "$latency" "500" +done diff --git a/tests/proof/claims.yaml b/tests/proof/claims.yaml new file mode 100644 index 0000000..bbcdc41 --- /dev/null +++ b/tests/proof/claims.yaml @@ -0,0 +1,214 @@ +# claims.yaml — what the Go lakehouse claims, enumerated. +# +# Each claim has an id, name, type, the services + routes it touches, +# the evidence shape, and whether failure is fatal (required: true) or +# advisory (required: false). +# +# Source of truth for what cases/*.sh actually verify is the case +# scripts themselves; this file is the human-readable enumeration that +# the spec mandates as a deliverable. run_proof.sh validates that every +# claim here has a matching case with the same CASE_ID at startup. +# +# Modes: +# contract — fast; APIs + schemas + status codes; no big data +# integration — full chain CSV→storaged→catalogd→ingestd→queryd, text→embedd→vectord +# performance — measurements only; runs after contract+integration green + +claims: + - id: GOLAKE-001 + name: Gateway health route responds + type: contract + services: [gateway] + routes: [GET /health] + evidence: [status_code, response_body, latency_ms] + required: true + + - id: GOLAKE-002 + name: Each backing service health route responds + type: contract + services: [storaged, catalogd, ingestd, queryd, vectord, embedd] + routes: [GET /health] + evidence: [status_code, response_body, latency_ms, service_field_match] + required: true + + - id: GOLAKE-003 + name: Gateway proxies /v1/* to the right upstream and preserves status codes + type: contract + services: [gateway] + routes: [GET /v1/storage/list, GET /v1/catalog/list, POST /v1/sql with empty] + evidence: [upstream_match, status_passthrough, latency_ms] + required: true + + - id: GOLAKE-010 + name: Storage put/get round-trip preserves bytes + type: integration + services: [storaged] + routes: [PUT /storage/put/*, GET /storage/get/*] + evidence: [input_sha256, output_sha256, status_code, latency_ms] + required: true + + - id: GOLAKE-011 + name: Storage list returns the just-put key + type: integration + services: [storaged] + routes: [PUT /storage/put/*, GET /storage/list] + evidence: [list_contains_key, latency_ms] + required: true + + - id: GOLAKE-012 + name: Storage delete removes the key (subsequent GET 404) + type: integration + services: [storaged] + routes: [DELETE /storage/delete/*, GET /storage/get/*] + evidence: [delete_status, get_after_status] + required: true + + - id: GOLAKE-020 + name: Catalog register is idempotent on identical fingerprint + type: integration + services: [catalogd] + routes: [POST /catalog/register] + evidence: [first_existing_false, second_existing_true, dataset_id_stable] + required: true + + - id: GOLAKE-021 + name: Catalog manifest read matches what was registered + type: integration + services: [catalogd] + routes: [POST /catalog/register, GET /catalog/manifest/*] + evidence: [manifest_equality, schema_fingerprint_match] + required: true + + - id: GOLAKE-022 + name: Catalog list contains the registered dataset + type: integration + services: [catalogd] + routes: [GET /catalog/list] + evidence: [list_contains_dataset_id] + required: true + + - id: GOLAKE-030 + name: Ingest CSV → Parquet writes a parquet object that catalogd manifests + type: integration + services: [ingestd, storaged, catalogd] + routes: [POST /ingest, GET /storage/list, GET /catalog/manifest/*] + evidence: [parquet_object_exists, manifest_row_count, content_addressed_key] + required: true + + - id: GOLAKE-040 + name: Query correctness — 5 SQL assertions against the workers CSV fixture + type: integration + services: [queryd] + routes: [POST /sql] + evidence: [Q1_count_5, Q2_chicago_2, Q3_max_95, Q4_safety_barbara, Q5_houston_avg_89_5] + required: true + + - id: GOLAKE-050 + name: Embedding contract — request returns dim=768, non-empty vector + type: contract + services: [embedd] + routes: [POST /embed] + evidence: [dimension, vector_nonempty, model_echoed] + required: true + + - id: GOLAKE-051 + name: Embedding integration — top-K ranking matches stored fixture + type: integration + services: [embedd, vectord] + routes: [POST /embed, POST /vectors/index//add, POST /vectors/index//search] + evidence: [top_k_id_set, top_1_id_match] + required: true + notes: | + Ollama embeddings are stable-but-not-bit-identical across runs. + Ranking-by-cosine is deterministic at our scale; this case asserts + the top-K ID set matches expected/rankings.json. Regenerable via + run_proof.sh --regenerate-rankings. + + - id: GOLAKE-060 + name: Vector add + lookup-by-ID round-trip + type: contract + services: [vectord] + routes: [POST /vectors/index, POST /vectors/index//add, GET /vectors/index/] + evidence: [add_status, lookup_returns_inserted_ids] + required: true + + - id: GOLAKE-061 + name: Vector search nearest-neighbor — inserted vector ranks #1 against itself + type: contract + services: [vectord] + routes: [POST /vectors/index//add, POST /vectors/index//search] + evidence: [top_1_id, top_1_distance] + required: true + + - id: GOLAKE-070 + name: Vector persistence — kill+restart preserves index state + type: integration + services: [vectord, storaged] + routes: [POST /vectors/index//add, POST /vectors/index//search] + evidence: [pre_restart_search, post_restart_search_dist_zero] + required: true + + - id: GOLAKE-080 + name: Failure mode — malformed JSON returns 4xx, never 5xx, never silent 200 + type: contract + services: [storaged, catalogd, ingestd, queryd, vectord, embedd] + routes: [POST endpoints with invalid body] + evidence: [per_service_status_codes, error_body_shape] + required: true + + - id: GOLAKE-081 + name: Failure mode — missing required field rejected with structured 400 + type: contract + services: [catalogd, vectord, embedd] + routes: [POST endpoints with valid JSON but missing fields] + evidence: [per_service_status_codes] + required: true + + - id: GOLAKE-082 + name: Failure mode — bad SQL returns 4xx, error message present + type: contract + services: [queryd] + routes: [POST /sql with syntax error] + evidence: [status_code, error_body_present] + required: true + + - id: GOLAKE-083 + name: Failure mode — vector dim mismatch returns 4xx + type: contract + services: [vectord] + routes: [POST /vectors/index//add with wrong dim] + evidence: [status_code] + required: true + + - id: GOLAKE-084 + name: Failure mode — missing storage object returns 404 + type: contract + services: [storaged] + routes: [GET /storage/get/] + evidence: [status_code] + required: true + + - id: GOLAKE-085 + name: Failure mode — duplicate vector ID — record actual behavior (informational) + type: contract + services: [vectord] + routes: [POST /vectors/index//add with same id twice] + evidence: [first_status, second_status, search_returns_count] + required: false + notes: | + Spec asks us to verify duplicate-ID handling. Current behavior is + not yet documented; this case records what happens so we can + decide the contract. Required:false → does not fail the gate. + + - id: GOLAKE-100 + name: Performance baseline — rows/sec ingest, vectors/sec add, query latency + type: performance + services: [ingestd, vectord, queryd, embedd] + routes: [POST /ingest, POST /vectors/index//add, POST /sql, POST /embed] + evidence: [rows_per_sec, vectors_per_sec, query_p50_ms, query_p95_ms, + vector_search_p50_ms, vector_search_p95_ms, rss_peak_mb, cpu_peak_pct] + required: false + notes: | + First run writes tests/proof/baseline.json. Subsequent runs diff + against it; a regression ≥10% on any metric warns but does not + fail the gate. Use --regenerate-baseline to overwrite. diff --git a/tests/proof/fixtures/csv/workers.csv b/tests/proof/fixtures/csv/workers.csv new file mode 100644 index 0000000..f3e15ff --- /dev/null +++ b/tests/proof/fixtures/csv/workers.csv @@ -0,0 +1,6 @@ +id,name,role,city,score +1,Ada,welder,Chicago,91 +2,Grace,electrician,Detroit,88 +3,Linus,operator,Chicago,77 +4,Ken,pipefitter,Houston,84 +5,Barbara,safety,Houston,95 diff --git a/tests/proof/fixtures/expected/queries.json b/tests/proof/fixtures/expected/queries.json new file mode 100644 index 0000000..ff2b17e --- /dev/null +++ b/tests/proof/fixtures/expected/queries.json @@ -0,0 +1,36 @@ +{ + "_comment": "Expected results for the 5 SQL assertions in 04_query_correctness against fixtures/csv/workers.csv. The CSV is content-addressed; if its hash changes, this file must be re-derived.", + "fixture_sha256": "computed at runtime by 03_ingest_csv_to_parquet — see actual.fixture_sha in evidence", + "queries": [ + { + "id": "Q1", + "claim": "row count = 5", + "sql": "SELECT count(*) AS n FROM workers", + "expected": {"n": 5} + }, + { + "id": "Q2", + "claim": "Chicago row count = 2", + "sql": "SELECT count(*) AS n FROM workers WHERE city = 'Chicago'", + "expected": {"n": 2} + }, + { + "id": "Q3", + "claim": "max score = 95", + "sql": "SELECT max(score) AS m FROM workers", + "expected": {"m": 95} + }, + { + "id": "Q4", + "claim": "role = safety belongs to Barbara", + "sql": "SELECT name FROM workers WHERE role = 'safety'", + "expected": {"name": "Barbara"} + }, + { + "id": "Q5", + "claim": "Houston average score = 89.5", + "sql": "SELECT avg(score) AS avg FROM workers WHERE city = 'Houston'", + "expected": {"avg": 89.5} + } + ] +} diff --git a/tests/proof/fixtures/text/docs.txt b/tests/proof/fixtures/text/docs.txt new file mode 100644 index 0000000..7cffb3e --- /dev/null +++ b/tests/proof/fixtures/text/docs.txt @@ -0,0 +1,4 @@ +doc-001 industrial staffing for welders in Chicago +doc-002 safety compliance for warehouse crews +doc-003 electrical contractors assigned to Detroit +doc-004 pipefitters and heavy equipment operators in Houston diff --git a/tests/proof/lib/assert.sh b/tests/proof/lib/assert.sh new file mode 100644 index 0000000..02e7a59 --- /dev/null +++ b/tests/proof/lib/assert.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# lib/assert.sh — assertions that record evidence per the spec. +# +# Each assertion appends one record to: +# $PROOF_REPORT_DIR/raw/cases/.jsonl +# +# Each line is a self-describing JSON object — case ID, claim, expected, +# actual, result {pass|fail|skip}, optional evidence pointers. Cases +# call multiple assertions; run_proof.sh aggregates JSONL → summary. +# +# Functions: +# proof_assert_eq +# proof_assert_ne +# proof_assert_contains +# proof_assert_lt # passes if a < b +# proof_assert_gt # passes if a > b +# proof_assert_status +# proof_assert_json_eq +# proof_skip +# +# All return 0 (case scripts decide their own halt-on-fail policy). + +_proof_record() { + local case_id="$1" claim="$2" result="$3" expected="$4" actual="$5" detail="$6" + local out="${PROOF_REPORT_DIR}/raw/cases/${case_id}.jsonl" + mkdir -p "$(dirname "$out")" + # JSON-escape the variable inputs. + local e_claim e_expected e_actual e_detail + e_claim=$(printf '%s' "$claim" | jq -Rs .) + e_expected=$(printf '%s' "$expected" | jq -Rs .) + e_actual=$(printf '%s' "$actual" | jq -Rs .) + e_detail=$(printf '%s' "$detail" | jq -Rs .) + local ts + ts=$(date -u -Iseconds) + cat >> "$out" < b)}'; then + _proof_record "$case_id" "$claim" pass "${a} > ${b}" "${a}" "" + return 0 + fi + _proof_record "$case_id" "$claim" fail "${a} > ${b}" "${a}" "${a} is not greater than ${b}" + return 0 +} + +# proof_assert_status compares the status from a previously-recorded +# probe against an expected value. Probe must have run via lib/http.sh. +proof_assert_status() { + local case_id="$1" claim="$2" expected="$3" probe_name="$4" + local actual + actual=$(proof_status_of "$case_id" "$probe_name" 2>/dev/null || echo missing) + proof_assert_eq "$case_id" "$claim" "$expected" "$actual" +} + +# proof_assert_json_eq: jq-based equality on response body or a file. +# body_or_file: if starts with @, read from file; otherwise treat as +# literal JSON string. +proof_assert_json_eq() { + local case_id="$1" claim="$2" jq_path="$3" expected="$4" source="$5" + local actual + if [[ "$source" == @* ]]; then + actual=$(jq -r "$jq_path" "${source#@}" 2>/dev/null || echo "") + else + actual=$(printf '%s' "$source" | jq -r "$jq_path" 2>/dev/null || echo "") + fi + proof_assert_eq "$case_id" "$claim" "$expected" "$actual" +} + +proof_skip() { + local case_id="$1" claim="$2" reason="$3" + _proof_record "$case_id" "$claim" skip "" "" "$reason" + return 0 +} diff --git a/tests/proof/lib/env.sh b/tests/proof/lib/env.sh new file mode 100644 index 0000000..2480e8a --- /dev/null +++ b/tests/proof/lib/env.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# lib/env.sh — proof harness environment. +# +# Sourced once by run_proof.sh and by every case script. Establishes: +# - service URLs (gateway and direct ports) +# - report directory paths +# - run context (git SHA, hostname, timestamp) +# - mode (contract|integration|performance) +# +# Cases read from these vars; never re-set them. + +# Repo root — every path the harness emits is anchored here so report +# JSON is portable across reviewer machines. +PROOF_REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +export PROOF_REPO_ROOT + +# Service endpoints. Match internal/shared/config.go defaults; every +# binary binds 127.0.0.1 per the audit's R-007 mitigation. +export PROOF_GATEWAY_URL="${PROOF_GATEWAY_URL:-http://127.0.0.1:3110}" +export PROOF_STORAGED_URL="${PROOF_STORAGED_URL:-http://127.0.0.1:3211}" +export PROOF_CATALOGD_URL="${PROOF_CATALOGD_URL:-http://127.0.0.1:3212}" +export PROOF_INGESTD_URL="${PROOF_INGESTD_URL:-http://127.0.0.1:3213}" +export PROOF_QUERYD_URL="${PROOF_QUERYD_URL:-http://127.0.0.1:3214}" +export PROOF_VECTORD_URL="${PROOF_VECTORD_URL:-http://127.0.0.1:3215}" +export PROOF_EMBEDD_URL="${PROOF_EMBEDD_URL:-http://127.0.0.1:3216}" + +# Mode + report directory — set by run_proof.sh before sourcing cases. +# Defaulted here so cases sourced standalone for debug still work. +export PROOF_MODE="${PROOF_MODE:-contract}" + +if [ -z "${PROOF_REPORT_DIR:-}" ]; then + ts="$(date -u +%Y%m%d-%H%M%SZ)" + export PROOF_REPORT_DIR="${PROOF_REPO_ROOT}/tests/proof/reports/proof-${ts}" +fi + +mkdir -p \ + "${PROOF_REPORT_DIR}/raw/http" \ + "${PROOF_REPORT_DIR}/raw/logs" \ + "${PROOF_REPORT_DIR}/raw/outputs" \ + "${PROOF_REPORT_DIR}/raw/metrics" \ + "${PROOF_REPORT_DIR}/raw/cases" + +# Run context — captured once per run by run_proof.sh, but recomputed +# here as fallback if a case is invoked standalone. +if [ ! -f "${PROOF_REPORT_DIR}/raw/context.json" ]; then + git_sha="$(cd "$PROOF_REPO_ROOT" && git rev-parse HEAD 2>/dev/null || echo unknown)" + git_dirty="$(cd "$PROOF_REPO_ROOT" && [ -n "$(git status --porcelain 2>/dev/null)" ] && echo true || echo false)" + cat > "${PROOF_REPORT_DIR}/raw/context.json" </dev/null || echo unknown)" diff --git a/tests/proof/lib/http.sh b/tests/proof/lib/http.sh new file mode 100644 index 0000000..c6d6dd4 --- /dev/null +++ b/tests/proof/lib/http.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# lib/http.sh — curl wrappers that capture latency, status, body. +# +# Each request emits a JSON file under raw/http//.json +# describing the round-trip. Cases consume the JSON via assert.sh. +# +# Why JSON files instead of bash variables: gives the final report a +# diffable, replayable record. Future runs can compare on disk without +# re-executing the case. +# +# Functions: +# proof_get [extra-curl-args...] +# proof_post [extra-curl-args...] +# proof_put [extra-curl-args...] +# proof_delete [extra-curl-args...] +# +# Returns 0 always (capture is independent of HTTP outcome). +# Stores result at: $PROOF_REPORT_DIR/raw/http//.json +# Stores body at: $PROOF_REPORT_DIR/raw/http//.body + +_proof_http_emit() { + local case_id="$1" probe="$2" method="$3" url="$4" status="$5" latency_ms="$6" body_path="$7" headers_path="$8" + local dir="${PROOF_REPORT_DIR}/raw/http/${case_id}" + mkdir -p "$dir" + local body_sha="" + [ -s "$body_path" ] && body_sha="$(sha256sum "$body_path" | awk '{print $1}')" + cat > "${dir}/${probe}.json" </dev/null || echo 0) + end_ms=$(date +%s%3N) + local latency_ms=$((end_ms - start_ms)) + _proof_http_emit "$case_id" "$probe" "$method" "$url" "$status" "$latency_ms" "$body_path" "$headers_path" + cat "$body_path" +} + +proof_get() { + local case_id="$1" probe="$2" url="$3"; shift 3 + _proof_http_run "$case_id" "$probe" GET "$url" "$@" +} + +proof_post() { + local case_id="$1" probe="$2" url="$3" content_type="$4" body="$5"; shift 5 + _proof_http_run "$case_id" "$probe" POST "$url" \ + -H "Content-Type: ${content_type}" \ + --data "$body" \ + "$@" +} + +# proof_put accepts either an inline body or @-prefixed file path +# (curl --upload-file semantics for streaming). +proof_put() { + local case_id="$1" probe="$2" url="$3" content_type="$4" body="$5"; shift 5 + if [[ "$body" == @* ]]; then + local file="${body#@}" + _proof_http_run "$case_id" "$probe" PUT "$url" \ + -H "Content-Type: ${content_type}" \ + --upload-file "$file" \ + "$@" + else + _proof_http_run "$case_id" "$probe" PUT "$url" \ + -H "Content-Type: ${content_type}" \ + --data "$body" \ + "$@" + fi +} + +proof_delete() { + local case_id="$1" probe="$2" url="$3"; shift 3 + _proof_http_run "$case_id" "$probe" DELETE "$url" "$@" +} + +# Helper accessors — reads the per-probe JSON. +proof_status_of() { + local case_id="$1" probe="$2" + jq -r '.status' "${PROOF_REPORT_DIR}/raw/http/${case_id}/${probe}.json" +} + +proof_body_of() { + local case_id="$1" probe="$2" + cat "${PROOF_REPORT_DIR}/raw/http/${case_id}/${probe}.body" +} + +proof_latency_of() { + local case_id="$1" probe="$2" + jq -r '.latency_ms' "${PROOF_REPORT_DIR}/raw/http/${case_id}/${probe}.json" +} diff --git a/tests/proof/lib/metrics.sh b/tests/proof/lib/metrics.sh new file mode 100644 index 0000000..347d8d9 --- /dev/null +++ b/tests/proof/lib/metrics.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# lib/metrics.sh — performance measurements for performance mode. +# +# Functions: +# proof_metric_start +# proof_metric_stop # writes elapsed_ms +# proof_metric_value [unit] +# proof_sample_rss # MB +# proof_compute_percentile # 50, 95, 99 +# +# All metrics emit to: +# $PROOF_REPORT_DIR/raw/metrics/.jsonl + +_proof_metric_emit() { + local case_id="$1" name="$2" value="$3" unit="$4" detail="$5" + local out="${PROOF_REPORT_DIR}/raw/metrics/${case_id}.jsonl" + mkdir -p "$(dirname "$out")" + local e_detail + e_detail=$(printf '%s' "$detail" | jq -Rs .) + cat >> "$out" < "$f" +} + +proof_metric_stop() { + local case_id="$1" name="$2" + local f="${PROOF_REPORT_DIR}/raw/metrics/_timer_${case_id}_${name}" + if [ ! -f "$f" ]; then + echo "[metrics] timer ${name} for case ${case_id} not started" >&2 + return 1 + fi + local start_ms end_ms elapsed_ms + start_ms=$(cat "$f") + end_ms=$(date +%s%3N) + elapsed_ms=$((end_ms - start_ms)) + rm -f "$f" + _proof_metric_emit "$case_id" "${name}_ms" "$elapsed_ms" "ms" "" + echo "$elapsed_ms" +} + +proof_metric_value() { + local case_id="$1" name="$2" value="$3" unit="${4:-}" + _proof_metric_emit "$case_id" "$name" "$value" "$unit" "" +} + +# Sample resident-set-size (MB) for the first matching process. +proof_sample_rss() { + local case_id="$1" pattern="$2" + local pid rss_kb rss_mb + pid=$(pgrep -f "$pattern" | head -1) + if [ -z "$pid" ]; then + _proof_metric_emit "$case_id" "rss_${pattern//\//_}_mb" 0 "MB" "process not found" + return 1 + fi + rss_kb=$(awk '/^VmRSS:/ {print $2}' "/proc/$pid/status" 2>/dev/null || echo 0) + rss_mb=$(awk -v k="$rss_kb" 'BEGIN{printf "%.1f", k/1024}') + _proof_metric_emit "$case_id" "rss_${pattern//\//_}_mb" "$rss_mb" "MB" "pid=${pid}" + echo "$rss_mb" +} + +# proof_compute_percentile: streams values from a file (one number per +# line), prints the requested percentile. +proof_compute_percentile() { + local file="$1" pct="$2" + sort -n "$file" | awk -v pct="$pct" ' + { v[NR] = $1 } + END { + n = NR + if (n == 0) { print "0"; exit } + idx = int((pct/100.0) * n) + if (idx < 1) idx = 1 + if (idx > n) idx = n + print v[idx] + } + ' +} diff --git a/tests/proof/reports/.gitkeep b/tests/proof/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/proof/run_proof.sh b/tests/proof/run_proof.sh new file mode 100755 index 0000000..fca96c6 --- /dev/null +++ b/tests/proof/run_proof.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash +# run_proof.sh — orchestrator for the proof harness. +# +# Usage: +# tests/proof/run_proof.sh --mode contract +# tests/proof/run_proof.sh --mode integration +# tests/proof/run_proof.sh --mode performance +# tests/proof/run_proof.sh --mode integration --no-bootstrap # assume services up +# tests/proof/run_proof.sh --regenerate-rankings # rebuild expected/rankings.json +# +# Bootstraps services (storaged → catalogd → ingestd → queryd → +# vectord → embedd → gateway) once at the start unless --no-bootstrap. +# Iterates matching cases in numerical order. Aggregates per-case JSONL +# evidence into summary.md + summary.json under tests/proof/reports/proof-/. +# +# Designed per CLAUDE_REFACTOR_GUARDRAILS.md: bash + curl + jq only, +# no Go test framework, no DSL. Each case is a thin shell script that +# sources lib/*.sh and writes evidence; this harness orchestrates them. + +set -uo pipefail + +# ── arg parsing ──────────────────────────────────────────────────────────── +MODE="contract" +NO_BOOTSTRAP=0 +REGENERATE_RANKINGS=0 +REGENERATE_BASELINE=0 + +while [ $# -gt 0 ]; do + case "$1" in + --mode) MODE="$2"; shift 2 ;; + --mode=*) MODE="${1#--mode=}"; shift ;; + --no-bootstrap) NO_BOOTSTRAP=1; shift ;; + --regenerate-rankings) REGENERATE_RANKINGS=1; shift ;; + --regenerate-baseline) REGENERATE_BASELINE=1; shift ;; + -h|--help) + sed -n '1,16p' "$0" | sed 's/^# *//' + exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +case "$MODE" in + contract|integration|performance) ;; + *) echo "[run_proof] invalid --mode '$MODE' (must be contract|integration|performance)" >&2; exit 2 ;; +esac + +export PROOF_MODE="$MODE" +export PROOF_REGENERATE_RANKINGS="$REGENERATE_RANKINGS" +export PROOF_REGENERATE_BASELINE="$REGENERATE_BASELINE" + +# ── env setup ───────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR/../.." + +# Establish the report directory before sourcing env.sh so cases see it. +ts="$(date -u +%Y%m%d-%H%M%SZ)" +export PROOF_REPORT_DIR="$(pwd)/tests/proof/reports/proof-${ts}" +mkdir -p "$PROOF_REPORT_DIR" + +# shellcheck source=lib/env.sh +source "${SCRIPT_DIR}/lib/env.sh" +# shellcheck source=lib/http.sh +source "${SCRIPT_DIR}/lib/http.sh" +# shellcheck source=lib/assert.sh +source "${SCRIPT_DIR}/lib/assert.sh" +# shellcheck source=lib/metrics.sh +source "${SCRIPT_DIR}/lib/metrics.sh" + +echo "[run_proof] mode=${MODE} report=${PROOF_REPORT_DIR}" +echo "[run_proof] git_sha=${PROOF_GIT_SHA}" + +# ── service lifecycle ──────────────────────────────────────────────────── +PIDS=() +WE_BOOTED=0 + +cleanup() { + if [ "$WE_BOOTED" -eq 1 ] && [ "${#PIDS[@]}" -gt 0 ]; then + echo "[run_proof] cleanup: killing ${#PIDS[@]} services we started" + kill "${PIDS[@]}" 2>/dev/null || true + wait 2>/dev/null || true + fi +} +trap cleanup EXIT INT TERM + +poll_health() { + local name="$1" port="$2" deadline=$(($(date +%s) + 8)) + while [ "$(date +%s)" -lt "$deadline" ]; do + if curl -sS --max-time 1 "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then + return 0 + fi + sleep 0.1 + done + return 1 +} + +bootstrap_services() { + echo "[run_proof] bootstrap: building binaries..." + export PATH="/usr/local/go/bin:${PATH}" + if ! go build -o bin/ ./cmd/... > "${PROOF_REPORT_DIR}/raw/logs/build.log" 2>&1; then + echo "[run_proof] BUILD FAILED — see raw/logs/build.log" + return 1 + fi + + echo "[run_proof] bootstrap: launching services in dep order..." + for SPEC in "storaged:3211" "catalogd:3212" "ingestd:3213" "queryd:3214" "vectord:3215" "embedd:3216" "gateway:3110"; do + local NAME="${SPEC%:*}" PORT="${SPEC#*:}" + # Skip if already up. + if curl -sS --max-time 1 "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then + echo " ✓ ${NAME} (:${PORT}) already up — leaving as-is" + continue + fi + ./bin/"$NAME" > "${PROOF_REPORT_DIR}/raw/logs/${NAME}.log" 2>&1 & + PIDS+=("$!") + if poll_health "$NAME" "$PORT"; then + echo " ✓ ${NAME} (:${PORT}) booted" + WE_BOOTED=1 + else + echo " ✗ ${NAME} (:${PORT}) failed to bind in 8s — see raw/logs/${NAME}.log" + tail -20 "${PROOF_REPORT_DIR}/raw/logs/${NAME}.log" | sed 's/^/ /' + return 1 + fi + done +} + +if [ "$NO_BOOTSTRAP" -eq 0 ]; then + if ! bootstrap_services; then + echo "[run_proof] FATAL — bootstrap failed" + exit 1 + fi +else + echo "[run_proof] --no-bootstrap — assuming services already up" +fi + +# ── case discovery + filtering ─────────────────────────────────────────── +discover_cases() { + # Returns case files matching the current mode, sorted by NN prefix. + # Each case declares CASE_TYPE; we re-source in a subshell to read it. + local f case_type + for f in "${SCRIPT_DIR}/cases/"*.sh; do + [ -e "$f" ] || continue + case_type=$(bash -c "source '$f' --metadata-only 2>/dev/null; echo \${CASE_TYPE:-}" 2>/dev/null || echo "") + # contract mode runs contract cases only + # integration mode runs contract + integration + # performance mode runs contract + integration + performance + case "$MODE:$case_type" in + contract:contract|\ + integration:contract|integration:integration|\ + performance:contract|performance:integration|performance:performance) + echo "$f" ;; + esac + done +} + +CASES=() +while IFS= read -r line; do CASES+=("$line"); done < <(discover_cases) + +echo "[run_proof] cases for mode=${MODE}: ${#CASES[@]}" + +# ── case execution ─────────────────────────────────────────────────────── +CASE_PASS=0 +CASE_FAIL=0 +CASE_SKIP=0 +REQUIRED_FAIL=0 + +for case_file in "${CASES[@]}"; do + case_name=$(basename "$case_file" .sh) + echo "" + echo "[run_proof] running ${case_name} ..." + SECONDS=0 + if bash "$case_file" >> "${PROOF_REPORT_DIR}/raw/logs/${case_name}.log" 2>&1; then + echo " → wrapper exit 0 (${SECONDS}s)" + else + echo " → wrapper exit non-zero (${SECONDS}s) — see raw/logs/${case_name}.log" + fi +done + +# ── aggregation ────────────────────────────────────────────────────────── +echo "" +echo "[run_proof] aggregating evidence..." + +ALL_RECORDS_FILE="${PROOF_REPORT_DIR}/raw/all_records.jsonl" +> "$ALL_RECORDS_FILE" +for f in "${PROOF_REPORT_DIR}/raw/cases/"*.jsonl; do + [ -e "$f" ] || continue + cat "$f" >> "$ALL_RECORDS_FILE" +done + +# grep -c exits 1 with output "0" when no matches; the `|| echo 0` form +# concatenates "0\n0" and breaks jq --argjson + arithmetic. Capture the +# count and force a clean integer fallback on non-zero exit. +_count() { + local pattern="$1" file="$2" n + n=$(grep -c "$pattern" "$file" 2>/dev/null) || n=0 + echo "$n" +} + +if [ -s "$ALL_RECORDS_FILE" ]; then + pass=$(_count '"result":"pass"' "$ALL_RECORDS_FILE") + fail=$(_count '"result":"fail"' "$ALL_RECORDS_FILE") + skip=$(_count '"result":"skip"' "$ALL_RECORDS_FILE") +else + pass=0; fail=0; skip=0 +fi + +# summary.json +jq -n \ + --arg mode "$MODE" \ + --arg ts "$(date -u -Iseconds)" \ + --arg sha "$PROOF_GIT_SHA" \ + --argjson pass "$pass" \ + --argjson fail "$fail" \ + --argjson skip "$skip" \ + --argjson cases "${#CASES[@]}" \ + '{mode: $mode, timestamp_utc: $ts, git_sha: $sha, + counts: {pass: $pass, fail: $fail, skip: $skip}, + cases_run: $cases, evidence_dir: "raw/"}' \ + > "${PROOF_REPORT_DIR}/summary.json" + +# summary.md +{ + echo "# proof-${ts} — ${MODE} mode" + echo "" + echo "- git_sha: \`${PROOF_GIT_SHA}\`" + echo "- timestamp: $(date -u -Iseconds)" + echo "- cases run: ${#CASES[@]}" + echo "- assertions: ${pass} pass · ${fail} fail · ${skip} skip" + echo "" + echo "## per-case-id" + echo "" + echo "| case_id | pass | fail | skip |" + echo "|---|---:|---:|---:|" + # Iterate JSONL files (one per CASE_ID), not case scripts — a single + # case file may emit under multiple CASE_IDs and this preserves the + # mapping faithfully. + for jsonl in "${PROOF_REPORT_DIR}/raw/cases/"*.jsonl; do + [ -e "$jsonl" ] || continue + cid=$(basename "$jsonl" .jsonl) + cp=$(_count '"result":"pass"' "$jsonl") + cfl=$(_count '"result":"fail"' "$jsonl") + cs=$(_count '"result":"skip"' "$jsonl") + echo "| ${cid} | ${cp} | ${cfl} | ${cs} |" + done + echo "" + if [ "$fail" -gt 0 ]; then + echo "## failed assertions" + echo "" + grep '"result":"fail"' "$ALL_RECORDS_FILE" | jq -r '"- **\(.case_id)** — \(.claim) — expected: \(.expected) actual: \(.actual)"' + fi +} > "${PROOF_REPORT_DIR}/summary.md" + +# ── exit ───────────────────────────────────────────────────────────────── +echo "" +echo "[run_proof] DONE — summary: ${PROOF_REPORT_DIR}/summary.md" +echo " ${pass} pass · ${fail} fail · ${skip} skip" + +if [ "$fail" -gt 0 ]; then exit 1; fi +exit 0