From 73f242e3e41c2aa36b35fe9de54742b248915cb5 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Apr 2026 23:54:31 -0500 Subject: [PATCH] =?UTF-8?q?distillation:=20Phase=209=20=E2=80=94=20release?= =?UTF-8?q?=20freeze=20and=20operator=20handoff?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final phase. Adds: scripts/distillation/release_freeze.ts ~330 lines, 6 release gates docs/distillation/operator-handoff.md durable cold-start operator doc docs/distillation/recovery-runbook.md failure-mode runbook by symptom scripts/distillation/distill.ts +release-freeze subcommand The release_freeze orchestrator runs every gate the system has: 1. Clean git state (tolerates auto-regenerated reports) 2. Full test suite (bun test tests/distillation auditor/schemas/distillation) 3. Phase commit verification (every Phase 0-8 commit resolves) 4. Acceptance gate (22-invariant fixture E2E) 5. audit-full (Phases 0-7 verified + drift detection) 6. Tag availability check (distillation-v1.0.0 not yet existing) Outputs: reports/distillation/release-freeze.md human-readable manifest reports/distillation/release-manifest.json machine-readable manifest Manifest captures: - git_head + git_branch + released_at - phase→commit map for all 9 commits (Phase 0+1+2 scaffold through Phase 8 audit) - dataset counts at freeze (RAG/SFT/Preference/evidence/scored/quarantined) - latest audit baseline row - per-gate pass/fail with detail Operator handoff doc covers: - phase map with commits + report locations - known-good commands - how to rerun audit-full + inspect drift - how to restore from last-good (git checkout distillation-v1.0.0) - how to add future phases without contaminating corpus - what NOT to modify casually (with file:reason mapping) - cumulative commits at v1.0.0 Recovery runbook covers, by symptom: - audit-full exit non-zero (per-phase diagnostics) - drift table flags warn (intentional vs regression) - acceptance fail vs audit-full pass divergence - run-all empty exports (counter-bisection order) - hash mismatch on identical input (determinism violation; CRITICAL) - replay logs growing unbounded (rotation guidance) - nuclear restore via git checkout distillation-v1.0.0 Spec constraints (per now.md Phase 9): - DO NOT add new intelligence features ✓ (zero new logic) - DO NOT change scoring/export logic ✓ (zero touches) - DO NOT weaken gates ✓ (gates only added, never relaxed beyond the auto-regen tolerance documented in checkCleanGit) - DO NOT retrain anything ✓ (no model touches) CLI: ./scripts/distill release-freeze # exit 0 = release-ready Tag creation deferred to operator confirmation (the release-freeze report prints the exact `git tag` command). Per CLAUDE.md guidance, destructive/visible operations like tags require explicit user authorization. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/distillation/operator-handoff.md | 191 +++++++++++++ docs/distillation/recovery-runbook.md | 208 ++++++++++++++ scripts/distillation/distill.ts | 8 + scripts/distillation/release_freeze.ts | 362 +++++++++++++++++++++++++ 4 files changed, 769 insertions(+) create mode 100644 docs/distillation/operator-handoff.md create mode 100644 docs/distillation/recovery-runbook.md create mode 100644 scripts/distillation/release_freeze.ts diff --git a/docs/distillation/operator-handoff.md b/docs/distillation/operator-handoff.md new file mode 100644 index 0000000..c63fcb6 --- /dev/null +++ b/docs/distillation/operator-handoff.md @@ -0,0 +1,191 @@ +# Distillation System — Operator Handoff + +**Version:** v1.0.0 +**Branch:** `scrum/auto-apply-19814` +**Tag:** `distillation-v1.0.0` +**Audit baseline:** `data/_kb/audit_baselines.jsonl` (auto-grown per audit-full run) + +This is the operator-level handoff for the distillation system. If you are picking this up cold, **read this doc first**, then `docs/recon/local-distillation-recon.md`. Skim the per-phase reports under `reports/distillation/` only when you need detail. + +## What this system does + +Turns real Lakehouse execution traces (1052 records sampled at v1.0.0 freeze) into clean, gated training datasets: + +- **RAG corpus** — 446 grounded examples for retrieval-augmentation +- **SFT corpus** — 351 instruction→response pairs (strict accepted-only) +- **Preference corpus** — 83 chosen/rejected pairs (zero self-pairs, zero identical-text) + +It is **NOT** a model trainer. It is a **knowledge refinery** that produces training-safe substrate. The local-model "replay" runtime (Phase 7) demonstrates that retrieval against this substrate makes a 7B-class model behave like the system instead of fabricating audit verdicts. + +## Phase map + +| Phase | What it does | Commit | Report | +|---|---|---|---| +| 0 | Recon doc — inventory of source streams + integration plan | 27b1d27 | `docs/recon/local-distillation-recon.md` | +| 1 | 9 schemas + 51 schema tests + foundation types | 27b1d27 | (in commit body) | +| 2 | Materializer: 12 source jsonls → unified EvidenceRecord at `data/evidence/YYYY/MM/DD/` | 1ea8029 | (in commit body) | +| 3 | Deterministic Success Scorer: EvidenceRecord → ScoredRun (4 categories, no LLM) | c989253 | (in commit body) | +| 4 | RAG/SFT/Preference exports + quarantine system | 68b6697 | `reports/distillation/phase4-export-report.md` | +| 5 | Receipts harness — per-stage StageReceipt + RunSummary + DriftReport | 2cf359a | `reports/distillation/phase5-receipts-report.md` | +| 6 | Acceptance gate — fixture-driven 22-invariant E2E test | 1b433a9 | `reports/distillation/phase6-acceptance-report.md` | +| 7 | Replay layer — retrieval-driven local-model bootstrap | 681f39d | `reports/distillation/phase7-replay-report.md` | +| 8 | Full system audit + drift baseline | 5bdd159 | `reports/distillation/phase8-full-audit-report.md` | + +The auditor rebuild (commit 20a039c) is wired to use the Phase 5 substrate: it now calls `lakehouse_answers_v1` matrix retrieval instead of tree-split shard summaries. Per-audit cost: 50× fewer cloud calls, 17× faster wall-clock. + +## Known-good commands + +All commands run from `/home/profit/lakehouse`. Use `bun run scripts/distillation/distill.ts ` or `./scripts/distill ` if symlinked. + +```bash +# Build everything end-to-end with structured receipts +./scripts/distill run-all + +# Read a specific run's summary + drift +./scripts/distill receipts --run-id + +# Verify the system end-to-end on a deterministic fixture +./scripts/distill acceptance + +# Audit Phases 0-7 + drift detection vs prior baseline +./scripts/distill audit-full + +# Test a task through the replay layer (local model with retrieval) +./scripts/distill replay --task "" +./scripts/distill replay --task "" --no-retrieval # baseline / A/B +./scripts/distill replay --task "" --allow-escalation # try deepseek if local fails + +# Per-stage one-shot (rare — prefer run-all for receipts) +./scripts/distill build-evidence +./scripts/distill score +./scripts/distill export-rag +./scripts/distill export-sft # strict accepted-only +./scripts/distill export-sft --include-partial # opens to partially_accepted +./scripts/distill export-preference +./scripts/distill export-all +``` + +## How to rerun the full audit + +```bash +./scripts/distill audit-full +``` + +Reads: +- on-disk `data/evidence/`, `data/scored-runs/`, `exports/{rag,sft,preference}/*` +- the most recent run_id under `reports/distillation/` +- the prior audit baseline at `data/_kb/audit_baselines.jsonl` + +Writes: +- `reports/distillation/phase8-full-audit-report.md` +- a new row to `data/_kb/audit_baselines.jsonl` (auto-grown — never overwrite) + +Exit code 0 = pass (every required check held). Non-zero = at least one required check failed. + +## How to inspect drift + +Two levels: + +1. **Per-run drift** — every `run-all` writes `reports/distillation//drift.json`. Compares to the most recent prior run. Severity `ok | warn | alert`. + +2. **Cross-run baseline drift** — `audit-full` reads the latest baseline row from `data/_kb/audit_baselines.jsonl` and compares 10 tracked metrics (record counts, category distribution, export sizes, quarantine totals). Drift table appears in `reports/distillation/phase8-full-audit-report.md` with `>20%` flagged as `warn`. + +The baseline file is **append-only**. Don't truncate it — its value grows with the longitudinal record. If a metric flips `warn` after a code change, the row before that change is the diagnostic anchor. + +## How to restore from last good state + +```bash +git fetch --tags +git checkout distillation-v1.0.0 +./scripts/distill audit-full # confirm 16/16 required pass at v1.0.0 +``` + +If you've made changes that broke the system, hard reset to v1.0.0: + +```bash +git reset --hard distillation-v1.0.0 # destructive — loses uncommitted work +./scripts/distill acceptance # confirm 22/22 fixture invariants +./scripts/distill audit-full # confirm baseline match +``` + +## How to add future phases without contaminating the corpus + +The corpus = `exports/rag/playbooks.jsonl` + `exports/sft/instruction_response.jsonl` + `exports/preference/chosen_rejected.jsonl`. These are training-safe **only if** every gate held. To add Phase 10+: + +1. Add code under `scripts/distillation/.ts`. Do NOT modify Phases 0-8. +2. If your phase produces evidence, append to `data/_kb/.jsonl` and add a transform in `scripts/distillation/transforms.ts`. The materializer picks it up automatically. +3. If your phase needs a new schema, create `auditor/schemas/distillation/.ts` with `_SCHEMA_VERSION = 1` constant + validator + tests in `auditor/schemas/distillation/schemas.test.ts` (positive + negative fixtures). +4. Run `./scripts/distill audit-full` BEFORE merging. Confirm 16/16 still passes. +5. Run `./scripts/distill acceptance`. Confirm 22/22 still passes. +6. Re-run `./scripts/distill run-all`. Inspect drift in the new run's `drift.json`. Anything `>20%` in record counts means your phase moved the corpus — explain it in the commit. + +## What NOT to modify casually + +These have explicit firewalls. Touching them = potentially weakening contamination prevention: + +| File | Why fragile | +|---|---| +| `auditor/schemas/distillation/sft_sample.ts` | The `quality_score` enum literally enforces "no rejected/needs_human_review in SFT". Loosening it = silent leak | +| `scripts/distillation/export_sft.ts` `SFT_NEVER` constant | Second-layer defense. If schema fails, this catches it | +| `scripts/distillation/export_sft.ts` re-read validation | Third layer — re-reads on-disk SFT and fails LOUD if forbidden quality_score appears | +| `scripts/distillation/scorer.ts` category mapping | Changing rules → silent corpus shift. Run `audit-full` after any change to see drift | +| `tests/fixtures/distillation/acceptance/` | The fixture is the gate. Changing it = changing the bar | +| `data/_kb/audit_baselines.jsonl` | Append-only. Truncating loses longitudinal drift signal | + +If you must change one of these, run `audit-full` BEFORE and AFTER. The drift table will tell you exactly what your change moved. + +## Receipt-vs-drift quick reference + +If `audit-full` flags a metric: +- `>20%` swing in `p3_accepted` → scorer rules changed OR source data shifted +- `>20%` swing in `p4_sft_rows` → SFT eligibility changed (check exporter filter) +- `>20%` swing in `p4_total_quarantined` → either source data is dirtier OR a gate got tighter +- Hash mismatch on identical input → determinism violation; revert immediately + +If `acceptance` fails: +- 22 invariants are pinned in `scripts/distillation/acceptance.ts`. The failing one names what broke. +- Spec invariants (1-22) are documented in `reports/distillation/phase6-acceptance-report.md`. + +## Pointers to non-distillation systems + +The auditor (`auditor/`) and the gateway (`crates/gateway/`) are the consumers of the distillation substrate. They use it but are not part of it: + +- Auditor's `pr_audit` mode (`crates/gateway/src/v1/mode.rs`) retrieves from `lakehouse_answers_v1`. If you regenerate the RAG export, the auditor's context auto-improves on next call. +- The gateway's `/v1/chat` is the entry point all model calls flow through. Receipts capture provider, model, latency, prompt+completion tokens. + +## Provenance + +Every export row → traces to `data/scored-runs/.../.jsonl` line N → traces to `data/evidence/.../.jsonl` line N → traces to `data/_kb/.jsonl` line N. The `provenance.sig_hash` field (canonical sha256 of the source row, sorted keys) is the join key. + +If a downstream consumer asks "where did this SFT row come from", run: + +```bash +jq 'select(.id == "") | .provenance' exports/sft/instruction_response.jsonl +# returns {source_file, line_offset, sig_hash, recorded_at} +# Then: +sed -n "$(( + 1))p" data/scored-runs/ +# And so on back to data/_kb/.jsonl +``` + +## Test discipline + +```bash +bun test tests/distillation/ auditor/schemas/distillation/ +``` + +At v1.0.0: **145 tests, 0 fail, 372 expect() calls, ~600ms.** Any new phase must keep this at 0 fail. + +## Cumulative commits at v1.0.0 + +``` +27b1d27 distillation: Phase 0 recon + Phase 1 schemas + Phase 2 transforms scaffold +1ea8029 distillation: Phase 2 — Evidence View materializer + health audit +c989253 distillation: Phase 3 — deterministic Success Scorer +68b6697 distillation: Phase 4 — dataset export layer +2cf359a distillation: Phase 5 — receipts harness (system-level observability) +1b433a9 distillation: Phase 6 — acceptance gate suite +20a039c auditor: rebuild on mode runner + drop tree-split (use distillation substrate) +681f39d distillation: Phase 7 — replay-driven local model bootstrapping +5bdd159 distillation: Phase 8 — full system audit + distillation: Phase 9 — release freeze and operator handoff +``` diff --git a/docs/distillation/recovery-runbook.md b/docs/distillation/recovery-runbook.md new file mode 100644 index 0000000..065d94a --- /dev/null +++ b/docs/distillation/recovery-runbook.md @@ -0,0 +1,208 @@ +# Distillation System — Recovery Runbook + +**Version:** v1.0.0 +**Audience:** Future operator (or future Claude session) inheriting this system in a broken state. + +This is the failure-mode runbook. Read top-down, stop at the first symptom that matches. + +## Symptom 1: `audit-full` exits non-zero + +A required check failed. The report at `reports/distillation/phase8-full-audit-report.md` will name the failing check verbatim. Map by phase: + +### P0 — recon doc missing +**Cause:** repo state corrupted; the recon doc has never existed at this commit. +**Fix:** +```bash +git checkout distillation-v1.0.0 -- docs/recon/local-distillation-recon.md +``` + +### P0 — tier-1 source stream missing +**Cause:** fresh-clone or post-rotation environment without source data. +**Severity:** informational only (audit-full reports as required=false). Pipeline will produce 0 rows but won't fail. +**Fix:** populate the source stream OR accept reduced output and note in the next report. + +### P1 — schema validators fail +**Cause:** somebody modified a Phase 1 schema and broke a validator. +**Diagnostic:** +```bash +bun test auditor/schemas/distillation/ 2>&1 | grep -A2 "fail" +``` +**Fix:** revert the schema change. Phase 1 schemas are versioned (`_SCHEMA_VERSION = 1`); they bump deliberately, never silently. +```bash +git diff distillation-v1.0.0 -- auditor/schemas/distillation/ # see what changed +git checkout distillation-v1.0.0 -- auditor/schemas/distillation/ +``` + +### P2 — materializer dry-run fails / writes 0 +**Cause:** every source jsonl is empty OR every transform is broken. +**Diagnostic:** +```bash +ls -la data/_kb/*.jsonl # confirm sources have content +bun run scripts/distillation/build_evidence_index.ts --dry-run +# inspect skip reasons in data/_kb/distillation_skips.jsonl +``` +**Fix:** identify the broken transform via per-source row counts. If a real-data shape changed (e.g. a new field name in a source jsonl), update the matching transform in `scripts/distillation/transforms.ts`. Add a fixture row to `auditor/schemas/distillation/realdata.test.ts` covering the new shape. + +### P3 — scored-runs distribution empty +**Cause:** `data/scored-runs/` is missing OR the score categories are not landing. +**Fix:** +```bash +./scripts/distill score # re-runs scorer if data/evidence/ is populated +``` +If still empty, the scorer rules in `scripts/distillation/scorer.ts` may have changed. Re-run `bun test tests/distillation/scorer.test.ts` — if those fail, revert. + +### P4 — SFT contamination firewall caught a leak (CRITICAL) +**Severity:** alert. A `rejected` or `needs_human_review` row landed in SFT. This is a non-negotiable spec violation. +**Stop immediately:** +```bash +mv exports/sft/instruction_response.jsonl exports/sft/instruction_response.jsonl.QUARANTINED-$(date +%s) +``` +**Diagnostic:** the audit report's "P4 SFT contamination firewall" check shows the count. The forbidden row is in the file you just renamed: +```bash +jq 'select(.quality_score != "accepted" and .quality_score != "partially_accepted")' exports/sft/instruction_response.jsonl.QUARANTINED-* +``` +**Root cause is one of:** +1. SftSample schema validator was loosened — check `auditor/schemas/distillation/sft_sample.ts` against v1.0.0 +2. Exporter `SFT_NEVER` constant was loosened — check `scripts/distillation/export_sft.ts` +3. Schema validation was bypassed (someone called `appendFileSync` directly) — find the offending caller via `git log --oneline -p exports/sft/` + +**Recovery:** revert the offending change. Re-run `./scripts/distill export-sft`. The fresh output should pass the audit's leak check. + +### P4 — Preference self-pair leaked +**Cause:** export_preference's pairing logic produced `chosen_run_id == rejected_run_id`. Schema validator should catch this. +**Diagnostic:** +```bash +jq 'select(.chosen_run_id == .rejected_run_id) | .id' exports/preference/chosen_rejected.jsonl +``` +**Fix:** check `scripts/distillation/export_preference.ts::buildPair` — the equality guard at the top must remain. Revert if missing. + +### P5 — RunSummary fails to validate +**Cause:** receipts harness emitted a malformed summary. +**Diagnostic:** +```bash +LATEST=$(ls -1t reports/distillation/*/summary.json | head -1) +bun -e "import {validateRunSummary} from './auditor/schemas/distillation/run_summary'; const s = JSON.parse(await Bun.file('$LATEST').text()); console.log(validateRunSummary(s))" +``` +**Fix:** the field that failed validation is named in the validator output. Either fix the harness in `scripts/distillation/receipts.ts` or revert. + +### P6 — acceptance gate fails an invariant +**Cause:** something in Phases 1-5 changed in a way the fixture catches. +**Diagnostic:** run acceptance directly to see all 22 checks: +```bash +bun run scripts/distillation/acceptance.ts +# scan output; the failing check names what broke +cat reports/distillation/phase6-acceptance-report.md +``` +**Fix:** the report's "Failures" section names the invariant. The 22 invariants are documented at the top of `scripts/distillation/acceptance.ts`. Locate the affected phase code, revert. + +### P7 — replay validation regressed +**Cause:** local model output failed the structural validator OR retrieval found 0 playbooks. +**Diagnostic:** +```bash +./scripts/distill replay --task "" --local-only +# inspect the validation_result.reasons +``` +**Fix:** +- If validation reasons mention "filler/hedge phrase": the local model regressed (model swap?) — revert `LH_REPLAY_LOCAL_MODEL` to default +- If retrieval is empty: `exports/rag/playbooks.jsonl` is empty — re-run export-rag + +## Symptom 2: drift table flags `warn` + +A metric moved >20% from baseline. This is a SOFT alert — not a failure, but worth investigating before treating outputs as stable. + +**Diagnostic:** +```bash +jq -r '.metrics' data/_kb/audit_baselines.jsonl | tail -5 +# see the recent baseline trajectory +cat reports/distillation/phase8-full-audit-report.md +# read the drift table +``` + +**Common causes:** +- New source data → record counts grow → expected, not a regression +- Scorer rules changed → category distribution shifted → confirm intentional +- Exporter filter loosened → SFT/RAG counts grow → CHECK contamination firewall first + +**If the drift is intentional**, write a row to `data/_kb/audit_baselines.jsonl` documenting why (no schema for this — just append a JSON line with a `notes` field). Future audits will treat the new value as the new baseline. + +## Symptom 3: `acceptance` exits non-zero but `audit-full` doesn't + +This is rare — acceptance is stricter (22 invariants on a fixture vs audit-full's 16 required checks on real data). The failing acceptance check usually points to a broken assumption that real data hides. + +**Diagnostic:** +```bash +bun run scripts/distillation/acceptance.ts 2>&1 | head -50 +ls /tmp/distillation_phase6_acceptance/data/evidence/ # the failed run leaves its temp root +``` + +**Fix:** the acceptance script keeps the temp root on fail (cleans only on pass). Inspect `/tmp/distillation_phase6_acceptance/` to see what the pipeline produced vs expected. The fixture rows themselves are the contract — change the fixture deliberately, never to make a test pass. + +## Symptom 4: `run-all` produces empty exports + +**Cause:** Phase 2 or Phase 3 ran on empty input. + +**Diagnostic order:** +1. `ls data/_kb/*.jsonl` — sources present? +2. `find data/evidence -name "*.jsonl" | xargs wc -l` — any rows materialized? +3. `find data/scored-runs -name "*.jsonl" | xargs wc -l` — any rows scored? +4. `wc -l data/_kb/distillation_skips.jsonl data/_kb/scoring_skips.jsonl` — anything skipped? + +The first counter that's 0 names the broken phase. + +## Symptom 5: hash mismatch on identical input + +**Cause:** determinism violation. Same input → different output. This is a CRITICAL bug. + +**Diagnostic:** +```bash +# Wipe outputs but keep sources, run twice with same recorded_at, compare run_hash +rm -rf data/evidence data/scored-runs exports +RA="2026-04-27T00:00:00.000Z" +./scripts/distill run-all # captures run_id_1 +LATEST_1=$(ls -1t reports/distillation/ | grep -v phase | head -1) + +rm -rf data/evidence data/scored-runs exports +./scripts/distill run-all +LATEST_2=$(ls -1t reports/distillation/ | grep -v phase | head -1) + +jq .run_hash reports/distillation/$LATEST_1/summary.json +jq .run_hash reports/distillation/$LATEST_2/summary.json +# These MUST match if recorded_at is fixed. +``` + +**If they don't match:** something in the pipeline introduced non-determinism. Common causes: +- A `Date.now()` baked into output (other than the explicit `recorded_at`) +- `Math.random()` or `randomUUID()` in a path that should be deterministic +- A `Map` iteration order issue (rare in V8) +- Concurrent writes to the same file + +Bisect against `distillation-v1.0.0` — find the commit that introduced the non-determinism, revert. + +## Symptom 6: replay logs growing unbounded + +Phase 7 replay appends to `data/_kb/replay_runs.jsonl` with no rotation. Acceptable until file >100MB, then: + +```bash +# Move and start fresh +mv data/_kb/replay_runs.jsonl data/_kb/replay_runs.archive.$(date +%s).jsonl +gzip data/_kb/replay_runs.archive.*.jsonl +``` + +This is documented as a Phase 7 carry-over. A future Phase 10+ could add rotation. + +## Last resort: nuclear restore + +If nothing works: + +```bash +git fetch --tags +git stash # save uncommitted work +git checkout distillation-v1.0.0 +./scripts/distill audit-full # confirm 16/16 pass at v1.0.0 +./scripts/distill acceptance # confirm 22/22 pass + +# Now diff against the broken state to see what changed +git diff distillation-v1.0.0..scrum/auto-apply-19814 -- scripts/distillation/ auditor/schemas/distillation/ +``` + +The diff is your bug. diff --git a/scripts/distillation/distill.ts b/scripts/distillation/distill.ts index 678d30c..74be358 100644 --- a/scripts/distillation/distill.ts +++ b/scripts/distillation/distill.ts @@ -112,6 +112,13 @@ async function main() { if (!r.validation_result.passed && !process.argv.includes("--allow-escalation")) process.exit(1); break; } + case "release-freeze": { + // Phase 9 — orchestrate every gate before declaring v1.0.0. + const r = spawnSync("bun", ["run", "scripts/distillation/release_freeze.ts"], { + cwd: DEFAULT_ROOT, stdio: "inherit", + }); + process.exit(r.status ?? 1); + } case "audit-full": { // Phase 8 — meta-audit across Phases 0-7. Spawns the script so // its non-zero exit propagates and the report path is shown. @@ -160,6 +167,7 @@ async function main() { console.log(" acceptance fixture-driven end-to-end gate (Phase 6)"); console.log(" replay retrieval-driven local-model bootstrap (Phase 7) — needs --task"); console.log(" audit-full full system audit across Phases 0-7 (Phase 8)"); + console.log(" release-freeze run all gates + write release manifest (Phase 9)"); console.log(""); console.log("Flags: --dry-run, --include-partial, --include-review,"); console.log(" --task \"\", --local-only, --allow-escalation, --no-retrieval"); diff --git a/scripts/distillation/release_freeze.ts b/scripts/distillation/release_freeze.ts new file mode 100644 index 0000000..f72c617 --- /dev/null +++ b/scripts/distillation/release_freeze.ts @@ -0,0 +1,362 @@ +// release_freeze.ts — Phase 9 final orchestrator. Runs every gate the +// distillation system has + writes a release manifest + verifies clean +// git state. Never creates the git tag itself — prints the command for +// J to authorize. +// +// USAGE +// bun run scripts/distillation/release_freeze.ts +// +// Exit code 0 = release-ready. Non-zero = one or more gates failed. + +import { + existsSync, readFileSync, readdirSync, statSync, mkdirSync, writeFileSync, +} from "node:fs"; +import { resolve, dirname } from "node:path"; +import { spawnSync } from "node:child_process"; + +const DEFAULT_ROOT = process.env.LH_DISTILL_ROOT ?? "/home/profit/lakehouse"; +const VERSION = "v1.0.0"; +const TAG = `distillation-${VERSION}`; + +// Phase → known commit. Sourced from git log; if a commit gets +// rewritten the manifest will surface the mismatch. +const PHASE_COMMITS: Array<{ phase: string; commit: string; subject: string }> = [ + { phase: "0+1+2 scaffold", commit: "27b1d27", subject: "distillation: Phase 0 recon + Phase 1 schemas + Phase 2 transforms scaffold" }, + { phase: "2 materializer", commit: "1ea8029", subject: "distillation: Phase 2 — Evidence View materializer + health audit" }, + { phase: "3 scorer", commit: "c989253", subject: "distillation: Phase 3 — deterministic Success Scorer" }, + { phase: "4 exports", commit: "68b6697", subject: "distillation: Phase 4 — dataset export layer" }, + { phase: "5 receipts", commit: "2cf359a", subject: "distillation: Phase 5 — receipts harness (system-level observability)" }, + { phase: "6 acceptance", commit: "1b433a9", subject: "distillation: Phase 6 — acceptance gate suite" }, + { phase: "auditor rebuild", commit: "20a039c", subject: "auditor: rebuild on mode runner + drop tree-split (use distillation substrate)" }, + { phase: "7 replay", commit: "681f39d", subject: "distillation: Phase 7 — replay-driven local model bootstrapping" }, + { phase: "8 audit-full", commit: "5bdd159", subject: "distillation: Phase 8 — full system audit" }, +]; + +interface GateResult { + name: string; + passed: boolean; + detail: string; +} + +const gates: GateResult[] = []; +function gate(name: string, passed: boolean, detail: string) { + gates.push({ name, passed, detail }); +} + +function shell(cmd: string, args: string[], cwd = DEFAULT_ROOT, timeoutMs = 600_000) { + return spawnSync(cmd, args, { cwd, encoding: "utf8", timeout: timeoutMs, env: { ...process.env, LH_DISTILL_ROOT: cwd } }); +} + +function gitOutput(args: string[]): string { + const r = spawnSync("git", ["-C", DEFAULT_ROOT, ...args], { encoding: "utf8" }); + return r.status === 0 ? r.stdout.trim() : ""; +} + +// ─── Gate 1: clean git state ────────────────────────────────────── + +function checkCleanGit() { + const status = gitOutput(["status", "--porcelain"]); + // Tolerate two classes of dirty: + // 1. Untracked artifacts (data/, exports/, /tmp/, reports/distillation//) + // 2. Auto-regenerated reports under reports/distillation/phase*-*.md + + // reports/distillation/release-*.{md,json} — release-freeze itself + // rewrites these before it can check the gate + const lines = status.split("\n").filter(Boolean); + const tracked = lines.filter(l => /^\s*[MADRCU]/.test(l)); + const concerning = tracked.filter(l => { + // git status --porcelain format: "XY " where XY is 2-char status + const m = l.match(/^[\sMADRCU]{2}\s+(.+?)$/); + const path = m ? m[1] : l.replace(/^\s*[MADRCU]+\s*/, ""); + if (/^reports\/distillation\/phase\d+-.*\.md$/.test(path)) return false; + if (/^reports\/distillation\/release-.*\.(md|json)$/.test(path)) return false; + return true; + }); + const passed = concerning.length === 0; + gate( + "clean git state (no source-tree modifications)", + passed, + passed + ? `tree clean (${tracked.length - concerning.length} auto-regenerated reports tolerated)` + : `${concerning.length} concerning modified file(s):\n${concerning.slice(0, 6).map(l => " " + l).join("\n")}`, + ); +} + +// ─── Gate 2: full test suite ────────────────────────────────────── + +function checkTests() { + const r = shell("bun", ["test", "tests/distillation/", "auditor/schemas/distillation/"]); + const out = (r.stdout ?? "") + (r.stderr ?? ""); + const m = out.match(/(\d+)\s*pass\s*\n\s*(\d+)\s*fail/); + const pass = m ? Number(m[1]) : 0; + const fail = m ? Number(m[2]) : 1; + gate( + `full test suite (bun test tests/distillation/ auditor/schemas/distillation/)`, + r.status === 0 && fail === 0, + `${pass} pass, ${fail} fail (exit=${r.status})`, + ); +} + +// ─── Gate 3: acceptance gate ────────────────────────────────────── + +function checkAcceptance() { + const r = shell("bun", ["run", "scripts/distillation/acceptance.ts"]); + const out = (r.stdout ?? "") + (r.stderr ?? ""); + const m = out.match(/PASS\s*—\s*(\d+)\/(\d+)/); + const passed = r.status === 0 && m && m[1] === m[2]; + gate( + "acceptance gate (22-invariant fixture E2E)", + !!passed, + m ? `${m[1]}/${m[2]} invariants` : `exit=${r.status}, no PASS line found`, + ); +} + +// ─── Gate 4: full audit ─────────────────────────────────────────── + +function checkAuditFull() { + const r = shell("bun", ["run", "scripts/distillation/audit_full.ts"]); + const out = (r.stdout ?? "") + (r.stderr ?? ""); + const m = out.match(/PASS\s*—\s*(\d+)\/(\d+)\s*required/); + const passed = r.status === 0 && m && m[1] === m[2]; + gate( + "audit-full (Phases 0-7 verified + drift)", + !!passed, + m ? `${m[1]}/${m[2]} required checks` : `exit=${r.status}`, + ); +} + +// ─── Gate 5: tag does not yet exist ────────────────────────────── + +function checkTagAvailable() { + const tags = gitOutput(["tag", "-l", TAG]); + const exists = tags.trim() === TAG; + gate( + `tag ${TAG} available (does not yet exist)`, + !exists, + exists ? `tag already exists; bump VERSION or delete the prior tag` : "tag name is free", + ); +} + +// ─── Gather dataset/export counts ──────────────────────────────── + +interface DatasetCounts { + rag_rows: number; + sft_rows: number; + preference_pairs: number; + evidence_files: number; + evidence_rows: number; + scored_files: number; + scored_rows: number; + quarantined_total: number; +} + +function countLines(path: string): number { + if (!existsSync(path)) return 0; + return readFileSync(path, "utf8").split("\n").filter(Boolean).length; +} + +function walkCount(dir: string): { files: number; rows: number } { + if (!existsSync(dir)) return { files: 0, rows: 0 }; + let files = 0, rows = 0; + function walk(p: string) { + for (const e of readdirSync(p)) { + const full = resolve(p, e); + const st = statSync(full); + if (st.isDirectory()) walk(full); + else if (e.endsWith(".jsonl")) { files++; rows += countLines(full); } + } + } + walk(dir); + return { files, rows }; +} + +function gatherCounts(root: string): DatasetCounts { + const ev = walkCount(resolve(root, "data/evidence")); + const sc = walkCount(resolve(root, "data/scored-runs")); + return { + rag_rows: countLines(resolve(root, "exports/rag/playbooks.jsonl")), + sft_rows: countLines(resolve(root, "exports/sft/instruction_response.jsonl")), + preference_pairs: countLines(resolve(root, "exports/preference/chosen_rejected.jsonl")), + evidence_files: ev.files, evidence_rows: ev.rows, + scored_files: sc.files, scored_rows: sc.rows, + quarantined_total: ["sft", "rag", "preference"] + .reduce((acc, n) => acc + countLines(resolve(root, `exports/quarantine/${n}.jsonl`)), 0), + }; +} + +// ─── Gather latest baseline ────────────────────────────────────── + +function loadLatestBaseline(root: string): any { + const p = resolve(root, "data/_kb/audit_baselines.jsonl"); + if (!existsSync(p)) return null; + const lines = readFileSync(p, "utf8").split("\n").filter(Boolean); + if (lines.length === 0) return null; + try { return JSON.parse(lines[lines.length - 1]); } catch { return null; } +} + +// ─── Verify phase commits actually exist ───────────────────────── + +function verifyPhaseCommits() { + const missing: string[] = []; + for (const p of PHASE_COMMITS) { + const full = gitOutput(["rev-parse", p.commit]); + if (!full || full.length < 40) missing.push(`${p.phase} (${p.commit})`); + } + gate( + "every phase commit resolves", + missing.length === 0, + missing.length === 0 ? `${PHASE_COMMITS.length}/${PHASE_COMMITS.length} commits verified` : `missing: ${missing.join(", ")}`, + ); +} + +// ─── Build manifest + report ───────────────────────────────────── + +interface Manifest { + schema: "distillation_release_manifest.v1"; + version: string; + tag: string; + released_at: string; + git_head: string; + git_branch: string; + phase_commits: typeof PHASE_COMMITS; + dataset_counts: DatasetCounts; + latest_baseline: any; + gates: GateResult[]; + passed: boolean; +} + +function renderReport(m: Manifest): string { + const md: string[] = []; + md.push("# Distillation Release Freeze — " + m.version); + md.push(""); + md.push(`**Tag (proposed):** \`${m.tag}\``); + md.push(`**Released at:** ${m.released_at}`); + md.push(`**Git head:** \`${m.git_head}\``); + md.push(`**Branch:** ${m.git_branch}`); + md.push(""); + md.push(`## Result: ${m.passed ? "**RELEASE-READY** ✓" : "**NOT READY ✗** — one or more gates failed"}`); + md.push(""); + + md.push("## Gates"); + md.push(""); + md.push("| # | Gate | Status | Detail |"); + md.push("|---|---|---|---|"); + for (let i = 0; i < m.gates.length; i++) { + const g = m.gates[i]; + md.push(`| ${i + 1} | ${g.name} | ${g.passed ? "✓" : "✗ FAIL"} | ${g.detail.split("\n")[0].slice(0, 100)} |`); + } + md.push(""); + + md.push("## Phase commits"); + md.push(""); + md.push("| Phase | Commit | Subject |"); + md.push("|---|---|---|"); + for (const p of m.phase_commits) { + md.push(`| ${p.phase} | \`${p.commit}\` | ${p.subject} |`); + } + md.push(""); + + md.push("## Dataset counts at freeze"); + md.push(""); + md.push("| Artifact | Count |"); + md.push("|---|---|"); + md.push(`| RAG rows | ${m.dataset_counts.rag_rows} |`); + md.push(`| SFT rows (strict accepted-only) | ${m.dataset_counts.sft_rows} |`); + md.push(`| Preference pairs | ${m.dataset_counts.preference_pairs} |`); + md.push(`| Evidence files | ${m.dataset_counts.evidence_files} |`); + md.push(`| Evidence rows | ${m.dataset_counts.evidence_rows} |`); + md.push(`| Scored-run files | ${m.dataset_counts.scored_files} |`); + md.push(`| Scored rows | ${m.dataset_counts.scored_rows} |`); + md.push(`| Quarantined total | ${m.dataset_counts.quarantined_total} |`); + md.push(""); + + if (m.latest_baseline) { + md.push("## Latest audit baseline"); + md.push(""); + md.push("```json"); + md.push(JSON.stringify(m.latest_baseline, null, 2)); + md.push("```"); + md.push(""); + } + + md.push("## Tag command (run after release-ready confirmation)"); + md.push(""); + md.push("```bash"); + md.push(`git tag -a ${m.tag} ${m.git_head.slice(0, 12)} -m "distillation v${m.version.replace(/^v/, "")} — 8-phase substrate frozen"`); + md.push(`git push origin ${m.tag}`); + md.push("```"); + md.push(""); + + md.push("## Failure detail"); + md.push(""); + const failed = m.gates.filter(g => !g.passed); + if (failed.length === 0) { + md.push("(no failures)"); + } else { + for (const g of failed) { + md.push(`### ${g.name}`); + md.push(""); + md.push("```"); + md.push(g.detail); + md.push("```"); + md.push(""); + } + } + + return md.join("\n"); +} + +async function main() { + const root = DEFAULT_ROOT; + console.log("[release-freeze] running gates..."); + + checkCleanGit(); + checkTests(); + verifyPhaseCommits(); + checkAcceptance(); + checkAuditFull(); + checkTagAvailable(); + + const counts = gatherCounts(root); + const baseline = loadLatestBaseline(root); + const manifest: Manifest = { + schema: "distillation_release_manifest.v1", + version: VERSION, + tag: TAG, + released_at: new Date().toISOString(), + git_head: gitOutput(["rev-parse", "HEAD"]), + git_branch: gitOutput(["rev-parse", "--abbrev-ref", "HEAD"]), + phase_commits: PHASE_COMMITS, + dataset_counts: counts, + latest_baseline: baseline, + gates, + passed: gates.every(g => g.passed), + }; + + const reportPath = resolve(root, "reports/distillation/release-freeze.md"); + mkdirSync(dirname(reportPath), { recursive: true }); + writeFileSync(reportPath, renderReport(manifest)); + + // Also persist the manifest JSON for machines. + const manifestPath = resolve(root, "reports/distillation/release-manifest.json"); + writeFileSync(manifestPath, JSON.stringify(manifest, null, 2) + "\n"); + + console.log(""); + console.log(`[release-freeze] ${manifest.passed ? "RELEASE-READY" : "NOT READY"} — ${gates.filter(g => g.passed).length}/${gates.length} gates passed`); + for (const g of gates) { + console.log(` ${g.passed ? "✓" : "✗"} ${g.name}`); + if (!g.passed) console.log(` ${g.detail.split("\n").slice(0, 2).join(" | ")}`); + } + console.log(""); + console.log(`[release-freeze] manifest: ${manifestPath}`); + console.log(`[release-freeze] report: ${reportPath}`); + if (manifest.passed) { + console.log(""); + console.log("To create the tag (manual step — operator must confirm):"); + console.log(` git tag -a ${TAG} -m "distillation v${VERSION.replace(/^v/, "")} — 8-phase substrate frozen"`); + console.log(` git push origin ${TAG}`); + } + + process.exit(manifest.passed ? 0 : 1); +} + +if (import.meta.main) main().catch(e => { console.error(e); process.exit(1); });