golangLAKEHOUSE/internal/distillation/audit_full.go

package distillation

// Audit-FULL pipeline — Go port of scripts/distillation/audit_full.ts
// (Rust legacy). Runs the metric-collection passes that produce
// audit_baselines.jsonl entries. Pure observability: never modifies
// pipeline data, only reads and tallies.
//
// Phase coverage in this port:
//   - Phase 0 (file presence)            ✓ ported
//   - Phase 1 (schema validators)        ✓ ported (invokes `go test`
//                                          on internal/distillation)
//   - Phase 2 (evidence materialization) ✓ ported as observer — reads
//                                          existing data/evidence/
//                                          and tallies rows. Doesn't
//                                          re-run the materializer
//                                          (which is Rust-side); the
//                                          audit-FULL discipline is
//                                          OBSERVATION, not re-execution.
//   - Phase 3 (scored-runs distribution) ✓ ported
//   - Phase 4 (contamination firewall)   ✓ ported
//   - Phase 5 (receipts validation)      ✓ ported as observer — reads
//                                          reports/distillation/{run_id}/
//                                          summary.json + 5 stage
//                                          receipts (any-runtime artifacts).
//   - Phase 6 (acceptance gate)          ✗ skipped — TS-only fixture
//                                          harness at scripts/distillation/
//                                          acceptance.ts with bun-
//                                          specific deps. Porting the
//                                          fixtures + invariant runner
//                                          to Go is its own ADR-worth
//                                          of work; out of scope.
//   - Phase 7 (replay log shape)         ✓ ported as observer — reads
//                                          data/_kb/replay_runs.jsonl
//                                          and checks shape, doesn't
//                                          re-run replay (Rust-side
//                                          replay.ts is the producer).
//
// Output: a structured PhaseCheckReport plus a Markdown summary.
// Operators run this from cmd/audit_full to validate a Go-side
// distillation pipeline run produced sane outputs.

import (
	"encoding/json"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
)

// PhaseCheck is one observable check within a phase. Mirrors the
// Rust shape exactly — Markdown rendering uses the same column
// layout so cross-runtime diff'ing is meaningful.
type PhaseCheck struct {
	Phase    int      `json:"phase"`
	Name     string   `json:"name"`
	Expected string   `json:"expected"`
	Actual   string   `json:"actual"`
	Passed   bool     `json:"passed"`
	Required bool     `json:"required"` // false → informational only, doesn't fail audit
	Notes    []string `json:"notes,omitempty"`
}

// PhaseCheckReport is the aggregate result of one audit-full run.
// Metrics is the AuditBaseline-shape metric snapshot that the
// caller can pass to AppendBaseline to grow the longitudinal log.
type PhaseCheckReport struct {
	Checks  []PhaseCheck     `json:"checks"`
	Metrics map[string]int64 `json:"metrics"`
	Failed  int              `json:"failed"`           // count of REQUIRED checks that failed
	Skipped int              `json:"deferred_phases"`  // phases not yet ported
	GitHEAD string           `json:"git_head,omitempty"`
}

// AuditFullOptions controls a single audit-full run. Root is the
// data dir (defaults to LH_DISTILL_ROOT or /home/profit/lakehouse
// to keep operators running both runtimes hitting the same paths).
type AuditFullOptions struct {
	Root    string
	GitHEAD string // optional — caller resolves and passes through
	// GoTestModule is the package-pattern Phase 1 invokes via
	// `go test`. Defaults to "./internal/distillation/..." when
	// empty. Tests pass an empty path to disable the live
	// `go test` invocation (which would recurse).
	GoTestModule string
}

// RunAuditFull orchestrates the ported phases (0, 3, 4) and
// returns the aggregated report. Each phase is independent; a
// phase that errors is recorded as a failed check rather than
// aborting the run, matching Rust's "always emit a report" stance.
func RunAuditFull(opts AuditFullOptions) PhaseCheckReport {
	if opts.Root == "" {
		if env := os.Getenv("LH_DISTILL_ROOT"); env != "" {
			opts.Root = env
		} else {
			opts.Root = "/home/profit/lakehouse"
		}
	}
	report := PhaseCheckReport{
		Metrics: make(map[string]int64),
		GitHEAD: opts.GitHEAD,
		Skipped: 1, // only phase 6 (TS-only acceptance harness) deferred
	}
	auditPhase0(opts.Root, &report)
	auditPhase1(opts.Root, &report, opts.GoTestModule)
	auditPhase2(opts.Root, &report)
	auditPhase3(opts.Root, &report)
	auditPhase4(opts.Root, &report)
	auditPhase5(opts.Root, &report)
	// phase 6 intentionally skipped — see header comment
	auditPhase7(opts.Root, &report)
	for _, c := range report.Checks {
		if c.Required && !c.Passed {
			report.Failed++
		}
	}
	return report
}

// ── Phase 0: file presence ─────────────────────────────────────────

func auditPhase0(root string, report *PhaseCheckReport) {
	// The recon doc is Rust-specific (docs/recon/local-distillation-
	// recon.md); a Go-side equivalent would live in the
	// golangLAKEHOUSE repo. For audit-full's purposes, we treat its
	// presence as informational rather than required when running
	// against a non-Rust root.
	reconPath := filepath.Join(root, "docs", "recon", "local-distillation-recon.md")
	exists := fileExists(reconPath)
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 0, Name: "recon doc exists",
		Expected: "docs/recon/local-distillation-recon.md present",
		Actual:   fmt.Sprintf("%v", exists),
		Passed:   exists, Required: false, // informational on Go-side runs
	})

	tier1 := []string{
		"data/_kb/distilled_facts.jsonl",
		"data/_kb/scrum_reviews.jsonl",
		"data/_kb/audit_facts.jsonl",
		"data/_kb/mode_experiments.jsonl",
	}
	missing := []string{}
	for _, p := range tier1 {
		if !fileExists(filepath.Join(root, p)) {
			missing = append(missing, p)
		}
	}
	notes := []string{}
	if len(missing) > 0 {
		notes = append(notes, "fresh-clone or post-rotation environment — Phase 2 will tally as rows_present=false; not a hard fail")
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 0, Name: "tier-1 source streams present",
		Expected: "all 4 tier-1 jsonls on disk",
		Actual: func() string {
			if len(missing) == 0 {
				return "all present"
			}
			return "missing: " + strings.Join(missing, ", ")
		}(),
		Passed: len(missing) == 0, Required: false,
		Notes: notes,
	})
}

// ── Phase 1: schema validators ─────────────────────────────────────

// auditPhase1 invokes `go test` on the distillation package — the Go
// equivalent of Rust's `bun test auditor/schemas/distillation/`. The
// audit-FULL semantic: "do the schema validators still pass on
// fixtures?" When module == "" (test mode) the phase records a
// skipped-with-rationale check rather than recursing into itself.
func auditPhase1(root string, report *PhaseCheckReport, module string) {
	if module == "" {
		// Test-disabled mode: record but don't invoke (would recurse
		// when called from a `go test` already in progress).
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 1, Name: "schema validators (skipped — test invocation disabled)",
			Expected: "go test ./internal/distillation/...",
			Actual:   "skipped",
			Passed:   true, Required: false,
			Notes: []string{"caller passed empty GoTestModule — typically because we're already inside a test run"},
		})
		return
	}
	cmd := exec.Command("go", "test", "-count=1", module)
	cmd.Dir = root // run from go module root if caller supplied it; otherwise cwd
	out, err := cmd.CombinedOutput()
	passed := err == nil
	actual := "PASS"
	if !passed {
		actual = "FAIL — " + abbrevOutput(string(out), 200)
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 1, Name: "schema validators pass on fixtures",
		Expected: "go test ./internal/distillation/... → exit 0",
		Actual:   actual,
		Passed:   passed, Required: true,
	})
}

// abbrevOutput truncates noisy command-output to a stable preview.
// Long stack traces would blow out the report Markdown without this.
func abbrevOutput(s string, max int) string {
	s = strings.TrimSpace(s)
	if len(s) <= max {
		return s
	}
	return s[:max] + "...(truncated)"
}

// ── Phase 2: evidence materialization (observer) ───────────────────

// auditPhase2 reads data/evidence/ and tallies rows + skipped
// markers. Mirrors the Rust phase 2's "materializer dry-run
// completes / tier-1 sources each materialize ≥1 row" checks but
// in OBSERVER mode — doesn't re-run the materializer (which is
// Rust-side); instead reads what the Rust side already produced.
//
// Records p2_evidence_rows + p2_evidence_skips metrics that match
// the Rust shape, so a Go-side audit-full producing baselines is
// drop-in-comparable to a Rust-side run.
func auditPhase2(root string, report *PhaseCheckReport) {
	evidenceDir := filepath.Join(root, "data", "evidence")
	if !fileExists(evidenceDir) {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 2, Name: "evidence materialization output present",
			Expected: "data/evidence/ populated",
			Actual:   "missing",
			Passed:   false, Required: true,
			Notes: []string{"run materializer (Rust: ./scripts/distill collect; Go-side materializer not yet ported) before audit-full"},
		})
		return
	}
	rows := int64(0)
	skips := int64(0)
	bySource := map[string]int64{}
	tier1Hits := map[string]bool{
		"distilled_facts":  false,
		"scrum_reviews":    false,
		"audit_facts":      false,
		"mode_experiments": false,
	}

	walkErr := filepath.Walk(evidenceDir, func(path string, info os.FileInfo, err error) error {
		if err != nil {
			return nil
		}
		if info.IsDir() || !strings.HasSuffix(path, ".jsonl") {
			return nil
		}
		data, err := os.ReadFile(path)
		if err != nil {
			return nil
		}
		// Tally per-source via the ev.provenance.source_file field on
		// each evidence row. Match Rust's "by_source" map shape.
		for _, line := range strings.Split(string(data), "\n") {
			line = strings.TrimSpace(line)
			if line == "" {
				continue
			}
			rows++
			var rec struct {
				Provenance struct {
					SourceFile string `json:"source_file"`
				} `json:"provenance"`
				SuccessMarkers []string `json:"success_markers,omitempty"`
				FailureMarkers []string `json:"failure_markers,omitempty"`
			}
			if err := json.Unmarshal([]byte(line), &rec); err != nil {
				skips++
				continue
			}
			stem := stemFromSourceFile(rec.Provenance.SourceFile)
			bySource[stem]++
			if _, ok := tier1Hits[stem]; ok {
				tier1Hits[stem] = true
			}
		}
		return nil
	})
	if walkErr != nil {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 2, Name: "evidence walk",
			Expected: "no error", Actual: walkErr.Error(),
			Passed: false, Required: true,
		})
		return
	}

	report.Metrics["p2_evidence_rows"] = rows
	report.Metrics["p2_evidence_skips"] = skips

	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 2, Name: "evidence materialization output non-empty",
		Expected: ">=1 row across all sources",
		Actual:   fmt.Sprintf("%d rows · %d skipped", rows, skips),
		Passed:   rows >= 1, Required: true,
	})

	tier1Found := []string{}
	for src, hit := range tier1Hits {
		if hit {
			tier1Found = append(tier1Found, src)
		}
	}
	sort.Strings(tier1Found)
	notes := []string{}
	if len(tier1Found) < 4 {
		notes = append(notes, "fresh-environment OK; expect lower count when source streams are absent")
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 2, Name: "tier-1 sources each materialize ≥1 row",
		Expected: "4/4: distilled_facts, scrum_reviews, audit_facts, mode_experiments",
		Actual:   fmt.Sprintf("%d/4 hit (%s)", len(tier1Found), strings.Join(tier1Found, ", ")),
		Passed:   len(tier1Found) >= 1, Required: false,
		Notes: notes,
	})
}

// ── Phase 3: scored-runs distribution ──────────────────────────────

func auditPhase3(root string, report *PhaseCheckReport) {
	scoredDir := filepath.Join(root, "data", "scored-runs")
	if !fileExists(scoredDir) {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 3, Name: "scored-runs on disk",
			Expected: "data/scored-runs/ populated",
			Actual:   "missing",
			Passed:   false, Required: true,
			Notes: []string{"run scoring before audit-full (Go: scripts/distillation/score; Rust: ./scripts/distill score)"},
		})
		return
	}

	counts := map[string]int64{
		"accepted":           0,
		"partially_accepted": 0,
		"rejected":           0,
		"needs_human_review": 0,
	}
	files, err := ListScoredRunFiles(root)
	if err != nil {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 3, Name: "scored-runs walk",
			Expected: "no error", Actual: err.Error(),
			Passed: false, Required: true,
		})
		return
	}
	for _, f := range files {
		runs, _, err := LoadScoredRunsFromFile(f)
		if err != nil {
			continue
		}
		for _, r := range runs {
			if _, ok := counts[string(r.Category)]; ok {
				counts[string(r.Category)]++
			}
		}
	}
	total := counts["accepted"] + counts["partially_accepted"] + counts["rejected"] + counts["needs_human_review"]

	report.Metrics["p3_accepted"] = counts["accepted"]
	report.Metrics["p3_partial"] = counts["partially_accepted"]
	report.Metrics["p3_rejected"] = counts["rejected"]
	report.Metrics["p3_human"] = counts["needs_human_review"]

	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 3, Name: "on-disk scored-runs distribution non-empty",
		Expected: ">=1 accepted",
		Actual:   fmt.Sprintf("acc=%d part=%d rej=%d hum=%d", counts["accepted"], counts["partially_accepted"], counts["rejected"], counts["needs_human_review"]),
		Passed:   counts["accepted"] >= 1, Required: true,
	})
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 3, Name: "scored-runs distribution sums positive",
		Expected: ">0 total", Actual: fmt.Sprintf("%d total", total),
		Passed: total > 0, Required: false,
	})
}

// ── Phase 4: contamination firewall + provenance ───────────────────

// sigHashRe pre-compiled match for the canonical sig_hash shape:
// 64 lowercase hex characters (sha256 hex). Used per-row in the
// provenance check.
var sigHashRe = regexp.MustCompile(`^[0-9a-f]{64}$`)

func auditPhase4(root string, report *PhaseCheckReport) {
	sftPath := filepath.Join(root, "exports", "sft", "instruction_response.jsonl")
	ragPath := filepath.Join(root, "exports", "rag", "playbooks.jsonl")
	prefPath := filepath.Join(root, "exports", "preference", "chosen_rejected.jsonl")

	sftRows := readJSONLLines(sftPath)
	ragRows := readJSONLLines(ragPath)
	prefRows := readJSONLLines(prefPath)

	report.Metrics["p4_sft_rows"] = int64(len(sftRows))
	report.Metrics["p4_rag_rows"] = int64(len(ragRows))
	report.Metrics["p4_pref_pairs"] = int64(len(prefRows))

	// SFT contamination firewall: 0 forbidden quality_scores. The
	// only legal SFT quality scores are accepted + partially_accepted.
	sftForbidden := 0
	for _, line := range sftRows {
		var r struct {
			QualityScore string `json:"quality_score"`
		}
		if err := json.Unmarshal([]byte(line), &r); err != nil {
			continue // tolerate malformed (matches Rust)
		}
		if r.QualityScore != "accepted" && r.QualityScore != "partially_accepted" {
			sftForbidden++
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 4, Name: "SFT contamination firewall: 0 forbidden quality_scores",
		Expected: "0", Actual: fmt.Sprintf("%d", sftForbidden),
		Passed: sftForbidden == 0, Required: true,
		Notes: []string{"this is the spec non-negotiable — rejected/needs_human_review must NEVER appear in SFT"},
	})

	// RAG firewall: 0 rejected leaks
	ragRejected := 0
	for _, line := range ragRows {
		var r struct {
			SuccessScore string `json:"success_score"`
		}
		if err := json.Unmarshal([]byte(line), &r); err != nil {
			continue
		}
		if r.SuccessScore == "rejected" {
			ragRejected++
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 4, Name: "RAG firewall: 0 rejected leaks",
		Expected: "0", Actual: fmt.Sprintf("%d", ragRejected),
		Passed: ragRejected == 0, Required: true,
	})

	// Preference: 0 self-pairs + 0 identical-text pairs.
	prefSelfPairs, prefIdenticalText := 0, 0
	for _, line := range prefRows {
		var r struct {
			ChosenRunID   string `json:"chosen_run_id"`
			RejectedRunID string `json:"rejected_run_id"`
			Chosen        string `json:"chosen"`
			Rejected      string `json:"rejected"`
		}
		if err := json.Unmarshal([]byte(line), &r); err != nil {
			continue
		}
		if r.ChosenRunID == r.RejectedRunID {
			prefSelfPairs++
		}
		if r.Chosen == r.Rejected {
			prefIdenticalText++
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 4, Name: "Preference: 0 self-pairs (chosen_run_id != rejected_run_id)",
		Expected: "0", Actual: fmt.Sprintf("%d", prefSelfPairs),
		Passed: prefSelfPairs == 0, Required: true,
	})
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 4, Name: "Preference: 0 identical-text pairs",
		Expected: "0", Actual: fmt.Sprintf("%d", prefIdenticalText),
		Passed: prefIdenticalText == 0, Required: true,
	})

	// Provenance check: every export row must carry a 64-char hex
	// sig_hash. Walks sft + rag + pref together since the contract
	// is uniform across all three.
	noProv := 0
	checkProv := func(line string) {
		var r struct {
			Provenance struct {
				SigHash string `json:"sig_hash"`
			} `json:"provenance"`
		}
		if err := json.Unmarshal([]byte(line), &r); err != nil {
			return
		}
		if r.Provenance.SigHash == "" || !sigHashRe.MatchString(r.Provenance.SigHash) {
			noProv++
		}
	}
	for _, line := range sftRows {
		checkProv(line)
	}
	for _, line := range ragRows {
		checkProv(line)
	}
	for _, line := range prefRows {
		checkProv(line)
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 4, Name: "every export row carries valid sha256 provenance.sig_hash",
		Expected: "0 missing", Actual: fmt.Sprintf("%d missing", noProv),
		Passed: noProv == 0, Required: true,
	})

	// Quarantine totals (informational — feeds the p4_total_quarantined
	// metric used by the longitudinal drift signal).
	totalQuar := int64(0)
	for _, qp := range []string{
		"exports/quarantine/sft.jsonl",
		"exports/quarantine/rag.jsonl",
		"exports/quarantine/preference.jsonl",
	} {
		totalQuar += int64(len(readJSONLLines(filepath.Join(root, qp))))
	}
	report.Metrics["p4_total_quarantined"] = totalQuar
}

// ── Phase 5: receipts validation (observer) ────────────────────────

// runSummaryShape mirrors the Rust RunSummary just enough to
// validate the file's shape — schema_version, run_hash sha256,
// git_commit hex, and the 5 stage names. Full schema validation
// is intentionally NOT ported (it would require porting the
// schemas/distillation/ TS validators); we check the load-bearing
// invariants and call it good.
type runSummaryShape struct {
	SchemaVersion int    `json:"schema_version"`
	RunID         string `json:"run_id"`
	GitCommit     string `json:"git_commit"`
	RunHash       string `json:"run_hash"`
	Stages        []struct {
		Stage string `json:"stage"`
	} `json:"stages"`
}

func auditPhase5(root string, report *PhaseCheckReport) {
	reportsDir := filepath.Join(root, "reports", "distillation")
	if !fileExists(reportsDir) {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 5, Name: "receipts directory exists",
			Expected: "reports/distillation/", Actual: "MISSING",
			Passed: false, Required: true,
		})
		return
	}
	// Find the most recent run_id directory with a summary.json.
	// Mirrors the Rust mtime-sort behavior — ordering matters when
	// both Rust + Go runs land in the same directory.
	type cand struct {
		id    string
		mtime int64
	}
	var cands []cand
	entries, err := os.ReadDir(reportsDir)
	if err != nil {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 5, Name: "scan reports/distillation",
			Expected: "no error", Actual: err.Error(),
			Passed: false, Required: true,
		})
		return
	}
	for _, e := range entries {
		if !e.IsDir() {
			continue
		}
		sumPath := filepath.Join(reportsDir, e.Name(), "summary.json")
		st, err := os.Stat(sumPath)
		if err != nil {
			continue
		}
		cands = append(cands, cand{id: e.Name(), mtime: st.ModTime().UnixMilli()})
	}
	if len(cands) == 0 {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 5, Name: "≥1 run with summary.json",
			Expected: "≥1", Actual: "0",
			Passed: false, Required: false,
			Notes: []string{"no Phase 5 run-all has executed yet — Rust: ./scripts/distill run-all"},
		})
		return
	}
	sort.Slice(cands, func(i, j int) bool { return cands[i].mtime > cands[j].mtime })
	latest := cands[0]
	runDir := filepath.Join(reportsDir, latest.id)

	// All 5 stage receipts present.
	expected := []string{"collect", "score", "export-rag", "export-sft", "export-preference"}
	missing := []string{}
	for _, s := range expected {
		if !fileExists(filepath.Join(runDir, s+".json")) {
			missing = append(missing, s)
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 5, Name: fmt.Sprintf("latest run (%s) has all 5 stage receipts", latest.id),
		Expected: strings.Join(expected, ","),
		Actual: func() string {
			if len(missing) == 0 {
				return "all present"
			}
			return "missing: " + strings.Join(missing, ",")
		}(),
		Passed: len(missing) == 0, Required: true,
	})

	// Each receipt parses as JSON. Full schema validation (StageReceipt
	// schema) is Rust-side only; we check basic decodability here.
	invalid := 0
	for _, s := range expected {
		path := filepath.Join(runDir, s+".json")
		data, err := os.ReadFile(path)
		if err != nil {
			continue
		}
		var anyShape any
		if err := json.Unmarshal(data, &anyShape); err != nil {
			invalid++
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 5, Name: "every stage receipt parses as JSON",
		Expected: "0 invalid", Actual: fmt.Sprintf("%d invalid", invalid),
		Passed: invalid == 0, Required: true,
	})

	// RunSummary shape: schema_version=1, run_hash sha256, git_commit
	// 40-char hex.
	summaryPath := filepath.Join(runDir, "summary.json")
	data, err := os.ReadFile(summaryPath)
	if err != nil {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 5, Name: "summary.json readable",
			Expected: "ok", Actual: err.Error(),
			Passed: false, Required: true,
		})
		return
	}
	var sum runSummaryShape
	if err := json.Unmarshal(data, &sum); err != nil {
		report.Checks = append(report.Checks, PhaseCheck{
			Phase: 5, Name: "summary.json decodable",
			Expected: "ok", Actual: err.Error(),
			Passed: false, Required: true,
		})
		return
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 5, Name: "summary.schema_version == 1",
		Expected: "1", Actual: fmt.Sprintf("%d", sum.SchemaVersion),
		Passed: sum.SchemaVersion == 1, Required: true,
	})
	gitHEADRe := regexp.MustCompile(`^[0-9a-f]{40}$`)
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 5, Name: "summary.git_commit is 40-char hex",
		Expected: "/^[0-9a-f]{40}$/", Actual: shortHash(sum.GitCommit),
		Passed: gitHEADRe.MatchString(sum.GitCommit), Required: false,
	})
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 5, Name: "run_hash is sha256",
		Expected: "/^[0-9a-f]{64}$/", Actual: shortHash(sum.RunHash),
		Passed: sigHashRe.MatchString(sum.RunHash), Required: true,
	})
}

func shortHash(h string) string {
	if len(h) <= 16 {
		return h
	}
	return h[:16] + "..."
}

// ── Phase 7: replay log shape (observer) ───────────────────────────

// auditPhase7 checks data/_kb/replay_runs.jsonl exists and contains
// well-shaped records. Mirrors Rust phase 7's "persisted log shape"
// check but skips the live-replay invocation (which would require
// porting Rust replay.ts, a substantial effort). The full Rust
// phase 7 also runs 3 dry-run replays — operators wanting that
// signal continue to invoke the Rust audit-full.
func auditPhase7(root string, report *PhaseCheckReport) {
	logPath := filepath.Join(root, "data", "_kb", "replay_runs.jsonl")
	lines := readJSONLLines(logPath)
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 7, Name: "replay_runs.jsonl exists",
		Expected: "exists with ≥1 row",
		Actual: func() string {
			if !fileExists(logPath) {
				return "missing"
			}
			return fmt.Sprintf("%d rows total", len(lines))
		}(),
		Passed: fileExists(logPath), Required: false,
	})
	if !fileExists(logPath) {
		return
	}
	// Validate shape on a sample of rows — full validation across
	// thousands of lines isn't worth the cost, and a structural
	// problem will show up in any sample.
	sample := lines
	if len(sample) > 50 {
		sample = sample[len(sample)-50:]
	}
	malformed := 0
	for _, line := range sample {
		var anyShape any
		if err := json.Unmarshal([]byte(line), &anyShape); err != nil {
			malformed++
		}
	}
	report.Checks = append(report.Checks, PhaseCheck{
		Phase: 7, Name: "replay_runs.jsonl tail rows parse as JSON",
		Expected: "0 malformed in last 50", Actual: fmt.Sprintf("%d malformed", malformed),
		Passed: malformed == 0, Required: true,
	})
}

// ── helpers ────────────────────────────────────────────────────────

func fileExists(p string) bool {
	_, err := os.Stat(p)
	return err == nil
}

// readJSONLLines reads a JSONL file and returns non-empty lines.
// Returns nil on missing file (matches Rust's existsSync ? read : []).
func readJSONLLines(path string) []string {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil
	}
	out := make([]string, 0)
	for _, line := range strings.Split(string(data), "\n") {
		if strings.TrimSpace(line) != "" {
			out = append(out, line)
		}
	}
	return out
}

// FormatAuditFullReport renders a Markdown report mirroring the
// Rust phase8-full-audit-report.md shape so operators reading
// across runtimes don't have to re-learn the layout.
func FormatAuditFullReport(report PhaseCheckReport) string {
	var b strings.Builder
	fmt.Fprintln(&b, "# Audit-FULL report (Go)")
	fmt.Fprintln(&b)
	if report.GitHEAD != "" {
		fmt.Fprintf(&b, "**git HEAD:** `%s`\n\n", report.GitHEAD)
	}
	failed := report.Failed
	total := 0
	for _, c := range report.Checks {
		if c.Required {
			total++
		}
	}
	verdict := "PASS"
	if failed > 0 {
		verdict = "FAIL"
	}
	fmt.Fprintf(&b, "**Verdict:** %s — %d/%d required checks passed; %d phase(s) deferred.\n\n",
		verdict, total-failed, total, report.Skipped)

	fmt.Fprintln(&b, "## Checks")
	fmt.Fprintln(&b)
	fmt.Fprintln(&b, "| phase | name | expected | actual | required | passed |")
	fmt.Fprintln(&b, "|---|---|---|---|---|---|")
	for _, c := range report.Checks {
		req := "no"
		if c.Required {
			req = "**yes**"
		}
		passed := "✗"
		if c.Passed {
			passed = "✓"
		}
		fmt.Fprintf(&b, "| %d | %s | %s | %s | %s | %s |\n",
			c.Phase, c.Name, c.Expected, c.Actual, req, passed)
		for _, n := range c.Notes {
			fmt.Fprintf(&b, "| | _note_ | %s | | | |\n", n)
		}
	}

	if len(report.Metrics) > 0 {
		fmt.Fprintln(&b)
		fmt.Fprintln(&b, "## Metrics")
		fmt.Fprintln(&b)
		fmt.Fprintln(&b, "| metric | value |")
		fmt.Fprintln(&b, "|---|---:|")
		// Stable order for diffs.
		names := make([]string, 0, len(report.Metrics))
		for k := range report.Metrics {
			names = append(names, k)
		}
		sort.Strings(names)
		for _, k := range names {
			fmt.Fprintf(&b, "| %s | %d |\n", k, report.Metrics[k])
		}
	}
	return b.String()
}