golangLAKEHOUSE/internal/distillation/sft_export.go

package distillation

// SFT (Supervised Fine-Tuning) export pipeline. Closes the SUBSTRATE
// half of OPEN #2 — types, contamination firewall, file-listing
// helper. The actual synthesis (turning EvidenceRecord + ScoredRun
// into instruction/input/response triples) is still on the Rust
// side at scripts/distillation/export_sft.ts and will land in a
// follow-up wave.
//
// Why ship substrate without synthesis: the firewall constants and
// types are the load-bearing contamination guarantees. Once they're
// pinned in Go (with tests proving the firewall set is exactly
// {rejected, needs_human_review} and never expands), the synthesis
// port becomes a translation exercise rather than a design one.
//
// Per the project_distillation_substrate.md note: SFT_NEVER is one
// of the "what NOT to touch casually" knobs. Replicating it here in
// Go preserves the cross-runtime invariant — the contamination
// firewall fires even if the SFT export is run from the Go side.

import (
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
)

// SftNever is declared in types.go (the load-bearing contamination
// firewall — pinned at the type-level so every consumer reads the
// same source of truth). IsSftNever below is the predicate
// helper; it lives here because it's specific to the SFT export
// path, not a property of the type system.
//
// IsSftNever returns true if a scored run's category is on the
// contamination firewall list. Inlinable; called per-record in the
// hot synthesis loop.
func IsSftNever(c ScoreCategory) bool {
	for _, blocked := range SftNever {
		if c == blocked {
			return true
		}
	}
	return false
}

// ExportSftOptions mirrors the TS shape so callers porting from
// Rust have an identity-translation surface. Root is the lakehouse
// data root (default $LH_DISTILL_ROOT or /home/profit/lakehouse).
// RecordedAt is the timestamp stamped on emitted SFT samples for
// lineage. IncludePartial toggles "emit even when evidence record
// is missing some optional fields"; DryRun skips file writes.
type ExportSftOptions struct {
	Root           string
	RecordedAt     string
	IncludePartial bool
	DryRun         bool
}

// ExportSftResult mirrors the TS result shape exactly so a
// callable swap between sides doesn't break consumers reading the
// JSON.
type ExportSftResult struct {
	ScoredFilesRead    int    `json:"scored_files_read"`
	RecordsRead        int    `json:"records_read"`
	RecordsExported    int    `json:"records_exported"`
	RecordsQuarantined int    `json:"records_quarantined"`
	OutputPath         string `json:"output_path"`
	QuarantineSummary  string `json:"quarantine_summary"`
}

// ListScoredRunFiles walks {root}/data/scored-runs/YYYY/MM/DD/*.jsonl
// and returns the sorted list. Empty when the dir doesn't exist
// (matches Rust behavior — caller should treat zero-files as a
// no-op, not an error).
func ListScoredRunFiles(root string) ([]string, error) {
	if root == "" {
		return nil, errors.New("distillation: empty root")
	}
	base := filepath.Join(root, "data", "scored-runs")
	if _, err := os.Stat(base); os.IsNotExist(err) {
		return nil, nil
	} else if err != nil {
		return nil, fmt.Errorf("stat %s: %w", base, err)
	}
	var out []string
	years, err := os.ReadDir(base)
	if err != nil {
		return nil, fmt.Errorf("read %s: %w", base, err)
	}
	sortDirEntries(years)
	for _, y := range years {
		if !y.IsDir() {
			continue
		}
		months, err := os.ReadDir(filepath.Join(base, y.Name()))
		if err != nil {
			continue
		}
		sortDirEntries(months)
		for _, m := range months {
			if !m.IsDir() {
				continue
			}
			days, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name()))
			if err != nil {
				continue
			}
			sortDirEntries(days)
			for _, d := range days {
				if !d.IsDir() {
					continue
				}
				files, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name(), d.Name()))
				if err != nil {
					continue
				}
				sortDirEntries(files)
				for _, f := range files {
					if strings.HasSuffix(f.Name(), ".jsonl") {
						out = append(out, filepath.Join(base, y.Name(), m.Name(), d.Name(), f.Name()))
					}
				}
			}
		}
	}
	return out, nil
}

// sortDirEntries sorts dir entries by name in-place. Stable
// alphabetical so the directory walk is deterministic — important
// for the audit_baselines longitudinal signal which expects the
// same order across runs.
func sortDirEntries(entries []os.DirEntry) {
	sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() })
}

// LoadScoredRunsFromFile reads a JSONL of ScoredRun records.
// Returns the slice + the count of malformed lines (skipped).
// This is the read-half — the synthesis half (turn ScoredRun +
// EvidenceRecord into SftSample) is the not-yet-ported piece.
func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil, 0, err
	}
	lines := strings.Split(string(data), "\n")
	out := make([]ScoredRun, 0, len(lines))
	skipped := 0
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		var sr ScoredRun
		if err := json.Unmarshal([]byte(line), &sr); err != nil {
			skipped++
			continue
		}
		out = append(out, sr)
	}
	return out, skipped, nil
}

// ExportSft is the partial port. Lists scored-run files, loads
// each, applies the contamination firewall, and reports counts.
// What's NOT yet ported (deliberate, separate wave):
//   - Evidence-record loading + cache (loadEvidenceByRunId).
//   - synthesizeSft — the actual instruction/input/response
//     synthesis logic. ~80 lines of TS in scripts/distillation/export_sft.ts.
//   - Quarantine writer integration (write rejected records to
//     a quarantine JSONL for operator review).
//   - File output (write SFT JSONL to data/distilled/sft/).
//
// Returning a non-nil result with RecordsExported=0 is intentional
// pre-synthesis — operators calling this on the Go side will see
// the count of records that PASSED the firewall and would have
// been exported by a complete implementation. RecordsQuarantined
// reflects records BLOCKED by the firewall.
//
// Tests/contracts that synthesis port must preserve:
//   - SftNever firewall fires before any other validation
//   - Sort order matches Rust (file walk + record order within file)
//   - Empty root dir returns zero-counts, not error
func ExportSft(opts ExportSftOptions) (ExportSftResult, error) {
	res := ExportSftResult{
		OutputPath:        filepath.Join(opts.Root, "data", "distilled", "sft", "sft_partial.jsonl"),
		QuarantineSummary: "synthesis not yet ported — see internal/distillation/sft_export.go header",
	}
	files, err := ListScoredRunFiles(opts.Root)
	if err != nil {
		return res, fmt.Errorf("list scored runs: %w", err)
	}
	res.ScoredFilesRead = len(files)
	for _, f := range files {
		runs, _, err := LoadScoredRunsFromFile(f)
		if err != nil {
			continue
		}
		res.RecordsRead += len(runs)
		for _, r := range runs {
			if IsSftNever(r.Category) {
				res.RecordsQuarantined++
				continue
			}
			// Synthesis would happen here. Pre-port: count as
			// "would-export" for the firewall-passing records.
			res.RecordsExported++
		}
	}
	return res, nil
}