golangLAKEHOUSE/internal/distillation/sft_export.go

package distillation

// SFT (Supervised Fine-Tuning) export pipeline. Closes the SUBSTRATE
// half of OPEN #2 — types, contamination firewall, file-listing
// helper. The actual synthesis (turning EvidenceRecord + ScoredRun
// into instruction/input/response triples) is still on the Rust
// side at scripts/distillation/export_sft.ts and will land in a
// follow-up wave.
//
// Why ship substrate without synthesis: the firewall constants and
// types are the load-bearing contamination guarantees. Once they're
// pinned in Go (with tests proving the firewall set is exactly
// {rejected, needs_human_review} and never expands), the synthesis
// port becomes a translation exercise rather than a design one.
//
// Per the project_distillation_substrate.md note: SFT_NEVER is one
// of the "what NOT to touch casually" knobs. Replicating it here in
// Go preserves the cross-runtime invariant — the contamination
// firewall fires even if the SFT export is run from the Go side.

import (
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
)

// SftNever is declared in types.go (the load-bearing contamination
// firewall — pinned at the type-level so every consumer reads the
// same source of truth). IsSftNever below is the predicate
// helper; it lives here because it's specific to the SFT export
// path, not a property of the type system.
//
// IsSftNever returns true if a scored run's category is on the
// contamination firewall list. Inlinable; called per-record in the
// hot synthesis loop.
func IsSftNever(c ScoreCategory) bool {
	for _, blocked := range SftNever {
		if c == blocked {
			return true
		}
	}
	return false
}

// ExportSftOptions mirrors the TS shape so callers porting from
// Rust have an identity-translation surface. Root is the lakehouse
// data root (default $LH_DISTILL_ROOT or /home/profit/lakehouse).
// RecordedAt is the timestamp stamped on emitted SFT samples for
// lineage. IncludePartial toggles "emit even when evidence record
// is missing some optional fields"; DryRun skips file writes.
type ExportSftOptions struct {
	Root           string
	RecordedAt     string
	IncludePartial bool
	DryRun         bool
}

// ExportSftResult mirrors the TS result shape exactly so a
// callable swap between sides doesn't break consumers reading the
// JSON.
type ExportSftResult struct {
	ScoredFilesRead    int    `json:"scored_files_read"`
	RecordsRead        int    `json:"records_read"`
	RecordsExported    int    `json:"records_exported"`
	RecordsQuarantined int    `json:"records_quarantined"`
	OutputPath         string `json:"output_path"`
	QuarantineSummary  string `json:"quarantine_summary"`
}

// ListScoredRunFiles walks {root}/data/scored-runs/YYYY/MM/DD/*.jsonl
// and returns the sorted list. Empty when the dir doesn't exist
// (matches Rust behavior — caller should treat zero-files as a
// no-op, not an error).
func ListScoredRunFiles(root string) ([]string, error) {
	if root == "" {
		return nil, errors.New("distillation: empty root")
	}
	base := filepath.Join(root, "data", "scored-runs")
	if _, err := os.Stat(base); os.IsNotExist(err) {
		return nil, nil
	} else if err != nil {
		return nil, fmt.Errorf("stat %s: %w", base, err)
	}
	var out []string
	years, err := os.ReadDir(base)
	if err != nil {
		return nil, fmt.Errorf("read %s: %w", base, err)
	}
	sortDirEntries(years)
	for _, y := range years {
		if !y.IsDir() {
			continue
		}
		months, err := os.ReadDir(filepath.Join(base, y.Name()))
		if err != nil {
			continue
		}
		sortDirEntries(months)
		for _, m := range months {
			if !m.IsDir() {
				continue
			}
			days, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name()))
			if err != nil {
				continue
			}
			sortDirEntries(days)
			for _, d := range days {
				if !d.IsDir() {
					continue
				}
				files, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name(), d.Name()))
				if err != nil {
					continue
				}
				sortDirEntries(files)
				for _, f := range files {
					if strings.HasSuffix(f.Name(), ".jsonl") {
						out = append(out, filepath.Join(base, y.Name(), m.Name(), d.Name(), f.Name()))
					}
				}
			}
		}
	}
	return out, nil
}

// sortDirEntries sorts dir entries by name in-place. Stable
// alphabetical so the directory walk is deterministic — important
// for the audit_baselines longitudinal signal which expects the
// same order across runs.
func sortDirEntries(entries []os.DirEntry) {
	sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() })
}

// LoadScoredRunsFromFile reads a JSONL of ScoredRun records.
// Returns the slice + the count of malformed lines (skipped).
// This is the read-half — the synthesis half (turn ScoredRun +
// EvidenceRecord into SftSample) is the not-yet-ported piece.
func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil, 0, err
	}
	lines := strings.Split(string(data), "\n")
	out := make([]ScoredRun, 0, len(lines))
	skipped := 0
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		var sr ScoredRun
		if err := json.Unmarshal([]byte(line), &sr); err != nil {
			skipped++
			continue
		}
		out = append(out, sr)
	}
	return out, skipped, nil
}

// LoadEvidenceByRunID reads {evidence_path}'s JSONL and returns a
// map[run_id]EvidenceRecord. evidence_path is derived by replacing
// /scored-runs/ with /evidence/ in the scored-runs path — the
// convention the Rust pipeline uses to colocate evidence + scores.
//
// Cache is the per-call memoization map: callers iterating multiple
// scored-runs files in the same directory benefit from shared
// loads. Pass nil to disable caching (useful in tests).
func LoadEvidenceByRunID(scoredPath string, cache map[string]map[string]EvidenceRecord) (map[string]EvidenceRecord, error) {
	evidencePath := strings.Replace(scoredPath, "/scored-runs/", "/evidence/", 1)
	if cache != nil {
		if hit, ok := cache[evidencePath]; ok {
			return hit, nil
		}
	}
	out := make(map[string]EvidenceRecord)
	data, err := os.ReadFile(evidencePath)
	if os.IsNotExist(err) {
		if cache != nil {
			cache[evidencePath] = out
		}
		return out, nil
	}
	if err != nil {
		return nil, err
	}
	for _, line := range strings.Split(string(data), "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		var r EvidenceRecord
		if err := json.Unmarshal([]byte(line), &r); err != nil {
			continue // tolerate malformed lines per Rust convention
		}
		out[r.RunID] = r
	}
	if cache != nil {
		cache[evidencePath] = out
	}
	return out, nil
}

// SynthesizeSft turns a (ScoredRun, EvidenceRecord) pair into an
// SftSample. Returns nil when the record is unsuitable for SFT:
//   - model_role isn't executor/reviewer/applier (extraction-class
//     records have no instruction→response shape)
//   - text is empty (nothing to learn from)
//
// Mirrors the Rust synthesizeSft exactly so byte-identical SFT
// JSONL output is achievable across both runtimes (operators
// running a/b validation between Rust + Go pipelines depend on this).
func SynthesizeSft(scored ScoredRun, ev EvidenceRecord, recordedAt, sftID string) *SftSample {
	if ev.ModelRole != RoleExecutor && ev.ModelRole != RoleReviewer && ev.ModelRole != RoleApplier {
		return nil
	}
	text := ev.Text
	if strings.TrimSpace(text) == "" {
		return nil
	}

	stem := stemFromSourceFile(ev.Provenance.SourceFile)
	firstSourceFile := ""
	if len(ev.SourceFiles) > 0 {
		firstSourceFile = ev.SourceFiles[0]
	}
	if firstSourceFile == "" {
		firstSourceFile = "<file>"
	}
	instruction := buildInstruction(stem, ev.TaskID, firstSourceFile)

	// Context — what the model could see. Keep terse; matches Rust's
	// space-separated " · " join.
	var ctxParts []string
	if ev.RetrievedContext != nil {
		if len(ev.RetrievedContext.MatrixCorpora) > 0 {
			ctxParts = append(ctxParts, "matrix="+strings.Join(ev.RetrievedContext.MatrixCorpora, ","))
		}
		if ev.RetrievedContext.PathwayFingerprintsSeen > 0 {
			ctxParts = append(ctxParts, fmt.Sprintf("pathway_fingerprints=%d", ev.RetrievedContext.PathwayFingerprintsSeen))
		}
	}
	if ev.ModelName != "" {
		ctxParts = append(ctxParts, "model="+ev.ModelName)
	}
	context := strings.Join(ctxParts, " · ")

	return &SftSample{
		SchemaVersion: SftSampleSchemaVersion,
		ID:            sftID,
		Instruction:   instruction,
		Context:       context,
		Response:      text,
		SourceRunID:   scored.EvidenceRunID,
		QualityScore:  SftQualityScore(scored.Category),
		CreatedAt:     recordedAt,
		Provenance: Provenance{
			SourceFile: scored.Provenance.SourceFile,
			LineOffset: scored.Provenance.LineOffset,
			SigHash:    scored.Provenance.SigHash,
			RecordedAt: recordedAt,
		},
	}
}

// stemFromSourceFile mirrors the Rust path-cleanup: strip the
// `data/_kb/` prefix and the `.jsonl` suffix. Used to dispatch
// the per-source-class instruction templates below.
func stemFromSourceFile(path string) string {
	s := strings.TrimPrefix(path, "data/_kb/")
	s = strings.TrimSuffix(s, ".jsonl")
	return s
}

// buildInstruction returns the per-source-class instruction
// template. Mirrors the Rust switch statement byte-for-byte so
// trained-model behavior is unchanged across runtimes.
func buildInstruction(stem, taskID, firstSourceFile string) string {
	switch stem {
	case "scrum_reviews":
		return fmt.Sprintf("Review the file '%s' against the PRD + change-proposal context. Produce a forensic audit with findings, severity, confidence, patches.", firstSourceFile)
	case "mode_experiments":
		return fmt.Sprintf("Run task_class='%s' for file '%s'. Produce the mode-runner's expected output shape.", taskID, firstSourceFile)
	case "auto_apply":
		return fmt.Sprintf("Auto-apply: emit a 6-line surgical patch for '%s' based on the latest scrum review findings.", firstSourceFile)
	case "audits":
		phase := strings.TrimPrefix(taskID, "phase:")
		return fmt.Sprintf("Audit phase '%s' and report findings with severity.", phase)
	case "observer_reviews":
		return fmt.Sprintf("Observer-review the latest attempt on '%s'. Verdict: accept | reject | cycle.", firstSourceFile)
	case "contract_analyses":
		permit := strings.TrimPrefix(taskID, "permit:")
		// The Rust version reads ev.metadata.contractor when present.
		// Go's EvidenceRecord doesn't carry a free-form metadata bag
		// today, so we always emit the no-contractor form. Operators
		// who need contractor-aware instructions can extend this once
		// the metadata field lands in EvidenceRecord (separate ADR).
		return fmt.Sprintf("Analyze permit '%s'. Recommend with risk markers.", permit)
	case "outcomes":
		return "Run scenario; report per-event outcome with citations."
	default:
		return fmt.Sprintf("Source '%s' run; produce the appropriate output for this task type.", stem)
	}
}

// ExportSft is the now-COMPLETE port (not just substrate). Walks
// scored-runs files, loads paired evidence, applies the firewall,
// synthesizes SFT samples for surviving records, writes the
// combined JSONL output.
//
// Behavior:
//   - DryRun=true: counts but doesn't write files.
//   - IncludePartial=true: emit samples even when synthesis returns
//     "skip" reasons that aren't on the firewall (model_role
//     mismatch, empty text). Default false.
//   - Empty Root or missing scored-runs dir: returns zero-counts,
//     no error.
//
// Output JSONL format: one SftSample per line, UTF-8, LF-terminated,
// matching the Rust convention so a/b validation between runtimes
// can diff the files directly.
func ExportSft(opts ExportSftOptions) (ExportSftResult, error) {
	if opts.RecordedAt == "" {
		return ExportSftResult{}, errors.New("distillation: ExportSft requires RecordedAt")
	}
	res := ExportSftResult{
		OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_export.jsonl"),
	}
	files, err := ListScoredRunFiles(opts.Root)
	if err != nil {
		return res, fmt.Errorf("list scored runs: %w", err)
	}
	res.ScoredFilesRead = len(files)

	evCache := make(map[string]map[string]EvidenceRecord)
	var output []byte
	skippedNotInstructable := 0

	for _, f := range files {
		runs, _, err := LoadScoredRunsFromFile(f)
		if err != nil {
			continue
		}
		evidence, err := LoadEvidenceByRunID(f, evCache)
		if err != nil {
			continue
		}
		res.RecordsRead += len(runs)
		for i, r := range runs {
			if IsSftNever(r.Category) {
				res.RecordsQuarantined++
				continue
			}
			ev, ok := evidence[r.EvidenceRunID]
			if !ok {
				// Evidence missing — without it we can't synthesize.
				// IncludePartial=true would let us emit a sample with
				// just scored-side fields, but the contract says
				// SftSample.Response must be non-empty (no evidence
				// = no text), so skip regardless.
				skippedNotInstructable++
				continue
			}
			sftID := fmt.Sprintf("%s:%d", filepath.Base(f), i)
			sample := SynthesizeSft(r, ev, opts.RecordedAt, sftID)
			if sample == nil {
				skippedNotInstructable++
				continue
			}
			if !opts.DryRun {
				line, err := json.Marshal(sample)
				if err != nil {
					skippedNotInstructable++
					continue
				}
				output = append(output, line...)
				output = append(output, '\n')
			}
			res.RecordsExported++
		}
	}

	res.QuarantineSummary = fmt.Sprintf("firewall=%d, not-instructable=%d", res.RecordsQuarantined, skippedNotInstructable)
	if !opts.DryRun && len(output) > 0 {
		if err := os.MkdirAll(filepath.Dir(res.OutputPath), 0o755); err != nil {
			return res, fmt.Errorf("mkdir output: %w", err)
		}
		if err := os.WriteFile(res.OutputPath, output, 0o644); err != nil {
			return res, fmt.Errorf("write output: %w", err)
		}
	}
	return res, nil
}