package distillation // SFT (Supervised Fine-Tuning) export pipeline. Closes the SUBSTRATE // half of OPEN #2 — types, contamination firewall, file-listing // helper. The actual synthesis (turning EvidenceRecord + ScoredRun // into instruction/input/response triples) is still on the Rust // side at scripts/distillation/export_sft.ts and will land in a // follow-up wave. // // Why ship substrate without synthesis: the firewall constants and // types are the load-bearing contamination guarantees. Once they're // pinned in Go (with tests proving the firewall set is exactly // {rejected, needs_human_review} and never expands), the synthesis // port becomes a translation exercise rather than a design one. // // Per the project_distillation_substrate.md note: SFT_NEVER is one // of the "what NOT to touch casually" knobs. Replicating it here in // Go preserves the cross-runtime invariant — the contamination // firewall fires even if the SFT export is run from the Go side. import ( "encoding/json" "errors" "fmt" "os" "path/filepath" "sort" "strings" ) // SftNever is declared in types.go (the load-bearing contamination // firewall — pinned at the type-level so every consumer reads the // same source of truth). IsSftNever below is the predicate // helper; it lives here because it's specific to the SFT export // path, not a property of the type system. // // IsSftNever returns true if a scored run's category is on the // contamination firewall list. Inlinable; called per-record in the // hot synthesis loop. func IsSftNever(c ScoreCategory) bool { for _, blocked := range SftNever { if c == blocked { return true } } return false } // ExportSftOptions mirrors the TS shape so callers porting from // Rust have an identity-translation surface. Root is the lakehouse // data root (default $LH_DISTILL_ROOT or /home/profit/lakehouse). // RecordedAt is the timestamp stamped on emitted SFT samples for // lineage. IncludePartial toggles "emit even when evidence record // is missing some optional fields"; DryRun skips file writes. type ExportSftOptions struct { Root string RecordedAt string IncludePartial bool DryRun bool } // ExportSftResult mirrors the TS result shape exactly so a // callable swap between sides doesn't break consumers reading the // JSON. type ExportSftResult struct { ScoredFilesRead int `json:"scored_files_read"` RecordsRead int `json:"records_read"` RecordsExported int `json:"records_exported"` RecordsQuarantined int `json:"records_quarantined"` OutputPath string `json:"output_path"` QuarantineSummary string `json:"quarantine_summary"` } // ListScoredRunFiles walks {root}/data/scored-runs/YYYY/MM/DD/*.jsonl // and returns the sorted list. Empty when the dir doesn't exist // (matches Rust behavior — caller should treat zero-files as a // no-op, not an error). func ListScoredRunFiles(root string) ([]string, error) { if root == "" { return nil, errors.New("distillation: empty root") } base := filepath.Join(root, "data", "scored-runs") if _, err := os.Stat(base); os.IsNotExist(err) { return nil, nil } else if err != nil { return nil, fmt.Errorf("stat %s: %w", base, err) } var out []string years, err := os.ReadDir(base) if err != nil { return nil, fmt.Errorf("read %s: %w", base, err) } sortDirEntries(years) for _, y := range years { if !y.IsDir() { continue } months, err := os.ReadDir(filepath.Join(base, y.Name())) if err != nil { continue } sortDirEntries(months) for _, m := range months { if !m.IsDir() { continue } days, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name())) if err != nil { continue } sortDirEntries(days) for _, d := range days { if !d.IsDir() { continue } files, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name(), d.Name())) if err != nil { continue } sortDirEntries(files) for _, f := range files { if strings.HasSuffix(f.Name(), ".jsonl") { out = append(out, filepath.Join(base, y.Name(), m.Name(), d.Name(), f.Name())) } } } } } return out, nil } // sortDirEntries sorts dir entries by name in-place. Stable // alphabetical so the directory walk is deterministic — important // for the audit_baselines longitudinal signal which expects the // same order across runs. func sortDirEntries(entries []os.DirEntry) { sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() }) } // LoadScoredRunsFromFile reads a JSONL of ScoredRun records. // Returns the slice + the count of malformed lines (skipped). // This is the read-half — the synthesis half (turn ScoredRun + // EvidenceRecord into SftSample) is the not-yet-ported piece. func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) { data, err := os.ReadFile(path) if err != nil { return nil, 0, err } lines := strings.Split(string(data), "\n") out := make([]ScoredRun, 0, len(lines)) skipped := 0 for _, line := range lines { line = strings.TrimSpace(line) if line == "" { continue } var sr ScoredRun if err := json.Unmarshal([]byte(line), &sr); err != nil { skipped++ continue } out = append(out, sr) } return out, skipped, nil } // ExportSft is the partial port. Lists scored-run files, loads // each, applies the contamination firewall, and reports counts. // What's NOT yet ported (deliberate, separate wave): // - Evidence-record loading + cache (loadEvidenceByRunId). // - synthesizeSft — the actual instruction/input/response // synthesis logic. ~80 lines of TS in scripts/distillation/export_sft.ts. // - Quarantine writer integration (write rejected records to // a quarantine JSONL for operator review). // - File output (write SFT JSONL to data/distilled/sft/). // // Returning a non-nil result with RecordsExported=0 is intentional // pre-synthesis — operators calling this on the Go side will see // the count of records that PASSED the firewall and would have // been exported by a complete implementation. RecordsQuarantined // reflects records BLOCKED by the firewall. // // Tests/contracts that synthesis port must preserve: // - SftNever firewall fires before any other validation // - Sort order matches Rust (file walk + record order within file) // - Empty root dir returns zero-counts, not error func ExportSft(opts ExportSftOptions) (ExportSftResult, error) { res := ExportSftResult{ OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_partial.jsonl"), QuarantineSummary: "synthesis not yet ported — see internal/distillation/sft_export.go header", } files, err := ListScoredRunFiles(opts.Root) if err != nil { return res, fmt.Errorf("list scored runs: %w", err) } res.ScoredFilesRead = len(files) for _, f := range files { runs, _, err := LoadScoredRunsFromFile(f) if err != nil { continue } res.RecordsRead += len(runs) for _, r := range runs { if IsSftNever(r.Category) { res.RecordsQuarantined++ continue } // Synthesis would happen here. Pre-port: count as // "would-export" for the firewall-passing records. res.RecordsExported++ } } return res, nil }