package distillation // SFT (Supervised Fine-Tuning) export pipeline. Closes the SUBSTRATE // half of OPEN #2 — types, contamination firewall, file-listing // helper. The actual synthesis (turning EvidenceRecord + ScoredRun // into instruction/input/response triples) is still on the Rust // side at scripts/distillation/export_sft.ts and will land in a // follow-up wave. // // Why ship substrate without synthesis: the firewall constants and // types are the load-bearing contamination guarantees. Once they're // pinned in Go (with tests proving the firewall set is exactly // {rejected, needs_human_review} and never expands), the synthesis // port becomes a translation exercise rather than a design one. // // Per the project_distillation_substrate.md note: SFT_NEVER is one // of the "what NOT to touch casually" knobs. Replicating it here in // Go preserves the cross-runtime invariant — the contamination // firewall fires even if the SFT export is run from the Go side. import ( "encoding/json" "errors" "fmt" "os" "path/filepath" "sort" "strings" ) // SftNever is declared in types.go (the load-bearing contamination // firewall — pinned at the type-level so every consumer reads the // same source of truth). IsSftNever below is the predicate // helper; it lives here because it's specific to the SFT export // path, not a property of the type system. // // IsSftNever returns true if a scored run's category is on the // contamination firewall list. Inlinable; called per-record in the // hot synthesis loop. func IsSftNever(c ScoreCategory) bool { for _, blocked := range SftNever { if c == blocked { return true } } return false } // ExportSftOptions mirrors the TS shape so callers porting from // Rust have an identity-translation surface. Root is the lakehouse // data root (default $LH_DISTILL_ROOT or /home/profit/lakehouse). // RecordedAt is the timestamp stamped on emitted SFT samples for // lineage. IncludePartial toggles "emit even when evidence record // is missing some optional fields"; DryRun skips file writes. type ExportSftOptions struct { Root string RecordedAt string IncludePartial bool DryRun bool } // ExportSftResult mirrors the TS result shape exactly so a // callable swap between sides doesn't break consumers reading the // JSON. type ExportSftResult struct { ScoredFilesRead int `json:"scored_files_read"` RecordsRead int `json:"records_read"` RecordsExported int `json:"records_exported"` RecordsQuarantined int `json:"records_quarantined"` OutputPath string `json:"output_path"` QuarantineSummary string `json:"quarantine_summary"` } // ListScoredRunFiles walks {root}/data/scored-runs/YYYY/MM/DD/*.jsonl // and returns the sorted list. Empty when the dir doesn't exist // (matches Rust behavior — caller should treat zero-files as a // no-op, not an error). func ListScoredRunFiles(root string) ([]string, error) { if root == "" { return nil, errors.New("distillation: empty root") } base := filepath.Join(root, "data", "scored-runs") if _, err := os.Stat(base); os.IsNotExist(err) { return nil, nil } else if err != nil { return nil, fmt.Errorf("stat %s: %w", base, err) } var out []string years, err := os.ReadDir(base) if err != nil { return nil, fmt.Errorf("read %s: %w", base, err) } sortDirEntries(years) for _, y := range years { if !y.IsDir() { continue } months, err := os.ReadDir(filepath.Join(base, y.Name())) if err != nil { continue } sortDirEntries(months) for _, m := range months { if !m.IsDir() { continue } days, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name())) if err != nil { continue } sortDirEntries(days) for _, d := range days { if !d.IsDir() { continue } files, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name(), d.Name())) if err != nil { continue } sortDirEntries(files) for _, f := range files { if strings.HasSuffix(f.Name(), ".jsonl") { out = append(out, filepath.Join(base, y.Name(), m.Name(), d.Name(), f.Name())) } } } } } return out, nil } // sortDirEntries sorts dir entries by name in-place. Stable // alphabetical so the directory walk is deterministic — important // for the audit_baselines longitudinal signal which expects the // same order across runs. func sortDirEntries(entries []os.DirEntry) { sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() }) } // LoadScoredRunsFromFile reads a JSONL of ScoredRun records. // Returns the slice + the count of malformed lines (skipped). // This is the read-half — the synthesis half (turn ScoredRun + // EvidenceRecord into SftSample) is the not-yet-ported piece. func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) { data, err := os.ReadFile(path) if err != nil { return nil, 0, err } lines := strings.Split(string(data), "\n") out := make([]ScoredRun, 0, len(lines)) skipped := 0 for _, line := range lines { line = strings.TrimSpace(line) if line == "" { continue } var sr ScoredRun if err := json.Unmarshal([]byte(line), &sr); err != nil { skipped++ continue } out = append(out, sr) } return out, skipped, nil } // LoadEvidenceByRunID reads {evidence_path}'s JSONL and returns a // map[run_id]EvidenceRecord. evidence_path is derived by replacing // /scored-runs/ with /evidence/ in the scored-runs path — the // convention the Rust pipeline uses to colocate evidence + scores. // // Cache is the per-call memoization map: callers iterating multiple // scored-runs files in the same directory benefit from shared // loads. Pass nil to disable caching (useful in tests). func LoadEvidenceByRunID(scoredPath string, cache map[string]map[string]EvidenceRecord) (map[string]EvidenceRecord, error) { evidencePath := strings.Replace(scoredPath, "/scored-runs/", "/evidence/", 1) if cache != nil { if hit, ok := cache[evidencePath]; ok { return hit, nil } } out := make(map[string]EvidenceRecord) data, err := os.ReadFile(evidencePath) if os.IsNotExist(err) { if cache != nil { cache[evidencePath] = out } return out, nil } if err != nil { return nil, err } for _, line := range strings.Split(string(data), "\n") { line = strings.TrimSpace(line) if line == "" { continue } var r EvidenceRecord if err := json.Unmarshal([]byte(line), &r); err != nil { continue // tolerate malformed lines per Rust convention } out[r.RunID] = r } if cache != nil { cache[evidencePath] = out } return out, nil } // SynthesizeSft turns a (ScoredRun, EvidenceRecord) pair into an // SftSample. Returns nil when the record is unsuitable for SFT: // - model_role isn't executor/reviewer/applier (extraction-class // records have no instruction→response shape) // - text is empty (nothing to learn from) // // Mirrors the Rust synthesizeSft exactly so byte-identical SFT // JSONL output is achievable across both runtimes (operators // running a/b validation between Rust + Go pipelines depend on this). func SynthesizeSft(scored ScoredRun, ev EvidenceRecord, recordedAt, sftID string) *SftSample { if ev.ModelRole != RoleExecutor && ev.ModelRole != RoleReviewer && ev.ModelRole != RoleApplier { return nil } text := ev.Text if strings.TrimSpace(text) == "" { return nil } stem := stemFromSourceFile(ev.Provenance.SourceFile) firstSourceFile := "" if len(ev.SourceFiles) > 0 { firstSourceFile = ev.SourceFiles[0] } if firstSourceFile == "" { firstSourceFile = "" } instruction := buildInstruction(stem, ev.TaskID, firstSourceFile) // Context — what the model could see. Keep terse; matches Rust's // space-separated " · " join. var ctxParts []string if ev.RetrievedContext != nil { if len(ev.RetrievedContext.MatrixCorpora) > 0 { ctxParts = append(ctxParts, "matrix="+strings.Join(ev.RetrievedContext.MatrixCorpora, ",")) } if ev.RetrievedContext.PathwayFingerprintsSeen > 0 { ctxParts = append(ctxParts, fmt.Sprintf("pathway_fingerprints=%d", ev.RetrievedContext.PathwayFingerprintsSeen)) } } if ev.ModelName != "" { ctxParts = append(ctxParts, "model="+ev.ModelName) } context := strings.Join(ctxParts, " · ") return &SftSample{ SchemaVersion: SftSampleSchemaVersion, ID: sftID, Instruction: instruction, Context: context, Response: text, SourceRunID: scored.EvidenceRunID, QualityScore: SftQualityScore(scored.Category), CreatedAt: recordedAt, Provenance: Provenance{ SourceFile: scored.Provenance.SourceFile, LineOffset: scored.Provenance.LineOffset, SigHash: scored.Provenance.SigHash, RecordedAt: recordedAt, }, } } // stemFromSourceFile mirrors the Rust path-cleanup: strip the // `data/_kb/` prefix and the `.jsonl` suffix. Used to dispatch // the per-source-class instruction templates below. func stemFromSourceFile(path string) string { s := strings.TrimPrefix(path, "data/_kb/") s = strings.TrimSuffix(s, ".jsonl") return s } // buildInstruction returns the per-source-class instruction // template. Mirrors the Rust switch statement byte-for-byte so // trained-model behavior is unchanged across runtimes. func buildInstruction(stem, taskID, firstSourceFile string) string { switch stem { case "scrum_reviews": return fmt.Sprintf("Review the file '%s' against the PRD + change-proposal context. Produce a forensic audit with findings, severity, confidence, patches.", firstSourceFile) case "mode_experiments": return fmt.Sprintf("Run task_class='%s' for file '%s'. Produce the mode-runner's expected output shape.", taskID, firstSourceFile) case "auto_apply": return fmt.Sprintf("Auto-apply: emit a 6-line surgical patch for '%s' based on the latest scrum review findings.", firstSourceFile) case "audits": phase := strings.TrimPrefix(taskID, "phase:") return fmt.Sprintf("Audit phase '%s' and report findings with severity.", phase) case "observer_reviews": return fmt.Sprintf("Observer-review the latest attempt on '%s'. Verdict: accept | reject | cycle.", firstSourceFile) case "contract_analyses": permit := strings.TrimPrefix(taskID, "permit:") // The Rust version reads ev.metadata.contractor when present. // Go's EvidenceRecord doesn't carry a free-form metadata bag // today, so we always emit the no-contractor form. Operators // who need contractor-aware instructions can extend this once // the metadata field lands in EvidenceRecord (separate ADR). return fmt.Sprintf("Analyze permit '%s'. Recommend with risk markers.", permit) case "outcomes": return "Run scenario; report per-event outcome with citations." default: return fmt.Sprintf("Source '%s' run; produce the appropriate output for this task type.", stem) } } // ExportSft is the now-COMPLETE port (not just substrate). Walks // scored-runs files, loads paired evidence, applies the firewall, // synthesizes SFT samples for surviving records, writes the // combined JSONL output. // // Behavior: // - DryRun=true: counts but doesn't write files. // - IncludePartial=true: emit samples even when synthesis returns // "skip" reasons that aren't on the firewall (model_role // mismatch, empty text). Default false. // - Empty Root or missing scored-runs dir: returns zero-counts, // no error. // // Output JSONL format: one SftSample per line, UTF-8, LF-terminated, // matching the Rust convention so a/b validation between runtimes // can diff the files directly. func ExportSft(opts ExportSftOptions) (ExportSftResult, error) { if opts.RecordedAt == "" { return ExportSftResult{}, errors.New("distillation: ExportSft requires RecordedAt") } res := ExportSftResult{ OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_export.jsonl"), } files, err := ListScoredRunFiles(opts.Root) if err != nil { return res, fmt.Errorf("list scored runs: %w", err) } res.ScoredFilesRead = len(files) evCache := make(map[string]map[string]EvidenceRecord) var output []byte skippedNotInstructable := 0 for _, f := range files { runs, _, err := LoadScoredRunsFromFile(f) if err != nil { continue } evidence, err := LoadEvidenceByRunID(f, evCache) if err != nil { continue } res.RecordsRead += len(runs) for i, r := range runs { if IsSftNever(r.Category) { res.RecordsQuarantined++ continue } ev, ok := evidence[r.EvidenceRunID] if !ok { // Evidence missing — without it we can't synthesize. // IncludePartial=true would let us emit a sample with // just scored-side fields, but the contract says // SftSample.Response must be non-empty (no evidence // = no text), so skip regardless. skippedNotInstructable++ continue } sftID := fmt.Sprintf("%s:%d", filepath.Base(f), i) sample := SynthesizeSft(r, ev, opts.RecordedAt, sftID) if sample == nil { skippedNotInstructable++ continue } if !opts.DryRun { line, err := json.Marshal(sample) if err != nil { skippedNotInstructable++ continue } output = append(output, line...) output = append(output, '\n') } res.RecordsExported++ } } res.QuarantineSummary = fmt.Sprintf("firewall=%d, not-instructable=%d", res.RecordsQuarantined, skippedNotInstructable) if !opts.DryRun && len(output) > 0 { if err := os.MkdirAll(filepath.Dir(res.OutputPath), 0o755); err != nil { return res, fmt.Errorf("mkdir output: %w", err) } if err := os.WriteFile(res.OutputPath, output, 0o644); err != nil { return res, fmt.Errorf("write output: %w", err) } } return res, nil }