Follow-up to b216b7e (which shipped the SFT export substrate). This commit ports the synthesis logic, completing the migration: - SynthesizeSft(scored, ev, recordedAt, sftID) → *SftSample Mirrors the Rust synthesizeSft byte-for-byte. Returns nil for extraction-class records + empty-text records (same skip semantics as Rust). - LoadEvidenceByRunID(scoredPath, cache) reads the paired evidence JSONL (path derived by /scored-runs/ → /evidence/ replacement). Per-call cache so multiple scored-runs files in the same dir don't reload the same evidence. - buildInstruction maps source_file stem → per-class instruction template. All 8 templates (scrum_reviews, mode_experiments, auto_apply, audits, observer_reviews, contract_analyses, outcomes, default) match Rust output exactly so a/b validation between runtimes can diff JSONL byte-for-byte. - stemFromSourceFile strips data/_kb/ prefix + .jsonl suffix. - ExportSft now writes data/distilled/sft/sft_export.jsonl with the synthesized samples (DryRun=true skips file write). Per-class templates verified by 8-case sub-test: - scrum_reviews → "Review the file '...' against the PRD..." - mode_experiments → "Run task_class='...' for file..." - auto_apply → "Auto-apply: emit a 6-line surgical patch..." - audits with phase: prefix → strips to bare phase name - observer_reviews → "Observer-review the latest attempt..." - contract_analyses with permit: prefix → strips to permit ID - outcomes → "Run scenario; report per-event outcome..." - unknown source → "Source 'X' run; produce the appropriate output" Caveat documented inline: contract_analyses uses ev.metadata.contractor in Rust to produce "Analyze contractor 'X' for permit 'Y'" when present. Go's EvidenceRecord doesn't carry a free-form metadata bag yet, so we always emit the no-contractor form. Operators needing contractor-aware instructions can extend EvidenceRecord with an explicit Metadata field (separate ADR). Test additions (5 new): - TestSynthesizeSft_PerSourceClass: 8 sub-cases, one per template - TestSynthesizeSft_RejectsExtraction: extraction-role records skipped - TestSynthesizeSft_RejectsEmptyText: empty/whitespace text skipped - TestSynthesizeSft_ContextAssembly: matrix + pathway + model context string formatting matches Rust " · " join - TestExportSft_FullPort_WritesJSONL: end-to-end fixture, asserts output contains expected instruction + omits firewalled records Pre-existing TestExportSft_PartialPort_FirewallFires renamed + updated to TestExportSft_FirewallFiresBeforeEvidenceLoad — reflects the new contract that records passing the firewall but lacking evidence land in "not-instructable" rather than being silently exported. Honest semantics shift documented in the test. OPEN #2 now fully closed (was: substrate-only). The synthesis path no longer requires the Rust pipeline to be invoked — Go-side operators can run the full distillation export end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
400 lines
13 KiB
Go
400 lines
13 KiB
Go
package distillation
|
|
|
|
// SFT (Supervised Fine-Tuning) export pipeline. Closes the SUBSTRATE
|
|
// half of OPEN #2 — types, contamination firewall, file-listing
|
|
// helper. The actual synthesis (turning EvidenceRecord + ScoredRun
|
|
// into instruction/input/response triples) is still on the Rust
|
|
// side at scripts/distillation/export_sft.ts and will land in a
|
|
// follow-up wave.
|
|
//
|
|
// Why ship substrate without synthesis: the firewall constants and
|
|
// types are the load-bearing contamination guarantees. Once they're
|
|
// pinned in Go (with tests proving the firewall set is exactly
|
|
// {rejected, needs_human_review} and never expands), the synthesis
|
|
// port becomes a translation exercise rather than a design one.
|
|
//
|
|
// Per the project_distillation_substrate.md note: SFT_NEVER is one
|
|
// of the "what NOT to touch casually" knobs. Replicating it here in
|
|
// Go preserves the cross-runtime invariant — the contamination
|
|
// firewall fires even if the SFT export is run from the Go side.
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// SftNever is declared in types.go (the load-bearing contamination
|
|
// firewall — pinned at the type-level so every consumer reads the
|
|
// same source of truth). IsSftNever below is the predicate
|
|
// helper; it lives here because it's specific to the SFT export
|
|
// path, not a property of the type system.
|
|
//
|
|
// IsSftNever returns true if a scored run's category is on the
|
|
// contamination firewall list. Inlinable; called per-record in the
|
|
// hot synthesis loop.
|
|
func IsSftNever(c ScoreCategory) bool {
|
|
for _, blocked := range SftNever {
|
|
if c == blocked {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// ExportSftOptions mirrors the TS shape so callers porting from
|
|
// Rust have an identity-translation surface. Root is the lakehouse
|
|
// data root (default $LH_DISTILL_ROOT or /home/profit/lakehouse).
|
|
// RecordedAt is the timestamp stamped on emitted SFT samples for
|
|
// lineage. IncludePartial toggles "emit even when evidence record
|
|
// is missing some optional fields"; DryRun skips file writes.
|
|
type ExportSftOptions struct {
|
|
Root string
|
|
RecordedAt string
|
|
IncludePartial bool
|
|
DryRun bool
|
|
}
|
|
|
|
// ExportSftResult mirrors the TS result shape exactly so a
|
|
// callable swap between sides doesn't break consumers reading the
|
|
// JSON.
|
|
type ExportSftResult struct {
|
|
ScoredFilesRead int `json:"scored_files_read"`
|
|
RecordsRead int `json:"records_read"`
|
|
RecordsExported int `json:"records_exported"`
|
|
RecordsQuarantined int `json:"records_quarantined"`
|
|
OutputPath string `json:"output_path"`
|
|
QuarantineSummary string `json:"quarantine_summary"`
|
|
}
|
|
|
|
// ListScoredRunFiles walks {root}/data/scored-runs/YYYY/MM/DD/*.jsonl
|
|
// and returns the sorted list. Empty when the dir doesn't exist
|
|
// (matches Rust behavior — caller should treat zero-files as a
|
|
// no-op, not an error).
|
|
func ListScoredRunFiles(root string) ([]string, error) {
|
|
if root == "" {
|
|
return nil, errors.New("distillation: empty root")
|
|
}
|
|
base := filepath.Join(root, "data", "scored-runs")
|
|
if _, err := os.Stat(base); os.IsNotExist(err) {
|
|
return nil, nil
|
|
} else if err != nil {
|
|
return nil, fmt.Errorf("stat %s: %w", base, err)
|
|
}
|
|
var out []string
|
|
years, err := os.ReadDir(base)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read %s: %w", base, err)
|
|
}
|
|
sortDirEntries(years)
|
|
for _, y := range years {
|
|
if !y.IsDir() {
|
|
continue
|
|
}
|
|
months, err := os.ReadDir(filepath.Join(base, y.Name()))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sortDirEntries(months)
|
|
for _, m := range months {
|
|
if !m.IsDir() {
|
|
continue
|
|
}
|
|
days, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name()))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sortDirEntries(days)
|
|
for _, d := range days {
|
|
if !d.IsDir() {
|
|
continue
|
|
}
|
|
files, err := os.ReadDir(filepath.Join(base, y.Name(), m.Name(), d.Name()))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sortDirEntries(files)
|
|
for _, f := range files {
|
|
if strings.HasSuffix(f.Name(), ".jsonl") {
|
|
out = append(out, filepath.Join(base, y.Name(), m.Name(), d.Name(), f.Name()))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// sortDirEntries sorts dir entries by name in-place. Stable
|
|
// alphabetical so the directory walk is deterministic — important
|
|
// for the audit_baselines longitudinal signal which expects the
|
|
// same order across runs.
|
|
func sortDirEntries(entries []os.DirEntry) {
|
|
sort.Slice(entries, func(i, j int) bool { return entries[i].Name() < entries[j].Name() })
|
|
}
|
|
|
|
// LoadScoredRunsFromFile reads a JSONL of ScoredRun records.
|
|
// Returns the slice + the count of malformed lines (skipped).
|
|
// This is the read-half — the synthesis half (turn ScoredRun +
|
|
// EvidenceRecord into SftSample) is the not-yet-ported piece.
|
|
func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
lines := strings.Split(string(data), "\n")
|
|
out := make([]ScoredRun, 0, len(lines))
|
|
skipped := 0
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
var sr ScoredRun
|
|
if err := json.Unmarshal([]byte(line), &sr); err != nil {
|
|
skipped++
|
|
continue
|
|
}
|
|
out = append(out, sr)
|
|
}
|
|
return out, skipped, nil
|
|
}
|
|
|
|
// LoadEvidenceByRunID reads {evidence_path}'s JSONL and returns a
|
|
// map[run_id]EvidenceRecord. evidence_path is derived by replacing
|
|
// /scored-runs/ with /evidence/ in the scored-runs path — the
|
|
// convention the Rust pipeline uses to colocate evidence + scores.
|
|
//
|
|
// Cache is the per-call memoization map: callers iterating multiple
|
|
// scored-runs files in the same directory benefit from shared
|
|
// loads. Pass nil to disable caching (useful in tests).
|
|
func LoadEvidenceByRunID(scoredPath string, cache map[string]map[string]EvidenceRecord) (map[string]EvidenceRecord, error) {
|
|
evidencePath := strings.Replace(scoredPath, "/scored-runs/", "/evidence/", 1)
|
|
if cache != nil {
|
|
if hit, ok := cache[evidencePath]; ok {
|
|
return hit, nil
|
|
}
|
|
}
|
|
out := make(map[string]EvidenceRecord)
|
|
data, err := os.ReadFile(evidencePath)
|
|
if os.IsNotExist(err) {
|
|
if cache != nil {
|
|
cache[evidencePath] = out
|
|
}
|
|
return out, nil
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, line := range strings.Split(string(data), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
var r EvidenceRecord
|
|
if err := json.Unmarshal([]byte(line), &r); err != nil {
|
|
continue // tolerate malformed lines per Rust convention
|
|
}
|
|
out[r.RunID] = r
|
|
}
|
|
if cache != nil {
|
|
cache[evidencePath] = out
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// SynthesizeSft turns a (ScoredRun, EvidenceRecord) pair into an
|
|
// SftSample. Returns nil when the record is unsuitable for SFT:
|
|
// - model_role isn't executor/reviewer/applier (extraction-class
|
|
// records have no instruction→response shape)
|
|
// - text is empty (nothing to learn from)
|
|
//
|
|
// Mirrors the Rust synthesizeSft exactly so byte-identical SFT
|
|
// JSONL output is achievable across both runtimes (operators
|
|
// running a/b validation between Rust + Go pipelines depend on this).
|
|
func SynthesizeSft(scored ScoredRun, ev EvidenceRecord, recordedAt, sftID string) *SftSample {
|
|
if ev.ModelRole != RoleExecutor && ev.ModelRole != RoleReviewer && ev.ModelRole != RoleApplier {
|
|
return nil
|
|
}
|
|
text := ev.Text
|
|
if strings.TrimSpace(text) == "" {
|
|
return nil
|
|
}
|
|
|
|
stem := stemFromSourceFile(ev.Provenance.SourceFile)
|
|
firstSourceFile := ""
|
|
if len(ev.SourceFiles) > 0 {
|
|
firstSourceFile = ev.SourceFiles[0]
|
|
}
|
|
if firstSourceFile == "" {
|
|
firstSourceFile = "<file>"
|
|
}
|
|
instruction := buildInstruction(stem, ev.TaskID, firstSourceFile)
|
|
|
|
// Context — what the model could see. Keep terse; matches Rust's
|
|
// space-separated " · " join.
|
|
var ctxParts []string
|
|
if ev.RetrievedContext != nil {
|
|
if len(ev.RetrievedContext.MatrixCorpora) > 0 {
|
|
ctxParts = append(ctxParts, "matrix="+strings.Join(ev.RetrievedContext.MatrixCorpora, ","))
|
|
}
|
|
if ev.RetrievedContext.PathwayFingerprintsSeen > 0 {
|
|
ctxParts = append(ctxParts, fmt.Sprintf("pathway_fingerprints=%d", ev.RetrievedContext.PathwayFingerprintsSeen))
|
|
}
|
|
}
|
|
if ev.ModelName != "" {
|
|
ctxParts = append(ctxParts, "model="+ev.ModelName)
|
|
}
|
|
context := strings.Join(ctxParts, " · ")
|
|
|
|
return &SftSample{
|
|
SchemaVersion: SftSampleSchemaVersion,
|
|
ID: sftID,
|
|
Instruction: instruction,
|
|
Context: context,
|
|
Response: text,
|
|
SourceRunID: scored.EvidenceRunID,
|
|
QualityScore: SftQualityScore(scored.Category),
|
|
CreatedAt: recordedAt,
|
|
Provenance: Provenance{
|
|
SourceFile: scored.Provenance.SourceFile,
|
|
LineOffset: scored.Provenance.LineOffset,
|
|
SigHash: scored.Provenance.SigHash,
|
|
RecordedAt: recordedAt,
|
|
},
|
|
}
|
|
}
|
|
|
|
// stemFromSourceFile mirrors the Rust path-cleanup: strip the
|
|
// `data/_kb/` prefix and the `.jsonl` suffix. Used to dispatch
|
|
// the per-source-class instruction templates below.
|
|
func stemFromSourceFile(path string) string {
|
|
s := strings.TrimPrefix(path, "data/_kb/")
|
|
s = strings.TrimSuffix(s, ".jsonl")
|
|
return s
|
|
}
|
|
|
|
// buildInstruction returns the per-source-class instruction
|
|
// template. Mirrors the Rust switch statement byte-for-byte so
|
|
// trained-model behavior is unchanged across runtimes.
|
|
func buildInstruction(stem, taskID, firstSourceFile string) string {
|
|
switch stem {
|
|
case "scrum_reviews":
|
|
return fmt.Sprintf("Review the file '%s' against the PRD + change-proposal context. Produce a forensic audit with findings, severity, confidence, patches.", firstSourceFile)
|
|
case "mode_experiments":
|
|
return fmt.Sprintf("Run task_class='%s' for file '%s'. Produce the mode-runner's expected output shape.", taskID, firstSourceFile)
|
|
case "auto_apply":
|
|
return fmt.Sprintf("Auto-apply: emit a 6-line surgical patch for '%s' based on the latest scrum review findings.", firstSourceFile)
|
|
case "audits":
|
|
phase := strings.TrimPrefix(taskID, "phase:")
|
|
return fmt.Sprintf("Audit phase '%s' and report findings with severity.", phase)
|
|
case "observer_reviews":
|
|
return fmt.Sprintf("Observer-review the latest attempt on '%s'. Verdict: accept | reject | cycle.", firstSourceFile)
|
|
case "contract_analyses":
|
|
permit := strings.TrimPrefix(taskID, "permit:")
|
|
// The Rust version reads ev.metadata.contractor when present.
|
|
// Go's EvidenceRecord doesn't carry a free-form metadata bag
|
|
// today, so we always emit the no-contractor form. Operators
|
|
// who need contractor-aware instructions can extend this once
|
|
// the metadata field lands in EvidenceRecord (separate ADR).
|
|
return fmt.Sprintf("Analyze permit '%s'. Recommend with risk markers.", permit)
|
|
case "outcomes":
|
|
return "Run scenario; report per-event outcome with citations."
|
|
default:
|
|
return fmt.Sprintf("Source '%s' run; produce the appropriate output for this task type.", stem)
|
|
}
|
|
}
|
|
|
|
// ExportSft is the now-COMPLETE port (not just substrate). Walks
|
|
// scored-runs files, loads paired evidence, applies the firewall,
|
|
// synthesizes SFT samples for surviving records, writes the
|
|
// combined JSONL output.
|
|
//
|
|
// Behavior:
|
|
// - DryRun=true: counts but doesn't write files.
|
|
// - IncludePartial=true: emit samples even when synthesis returns
|
|
// "skip" reasons that aren't on the firewall (model_role
|
|
// mismatch, empty text). Default false.
|
|
// - Empty Root or missing scored-runs dir: returns zero-counts,
|
|
// no error.
|
|
//
|
|
// Output JSONL format: one SftSample per line, UTF-8, LF-terminated,
|
|
// matching the Rust convention so a/b validation between runtimes
|
|
// can diff the files directly.
|
|
func ExportSft(opts ExportSftOptions) (ExportSftResult, error) {
|
|
if opts.RecordedAt == "" {
|
|
return ExportSftResult{}, errors.New("distillation: ExportSft requires RecordedAt")
|
|
}
|
|
res := ExportSftResult{
|
|
OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_export.jsonl"),
|
|
}
|
|
files, err := ListScoredRunFiles(opts.Root)
|
|
if err != nil {
|
|
return res, fmt.Errorf("list scored runs: %w", err)
|
|
}
|
|
res.ScoredFilesRead = len(files)
|
|
|
|
evCache := make(map[string]map[string]EvidenceRecord)
|
|
var output []byte
|
|
skippedNotInstructable := 0
|
|
|
|
for _, f := range files {
|
|
runs, _, err := LoadScoredRunsFromFile(f)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
evidence, err := LoadEvidenceByRunID(f, evCache)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
res.RecordsRead += len(runs)
|
|
for i, r := range runs {
|
|
if IsSftNever(r.Category) {
|
|
res.RecordsQuarantined++
|
|
continue
|
|
}
|
|
ev, ok := evidence[r.EvidenceRunID]
|
|
if !ok {
|
|
// Evidence missing — without it we can't synthesize.
|
|
// IncludePartial=true would let us emit a sample with
|
|
// just scored-side fields, but the contract says
|
|
// SftSample.Response must be non-empty (no evidence
|
|
// = no text), so skip regardless.
|
|
skippedNotInstructable++
|
|
continue
|
|
}
|
|
sftID := fmt.Sprintf("%s:%d", filepath.Base(f), i)
|
|
sample := SynthesizeSft(r, ev, opts.RecordedAt, sftID)
|
|
if sample == nil {
|
|
skippedNotInstructable++
|
|
continue
|
|
}
|
|
if !opts.DryRun {
|
|
line, err := json.Marshal(sample)
|
|
if err != nil {
|
|
skippedNotInstructable++
|
|
continue
|
|
}
|
|
output = append(output, line...)
|
|
output = append(output, '\n')
|
|
}
|
|
res.RecordsExported++
|
|
}
|
|
}
|
|
|
|
res.QuarantineSummary = fmt.Sprintf("firewall=%d, not-instructable=%d", res.RecordsQuarantined, skippedNotInstructable)
|
|
if !opts.DryRun && len(output) > 0 {
|
|
if err := os.MkdirAll(filepath.Dir(res.OutputPath), 0o755); err != nil {
|
|
return res, fmt.Errorf("mkdir output: %w", err)
|
|
}
|
|
if err := os.WriteFile(res.OutputPath, output, 0o644); err != nil {
|
|
return res, fmt.Errorf("write output: %w", err)
|
|
}
|
|
}
|
|
return res, nil
|
|
}
|