E (partial): distillation port — scorer + contamination firewall

First slice of the Rust v1.0.0 distillation substrate (e7636f2) ported to Go per ADR-001 #4 (port LOGIC, not bit-identical reproducibility). This commit lands the LOAD-BEARING pieces named in project_distillation_substrate.md memory: - The deterministic Success Scorer (8 sub-scorers + dispatch) - The contamination firewall on SFT samples (the "non-negotiable" spec property: rejected/needs_human_review NEVER ship to SFT) - All on-wire types + validators for ScoredRun, SftSample, EvidenceRecord with Provenance Files: internal/distillation/types.go — types + ScorerVersion + SftNever + ValidateScoredRun + ValidateSftSample internal/distillation/scorer.go — ScoreRecord + 8 class scorers + BuildScoredRun (deterministic) internal/distillation/scorer_test.go — ~40 test cases: - source-class dispatch (verdict / telemetry / extraction) - scrum_review (4 attempt cases) - observer_review (5 verdict cases) - audit (legacy + severity, 9 cases) - auto_apply (4 cases) - outcomes / mode_experiment / extraction - CONTAMINATION FIREWALL: ErrSftContamination sentinel fires on rejected/needs_human_review, distinct from typo errors - empty-pair guard (instruction/response trim != "") - reasons-required ScoredRun validation - deterministic sig_hash on identical input - purity check (input not mutated, repeatable output) Per the 2026-04-29 cross-lineage scrum's discipline: false-positive findings would be dismissed inline (none in this commit). Real findings would be addressed before merge — but this is greenfield port code reviewed against its Rust source line-by-line, which the test suite encodes as truth tables. Explicitly DEFERRED to follow-up commits: - Materialization layer (jsonl read/write, date-partitioned storage in data/scored-runs/YYYY/MM/DD/, evidence index) - SFT exporter (file iteration + filtering — the SCORING firewall is here; the EXPORT firewall is the next layer) - export_preference, export_rag (other export shapes) - Acceptance harness (16/16 acceptance gate that locks v1.0.0) - replay, receipts, build_evidence_index, transforms The scorer + firewall validator are pure functions — operational tooling layers on top without changing the deterministic logic the downstream learning loop depends on. The Go ScorerVersion stays at v1.0.0 to match the Rust e7636f2 baseline; bumping in the Go materialization commit is reserved for the next scoring-rule change, NOT the port itself. 15-smoke regression all green. vet clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:04:29 -05:00 · 2026-04-29 20:04:29 -05:00 · 57d0df125d
commit 57d0df125d
parent 7f42089521
3 changed files with 1269 additions and 0 deletions
--- a/internal/distillation/scorer.go
+++ b/internal/distillation/scorer.go
@ -0,0 +1,410 @@
+package distillation
+
+// scorer.go — pure deterministic Success Scorer (port of Rust
+// scripts/distillation/scorer.ts at e7636f2).
+//
+// Takes one EvidenceRecord, returns category + reasons + sub_scores.
+// NO I/O, NO LLM, NO clock reads, NO mutable state. Identical input
+// → identical output forever. Same contract as the Rust source —
+// future scoring-rule changes bump ScorerVersion atomically with
+// the logic.
+//
+// Three-class strategy mirrors the Rust source taxonomy
+// (docs/recon/local-distillation-recon.md + data/_kb/evidence_health.md):
+//
+//   CLASS A — verdict-bearing
+//     scrum_reviews, observer_reviews, audits, contract_analyses
+//     Direct scoring from existing markers / observer_verdict
+//
+//   CLASS B — telemetry-rich
+//     auto_apply, outcomes, mode_experiments
+//     Markers exist but partial; needs_human_review fills the gap
+//
+//   CLASS C — pure-extraction (no native scoring signal)
+//     distilled_*, audit_facts, observer_escalations
+//     Default needs_human_review; v2 will JOIN to parent verdict
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// sourceClass categorizes an EvidenceRecord's source_file for the
+// scorer's three-class dispatch.
+type sourceClass string
+
+const (
+	classVerdict    sourceClass = "verdict"
+	classTelemetry  sourceClass = "telemetry"
+	classExtraction sourceClass = "extraction"
+)
+
+// sourceClassFor maps a source_file (from provenance) to a class.
+// Centralized so adding a new source is a one-line change. Mirrors
+// the Rust switch on the stem (data/_kb/X.jsonl → X).
+func sourceClassFor(sourceFile string) sourceClass {
+	stem := strings.TrimSuffix(strings.TrimPrefix(sourceFile, "data/_kb/"), ".jsonl")
+	switch stem {
+	case "scrum_reviews", "observer_reviews", "audits", "contract_analyses":
+		return classVerdict
+	case "auto_apply", "outcomes", "mode_experiments":
+		return classTelemetry
+	case "distilled_facts", "distilled_procedures", "distilled_config_hints",
+		"audit_facts", "observer_escalations":
+		return classExtraction
+	default:
+		// Unknown source → most conservative path (forces
+		// needs_human_review until a transform is added).
+		return classExtraction
+	}
+}
+
+// stemOf extracts the stable corpus identifier from a source_file.
+// E.g. "data/_kb/scrum_reviews.jsonl" → "scrum_reviews".
+func stemOf(sourceFile string) string {
+	return strings.TrimSuffix(strings.TrimPrefix(sourceFile, "data/_kb/"), ".jsonl")
+}
+
+// ScoreOutput is the scorer's return shape — category + reasons +
+// the captured sub-signals. Reasons is always non-empty (validator
+// requires it).
+type ScoreOutput struct {
+	Category  ScoreCategory
+	Reasons   []string
+	SubScores *SubScores
+}
+
+// ScoreRecord dispatches an EvidenceRecord to the appropriate class
+// scorer and returns the verdict + reasons + sub-scores. Pure
+// function. Caller wraps the output in a ScoredRun via BuildScoredRun
+// for the on-wire shape.
+func ScoreRecord(rec EvidenceRecord) ScoreOutput {
+	cls := sourceClassFor(rec.Provenance.SourceFile)
+	stem := stemOf(rec.Provenance.SourceFile)
+
+	switch cls {
+	case classVerdict:
+		switch stem {
+		case "scrum_reviews":
+			return scoreScrumReview(rec)
+		case "observer_reviews":
+			return scoreObserverReview(rec)
+		case "audits":
+			return scoreAudit(rec)
+		case "contract_analyses":
+			return scoreContractAnalysis(rec)
+		}
+	case classTelemetry:
+		switch stem {
+		case "auto_apply":
+			return scoreAutoApply(rec)
+		case "outcomes":
+			return scoreOutcomes(rec)
+		case "mode_experiments":
+			return scoreModeExperiment(rec)
+		}
+	}
+	return scoreExtraction()
+}
+
+// BuildScoredRun composes a complete ScoredRun for persistence.
+// Caller supplies recorded_at + the source file path/line offset.
+// SigHash is computed deterministically from the EvidenceRecord
+// JSON; ScoredRun traces to the materialized evidence row.
+func BuildScoredRun(rec EvidenceRecord, sourceFile string, lineOffset int64, recordedAt string) (ScoredRun, error) {
+	out := ScoreRecord(rec)
+	sig, err := canonicalSha256(rec)
+	if err != nil {
+		return ScoredRun{}, fmt.Errorf("scoredrun sig hash: %w", err)
+	}
+	return ScoredRun{
+		SchemaVersion:  ScoredRunSchemaVersion,
+		EvidenceRunID:  rec.RunID,
+		EvidenceTaskID: rec.TaskID,
+		Category:       out.Category,
+		Reasons:        out.Reasons,
+		ScoredAt:       recordedAt,
+		ScorerVersion:  ScorerVersion,
+		SubScores:      out.SubScores,
+		Provenance: Provenance{
+			SourceFile: sourceFile,
+			LineOffset: lineOffset,
+			SigHash:    sig,
+			RecordedAt: recordedAt,
+		},
+	}, nil
+}
+
+// canonicalSha256 hashes a value's canonical JSON encoding. Used
+// for ScoredRun.Provenance.SigHash. Matches the Rust pattern of
+// "hash the structured object, not the raw source bytes" so
+// re-materialization with same logic produces same hash.
+func canonicalSha256(v any) (string, error) {
+	bs, err := json.Marshal(v)
+	if err != nil {
+		return "", err
+	}
+	sum := sha256.Sum256(bs)
+	return hex.EncodeToString(sum[:]), nil
+}
+
+// ─── Class A: verdict-bearing ────────────────────────────────────
+
+func scoreScrumReview(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	successMarker := findPrefix(r.SuccessMarkers, "accepted_on_attempt_")
+	if successMarker == "" {
+		return ScoreOutput{
+			Category: CategoryNeedsHumanReview,
+			Reasons:  []string{"scrum_review missing accepted_on_attempt_* success marker"},
+			SubScores: subs,
+		}
+	}
+	attemptStr := strings.TrimPrefix(successMarker, "accepted_on_attempt_")
+	attempt, err := strconv.Atoi(attemptStr)
+	if err != nil {
+		return ScoreOutput{
+			Category: CategoryNeedsHumanReview,
+			Reasons:  []string{"scrum_review accepted_on_attempt_* marker has non-integer suffix: " + attemptStr},
+			SubScores: subs,
+		}
+	}
+	subs.AcceptedOnAttempt = &attempt
+	switch {
+	case attempt == 1:
+		return ScoreOutput{
+			Category: CategoryAccepted,
+			Reasons:  []string{"scrum: accepted on first attempt"},
+			SubScores: subs,
+		}
+	case attempt <= 3:
+		return ScoreOutput{
+			Category: CategoryPartiallyAccepted,
+			Reasons:  []string{fmt.Sprintf("scrum: accepted after %d attempts", attempt)},
+			SubScores: subs,
+		}
+	default:
+		return ScoreOutput{
+			Category: CategoryPartiallyAccepted,
+			Reasons:  []string{fmt.Sprintf("scrum: accepted only after %d attempts (high-cost path)", attempt)},
+			SubScores: subs,
+		}
+	}
+}
+
+func scoreObserverReview(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	switch r.ObserverVerdict {
+	case VerdictAccept:
+		subs.ObserverVerdict = VerdictAccept
+		return ScoreOutput{
+			Category: CategoryAccepted,
+			Reasons:  []string{"observer accepted the reviewed attempt"},
+			SubScores: subs,
+		}
+	case VerdictReject:
+		subs.ObserverVerdict = VerdictReject
+		return ScoreOutput{
+			Category: CategoryRejected,
+			Reasons:  []string{"observer rejected the reviewed attempt"},
+			SubScores: subs,
+		}
+	case VerdictCycle:
+		subs.ObserverVerdict = VerdictCycle
+		return ScoreOutput{
+			Category: CategoryPartiallyAccepted,
+			Reasons:  []string{"observer flagged the attempt as cycling — partial signal"},
+			SubScores: subs,
+		}
+	default:
+		return ScoreOutput{
+			Category: CategoryNeedsHumanReview,
+			Reasons:  []string{fmt.Sprintf("observer_verdict missing or unrecognized: %q", r.ObserverVerdict)},
+			SubScores: subs,
+		}
+	}
+}
+
+func scoreAudit(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	succ := r.SuccessMarkers
+	fail := r.FailureMarkers
+
+	// Legacy markers (back-compat with pre-fix materializations).
+	if contains(succ, "approved") {
+		return ScoreOutput{Category: CategoryAccepted,
+			Reasons: []string{"audit overall=approved (legacy marker)"}, SubScores: subs}
+	}
+	if contains(fail, "blocked") {
+		return ScoreOutput{Category: CategoryRejected,
+			Reasons: []string{"audit overall=block (legacy marker)"}, SubScores: subs}
+	}
+	if contains(fail, "request_changes") {
+		return ScoreOutput{Category: CategoryPartiallyAccepted,
+			Reasons: []string{"audit overall=request_changes (legacy marker)"}, SubScores: subs}
+	}
+
+	// Severity-derived markers (Phase 2 transform).
+	sevSucc := findPrefix(succ, "audit_severity_")
+	sevFail := findPrefix(fail, "audit_severity_")
+	if sevSucc != "" {
+		return ScoreOutput{Category: CategoryAccepted,
+			Reasons: []string{sevSucc + " → minor finding"}, SubScores: subs}
+	}
+	if sevFail == "audit_severity_medium" {
+		return ScoreOutput{Category: CategoryPartiallyAccepted,
+			Reasons: []string{"audit_severity_medium → finding warrants review"}, SubScores: subs}
+	}
+	if sevFail == "audit_severity_high" || sevFail == "audit_severity_critical" {
+		return ScoreOutput{Category: CategoryRejected,
+			Reasons: []string{sevFail + " → blocking finding"}, SubScores: subs}
+	}
+	return ScoreOutput{Category: CategoryNeedsHumanReview,
+		Reasons: []string{"audit row has no severity or overall marker"}, SubScores: subs}
+}
+
+func scoreContractAnalysis(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	// failure_markers takes precedence: explicit rejection beats absent verdict.
+	if contains(r.FailureMarkers, "observer_rejected") || r.ObserverVerdict == VerdictReject {
+		subs.ObserverVerdict = VerdictReject
+		return ScoreOutput{Category: CategoryRejected,
+			Reasons: []string{"contract analysis: observer rejected"}, SubScores: subs}
+	}
+	switch r.ObserverVerdict {
+	case VerdictAccept:
+		subs.ObserverVerdict = VerdictAccept
+		return ScoreOutput{Category: CategoryAccepted,
+			Reasons: []string{"contract analysis: observer accepted"}, SubScores: subs}
+	case VerdictCycle:
+		subs.ObserverVerdict = VerdictCycle
+		return ScoreOutput{Category: CategoryPartiallyAccepted,
+			Reasons: []string{"contract analysis: observer cycled (partial)"}, SubScores: subs}
+	}
+	return ScoreOutput{Category: CategoryNeedsHumanReview,
+		Reasons: []string{"contract analysis: no observer verdict signal"}, SubScores: subs}
+}
+
+// ─── Class B: telemetry-rich ─────────────────────────────────────
+
+func scoreAutoApply(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	if contains(r.SuccessMarkers, "committed") {
+		t := true
+		subs.CargoGreen = &t
+		return ScoreOutput{Category: CategoryAccepted,
+			Reasons: []string{"auto_apply: patch committed (cargo green + warning baseline + rationale alignment passed)"},
+			SubScores: subs}
+	}
+	reverted := findContaining(r.FailureMarkers, "reverted")
+	if reverted != "" {
+		if strings.Contains(reverted, "build_red") {
+			f := false
+			subs.CargoGreen = &f
+		}
+		return ScoreOutput{Category: CategoryRejected,
+			Reasons: []string{"auto_apply: " + reverted}, SubScores: subs}
+	}
+	return ScoreOutput{Category: CategoryNeedsHumanReview,
+		Reasons: []string{"auto_apply: no commit + no revert (likely no_patches or dry_run)"},
+		SubScores: subs}
+}
+
+func scoreOutcomes(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	if contains(r.SuccessMarkers, "all_events_ok") {
+		return ScoreOutput{Category: CategoryAccepted,
+			Reasons: []string{"outcomes: all events ok"}, SubScores: subs}
+	}
+	if gap := numericFromMap(r.ValidationResults, "gap_signals"); gap > 0 {
+		return ScoreOutput{Category: CategoryPartiallyAccepted,
+			Reasons: []string{fmt.Sprintf("outcomes: %d gap signal(s) detected", int(gap))},
+			SubScores: subs}
+	}
+	return ScoreOutput{Category: CategoryNeedsHumanReview,
+		Reasons: []string{"outcomes: no decisive marker — defer to human"},
+		SubScores: subs}
+}
+
+func scoreModeExperiment(r EvidenceRecord) ScoreOutput {
+	subs := &SubScores{}
+	if strings.TrimSpace(r.Text) == "" {
+		return ScoreOutput{Category: CategoryRejected,
+			Reasons: []string{"mode_experiment: empty response text"}, SubScores: subs}
+	}
+	if r.LatencyMs > 120_000 {
+		return ScoreOutput{Category: CategoryPartiallyAccepted,
+			Reasons: []string{fmt.Sprintf("mode_experiment: latency %dms exceeds 2-minute soft cap", r.LatencyMs)},
+			SubScores: subs}
+	}
+	return ScoreOutput{Category: CategoryNeedsHumanReview,
+		Reasons: []string{"mode_experiment: response present, latency within bounds; verdict not yet wired"},
+		SubScores: subs}
+}
+
+// ─── Class C: pure-extraction ────────────────────────────────────
+
+func scoreExtraction() ScoreOutput {
+	return ScoreOutput{
+		Category: CategoryNeedsHumanReview,
+		Reasons:  []string{"extraction-class source has no native scoring signal — JOIN to parent verdict pending Phase 3 v2"},
+		SubScores: &SubScores{},
+	}
+}
+
+// ─── Internal helpers ────────────────────────────────────────────
+
+func contains(slice []string, want string) bool {
+	for _, s := range slice {
+		if s == want {
+			return true
+		}
+	}
+	return false
+}
+
+func findPrefix(slice []string, prefix string) string {
+	for _, s := range slice {
+		if strings.HasPrefix(s, prefix) {
+			return s
+		}
+	}
+	return ""
+}
+
+func findContaining(slice []string, sub string) string {
+	for _, s := range slice {
+		if strings.Contains(s, sub) {
+			return s
+		}
+	}
+	return ""
+}
+
+func numericFromMap(m map[string]any, key string) float64 {
+	if m == nil {
+		return 0
+	}
+	v, ok := m[key]
+	if !ok {
+		return 0
+	}
+	switch n := v.(type) {
+	case int:
+		return float64(n)
+	case int64:
+		return float64(n)
+	case float32:
+		return float64(n)
+	case float64:
+		return n
+	case json.Number:
+		f, _ := n.Float64()
+		return f
+	}
+	return 0
+}
--- a/internal/distillation/scorer_test.go
+++ b/internal/distillation/scorer_test.go
@ -0,0 +1,375 @@
+package distillation
+
+import (
+	"errors"
+	"strings"
+	"testing"
+)
+
+func mkRecord(sourceFile string) EvidenceRecord {
+	return EvidenceRecord{
+		RunID:         "run-1",
+		TaskID:        "task-1",
+		Timestamp:     "2026-04-29T12:00:00Z",
+		SchemaVersion: EvidenceSchemaVersion,
+		Provenance: Provenance{
+			SourceFile: sourceFile,
+			SigHash:    "deadbeef",
+			RecordedAt: "2026-04-29T12:00:01Z",
+		},
+	}
+}
+
+func TestSourceClassFor(t *testing.T) {
+	cases := []struct {
+		path string
+		want sourceClass
+	}{
+		{"data/_kb/scrum_reviews.jsonl", classVerdict},
+		{"data/_kb/observer_reviews.jsonl", classVerdict},
+		{"data/_kb/audits.jsonl", classVerdict},
+		{"data/_kb/contract_analyses.jsonl", classVerdict},
+		{"data/_kb/auto_apply.jsonl", classTelemetry},
+		{"data/_kb/outcomes.jsonl", classTelemetry},
+		{"data/_kb/mode_experiments.jsonl", classTelemetry},
+		{"data/_kb/distilled_facts.jsonl", classExtraction},
+		{"data/_kb/audit_facts.jsonl", classExtraction},
+		{"data/_kb/observer_escalations.jsonl", classExtraction},
+		{"data/_kb/wholly_unknown.jsonl", classExtraction}, // unknown → extraction (conservative)
+	}
+	for _, c := range cases {
+		got := sourceClassFor(c.path)
+		if got != c.want {
+			t.Errorf("sourceClassFor(%q): want %q, got %q", c.path, c.want, got)
+		}
+	}
+}
+
+func TestScoreScrumReview(t *testing.T) {
+	cases := []struct {
+		name           string
+		successMarkers []string
+		wantCategory   ScoreCategory
+		wantReasonSub  string
+	}{
+		{
+			name:           "first attempt → accepted",
+			successMarkers: []string{"accepted_on_attempt_1"},
+			wantCategory:   CategoryAccepted,
+			wantReasonSub:  "first attempt",
+		},
+		{
+			name:           "second attempt → partial",
+			successMarkers: []string{"accepted_on_attempt_2"},
+			wantCategory:   CategoryPartiallyAccepted,
+			wantReasonSub:  "after 2 attempts",
+		},
+		{
+			name:           "fourth attempt → partial (high-cost)",
+			successMarkers: []string{"accepted_on_attempt_4"},
+			wantCategory:   CategoryPartiallyAccepted,
+			wantReasonSub:  "high-cost",
+		},
+		{
+			name:           "missing marker → needs_human_review",
+			successMarkers: []string{},
+			wantCategory:   CategoryNeedsHumanReview,
+			wantReasonSub:  "missing accepted_on_attempt",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			rec := mkRecord("data/_kb/scrum_reviews.jsonl")
+			rec.SuccessMarkers = c.successMarkers
+			out := ScoreRecord(rec)
+			if out.Category != c.wantCategory {
+				t.Errorf("category: want %q, got %q (reasons=%v)", c.wantCategory, out.Category, out.Reasons)
+			}
+			if !reasonsContain(out.Reasons, c.wantReasonSub) {
+				t.Errorf("reasons missing %q: %v", c.wantReasonSub, out.Reasons)
+			}
+		})
+	}
+}
+
+func TestScoreObserverReview(t *testing.T) {
+	cases := []struct {
+		verdict ObserverVerdict
+		want    ScoreCategory
+	}{
+		{VerdictAccept, CategoryAccepted},
+		{VerdictReject, CategoryRejected},
+		{VerdictCycle, CategoryPartiallyAccepted},
+		{"", CategoryNeedsHumanReview},
+		{"weird-verdict", CategoryNeedsHumanReview},
+	}
+	for _, c := range cases {
+		rec := mkRecord("data/_kb/observer_reviews.jsonl")
+		rec.ObserverVerdict = c.verdict
+		out := ScoreRecord(rec)
+		if out.Category != c.want {
+			t.Errorf("verdict=%q: want %q, got %q", c.verdict, c.want, out.Category)
+		}
+	}
+}
+
+func TestScoreAudit_LegacyAndSeverityMarkers(t *testing.T) {
+	cases := []struct {
+		name string
+		succ []string
+		fail []string
+		want ScoreCategory
+	}{
+		{"legacy approved", []string{"approved"}, nil, CategoryAccepted},
+		{"legacy blocked", nil, []string{"blocked"}, CategoryRejected},
+		{"legacy request_changes", nil, []string{"request_changes"}, CategoryPartiallyAccepted},
+		{"severity_low → accepted", []string{"audit_severity_low"}, nil, CategoryAccepted},
+		{"severity_info → accepted", []string{"audit_severity_info"}, nil, CategoryAccepted},
+		{"severity_medium fail → partial", nil, []string{"audit_severity_medium"}, CategoryPartiallyAccepted},
+		{"severity_high → rejected", nil, []string{"audit_severity_high"}, CategoryRejected},
+		{"severity_critical → rejected", nil, []string{"audit_severity_critical"}, CategoryRejected},
+		{"no markers", nil, nil, CategoryNeedsHumanReview},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			rec := mkRecord("data/_kb/audits.jsonl")
+			rec.SuccessMarkers = c.succ
+			rec.FailureMarkers = c.fail
+			out := ScoreRecord(rec)
+			if out.Category != c.want {
+				t.Errorf("want %q, got %q (reasons=%v)", c.want, out.Category, out.Reasons)
+			}
+		})
+	}
+}
+
+func TestScoreAutoApply(t *testing.T) {
+	cases := []struct {
+		name string
+		succ []string
+		fail []string
+		want ScoreCategory
+	}{
+		{"committed → accepted", []string{"committed"}, nil, CategoryAccepted},
+		{"reverted_build_red → rejected", nil, []string{"reverted_build_red"}, CategoryRejected},
+		{"reverted other → rejected", nil, []string{"reverted_warning_count_up"}, CategoryRejected},
+		{"no signal → needs_human", nil, nil, CategoryNeedsHumanReview},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			rec := mkRecord("data/_kb/auto_apply.jsonl")
+			rec.SuccessMarkers = c.succ
+			rec.FailureMarkers = c.fail
+			out := ScoreRecord(rec)
+			if out.Category != c.want {
+				t.Errorf("want %q, got %q", c.want, out.Category)
+			}
+		})
+	}
+}
+
+func TestScoreOutcomes(t *testing.T) {
+	rec := mkRecord("data/_kb/outcomes.jsonl")
+	rec.SuccessMarkers = []string{"all_events_ok"}
+	if out := ScoreRecord(rec); out.Category != CategoryAccepted {
+		t.Errorf("all_events_ok: want accepted, got %q", out.Category)
+	}
+
+	rec2 := mkRecord("data/_kb/outcomes.jsonl")
+	rec2.ValidationResults = map[string]any{"gap_signals": float64(2)}
+	if out := ScoreRecord(rec2); out.Category != CategoryPartiallyAccepted {
+		t.Errorf("gap_signals=2: want partial, got %q (reasons=%v)", out.Category, out.Reasons)
+	}
+
+	rec3 := mkRecord("data/_kb/outcomes.jsonl")
+	if out := ScoreRecord(rec3); out.Category != CategoryNeedsHumanReview {
+		t.Errorf("no signal: want needs_human, got %q", out.Category)
+	}
+}
+
+func TestScoreModeExperiment(t *testing.T) {
+	rec := mkRecord("data/_kb/mode_experiments.jsonl")
+	rec.Text = ""
+	if out := ScoreRecord(rec); out.Category != CategoryRejected {
+		t.Errorf("empty text: want rejected, got %q", out.Category)
+	}
+
+	rec.Text = "real response"
+	rec.LatencyMs = 130_000
+	if out := ScoreRecord(rec); out.Category != CategoryPartiallyAccepted {
+		t.Errorf("over latency cap: want partial, got %q", out.Category)
+	}
+
+	rec.LatencyMs = 5000
+	if out := ScoreRecord(rec); out.Category != CategoryNeedsHumanReview {
+		t.Errorf("normal: want needs_human (verdict not yet wired), got %q", out.Category)
+	}
+}
+
+func TestScoreExtraction_Defaults(t *testing.T) {
+	for _, src := range []string{
+		"data/_kb/distilled_facts.jsonl",
+		"data/_kb/distilled_procedures.jsonl",
+		"data/_kb/audit_facts.jsonl",
+		"data/_kb/observer_escalations.jsonl",
+	} {
+		rec := mkRecord(src)
+		out := ScoreRecord(rec)
+		if out.Category != CategoryNeedsHumanReview {
+			t.Errorf("%s: want needs_human_review, got %q", src, out.Category)
+		}
+	}
+}
+
+// ─── Contamination firewall — the safety-critical guarantee ───────
+
+func TestValidateSftSample_RejectsContaminationCategories(t *testing.T) {
+	for _, contaminated := range []SftQualityScore{
+		SftQualityScore("rejected"),
+		SftQualityScore("needs_human_review"),
+	} {
+		s := goodSftSample()
+		s.QualityScore = contaminated
+		err := ValidateSftSample(s)
+		if err == nil {
+			t.Errorf("contaminated quality_score=%q should fail validation", contaminated)
+			continue
+		}
+		if !errors.Is(err, ErrSftContamination) {
+			t.Errorf("contaminated %q: want errors.Is(err, ErrSftContamination), got %v", contaminated, err)
+		}
+	}
+}
+
+func TestValidateSftSample_AcceptsLegalCategories(t *testing.T) {
+	for _, legal := range []SftQualityScore{SftQualityAccepted, SftQualityPartiallyAccepted} {
+		s := goodSftSample()
+		s.QualityScore = legal
+		if err := ValidateSftSample(s); err != nil {
+			t.Errorf("legal quality_score=%q failed: %v", legal, err)
+		}
+	}
+}
+
+func TestValidateSftSample_RejectsTypoCategory(t *testing.T) {
+	s := goodSftSample()
+	s.QualityScore = "approved" // close to "accepted" but wrong
+	err := ValidateSftSample(s)
+	if err == nil {
+		t.Fatal("typo category should fail validation")
+	}
+	// Typo is NOT contamination — should be a regular ValidationError,
+	// not the firewall sentinel. This distinguishes "you typo'd" from
+	// "you broke the spec."
+	if errors.Is(err, ErrSftContamination) {
+		t.Error("typo should not surface as ErrSftContamination")
+	}
+}
+
+func TestValidateSftSample_RejectsEmptyPair(t *testing.T) {
+	s := goodSftSample()
+	s.Instruction = "  "
+	if err := ValidateSftSample(s); err == nil {
+		t.Error("whitespace-only instruction should fail")
+	}
+
+	s2 := goodSftSample()
+	s2.Response = ""
+	if err := ValidateSftSample(s2); err == nil {
+		t.Error("empty response should fail")
+	}
+}
+
+func TestValidateScoredRun_ReasonsRequired(t *testing.T) {
+	r := ScoredRun{
+		SchemaVersion:  ScoredRunSchemaVersion,
+		EvidenceRunID:  "x",
+		EvidenceTaskID: "y",
+		Category:       CategoryAccepted,
+		Reasons:        nil, // empty — must fail
+		ScoredAt:       "2026-04-29T12:00:00Z",
+		ScorerVersion:  ScorerVersion,
+		Provenance: Provenance{
+			SourceFile: "data/_kb/scrum_reviews.jsonl",
+			SigHash:    "abc",
+			RecordedAt: "2026-04-29T12:00:00Z",
+		},
+	}
+	err := ValidateScoredRun(r)
+	if err == nil {
+		t.Fatal("empty reasons should fail")
+	}
+	if !strings.Contains(err.Error(), "reasons") {
+		t.Errorf("error should mention reasons: %v", err)
+	}
+}
+
+func TestBuildScoredRun_DeterministicSigHash(t *testing.T) {
+	rec := mkRecord("data/_kb/scrum_reviews.jsonl")
+	rec.SuccessMarkers = []string{"accepted_on_attempt_1"}
+
+	r1, err := BuildScoredRun(rec, "data/scored-runs/2026/04/29/x.jsonl", 0, "2026-04-29T12:00:00Z")
+	if err != nil {
+		t.Fatal(err)
+	}
+	r2, err := BuildScoredRun(rec, "data/scored-runs/2026/04/29/x.jsonl", 0, "2026-04-29T12:00:00Z")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if r1.Provenance.SigHash != r2.Provenance.SigHash {
+		t.Errorf("identical EvidenceRecord should produce identical sig_hash: %s vs %s",
+			r1.Provenance.SigHash, r2.Provenance.SigHash)
+	}
+	if r1.Category != CategoryAccepted {
+		t.Errorf("scored category: %q", r1.Category)
+	}
+	if r1.ScorerVersion != ScorerVersion {
+		t.Errorf("scorer version stamped wrong: %q", r1.ScorerVersion)
+	}
+}
+
+func TestScoreRecord_PureFunction_NoMutationOfInput(t *testing.T) {
+	// Belt-and-braces: the contract says "NO mutable state, identical
+	// input → identical output forever." Verify by scoring the same
+	// record twice and ensuring the input hasn't been touched.
+	rec := mkRecord("data/_kb/scrum_reviews.jsonl")
+	rec.SuccessMarkers = []string{"accepted_on_attempt_2"}
+	original := rec
+	out1 := ScoreRecord(rec)
+	out2 := ScoreRecord(rec)
+	if rec.RunID != original.RunID || len(rec.SuccessMarkers) != 1 {
+		t.Error("ScoreRecord mutated its input")
+	}
+	if out1.Category != out2.Category {
+		t.Error("ScoreRecord is non-deterministic")
+	}
+}
+
+// ─── Helpers ─────────────────────────────────────────────────────
+
+func goodSftSample() SftSample {
+	return SftSample{
+		SchemaVersion: SftSampleSchemaVersion,
+		ID:            "sft-1",
+		Instruction:   "summarize the diff",
+		Context:       "diff body...",
+		Response:      "the diff adds a function",
+		SourceRunID:   "run-1",
+		QualityScore:  SftQualityAccepted,
+		CreatedAt:     "2026-04-29T12:00:00Z",
+		Provenance: Provenance{
+			SourceFile: "data/scored-runs/2026/04/29/x.jsonl",
+			SigHash:    "deadbeef",
+			RecordedAt: "2026-04-29T12:00:01Z",
+		},
+	}
+}
+
+func reasonsContain(reasons []string, sub string) bool {
+	for _, r := range reasons {
+		if strings.Contains(r, sub) {
+			return true
+		}
+	}
+	return false
+}
--- a/internal/distillation/types.go
+++ b/internal/distillation/types.go
@ -0,0 +1,484 @@
+// Package distillation is the Go port of the Rust v1.0.0 distillation
+// substrate (frozen at e7636f2). Per ADR-001 #4: port LOGIC, not
+// bit-identical reproducibility.
+//
+// What this package owns (this commit):
+//   - The deterministic scorer: EvidenceRecord → ScoredRun
+//   - Score categories + scorer version constant
+//   - SftSample type + validator with the contamination firewall
+//     (the safety-critical piece — rejected/needs_human_review must
+//     NEVER ship to SFT)
+//
+// What's deferred to follow-up commits:
+//   - Materialization layer (file iteration, jsonl read/write,
+//     date-partitioned storage) — operational tooling on top of
+//     the scorer logic
+//   - export_preference, export_rag (other export shapes)
+//   - acceptance harness (the gate that locks v1.0.0)
+//   - replay, receipts, evidence-index builders
+//
+// The scorer + SftSample validator are the LOAD-BEARING pieces
+// per project_distillation_substrate.md memory. The rest is plumbing
+// that can land incrementally without changing the logic the
+// downstream learning loop depends on.
+
+package distillation
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+	"time"
+)
+
+// ScoreCategory is one of the 4 deterministic verdicts. Matches Rust
+// SCORE_CATEGORIES exactly.
+type ScoreCategory string
+
+const (
+	CategoryAccepted          ScoreCategory = "accepted"
+	CategoryPartiallyAccepted ScoreCategory = "partially_accepted"
+	CategoryRejected          ScoreCategory = "rejected"
+	CategoryNeedsHumanReview  ScoreCategory = "needs_human_review"
+)
+
+// AllScoreCategories lists every legal category — used by validators.
+var AllScoreCategories = []ScoreCategory{
+	CategoryAccepted,
+	CategoryPartiallyAccepted,
+	CategoryRejected,
+	CategoryNeedsHumanReview,
+}
+
+// ScorerVersion is hardcoded — the deterministic-output contract
+// requires this. Bump the literal in the same commit as any scoring-
+// rule change so the version stamp moves atomically with logic.
+// Mirrors the Rust SCORER_VERSION (also v1.0.0 at e7636f2).
+const ScorerVersion = "v1.0.0"
+
+// SftQualityScore enumerates the categories LEGAL in SFT exports.
+// SFT_NEVER (defined below) is the inverse — categories that NEVER
+// ship to SFT under any flag combination. The contamination firewall
+// is enforced at the schema layer (ValidateSftSample) AND by the
+// exporter; defense in depth.
+type SftQualityScore string
+
+const (
+	SftQualityAccepted          SftQualityScore = "accepted"
+	SftQualityPartiallyAccepted SftQualityScore = "partially_accepted"
+)
+
+// SftQualityScores lists quality scores legal in SFT samples.
+// Default is SftQualityAccepted only; --include-partial CLI flag
+// expands to both. rejected and needs_human_review are NEVER legal.
+var SftQualityScores = []SftQualityScore{
+	SftQualityAccepted,
+	SftQualityPartiallyAccepted,
+}
+
+// SftNever is the contamination firewall: ScoreCategories that NEVER
+// ship to SFT under ANY caller flag. Enforced at the schema layer
+// (ValidateSftSample) AND at the exporter layer. Per the Rust
+// e7636f2 spec: "Hard non-negotiable: this set never expands. If you
+// find yourself adding 'needs_human_review' or 'rejected' here, stop
+// — that's the contamination the spec forbids."
+//
+// Exported so callers AND the validator share the same source of
+// truth. Modifying this constant changes the contract; reviewers
+// should treat any commit that touches it as a security review.
+var SftNever = []ScoreCategory{
+	CategoryRejected,
+	CategoryNeedsHumanReview,
+}
+
+// SftSampleSchemaVersion bumps when the on-wire SftSample shape
+// changes incompatibly. Match the Rust SFT_SAMPLE_SCHEMA_VERSION.
+const SftSampleSchemaVersion = 1
+
+// ScoredRunSchemaVersion bumps when the on-wire ScoredRun shape
+// changes incompatibly. Match the Rust SCORED_RUN_SCHEMA_VERSION.
+const ScoredRunSchemaVersion = 1
+
+// EvidenceSchemaVersion mirrors the Rust EVIDENCE_SCHEMA_VERSION.
+// This package consumes EvidenceRecord; producing it is a separate
+// concern (the materialization layer not yet ported).
+const EvidenceSchemaVersion = 1
+
+// ModelRole categorizes the kind of model output represented by an
+// EvidenceRecord. Used by the SFT exporter to filter "real model
+// output" from pure-extraction rows.
+type ModelRole string
+
+const (
+	RoleExecutor    ModelRole = "executor"
+	RoleReviewer    ModelRole = "reviewer"
+	RoleExtractor   ModelRole = "extractor"
+	RoleVerifier    ModelRole = "verifier"
+	RoleCategorizer ModelRole = "categorizer"
+	RoleTiebreaker  ModelRole = "tiebreaker"
+	RoleApplier     ModelRole = "applier"
+	RoleEmbedder    ModelRole = "embedder"
+	RoleOther       ModelRole = "other"
+)
+
+// Provenance is the source-linkage every distillation record carries.
+// SourceFile is required (no record without source linkage); other
+// fields are best-effort for de-duplication and trace-back.
+type Provenance struct {
+	SourceFile  string `json:"source_file"`
+	LineOffset  int64  `json:"line_offset,omitempty"`
+	SigHash     string `json:"sig_hash"`
+	RecordedAt  string `json:"recorded_at"` // ISO 8601
+}
+
+// ObserverVerdict is what an observer returned for an executor's
+// output. Matches the Rust enum but as a string type for JSON
+// flexibility.
+type ObserverVerdict string
+
+const (
+	VerdictAccept ObserverVerdict = "accept"
+	VerdictReject ObserverVerdict = "reject"
+	VerdictCycle  ObserverVerdict = "cycle"
+)
+
+// EvidenceRecord is one row in the canonical evidence stream.
+// Producing it (transforms from raw KB streams) is a separate
+// concern; this package consumes it.
+//
+// Fields mirror the Rust EvidenceRecord at e7636f2. Optional fields
+// use Go pointers / slices so missing-vs-empty stays distinguishable
+// for the scorer's heuristics.
+type EvidenceRecord struct {
+	RunID         string `json:"run_id"`
+	TaskID        string `json:"task_id"`
+	Timestamp     string `json:"timestamp"`
+	SchemaVersion int    `json:"schema_version"`
+
+	Provenance Provenance `json:"provenance"`
+
+	ModelName     string    `json:"model_name,omitempty"`
+	ModelProvider string    `json:"model_provider,omitempty"`
+	ModelRole     ModelRole `json:"model_role,omitempty"`
+
+	InputHash  string `json:"input_hash,omitempty"`
+	OutputHash string `json:"output_hash,omitempty"`
+
+	SourceFiles []string `json:"source_files,omitempty"`
+	CommandsRun []string `json:"commands_run,omitempty"`
+
+	RetrievedContext *RetrievedContext `json:"retrieved_context,omitempty"`
+
+	ObserverNotes      []string         `json:"observer_notes,omitempty"`
+	ObserverVerdict    ObserverVerdict  `json:"observer_verdict,omitempty"`
+	ObserverConfidence float64          `json:"observer_confidence,omitempty"`
+	ScratchpadSummary  string           `json:"scratchpad_summary,omitempty"`
+
+	SuccessMarkers []string `json:"success_markers,omitempty"`
+	FailureMarkers []string `json:"failure_markers,omitempty"`
+
+	ValidationResults map[string]any `json:"validation_results,omitempty"`
+
+	HumanOverride *HumanOverride `json:"human_override,omitempty"`
+
+	CostUSD   float64 `json:"cost_usd,omitempty"`
+	LatencyMs int64   `json:"latency_ms,omitempty"`
+	Text      string  `json:"text,omitempty"`
+}
+
+// RetrievedContext captures what the model saw via retrieval. Matches
+// the Rust shape exactly so the JSON round-trips byte-identical (per
+// ADR-001 #4 "logic, not bit-identical" — but on-wire compatibility
+// is desirable for tooling that consumes EvidenceRecord JSONL).
+type RetrievedContext struct {
+	MatrixCorpora             []string `json:"matrix_corpora,omitempty"`
+	MatrixHits                int      `json:"matrix_hits,omitempty"`
+	MatrixChunksKept          int      `json:"matrix_chunks_kept,omitempty"`
+	MatrixChunksDropped       int      `json:"matrix_chunks_dropped,omitempty"`
+	PathwayFingerprintsSeen   int      `json:"pathway_fingerprints_seen,omitempty"`
+}
+
+// HumanOverride captures a human-in-the-loop decision overriding the
+// scorer's verdict. Recorded but doesn't change the scorer's output;
+// downstream consumers (UI, distillation acceptance) decide how to
+// treat it.
+type HumanOverride struct {
+	Overrider     string `json:"overrider"`
+	Decision      string `json:"decision"` // accept|reject|needs_review
+	Reason        string `json:"reason"`
+	OverriddenAt  string `json:"overridden_at"`
+}
+
+// SubScores carries the deterministic scorer's intermediate signals
+// alongside the final ScoreCategory. Persisted on every ScoredRun
+// so a downstream UI can show "why" without re-running the scorer.
+type SubScores struct {
+	CargoGreen              *bool  `json:"cargo_green,omitempty"`
+	AnchorGrounding         *float64 `json:"anchor_grounding,omitempty"`
+	SchemaValid             *bool  `json:"schema_valid,omitempty"`
+	PathwayReplaySucceeded  *bool  `json:"pathway_replay_succeeded,omitempty"`
+	ObserverVerdict         ObserverVerdict `json:"observer_verdict,omitempty"`
+	AcceptedOnAttempt       *int   `json:"accepted_on_attempt,omitempty"`
+	// Extra fields the Rust schema accepted as `[key: string]: unknown`.
+	// Captured here as a free-form map so future signals don't require
+	// type-system changes.
+	Extras map[string]any `json:"-"`
+}
+
+// ScoredRun is the deterministic scorer's output. One per
+// EvidenceRecord. Provenance ties back to the materialized evidence
+// row (not the raw source stream).
+type ScoredRun struct {
+	SchemaVersion    int           `json:"schema_version"`
+	EvidenceRunID    string        `json:"evidence_run_id"`
+	EvidenceTaskID   string        `json:"evidence_task_id"`
+	Category         ScoreCategory `json:"category"`
+	Reasons          []string      `json:"reasons"` // non-empty
+	ScoredAt         string        `json:"scored_at"`
+	ScorerVersion    string        `json:"scorer_version"`
+	SubScores        *SubScores    `json:"sub_scores,omitempty"`
+	Provenance       Provenance    `json:"provenance"`
+}
+
+// SftSample is one entry in exports/sft/instruction_response.jsonl.
+// The contamination firewall lives in ValidateSftSample.
+type SftSample struct {
+	SchemaVersion int             `json:"schema_version"`
+	ID            string          `json:"id"`
+	Instruction   string          `json:"instruction"`
+	Context       string          `json:"context"` // empty allowed; null/missing not
+	Response      string          `json:"response"`
+	SourceRunID   string          `json:"source_run_id"`
+	QualityScore  SftQualityScore `json:"quality_score"`
+	CreatedAt     string          `json:"created_at"`
+	Provenance    Provenance      `json:"provenance"`
+}
+
+// ─── Validators ──────────────────────────────────────────────────
+
+// ValidationError is a single field-level violation.
+type ValidationError struct {
+	Field   string
+	Message string
+}
+
+func (e ValidationError) Error() string {
+	return fmt.Sprintf("%s: %s", e.Field, e.Message)
+}
+
+// ValidationErrors is the joinable error returned by the validators
+// when one or more fields violate the schema.
+type ValidationErrors []ValidationError
+
+func (es ValidationErrors) Error() string {
+	if len(es) == 0 {
+		return "no errors"
+	}
+	parts := make([]string, len(es))
+	for i, e := range es {
+		parts[i] = e.Error()
+	}
+	return strings.Join(parts, "; ")
+}
+
+// HasErrors returns true when one or more errors are present.
+func (es ValidationErrors) HasErrors() bool { return len(es) > 0 }
+
+// ValidateScoredRun mirrors the Rust validateScoredRun. Returns nil
+// on success or a ValidationErrors with the field-level violations.
+func ValidateScoredRun(r ScoredRun) error {
+	var errs ValidationErrors
+	if r.SchemaVersion != ScoredRunSchemaVersion {
+		errs = append(errs, ValidationError{
+			"schema_version",
+			fmt.Sprintf("expected %d, got %d", ScoredRunSchemaVersion, r.SchemaVersion),
+		})
+	}
+	if r.EvidenceRunID == "" {
+		errs = append(errs, ValidationError{"evidence_run_id", "must be non-empty"})
+	}
+	if r.EvidenceTaskID == "" {
+		errs = append(errs, ValidationError{"evidence_task_id", "must be non-empty"})
+	}
+	if !validISOTimestamp(r.ScoredAt) {
+		errs = append(errs, ValidationError{"scored_at", "must be ISO 8601 timestamp"})
+	}
+	if r.ScorerVersion == "" {
+		errs = append(errs, ValidationError{"scorer_version", "must be non-empty"})
+	}
+	if len(r.Reasons) == 0 {
+		errs = append(errs, ValidationError{"reasons", "must be non-empty (every score needs a reason)"})
+	}
+	if !isValidCategory(r.Category) {
+		errs = append(errs, ValidationError{"category", fmt.Sprintf("must be one of %v, got %q", AllScoreCategories, r.Category)})
+	}
+	if err := validateProvenance(r.Provenance, "provenance"); err != nil {
+		errs = append(errs, err...)
+	}
+	if r.SubScores != nil && r.SubScores.AnchorGrounding != nil {
+		ag := *r.SubScores.AnchorGrounding
+		if ag < 0 || ag > 1 {
+			errs = append(errs, ValidationError{"sub_scores.anchor_grounding", "must be in [0, 1]"})
+		}
+	}
+	if errs.HasErrors() {
+		return errs
+	}
+	return nil
+}
+
+// ValidateSftSample is the contamination firewall. Returns ErrSftContamination
+// (wrapped) when quality_score is in SftNever — which is the safety-critical
+// guarantee the spec calls non-negotiable.
+//
+// Other field violations come back as ValidationErrors.
+func ValidateSftSample(s SftSample) error {
+	var errs ValidationErrors
+	if s.SchemaVersion != SftSampleSchemaVersion {
+		errs = append(errs, ValidationError{
+			"schema_version",
+			fmt.Sprintf("expected %d, got %d", SftSampleSchemaVersion, s.SchemaVersion),
+		})
+	}
+	if s.ID == "" {
+		errs = append(errs, ValidationError{"id", "must be non-empty"})
+	}
+	if strings.TrimSpace(s.Instruction) == "" {
+		errs = append(errs, ValidationError{"instruction", "must be non-whitespace (no empty pairs)"})
+	}
+	if strings.TrimSpace(s.Response) == "" {
+		errs = append(errs, ValidationError{"response", "must be non-whitespace (no empty pairs)"})
+	}
+	// Context is required-string but empty is allowed.
+	// (Field is always typed as string in Go, so the only way to
+	// distinguish "set" from "missing" was via the JSON layer; here
+	// empty is fine.)
+	if s.SourceRunID == "" {
+		errs = append(errs, ValidationError{"source_run_id", "must be non-empty"})
+	}
+	if !validISOTimestamp(s.CreatedAt) {
+		errs = append(errs, ValidationError{"created_at", "must be ISO 8601 timestamp"})
+	}
+	if err := validateProvenance(s.Provenance, "provenance"); err != nil {
+		errs = append(errs, err...)
+	}
+
+	// Contamination firewall. Hard non-negotiable per the spec.
+	if !isLegalSftQualityScore(s.QualityScore) {
+		// If it's in SftNever, surface the firewall sentinel — callers
+		// can errors.Is(err, ErrSftContamination) to reliably detect
+		// "the spec said never" as opposed to "you typo'd a category."
+		if isContaminationCategory(s.QualityScore) {
+			return fmt.Errorf("%w: quality_score %q in SftNever (rejected/needs_human_review never legal in SFT)",
+				ErrSftContamination, s.QualityScore)
+		}
+		errs = append(errs, ValidationError{
+			"quality_score",
+			fmt.Sprintf("must be one of %v, got %q", SftQualityScores, s.QualityScore),
+		})
+	}
+
+	if errs.HasErrors() {
+		return errs
+	}
+	return nil
+}
+
+// ErrSftContamination is the firewall sentinel — when ValidateSftSample
+// rejects a sample because its quality_score is in SftNever, callers
+// can errors.Is(err, ErrSftContamination) to reliably distinguish
+// "spec violation" from "typo'd category."
+var ErrSftContamination = errors.New("distillation: SFT contamination — quality_score in SftNever")
+
+// ─── Internal helpers ────────────────────────────────────────────
+
+func isValidCategory(c ScoreCategory) bool {
+	for _, v := range AllScoreCategories {
+		if c == v {
+			return true
+		}
+	}
+	return false
+}
+
+func isLegalSftQualityScore(q SftQualityScore) bool {
+	for _, v := range SftQualityScores {
+		if q == v {
+			return true
+		}
+	}
+	return false
+}
+
+func isContaminationCategory(q SftQualityScore) bool {
+	// Compare as ScoreCategory — the on-wire string is the same; this
+	// just guards the firewall against typos that happen to match
+	// SftNever string-wise.
+	for _, v := range SftNever {
+		if string(v) == string(q) {
+			return true
+		}
+	}
+	return false
+}
+
+func validISOTimestamp(s string) bool {
+	if s == "" {
+		return false
+	}
+	// time.Parse with RFC3339 covers most ISO 8601. We accept both
+	// the basic and nano variants since the Rust producers vary.
+	if _, err := time.Parse(time.RFC3339, s); err == nil {
+		return true
+	}
+	if _, err := time.Parse(time.RFC3339Nano, s); err == nil {
+		return true
+	}
+	return false
+}
+
+func validateProvenance(p Provenance, field string) ValidationErrors {
+	var errs ValidationErrors
+	if p.SourceFile == "" {
+		errs = append(errs, ValidationError{field + ".source_file", "must be non-empty"})
+	}
+	if p.SigHash == "" {
+		errs = append(errs, ValidationError{field + ".sig_hash", "must be non-empty"})
+	}
+	if !validISOTimestamp(p.RecordedAt) {
+		errs = append(errs, ValidationError{field + ".recorded_at", "must be ISO 8601 timestamp"})
+	}
+	return errs
+}
+
+// MarshalSubScores is a shim — Go's encoding/json doesn't merge a
+// "rest" map into the struct's JSON output by default. Callers that
+// need Extras serialized into the same object can use this helper.
+func MarshalSubScores(s *SubScores) ([]byte, error) {
+	if s == nil {
+		return []byte("null"), nil
+	}
+	// First marshal the typed fields normally.
+	type alias SubScores
+	base, err := json.Marshal((*alias)(s))
+	if err != nil {
+		return nil, err
+	}
+	if len(s.Extras) == 0 {
+		return base, nil
+	}
+	// Decode back to a map, merge Extras, re-encode. Less efficient
+	// but keeps the field semantics correct (typed fields override
+	// extras on collision — first-write-wins for known keys).
+	var combined map[string]any
+	if err := json.Unmarshal(base, &combined); err != nil {
+		return nil, err
+	}
+	for k, v := range s.Extras {
+		if _, exists := combined[k]; !exists {
+			combined[k] = v
+		}
+	}
+	return json.Marshal(combined)
+}