First slice of the Rust v1.0.0 distillation substrate (e7636f2) ported to Go per ADR-001 #4 (port LOGIC, not bit-identical reproducibility). This commit lands the LOAD-BEARING pieces named in project_distillation_substrate.md memory: - The deterministic Success Scorer (8 sub-scorers + dispatch) - The contamination firewall on SFT samples (the "non-negotiable" spec property: rejected/needs_human_review NEVER ship to SFT) - All on-wire types + validators for ScoredRun, SftSample, EvidenceRecord with Provenance Files: internal/distillation/types.go — types + ScorerVersion + SftNever + ValidateScoredRun + ValidateSftSample internal/distillation/scorer.go — ScoreRecord + 8 class scorers + BuildScoredRun (deterministic) internal/distillation/scorer_test.go — ~40 test cases: - source-class dispatch (verdict / telemetry / extraction) - scrum_review (4 attempt cases) - observer_review (5 verdict cases) - audit (legacy + severity, 9 cases) - auto_apply (4 cases) - outcomes / mode_experiment / extraction - CONTAMINATION FIREWALL: ErrSftContamination sentinel fires on rejected/needs_human_review, distinct from typo errors - empty-pair guard (instruction/response trim != "") - reasons-required ScoredRun validation - deterministic sig_hash on identical input - purity check (input not mutated, repeatable output) Per the 2026-04-29 cross-lineage scrum's discipline: false-positive findings would be dismissed inline (none in this commit). Real findings would be addressed before merge — but this is greenfield port code reviewed against its Rust source line-by-line, which the test suite encodes as truth tables. Explicitly DEFERRED to follow-up commits: - Materialization layer (jsonl read/write, date-partitioned storage in data/scored-runs/YYYY/MM/DD/, evidence index) - SFT exporter (file iteration + filtering — the SCORING firewall is here; the EXPORT firewall is the next layer) - export_preference, export_rag (other export shapes) - Acceptance harness (16/16 acceptance gate that locks v1.0.0) - replay, receipts, build_evidence_index, transforms The scorer + firewall validator are pure functions — operational tooling layers on top without changing the deterministic logic the downstream learning loop depends on. The Go ScorerVersion stays at v1.0.0 to match the Rust e7636f2 baseline; bumping in the Go materialization commit is reserved for the next scoring-rule change, NOT the port itself. 15-smoke regression all green. vet clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
376 lines
12 KiB
Go
376 lines
12 KiB
Go
package distillation
|
|
|
|
import (
|
|
"errors"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func mkRecord(sourceFile string) EvidenceRecord {
|
|
return EvidenceRecord{
|
|
RunID: "run-1",
|
|
TaskID: "task-1",
|
|
Timestamp: "2026-04-29T12:00:00Z",
|
|
SchemaVersion: EvidenceSchemaVersion,
|
|
Provenance: Provenance{
|
|
SourceFile: sourceFile,
|
|
SigHash: "deadbeef",
|
|
RecordedAt: "2026-04-29T12:00:01Z",
|
|
},
|
|
}
|
|
}
|
|
|
|
func TestSourceClassFor(t *testing.T) {
|
|
cases := []struct {
|
|
path string
|
|
want sourceClass
|
|
}{
|
|
{"data/_kb/scrum_reviews.jsonl", classVerdict},
|
|
{"data/_kb/observer_reviews.jsonl", classVerdict},
|
|
{"data/_kb/audits.jsonl", classVerdict},
|
|
{"data/_kb/contract_analyses.jsonl", classVerdict},
|
|
{"data/_kb/auto_apply.jsonl", classTelemetry},
|
|
{"data/_kb/outcomes.jsonl", classTelemetry},
|
|
{"data/_kb/mode_experiments.jsonl", classTelemetry},
|
|
{"data/_kb/distilled_facts.jsonl", classExtraction},
|
|
{"data/_kb/audit_facts.jsonl", classExtraction},
|
|
{"data/_kb/observer_escalations.jsonl", classExtraction},
|
|
{"data/_kb/wholly_unknown.jsonl", classExtraction}, // unknown → extraction (conservative)
|
|
}
|
|
for _, c := range cases {
|
|
got := sourceClassFor(c.path)
|
|
if got != c.want {
|
|
t.Errorf("sourceClassFor(%q): want %q, got %q", c.path, c.want, got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestScoreScrumReview(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
successMarkers []string
|
|
wantCategory ScoreCategory
|
|
wantReasonSub string
|
|
}{
|
|
{
|
|
name: "first attempt → accepted",
|
|
successMarkers: []string{"accepted_on_attempt_1"},
|
|
wantCategory: CategoryAccepted,
|
|
wantReasonSub: "first attempt",
|
|
},
|
|
{
|
|
name: "second attempt → partial",
|
|
successMarkers: []string{"accepted_on_attempt_2"},
|
|
wantCategory: CategoryPartiallyAccepted,
|
|
wantReasonSub: "after 2 attempts",
|
|
},
|
|
{
|
|
name: "fourth attempt → partial (high-cost)",
|
|
successMarkers: []string{"accepted_on_attempt_4"},
|
|
wantCategory: CategoryPartiallyAccepted,
|
|
wantReasonSub: "high-cost",
|
|
},
|
|
{
|
|
name: "missing marker → needs_human_review",
|
|
successMarkers: []string{},
|
|
wantCategory: CategoryNeedsHumanReview,
|
|
wantReasonSub: "missing accepted_on_attempt",
|
|
},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
rec := mkRecord("data/_kb/scrum_reviews.jsonl")
|
|
rec.SuccessMarkers = c.successMarkers
|
|
out := ScoreRecord(rec)
|
|
if out.Category != c.wantCategory {
|
|
t.Errorf("category: want %q, got %q (reasons=%v)", c.wantCategory, out.Category, out.Reasons)
|
|
}
|
|
if !reasonsContain(out.Reasons, c.wantReasonSub) {
|
|
t.Errorf("reasons missing %q: %v", c.wantReasonSub, out.Reasons)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestScoreObserverReview(t *testing.T) {
|
|
cases := []struct {
|
|
verdict ObserverVerdict
|
|
want ScoreCategory
|
|
}{
|
|
{VerdictAccept, CategoryAccepted},
|
|
{VerdictReject, CategoryRejected},
|
|
{VerdictCycle, CategoryPartiallyAccepted},
|
|
{"", CategoryNeedsHumanReview},
|
|
{"weird-verdict", CategoryNeedsHumanReview},
|
|
}
|
|
for _, c := range cases {
|
|
rec := mkRecord("data/_kb/observer_reviews.jsonl")
|
|
rec.ObserverVerdict = c.verdict
|
|
out := ScoreRecord(rec)
|
|
if out.Category != c.want {
|
|
t.Errorf("verdict=%q: want %q, got %q", c.verdict, c.want, out.Category)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestScoreAudit_LegacyAndSeverityMarkers(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
succ []string
|
|
fail []string
|
|
want ScoreCategory
|
|
}{
|
|
{"legacy approved", []string{"approved"}, nil, CategoryAccepted},
|
|
{"legacy blocked", nil, []string{"blocked"}, CategoryRejected},
|
|
{"legacy request_changes", nil, []string{"request_changes"}, CategoryPartiallyAccepted},
|
|
{"severity_low → accepted", []string{"audit_severity_low"}, nil, CategoryAccepted},
|
|
{"severity_info → accepted", []string{"audit_severity_info"}, nil, CategoryAccepted},
|
|
{"severity_medium fail → partial", nil, []string{"audit_severity_medium"}, CategoryPartiallyAccepted},
|
|
{"severity_high → rejected", nil, []string{"audit_severity_high"}, CategoryRejected},
|
|
{"severity_critical → rejected", nil, []string{"audit_severity_critical"}, CategoryRejected},
|
|
{"no markers", nil, nil, CategoryNeedsHumanReview},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
rec := mkRecord("data/_kb/audits.jsonl")
|
|
rec.SuccessMarkers = c.succ
|
|
rec.FailureMarkers = c.fail
|
|
out := ScoreRecord(rec)
|
|
if out.Category != c.want {
|
|
t.Errorf("want %q, got %q (reasons=%v)", c.want, out.Category, out.Reasons)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestScoreAutoApply(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
succ []string
|
|
fail []string
|
|
want ScoreCategory
|
|
}{
|
|
{"committed → accepted", []string{"committed"}, nil, CategoryAccepted},
|
|
{"reverted_build_red → rejected", nil, []string{"reverted_build_red"}, CategoryRejected},
|
|
{"reverted other → rejected", nil, []string{"reverted_warning_count_up"}, CategoryRejected},
|
|
{"no signal → needs_human", nil, nil, CategoryNeedsHumanReview},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
rec := mkRecord("data/_kb/auto_apply.jsonl")
|
|
rec.SuccessMarkers = c.succ
|
|
rec.FailureMarkers = c.fail
|
|
out := ScoreRecord(rec)
|
|
if out.Category != c.want {
|
|
t.Errorf("want %q, got %q", c.want, out.Category)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestScoreOutcomes(t *testing.T) {
|
|
rec := mkRecord("data/_kb/outcomes.jsonl")
|
|
rec.SuccessMarkers = []string{"all_events_ok"}
|
|
if out := ScoreRecord(rec); out.Category != CategoryAccepted {
|
|
t.Errorf("all_events_ok: want accepted, got %q", out.Category)
|
|
}
|
|
|
|
rec2 := mkRecord("data/_kb/outcomes.jsonl")
|
|
rec2.ValidationResults = map[string]any{"gap_signals": float64(2)}
|
|
if out := ScoreRecord(rec2); out.Category != CategoryPartiallyAccepted {
|
|
t.Errorf("gap_signals=2: want partial, got %q (reasons=%v)", out.Category, out.Reasons)
|
|
}
|
|
|
|
rec3 := mkRecord("data/_kb/outcomes.jsonl")
|
|
if out := ScoreRecord(rec3); out.Category != CategoryNeedsHumanReview {
|
|
t.Errorf("no signal: want needs_human, got %q", out.Category)
|
|
}
|
|
}
|
|
|
|
func TestScoreModeExperiment(t *testing.T) {
|
|
rec := mkRecord("data/_kb/mode_experiments.jsonl")
|
|
rec.Text = ""
|
|
if out := ScoreRecord(rec); out.Category != CategoryRejected {
|
|
t.Errorf("empty text: want rejected, got %q", out.Category)
|
|
}
|
|
|
|
rec.Text = "real response"
|
|
rec.LatencyMs = 130_000
|
|
if out := ScoreRecord(rec); out.Category != CategoryPartiallyAccepted {
|
|
t.Errorf("over latency cap: want partial, got %q", out.Category)
|
|
}
|
|
|
|
rec.LatencyMs = 5000
|
|
if out := ScoreRecord(rec); out.Category != CategoryNeedsHumanReview {
|
|
t.Errorf("normal: want needs_human (verdict not yet wired), got %q", out.Category)
|
|
}
|
|
}
|
|
|
|
func TestScoreExtraction_Defaults(t *testing.T) {
|
|
for _, src := range []string{
|
|
"data/_kb/distilled_facts.jsonl",
|
|
"data/_kb/distilled_procedures.jsonl",
|
|
"data/_kb/audit_facts.jsonl",
|
|
"data/_kb/observer_escalations.jsonl",
|
|
} {
|
|
rec := mkRecord(src)
|
|
out := ScoreRecord(rec)
|
|
if out.Category != CategoryNeedsHumanReview {
|
|
t.Errorf("%s: want needs_human_review, got %q", src, out.Category)
|
|
}
|
|
}
|
|
}
|
|
|
|
// ─── Contamination firewall — the safety-critical guarantee ───────
|
|
|
|
func TestValidateSftSample_RejectsContaminationCategories(t *testing.T) {
|
|
for _, contaminated := range []SftQualityScore{
|
|
SftQualityScore("rejected"),
|
|
SftQualityScore("needs_human_review"),
|
|
} {
|
|
s := goodSftSample()
|
|
s.QualityScore = contaminated
|
|
err := ValidateSftSample(s)
|
|
if err == nil {
|
|
t.Errorf("contaminated quality_score=%q should fail validation", contaminated)
|
|
continue
|
|
}
|
|
if !errors.Is(err, ErrSftContamination) {
|
|
t.Errorf("contaminated %q: want errors.Is(err, ErrSftContamination), got %v", contaminated, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestValidateSftSample_AcceptsLegalCategories(t *testing.T) {
|
|
for _, legal := range []SftQualityScore{SftQualityAccepted, SftQualityPartiallyAccepted} {
|
|
s := goodSftSample()
|
|
s.QualityScore = legal
|
|
if err := ValidateSftSample(s); err != nil {
|
|
t.Errorf("legal quality_score=%q failed: %v", legal, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestValidateSftSample_RejectsTypoCategory(t *testing.T) {
|
|
s := goodSftSample()
|
|
s.QualityScore = "approved" // close to "accepted" but wrong
|
|
err := ValidateSftSample(s)
|
|
if err == nil {
|
|
t.Fatal("typo category should fail validation")
|
|
}
|
|
// Typo is NOT contamination — should be a regular ValidationError,
|
|
// not the firewall sentinel. This distinguishes "you typo'd" from
|
|
// "you broke the spec."
|
|
if errors.Is(err, ErrSftContamination) {
|
|
t.Error("typo should not surface as ErrSftContamination")
|
|
}
|
|
}
|
|
|
|
func TestValidateSftSample_RejectsEmptyPair(t *testing.T) {
|
|
s := goodSftSample()
|
|
s.Instruction = " "
|
|
if err := ValidateSftSample(s); err == nil {
|
|
t.Error("whitespace-only instruction should fail")
|
|
}
|
|
|
|
s2 := goodSftSample()
|
|
s2.Response = ""
|
|
if err := ValidateSftSample(s2); err == nil {
|
|
t.Error("empty response should fail")
|
|
}
|
|
}
|
|
|
|
func TestValidateScoredRun_ReasonsRequired(t *testing.T) {
|
|
r := ScoredRun{
|
|
SchemaVersion: ScoredRunSchemaVersion,
|
|
EvidenceRunID: "x",
|
|
EvidenceTaskID: "y",
|
|
Category: CategoryAccepted,
|
|
Reasons: nil, // empty — must fail
|
|
ScoredAt: "2026-04-29T12:00:00Z",
|
|
ScorerVersion: ScorerVersion,
|
|
Provenance: Provenance{
|
|
SourceFile: "data/_kb/scrum_reviews.jsonl",
|
|
SigHash: "abc",
|
|
RecordedAt: "2026-04-29T12:00:00Z",
|
|
},
|
|
}
|
|
err := ValidateScoredRun(r)
|
|
if err == nil {
|
|
t.Fatal("empty reasons should fail")
|
|
}
|
|
if !strings.Contains(err.Error(), "reasons") {
|
|
t.Errorf("error should mention reasons: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestBuildScoredRun_DeterministicSigHash(t *testing.T) {
|
|
rec := mkRecord("data/_kb/scrum_reviews.jsonl")
|
|
rec.SuccessMarkers = []string{"accepted_on_attempt_1"}
|
|
|
|
r1, err := BuildScoredRun(rec, "data/scored-runs/2026/04/29/x.jsonl", 0, "2026-04-29T12:00:00Z")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
r2, err := BuildScoredRun(rec, "data/scored-runs/2026/04/29/x.jsonl", 0, "2026-04-29T12:00:00Z")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if r1.Provenance.SigHash != r2.Provenance.SigHash {
|
|
t.Errorf("identical EvidenceRecord should produce identical sig_hash: %s vs %s",
|
|
r1.Provenance.SigHash, r2.Provenance.SigHash)
|
|
}
|
|
if r1.Category != CategoryAccepted {
|
|
t.Errorf("scored category: %q", r1.Category)
|
|
}
|
|
if r1.ScorerVersion != ScorerVersion {
|
|
t.Errorf("scorer version stamped wrong: %q", r1.ScorerVersion)
|
|
}
|
|
}
|
|
|
|
func TestScoreRecord_PureFunction_NoMutationOfInput(t *testing.T) {
|
|
// Belt-and-braces: the contract says "NO mutable state, identical
|
|
// input → identical output forever." Verify by scoring the same
|
|
// record twice and ensuring the input hasn't been touched.
|
|
rec := mkRecord("data/_kb/scrum_reviews.jsonl")
|
|
rec.SuccessMarkers = []string{"accepted_on_attempt_2"}
|
|
original := rec
|
|
out1 := ScoreRecord(rec)
|
|
out2 := ScoreRecord(rec)
|
|
if rec.RunID != original.RunID || len(rec.SuccessMarkers) != 1 {
|
|
t.Error("ScoreRecord mutated its input")
|
|
}
|
|
if out1.Category != out2.Category {
|
|
t.Error("ScoreRecord is non-deterministic")
|
|
}
|
|
}
|
|
|
|
// ─── Helpers ─────────────────────────────────────────────────────
|
|
|
|
func goodSftSample() SftSample {
|
|
return SftSample{
|
|
SchemaVersion: SftSampleSchemaVersion,
|
|
ID: "sft-1",
|
|
Instruction: "summarize the diff",
|
|
Context: "diff body...",
|
|
Response: "the diff adds a function",
|
|
SourceRunID: "run-1",
|
|
QualityScore: SftQualityAccepted,
|
|
CreatedAt: "2026-04-29T12:00:00Z",
|
|
Provenance: Provenance{
|
|
SourceFile: "data/scored-runs/2026/04/29/x.jsonl",
|
|
SigHash: "deadbeef",
|
|
RecordedAt: "2026-04-29T12:00:01Z",
|
|
},
|
|
}
|
|
}
|
|
|
|
func reasonsContain(reasons []string, sub string) bool {
|
|
for _, r := range reasons {
|
|
if strings.Contains(r, sub) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|