root 7bb432f6c8 distillation: full SFT export port — closes OPEN #2 fully
Follow-up to b216b7e (which shipped the SFT export substrate). This
commit ports the synthesis logic, completing the migration:

- SynthesizeSft(scored, ev, recordedAt, sftID) → *SftSample
  Mirrors the Rust synthesizeSft byte-for-byte. Returns nil for
  extraction-class records + empty-text records (same skip
  semantics as Rust).
- LoadEvidenceByRunID(scoredPath, cache) reads the paired evidence
  JSONL (path derived by /scored-runs/ → /evidence/ replacement).
  Per-call cache so multiple scored-runs files in the same dir
  don't reload the same evidence.
- buildInstruction maps source_file stem → per-class instruction
  template. All 8 templates (scrum_reviews, mode_experiments,
  auto_apply, audits, observer_reviews, contract_analyses,
  outcomes, default) match Rust output exactly so a/b validation
  between runtimes can diff JSONL byte-for-byte.
- stemFromSourceFile strips data/_kb/ prefix + .jsonl suffix.
- ExportSft now writes data/distilled/sft/sft_export.jsonl with
  the synthesized samples (DryRun=true skips file write).

Per-class templates verified by 8-case sub-test:
- scrum_reviews → "Review the file '...' against the PRD..."
- mode_experiments → "Run task_class='...' for file..."
- auto_apply → "Auto-apply: emit a 6-line surgical patch..."
- audits with phase: prefix → strips to bare phase name
- observer_reviews → "Observer-review the latest attempt..."
- contract_analyses with permit: prefix → strips to permit ID
- outcomes → "Run scenario; report per-event outcome..."
- unknown source → "Source 'X' run; produce the appropriate output"

Caveat documented inline: contract_analyses uses ev.metadata.contractor
in Rust to produce "Analyze contractor 'X' for permit 'Y'" when
present. Go's EvidenceRecord doesn't carry a free-form metadata bag
yet, so we always emit the no-contractor form. Operators needing
contractor-aware instructions can extend EvidenceRecord with an
explicit Metadata field (separate ADR).

Test additions (5 new):
- TestSynthesizeSft_PerSourceClass: 8 sub-cases, one per template
- TestSynthesizeSft_RejectsExtraction: extraction-role records skipped
- TestSynthesizeSft_RejectsEmptyText: empty/whitespace text skipped
- TestSynthesizeSft_ContextAssembly: matrix + pathway + model
  context string formatting matches Rust " · " join
- TestExportSft_FullPort_WritesJSONL: end-to-end fixture, asserts
  output contains expected instruction + omits firewalled records

Pre-existing TestExportSft_PartialPort_FirewallFires renamed +
updated to TestExportSft_FirewallFiresBeforeEvidenceLoad — reflects
the new contract that records passing the firewall but lacking
evidence land in "not-instructable" rather than being silently
exported. Honest semantics shift documented in the test.

OPEN #2 now fully closed (was: substrate-only). The synthesis path
no longer requires the Rust pipeline to be invoked — Go-side
operators can run the full distillation export end-to-end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 00:06:57 -05:00

386 lines
14 KiB
Go

package distillation
import (
"os"
"path/filepath"
"strings"
"testing"
)
// TestIsSftNever_Firewall locks the contamination firewall set:
// the predicate fires for "rejected" and "needs_human_review" and
// no others. Per project_distillation_substrate.md: this is one of
// the substrate's load-bearing knobs — touching the firewall set
// requires explicit sign-off.
func TestIsSftNever_Firewall(t *testing.T) {
mustBlock := []ScoreCategory{
CategoryRejected,
CategoryNeedsHumanReview,
}
for _, c := range mustBlock {
if !IsSftNever(c) {
t.Errorf("firewall must block %q", c)
}
}
// Anything else should NOT be blocked. Read every category
// constant in this package and assert non-blocked unless it's
// in mustBlock.
allKnown := []ScoreCategory{
CategoryAccepted,
CategoryPartiallyAccepted,
CategoryRejected,
CategoryNeedsHumanReview,
}
for _, c := range allKnown {
shouldBlock := false
for _, b := range mustBlock {
if c == b {
shouldBlock = true
break
}
}
if got := IsSftNever(c); got != shouldBlock {
t.Errorf("IsSftNever(%q) = %v, want %v", c, got, shouldBlock)
}
}
// Unknown category is NOT blocked — that's the safe default
// (operators bumping ScoreCategory enum should explicitly add
// to firewall if they want it gated).
if IsSftNever(ScoreCategory("custom_future_category")) {
t.Errorf("unknown category must not be blocked by firewall")
}
}
// TestSftNever_PinsExpectedSet locks the firewall slice contents.
// If a future commit adds or removes categories from SftNever, this
// test fails — forcing the change through review.
func TestSftNever_PinsExpectedSet(t *testing.T) {
want := map[ScoreCategory]bool{
CategoryRejected: true,
CategoryNeedsHumanReview: true,
}
if len(SftNever) != len(want) {
t.Fatalf("SftNever has %d entries, want %d (firewall set changed without review?)",
len(SftNever), len(want))
}
for _, c := range SftNever {
if !want[c] {
t.Errorf("SftNever contains %q, which is not in the expected firewall set", c)
}
}
}
// TestListScoredRunFiles_Empty: missing root → no files, no error.
// Matches Rust behavior; operators running ExportSft on a fresh box
// shouldn't see an error before any scored runs have landed.
func TestListScoredRunFiles_Empty(t *testing.T) {
tmp := t.TempDir()
files, err := ListScoredRunFiles(tmp)
if err != nil {
t.Fatalf("ListScoredRunFiles: %v", err)
}
if len(files) != 0 {
t.Errorf("empty root: expected 0 files, got %d", len(files))
}
}
// TestListScoredRunFiles_WalksYearMonthDay locks the directory walk
// pattern: data/scored-runs/YYYY/MM/DD/*.jsonl. Subset of full
// Rust-side test coverage but proves the walk visits the right
// nesting.
func TestListScoredRunFiles_WalksYearMonthDay(t *testing.T) {
tmp := t.TempDir()
// Create the expected nested structure.
dirs := []string{
filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30"),
filepath.Join(tmp, "data", "scored-runs", "2026", "05", "01"),
}
for _, d := range dirs {
if err := os.MkdirAll(d, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
}
// Drop a JSONL in each + a non-JSONL we should skip.
for i, d := range dirs {
jsonlPath := filepath.Join(d, "run.jsonl")
if err := os.WriteFile(jsonlPath, []byte("{}\n"), 0o644); err != nil {
t.Fatalf("write %s: %v", jsonlPath, err)
}
// Non-JSONL — must be skipped.
other := filepath.Join(d, "skip.txt")
if err := os.WriteFile(other, []byte("ignore me"), 0o644); err != nil {
t.Fatalf("write %s: %v", other, err)
}
_ = i
}
files, err := ListScoredRunFiles(tmp)
if err != nil {
t.Fatalf("ListScoredRunFiles: %v", err)
}
if len(files) != 2 {
t.Errorf("expected 2 .jsonl files, got %d (%v)", len(files), files)
}
// Sort order: 2026-04-30 before 2026-05-01. Critical for audit
// baselines — the longitudinal signal depends on stable order.
if len(files) >= 2 {
if files[0] >= files[1] {
t.Errorf("files not sorted ascending: %q vs %q", files[0], files[1])
}
}
// Non-JSONL must be skipped.
for _, f := range files {
if filepath.Ext(f) != ".jsonl" {
t.Errorf("listing returned non-.jsonl: %q", f)
}
}
}
// TestSynthesizeSft_PerSourceClass locks the per-source-class
// instruction templates byte-for-byte against the Rust source.
// If a future commit changes a template, this test fails — the
// trained-model behavior shifts under our feet.
func TestSynthesizeSft_PerSourceClass(t *testing.T) {
cases := []struct {
name string
sourceFile string
taskID string
sourceFiles []string
wantPrefix string
}{
{
"scrum_reviews",
"data/_kb/scrum_reviews.jsonl",
"any",
[]string{"src/foo.rs"},
"Review the file 'src/foo.rs' against",
},
{
"mode_experiments",
"data/_kb/mode_experiments.jsonl",
"task_42",
[]string{"src/bar.go"},
"Run task_class='task_42' for file 'src/bar.go'.",
},
{
"auto_apply",
"data/_kb/auto_apply.jsonl",
"any",
[]string{"src/baz.ts"},
"Auto-apply: emit a 6-line surgical patch for 'src/baz.ts'",
},
{
"audits with phase: prefix stripped",
"data/_kb/audits.jsonl",
"phase:G2",
nil,
"Audit phase 'G2' and report findings",
},
{
"observer_reviews",
"data/_kb/observer_reviews.jsonl",
"any",
[]string{"f.rs"},
"Observer-review the latest attempt on 'f.rs'.",
},
{
"contract_analyses with permit: prefix",
"data/_kb/contract_analyses.jsonl",
"permit:ABC123",
nil,
"Analyze permit 'ABC123'. Recommend with risk markers.",
},
{
"outcomes",
"data/_kb/outcomes.jsonl",
"any",
nil,
"Run scenario; report per-event outcome with citations.",
},
{
"unknown source falls back to default",
"data/_kb/something_new.jsonl",
"any",
nil,
"Source 'something_new' run; produce the appropriate output",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
scored := ScoredRun{
EvidenceRunID: "rid",
Category: CategoryAccepted,
Provenance: Provenance{
SourceFile: c.sourceFile,
SigHash: "abc",
RecordedAt: "2026-04-30T00:00:00Z",
},
}
ev := EvidenceRecord{
RunID: "rid",
TaskID: c.taskID,
ModelRole: RoleExecutor,
Text: "model response text",
SourceFiles: c.sourceFiles,
Provenance: Provenance{SourceFile: c.sourceFile, SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
}
sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "test-id")
if sample == nil {
t.Fatalf("expected non-nil sample for %s", c.name)
}
if !strings.HasPrefix(sample.Instruction, c.wantPrefix) {
t.Errorf("instruction prefix mismatch:\n got: %q\n want: %q...", sample.Instruction, c.wantPrefix)
}
})
}
}
// TestSynthesizeSft_RejectsExtraction: extraction-class records
// have no instruction→response shape (they're pure data extraction,
// not model-output-as-training-target). Synthesis must return nil.
func TestSynthesizeSft_RejectsExtraction(t *testing.T) {
ev := EvidenceRecord{
RunID: "rid",
ModelRole: RoleExtractor,
Text: "extracted data",
Provenance: Provenance{SourceFile: "data/_kb/anything.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
}
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil {
t.Errorf("extraction record must produce nil sample, got %+v", sample)
}
}
// TestSynthesizeSft_RejectsEmptyText: text is the response side of
// the SFT pair; empty text means nothing to learn.
func TestSynthesizeSft_RejectsEmptyText(t *testing.T) {
ev := EvidenceRecord{
RunID: "rid",
ModelRole: RoleExecutor,
Text: " \n\t",
Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
}
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil {
t.Errorf("empty-text record must produce nil sample, got %+v", sample)
}
}
// TestSynthesizeSft_ContextAssembly verifies the terse " · "-joined
// context string carries matrix corpora + pathway fingerprints +
// model name in the documented order.
func TestSynthesizeSft_ContextAssembly(t *testing.T) {
ev := EvidenceRecord{
RunID: "rid",
ModelRole: RoleReviewer,
Text: "verdict",
ModelName: "qwen3.5",
RetrievedContext: &RetrievedContext{
MatrixCorpora: []string{"workers", "candidates"},
PathwayFingerprintsSeen: 88,
},
Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
}
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id")
if sample == nil {
t.Fatalf("expected non-nil sample")
}
want := "matrix=workers,candidates · pathway_fingerprints=88 · model=qwen3.5"
if sample.Context != want {
t.Errorf("context mismatch:\n got: %q\n want: %q", sample.Context, want)
}
}
// TestExportSft_FullPort_WritesJSONL covers the fully-ported path:
// scored runs + paired evidence both present, synthesis produces
// SftSamples, output JSONL is written. Locks the end-to-end
// contract that next-wave changes (synthesis tweaks, output layout)
// have to preserve.
func TestExportSft_FullPort_WritesJSONL(t *testing.T) {
tmp := t.TempDir()
scoredDir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
evidenceDir := filepath.Join(tmp, "data", "evidence", "2026", "04", "30")
for _, d := range []string{scoredDir, evidenceDir} {
if err := os.MkdirAll(d, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", d, err)
}
}
scoredJSONL := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
`
evidenceJSONL := `{"run_id":"r1","model_role":"executor","text":"some review output","source_files":["src/foo.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
{"run_id":"r2","model_role":"executor","text":"another output","source_files":["src/bar.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
`
if err := os.WriteFile(filepath.Join(scoredDir, "run.jsonl"), []byte(scoredJSONL), 0o644); err != nil {
t.Fatalf("write scored: %v", err)
}
if err := os.WriteFile(filepath.Join(evidenceDir, "run.jsonl"), []byte(evidenceJSONL), 0o644); err != nil {
t.Fatalf("write evidence: %v", err)
}
res, err := ExportSft(ExportSftOptions{
Root: tmp,
RecordedAt: "2026-04-30T00:00:00Z",
})
if err != nil {
t.Fatalf("ExportSft: %v", err)
}
if res.RecordsRead != 2 || res.RecordsExported != 1 || res.RecordsQuarantined != 1 {
t.Errorf("counts: read=%d exported=%d quarantined=%d (want 2/1/1)",
res.RecordsRead, res.RecordsExported, res.RecordsQuarantined)
}
out, err := os.ReadFile(res.OutputPath)
if err != nil {
t.Fatalf("read output: %v", err)
}
if !strings.Contains(string(out), "Review the file 'src/foo.rs'") {
t.Errorf("output missing expected scrum_reviews instruction; got:\n%s", string(out))
}
if strings.Contains(string(out), "src/bar.rs") {
t.Errorf("output contains rejected record's source_file — firewall leak")
}
}
// TestExportSft_FirewallFiresBeforeEvidenceLoad locks the order-of-
// operations: even if evidence records are missing, the
// firewall counts records as quarantined so the contamination
// guarantee never depends on side data being present. Records
// that pass the firewall but lack evidence get the more honest
// "not instructable" label rather than being silently exported.
func TestExportSft_FirewallFiresBeforeEvidenceLoad(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("mkdir: %v", err)
}
jsonl := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
{"category":"partially_accepted","evidence_run_id":"r3","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h3","recorded_at":"2026-04-30T00:00:00Z"}}
{"category":"needs_human_review","evidence_run_id":"r4","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h4","recorded_at":"2026-04-30T00:00:00Z"}}
`
if err := os.WriteFile(filepath.Join(dir, "run.jsonl"), []byte(jsonl), 0o644); err != nil {
t.Fatalf("write: %v", err)
}
// No evidence directory created → records that pass the firewall
// land in "not instructable" since synthesis can't proceed.
res, err := ExportSft(ExportSftOptions{
Root: tmp,
RecordedAt: "2026-04-30T00:00:00Z",
DryRun: true,
})
if err != nil {
t.Fatalf("ExportSft: %v", err)
}
if res.RecordsRead != 4 {
t.Errorf("RecordsRead: got %d, want 4", res.RecordsRead)
}
if res.RecordsQuarantined != 2 {
t.Errorf("RecordsQuarantined (firewall-blocked): got %d, want 2", res.RecordsQuarantined)
}
if res.RecordsExported != 0 {
t.Errorf("RecordsExported with no evidence: got %d, want 0", res.RecordsExported)
}
if !strings.Contains(res.QuarantineSummary, "not-instructable=2") {
t.Errorf("expected quarantine summary to flag 2 not-instructable, got %q", res.QuarantineSummary)
}
}