Follow-up to b216b7e (which shipped the SFT export substrate). This commit ports the synthesis logic, completing the migration: - SynthesizeSft(scored, ev, recordedAt, sftID) → *SftSample Mirrors the Rust synthesizeSft byte-for-byte. Returns nil for extraction-class records + empty-text records (same skip semantics as Rust). - LoadEvidenceByRunID(scoredPath, cache) reads the paired evidence JSONL (path derived by /scored-runs/ → /evidence/ replacement). Per-call cache so multiple scored-runs files in the same dir don't reload the same evidence. - buildInstruction maps source_file stem → per-class instruction template. All 8 templates (scrum_reviews, mode_experiments, auto_apply, audits, observer_reviews, contract_analyses, outcomes, default) match Rust output exactly so a/b validation between runtimes can diff JSONL byte-for-byte. - stemFromSourceFile strips data/_kb/ prefix + .jsonl suffix. - ExportSft now writes data/distilled/sft/sft_export.jsonl with the synthesized samples (DryRun=true skips file write). Per-class templates verified by 8-case sub-test: - scrum_reviews → "Review the file '...' against the PRD..." - mode_experiments → "Run task_class='...' for file..." - auto_apply → "Auto-apply: emit a 6-line surgical patch..." - audits with phase: prefix → strips to bare phase name - observer_reviews → "Observer-review the latest attempt..." - contract_analyses with permit: prefix → strips to permit ID - outcomes → "Run scenario; report per-event outcome..." - unknown source → "Source 'X' run; produce the appropriate output" Caveat documented inline: contract_analyses uses ev.metadata.contractor in Rust to produce "Analyze contractor 'X' for permit 'Y'" when present. Go's EvidenceRecord doesn't carry a free-form metadata bag yet, so we always emit the no-contractor form. Operators needing contractor-aware instructions can extend EvidenceRecord with an explicit Metadata field (separate ADR). Test additions (5 new): - TestSynthesizeSft_PerSourceClass: 8 sub-cases, one per template - TestSynthesizeSft_RejectsExtraction: extraction-role records skipped - TestSynthesizeSft_RejectsEmptyText: empty/whitespace text skipped - TestSynthesizeSft_ContextAssembly: matrix + pathway + model context string formatting matches Rust " · " join - TestExportSft_FullPort_WritesJSONL: end-to-end fixture, asserts output contains expected instruction + omits firewalled records Pre-existing TestExportSft_PartialPort_FirewallFires renamed + updated to TestExportSft_FirewallFiresBeforeEvidenceLoad — reflects the new contract that records passing the firewall but lacking evidence land in "not-instructable" rather than being silently exported. Honest semantics shift documented in the test. OPEN #2 now fully closed (was: substrate-only). The synthesis path no longer requires the Rust pipeline to be invoked — Go-side operators can run the full distillation export end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
386 lines
14 KiB
Go
386 lines
14 KiB
Go
package distillation
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
// TestIsSftNever_Firewall locks the contamination firewall set:
|
|
// the predicate fires for "rejected" and "needs_human_review" and
|
|
// no others. Per project_distillation_substrate.md: this is one of
|
|
// the substrate's load-bearing knobs — touching the firewall set
|
|
// requires explicit sign-off.
|
|
func TestIsSftNever_Firewall(t *testing.T) {
|
|
mustBlock := []ScoreCategory{
|
|
CategoryRejected,
|
|
CategoryNeedsHumanReview,
|
|
}
|
|
for _, c := range mustBlock {
|
|
if !IsSftNever(c) {
|
|
t.Errorf("firewall must block %q", c)
|
|
}
|
|
}
|
|
// Anything else should NOT be blocked. Read every category
|
|
// constant in this package and assert non-blocked unless it's
|
|
// in mustBlock.
|
|
allKnown := []ScoreCategory{
|
|
CategoryAccepted,
|
|
CategoryPartiallyAccepted,
|
|
CategoryRejected,
|
|
CategoryNeedsHumanReview,
|
|
}
|
|
for _, c := range allKnown {
|
|
shouldBlock := false
|
|
for _, b := range mustBlock {
|
|
if c == b {
|
|
shouldBlock = true
|
|
break
|
|
}
|
|
}
|
|
if got := IsSftNever(c); got != shouldBlock {
|
|
t.Errorf("IsSftNever(%q) = %v, want %v", c, got, shouldBlock)
|
|
}
|
|
}
|
|
// Unknown category is NOT blocked — that's the safe default
|
|
// (operators bumping ScoreCategory enum should explicitly add
|
|
// to firewall if they want it gated).
|
|
if IsSftNever(ScoreCategory("custom_future_category")) {
|
|
t.Errorf("unknown category must not be blocked by firewall")
|
|
}
|
|
}
|
|
|
|
// TestSftNever_PinsExpectedSet locks the firewall slice contents.
|
|
// If a future commit adds or removes categories from SftNever, this
|
|
// test fails — forcing the change through review.
|
|
func TestSftNever_PinsExpectedSet(t *testing.T) {
|
|
want := map[ScoreCategory]bool{
|
|
CategoryRejected: true,
|
|
CategoryNeedsHumanReview: true,
|
|
}
|
|
if len(SftNever) != len(want) {
|
|
t.Fatalf("SftNever has %d entries, want %d (firewall set changed without review?)",
|
|
len(SftNever), len(want))
|
|
}
|
|
for _, c := range SftNever {
|
|
if !want[c] {
|
|
t.Errorf("SftNever contains %q, which is not in the expected firewall set", c)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestListScoredRunFiles_Empty: missing root → no files, no error.
|
|
// Matches Rust behavior; operators running ExportSft on a fresh box
|
|
// shouldn't see an error before any scored runs have landed.
|
|
func TestListScoredRunFiles_Empty(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
files, err := ListScoredRunFiles(tmp)
|
|
if err != nil {
|
|
t.Fatalf("ListScoredRunFiles: %v", err)
|
|
}
|
|
if len(files) != 0 {
|
|
t.Errorf("empty root: expected 0 files, got %d", len(files))
|
|
}
|
|
}
|
|
|
|
// TestListScoredRunFiles_WalksYearMonthDay locks the directory walk
|
|
// pattern: data/scored-runs/YYYY/MM/DD/*.jsonl. Subset of full
|
|
// Rust-side test coverage but proves the walk visits the right
|
|
// nesting.
|
|
func TestListScoredRunFiles_WalksYearMonthDay(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
// Create the expected nested structure.
|
|
dirs := []string{
|
|
filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30"),
|
|
filepath.Join(tmp, "data", "scored-runs", "2026", "05", "01"),
|
|
}
|
|
for _, d := range dirs {
|
|
if err := os.MkdirAll(d, 0o755); err != nil {
|
|
t.Fatalf("mkdir: %v", err)
|
|
}
|
|
}
|
|
// Drop a JSONL in each + a non-JSONL we should skip.
|
|
for i, d := range dirs {
|
|
jsonlPath := filepath.Join(d, "run.jsonl")
|
|
if err := os.WriteFile(jsonlPath, []byte("{}\n"), 0o644); err != nil {
|
|
t.Fatalf("write %s: %v", jsonlPath, err)
|
|
}
|
|
// Non-JSONL — must be skipped.
|
|
other := filepath.Join(d, "skip.txt")
|
|
if err := os.WriteFile(other, []byte("ignore me"), 0o644); err != nil {
|
|
t.Fatalf("write %s: %v", other, err)
|
|
}
|
|
_ = i
|
|
}
|
|
files, err := ListScoredRunFiles(tmp)
|
|
if err != nil {
|
|
t.Fatalf("ListScoredRunFiles: %v", err)
|
|
}
|
|
if len(files) != 2 {
|
|
t.Errorf("expected 2 .jsonl files, got %d (%v)", len(files), files)
|
|
}
|
|
// Sort order: 2026-04-30 before 2026-05-01. Critical for audit
|
|
// baselines — the longitudinal signal depends on stable order.
|
|
if len(files) >= 2 {
|
|
if files[0] >= files[1] {
|
|
t.Errorf("files not sorted ascending: %q vs %q", files[0], files[1])
|
|
}
|
|
}
|
|
// Non-JSONL must be skipped.
|
|
for _, f := range files {
|
|
if filepath.Ext(f) != ".jsonl" {
|
|
t.Errorf("listing returned non-.jsonl: %q", f)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestSynthesizeSft_PerSourceClass locks the per-source-class
|
|
// instruction templates byte-for-byte against the Rust source.
|
|
// If a future commit changes a template, this test fails — the
|
|
// trained-model behavior shifts under our feet.
|
|
func TestSynthesizeSft_PerSourceClass(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
sourceFile string
|
|
taskID string
|
|
sourceFiles []string
|
|
wantPrefix string
|
|
}{
|
|
{
|
|
"scrum_reviews",
|
|
"data/_kb/scrum_reviews.jsonl",
|
|
"any",
|
|
[]string{"src/foo.rs"},
|
|
"Review the file 'src/foo.rs' against",
|
|
},
|
|
{
|
|
"mode_experiments",
|
|
"data/_kb/mode_experiments.jsonl",
|
|
"task_42",
|
|
[]string{"src/bar.go"},
|
|
"Run task_class='task_42' for file 'src/bar.go'.",
|
|
},
|
|
{
|
|
"auto_apply",
|
|
"data/_kb/auto_apply.jsonl",
|
|
"any",
|
|
[]string{"src/baz.ts"},
|
|
"Auto-apply: emit a 6-line surgical patch for 'src/baz.ts'",
|
|
},
|
|
{
|
|
"audits with phase: prefix stripped",
|
|
"data/_kb/audits.jsonl",
|
|
"phase:G2",
|
|
nil,
|
|
"Audit phase 'G2' and report findings",
|
|
},
|
|
{
|
|
"observer_reviews",
|
|
"data/_kb/observer_reviews.jsonl",
|
|
"any",
|
|
[]string{"f.rs"},
|
|
"Observer-review the latest attempt on 'f.rs'.",
|
|
},
|
|
{
|
|
"contract_analyses with permit: prefix",
|
|
"data/_kb/contract_analyses.jsonl",
|
|
"permit:ABC123",
|
|
nil,
|
|
"Analyze permit 'ABC123'. Recommend with risk markers.",
|
|
},
|
|
{
|
|
"outcomes",
|
|
"data/_kb/outcomes.jsonl",
|
|
"any",
|
|
nil,
|
|
"Run scenario; report per-event outcome with citations.",
|
|
},
|
|
{
|
|
"unknown source falls back to default",
|
|
"data/_kb/something_new.jsonl",
|
|
"any",
|
|
nil,
|
|
"Source 'something_new' run; produce the appropriate output",
|
|
},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
scored := ScoredRun{
|
|
EvidenceRunID: "rid",
|
|
Category: CategoryAccepted,
|
|
Provenance: Provenance{
|
|
SourceFile: c.sourceFile,
|
|
SigHash: "abc",
|
|
RecordedAt: "2026-04-30T00:00:00Z",
|
|
},
|
|
}
|
|
ev := EvidenceRecord{
|
|
RunID: "rid",
|
|
TaskID: c.taskID,
|
|
ModelRole: RoleExecutor,
|
|
Text: "model response text",
|
|
SourceFiles: c.sourceFiles,
|
|
Provenance: Provenance{SourceFile: c.sourceFile, SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
|
|
}
|
|
sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "test-id")
|
|
if sample == nil {
|
|
t.Fatalf("expected non-nil sample for %s", c.name)
|
|
}
|
|
if !strings.HasPrefix(sample.Instruction, c.wantPrefix) {
|
|
t.Errorf("instruction prefix mismatch:\n got: %q\n want: %q...", sample.Instruction, c.wantPrefix)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestSynthesizeSft_RejectsExtraction: extraction-class records
|
|
// have no instruction→response shape (they're pure data extraction,
|
|
// not model-output-as-training-target). Synthesis must return nil.
|
|
func TestSynthesizeSft_RejectsExtraction(t *testing.T) {
|
|
ev := EvidenceRecord{
|
|
RunID: "rid",
|
|
ModelRole: RoleExtractor,
|
|
Text: "extracted data",
|
|
Provenance: Provenance{SourceFile: "data/_kb/anything.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
|
|
}
|
|
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
|
|
if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil {
|
|
t.Errorf("extraction record must produce nil sample, got %+v", sample)
|
|
}
|
|
}
|
|
|
|
// TestSynthesizeSft_RejectsEmptyText: text is the response side of
|
|
// the SFT pair; empty text means nothing to learn.
|
|
func TestSynthesizeSft_RejectsEmptyText(t *testing.T) {
|
|
ev := EvidenceRecord{
|
|
RunID: "rid",
|
|
ModelRole: RoleExecutor,
|
|
Text: " \n\t",
|
|
Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
|
|
}
|
|
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
|
|
if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil {
|
|
t.Errorf("empty-text record must produce nil sample, got %+v", sample)
|
|
}
|
|
}
|
|
|
|
// TestSynthesizeSft_ContextAssembly verifies the terse " · "-joined
|
|
// context string carries matrix corpora + pathway fingerprints +
|
|
// model name in the documented order.
|
|
func TestSynthesizeSft_ContextAssembly(t *testing.T) {
|
|
ev := EvidenceRecord{
|
|
RunID: "rid",
|
|
ModelRole: RoleReviewer,
|
|
Text: "verdict",
|
|
ModelName: "qwen3.5",
|
|
RetrievedContext: &RetrievedContext{
|
|
MatrixCorpora: []string{"workers", "candidates"},
|
|
PathwayFingerprintsSeen: 88,
|
|
},
|
|
Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"},
|
|
}
|
|
scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance}
|
|
sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id")
|
|
if sample == nil {
|
|
t.Fatalf("expected non-nil sample")
|
|
}
|
|
want := "matrix=workers,candidates · pathway_fingerprints=88 · model=qwen3.5"
|
|
if sample.Context != want {
|
|
t.Errorf("context mismatch:\n got: %q\n want: %q", sample.Context, want)
|
|
}
|
|
}
|
|
|
|
// TestExportSft_FullPort_WritesJSONL covers the fully-ported path:
|
|
// scored runs + paired evidence both present, synthesis produces
|
|
// SftSamples, output JSONL is written. Locks the end-to-end
|
|
// contract that next-wave changes (synthesis tweaks, output layout)
|
|
// have to preserve.
|
|
func TestExportSft_FullPort_WritesJSONL(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
scoredDir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
|
|
evidenceDir := filepath.Join(tmp, "data", "evidence", "2026", "04", "30")
|
|
for _, d := range []string{scoredDir, evidenceDir} {
|
|
if err := os.MkdirAll(d, 0o755); err != nil {
|
|
t.Fatalf("mkdir %s: %v", d, err)
|
|
}
|
|
}
|
|
scoredJSONL := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
`
|
|
evidenceJSONL := `{"run_id":"r1","model_role":"executor","text":"some review output","source_files":["src/foo.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
{"run_id":"r2","model_role":"executor","text":"another output","source_files":["src/bar.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
`
|
|
if err := os.WriteFile(filepath.Join(scoredDir, "run.jsonl"), []byte(scoredJSONL), 0o644); err != nil {
|
|
t.Fatalf("write scored: %v", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(evidenceDir, "run.jsonl"), []byte(evidenceJSONL), 0o644); err != nil {
|
|
t.Fatalf("write evidence: %v", err)
|
|
}
|
|
res, err := ExportSft(ExportSftOptions{
|
|
Root: tmp,
|
|
RecordedAt: "2026-04-30T00:00:00Z",
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("ExportSft: %v", err)
|
|
}
|
|
if res.RecordsRead != 2 || res.RecordsExported != 1 || res.RecordsQuarantined != 1 {
|
|
t.Errorf("counts: read=%d exported=%d quarantined=%d (want 2/1/1)",
|
|
res.RecordsRead, res.RecordsExported, res.RecordsQuarantined)
|
|
}
|
|
out, err := os.ReadFile(res.OutputPath)
|
|
if err != nil {
|
|
t.Fatalf("read output: %v", err)
|
|
}
|
|
if !strings.Contains(string(out), "Review the file 'src/foo.rs'") {
|
|
t.Errorf("output missing expected scrum_reviews instruction; got:\n%s", string(out))
|
|
}
|
|
if strings.Contains(string(out), "src/bar.rs") {
|
|
t.Errorf("output contains rejected record's source_file — firewall leak")
|
|
}
|
|
}
|
|
|
|
// TestExportSft_FirewallFiresBeforeEvidenceLoad locks the order-of-
|
|
// operations: even if evidence records are missing, the
|
|
// firewall counts records as quarantined so the contamination
|
|
// guarantee never depends on side data being present. Records
|
|
// that pass the firewall but lack evidence get the more honest
|
|
// "not instructable" label rather than being silently exported.
|
|
func TestExportSft_FirewallFiresBeforeEvidenceLoad(t *testing.T) {
|
|
tmp := t.TempDir()
|
|
dir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30")
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
t.Fatalf("mkdir: %v", err)
|
|
}
|
|
jsonl := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
{"category":"partially_accepted","evidence_run_id":"r3","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h3","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
{"category":"needs_human_review","evidence_run_id":"r4","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h4","recorded_at":"2026-04-30T00:00:00Z"}}
|
|
`
|
|
if err := os.WriteFile(filepath.Join(dir, "run.jsonl"), []byte(jsonl), 0o644); err != nil {
|
|
t.Fatalf("write: %v", err)
|
|
}
|
|
// No evidence directory created → records that pass the firewall
|
|
// land in "not instructable" since synthesis can't proceed.
|
|
res, err := ExportSft(ExportSftOptions{
|
|
Root: tmp,
|
|
RecordedAt: "2026-04-30T00:00:00Z",
|
|
DryRun: true,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("ExportSft: %v", err)
|
|
}
|
|
if res.RecordsRead != 4 {
|
|
t.Errorf("RecordsRead: got %d, want 4", res.RecordsRead)
|
|
}
|
|
if res.RecordsQuarantined != 2 {
|
|
t.Errorf("RecordsQuarantined (firewall-blocked): got %d, want 2", res.RecordsQuarantined)
|
|
}
|
|
if res.RecordsExported != 0 {
|
|
t.Errorf("RecordsExported with no evidence: got %d, want 0", res.RecordsExported)
|
|
}
|
|
if !strings.Contains(res.QuarantineSummary, "not-instructable=2") {
|
|
t.Errorf("expected quarantine summary to flag 2 not-instructable, got %q", res.QuarantineSummary)
|
|
}
|
|
}
|