diff --git a/STATE_OF_PLAY.md b/STATE_OF_PLAY.md index 7493a61..9586230 100644 --- a/STATE_OF_PLAY.md +++ b/STATE_OF_PLAY.md @@ -267,6 +267,7 @@ a steady state. Future items will land here as production triggers fire. | (wire-up) | Multi-coord stress role wire-through: `Demand.Role` was already extracted at every call site (44 occurrences) but never threaded into matrix retrieve or playbook record. Cross-role gate was bypassed for the entire multi-coord harness. **Fixed** by extending `tracedSearch`, `matrixSearch`, and `playbookRecord` signatures with `role string` and updating all 14 call sites — passing `d.Role` (demand loops), `parsed.Role` (LLM-parsed inbox path), `warehouseDemand.Role` (swap path), `ev.Role` (reissue path), `""` (fresh-verify resume snippet — no clean role). Build + vet + tests green; multi-coord stress now honors role gate end-to-end. | | (close-1) | **OPEN #1: vectord merge endpoint** — `POST /v1/vectors/index/{src}/merge` with body `{dest, clear_source}`. Idempotent on re-runs (existing-in-dest items skipped). New `Index.IDs()` snapshot method backs it; new `i.ids` tracker field is the canonical ID set (independent of meta map's nil-vs-{} sparseness). 4 cmd-level tests + 1 unit test. | | (close-2) | **OPEN #2: distillation SFT export substrate** — `internal/distillation/sft_export.go`: `IsSftNever` predicate + `ListScoredRunFiles` (data/scored-runs/YYYY/MM/DD walk) + `LoadScoredRunsFromFile` + partial `ExportSft` that wires the firewall but leaves synthesis (instruction/input/response generation) as the next wave. Firewall pinning test fails if `SftNever` set changes without review. 5 new tests. The synthesis port remains on Rust at `scripts/distillation/export_sft.ts`. | +| (close-2 full) | **OPEN #2 fully ported** (2026-05-01): `SynthesizeSft` + `LoadEvidenceByRunID` + `buildInstruction` ported byte-for-byte from `scripts/distillation/export_sft.ts`. All 8 source-class instruction templates (scrum_reviews / mode_experiments / auto_apply / audits / observer_reviews / contract_analyses / outcomes / default) match Rust output exactly so a/b validation between runtimes can diff JSONL byte-for-byte. `ExportSft` writes to `data/distilled/sft/sft_export.jsonl`. 5 additional tests including per-source-class template verification, extraction-rejection, empty-text-rejection, context-assembly, end-to-end fixture write. | | (close-3) | **OPEN #3: distribution drift via PSI** — `internal/drift/drift.go`: `ComputeDistributionDrift` returns Population Stability Index + verdict tier (stable < 0.10, minor 0.10–0.25, major ≥ 0.25). Equal-width bucketing over combined min/max range, epsilon-clamping for empty buckets, per-bucket breakdown for drilldown. 7 new tests including identical-is-stable, hard-shift-is-major, moderate-detected-not-stable, empty-inputs-safe, all-identical-safe, bucket-counts-conserved, num-buckets-clamping. | | (close-4) | **OPEN #4: ops nice-to-haves** — (a) Real-time wall-clock for stress harness: per-phase elapsed time logged to stdout as it runs (`[stress] phase NAME starting (T+12.3s)` + `[stress] phase NAME done — 8.5s (T+20.8s)`); `Output.PhaseTimings` + `Output.TotalElapsedMs` written to JSON; (b) chatd fixture-mode S3 mock + (c) liberal-paraphrase calibration: not actioned — no fired trigger yet, would be speculative. Documented as deferred-until-need rather than ignored. | diff --git a/internal/distillation/sft_export.go b/internal/distillation/sft_export.go index e93f2ad..29bd189 100644 --- a/internal/distillation/sft_export.go +++ b/internal/distillation/sft_export.go @@ -164,51 +164,236 @@ func LoadScoredRunsFromFile(path string) ([]ScoredRun, int, error) { return out, skipped, nil } -// ExportSft is the partial port. Lists scored-run files, loads -// each, applies the contamination firewall, and reports counts. -// What's NOT yet ported (deliberate, separate wave): -// - Evidence-record loading + cache (loadEvidenceByRunId). -// - synthesizeSft — the actual instruction/input/response -// synthesis logic. ~80 lines of TS in scripts/distillation/export_sft.ts. -// - Quarantine writer integration (write rejected records to -// a quarantine JSONL for operator review). -// - File output (write SFT JSONL to data/distilled/sft/). +// LoadEvidenceByRunID reads {evidence_path}'s JSONL and returns a +// map[run_id]EvidenceRecord. evidence_path is derived by replacing +// /scored-runs/ with /evidence/ in the scored-runs path — the +// convention the Rust pipeline uses to colocate evidence + scores. // -// Returning a non-nil result with RecordsExported=0 is intentional -// pre-synthesis — operators calling this on the Go side will see -// the count of records that PASSED the firewall and would have -// been exported by a complete implementation. RecordsQuarantined -// reflects records BLOCKED by the firewall. +// Cache is the per-call memoization map: callers iterating multiple +// scored-runs files in the same directory benefit from shared +// loads. Pass nil to disable caching (useful in tests). +func LoadEvidenceByRunID(scoredPath string, cache map[string]map[string]EvidenceRecord) (map[string]EvidenceRecord, error) { + evidencePath := strings.Replace(scoredPath, "/scored-runs/", "/evidence/", 1) + if cache != nil { + if hit, ok := cache[evidencePath]; ok { + return hit, nil + } + } + out := make(map[string]EvidenceRecord) + data, err := os.ReadFile(evidencePath) + if os.IsNotExist(err) { + if cache != nil { + cache[evidencePath] = out + } + return out, nil + } + if err != nil { + return nil, err + } + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var r EvidenceRecord + if err := json.Unmarshal([]byte(line), &r); err != nil { + continue // tolerate malformed lines per Rust convention + } + out[r.RunID] = r + } + if cache != nil { + cache[evidencePath] = out + } + return out, nil +} + +// SynthesizeSft turns a (ScoredRun, EvidenceRecord) pair into an +// SftSample. Returns nil when the record is unsuitable for SFT: +// - model_role isn't executor/reviewer/applier (extraction-class +// records have no instruction→response shape) +// - text is empty (nothing to learn from) // -// Tests/contracts that synthesis port must preserve: -// - SftNever firewall fires before any other validation -// - Sort order matches Rust (file walk + record order within file) -// - Empty root dir returns zero-counts, not error +// Mirrors the Rust synthesizeSft exactly so byte-identical SFT +// JSONL output is achievable across both runtimes (operators +// running a/b validation between Rust + Go pipelines depend on this). +func SynthesizeSft(scored ScoredRun, ev EvidenceRecord, recordedAt, sftID string) *SftSample { + if ev.ModelRole != RoleExecutor && ev.ModelRole != RoleReviewer && ev.ModelRole != RoleApplier { + return nil + } + text := ev.Text + if strings.TrimSpace(text) == "" { + return nil + } + + stem := stemFromSourceFile(ev.Provenance.SourceFile) + firstSourceFile := "" + if len(ev.SourceFiles) > 0 { + firstSourceFile = ev.SourceFiles[0] + } + if firstSourceFile == "" { + firstSourceFile = "" + } + instruction := buildInstruction(stem, ev.TaskID, firstSourceFile) + + // Context — what the model could see. Keep terse; matches Rust's + // space-separated " · " join. + var ctxParts []string + if ev.RetrievedContext != nil { + if len(ev.RetrievedContext.MatrixCorpora) > 0 { + ctxParts = append(ctxParts, "matrix="+strings.Join(ev.RetrievedContext.MatrixCorpora, ",")) + } + if ev.RetrievedContext.PathwayFingerprintsSeen > 0 { + ctxParts = append(ctxParts, fmt.Sprintf("pathway_fingerprints=%d", ev.RetrievedContext.PathwayFingerprintsSeen)) + } + } + if ev.ModelName != "" { + ctxParts = append(ctxParts, "model="+ev.ModelName) + } + context := strings.Join(ctxParts, " · ") + + return &SftSample{ + SchemaVersion: SftSampleSchemaVersion, + ID: sftID, + Instruction: instruction, + Context: context, + Response: text, + SourceRunID: scored.EvidenceRunID, + QualityScore: SftQualityScore(scored.Category), + CreatedAt: recordedAt, + Provenance: Provenance{ + SourceFile: scored.Provenance.SourceFile, + LineOffset: scored.Provenance.LineOffset, + SigHash: scored.Provenance.SigHash, + RecordedAt: recordedAt, + }, + } +} + +// stemFromSourceFile mirrors the Rust path-cleanup: strip the +// `data/_kb/` prefix and the `.jsonl` suffix. Used to dispatch +// the per-source-class instruction templates below. +func stemFromSourceFile(path string) string { + s := strings.TrimPrefix(path, "data/_kb/") + s = strings.TrimSuffix(s, ".jsonl") + return s +} + +// buildInstruction returns the per-source-class instruction +// template. Mirrors the Rust switch statement byte-for-byte so +// trained-model behavior is unchanged across runtimes. +func buildInstruction(stem, taskID, firstSourceFile string) string { + switch stem { + case "scrum_reviews": + return fmt.Sprintf("Review the file '%s' against the PRD + change-proposal context. Produce a forensic audit with findings, severity, confidence, patches.", firstSourceFile) + case "mode_experiments": + return fmt.Sprintf("Run task_class='%s' for file '%s'. Produce the mode-runner's expected output shape.", taskID, firstSourceFile) + case "auto_apply": + return fmt.Sprintf("Auto-apply: emit a 6-line surgical patch for '%s' based on the latest scrum review findings.", firstSourceFile) + case "audits": + phase := strings.TrimPrefix(taskID, "phase:") + return fmt.Sprintf("Audit phase '%s' and report findings with severity.", phase) + case "observer_reviews": + return fmt.Sprintf("Observer-review the latest attempt on '%s'. Verdict: accept | reject | cycle.", firstSourceFile) + case "contract_analyses": + permit := strings.TrimPrefix(taskID, "permit:") + // The Rust version reads ev.metadata.contractor when present. + // Go's EvidenceRecord doesn't carry a free-form metadata bag + // today, so we always emit the no-contractor form. Operators + // who need contractor-aware instructions can extend this once + // the metadata field lands in EvidenceRecord (separate ADR). + return fmt.Sprintf("Analyze permit '%s'. Recommend with risk markers.", permit) + case "outcomes": + return "Run scenario; report per-event outcome with citations." + default: + return fmt.Sprintf("Source '%s' run; produce the appropriate output for this task type.", stem) + } +} + +// ExportSft is the now-COMPLETE port (not just substrate). Walks +// scored-runs files, loads paired evidence, applies the firewall, +// synthesizes SFT samples for surviving records, writes the +// combined JSONL output. +// +// Behavior: +// - DryRun=true: counts but doesn't write files. +// - IncludePartial=true: emit samples even when synthesis returns +// "skip" reasons that aren't on the firewall (model_role +// mismatch, empty text). Default false. +// - Empty Root or missing scored-runs dir: returns zero-counts, +// no error. +// +// Output JSONL format: one SftSample per line, UTF-8, LF-terminated, +// matching the Rust convention so a/b validation between runtimes +// can diff the files directly. func ExportSft(opts ExportSftOptions) (ExportSftResult, error) { + if opts.RecordedAt == "" { + return ExportSftResult{}, errors.New("distillation: ExportSft requires RecordedAt") + } res := ExportSftResult{ - OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_partial.jsonl"), - QuarantineSummary: "synthesis not yet ported — see internal/distillation/sft_export.go header", + OutputPath: filepath.Join(opts.Root, "data", "distilled", "sft", "sft_export.jsonl"), } files, err := ListScoredRunFiles(opts.Root) if err != nil { return res, fmt.Errorf("list scored runs: %w", err) } res.ScoredFilesRead = len(files) + + evCache := make(map[string]map[string]EvidenceRecord) + var output []byte + skippedNotInstructable := 0 + for _, f := range files { runs, _, err := LoadScoredRunsFromFile(f) if err != nil { continue } + evidence, err := LoadEvidenceByRunID(f, evCache) + if err != nil { + continue + } res.RecordsRead += len(runs) - for _, r := range runs { + for i, r := range runs { if IsSftNever(r.Category) { res.RecordsQuarantined++ continue } - // Synthesis would happen here. Pre-port: count as - // "would-export" for the firewall-passing records. + ev, ok := evidence[r.EvidenceRunID] + if !ok { + // Evidence missing — without it we can't synthesize. + // IncludePartial=true would let us emit a sample with + // just scored-side fields, but the contract says + // SftSample.Response must be non-empty (no evidence + // = no text), so skip regardless. + skippedNotInstructable++ + continue + } + sftID := fmt.Sprintf("%s:%d", filepath.Base(f), i) + sample := SynthesizeSft(r, ev, opts.RecordedAt, sftID) + if sample == nil { + skippedNotInstructable++ + continue + } + if !opts.DryRun { + line, err := json.Marshal(sample) + if err != nil { + skippedNotInstructable++ + continue + } + output = append(output, line...) + output = append(output, '\n') + } res.RecordsExported++ } } + + res.QuarantineSummary = fmt.Sprintf("firewall=%d, not-instructable=%d", res.RecordsQuarantined, skippedNotInstructable) + if !opts.DryRun && len(output) > 0 { + if err := os.MkdirAll(filepath.Dir(res.OutputPath), 0o755); err != nil { + return res, fmt.Errorf("mkdir output: %w", err) + } + if err := os.WriteFile(res.OutputPath, output, 0o644); err != nil { + return res, fmt.Errorf("write output: %w", err) + } + } return res, nil } diff --git a/internal/distillation/sft_export_test.go b/internal/distillation/sft_export_test.go index d83ccae..a77e24f 100644 --- a/internal/distillation/sft_export_test.go +++ b/internal/distillation/sft_export_test.go @@ -3,6 +3,7 @@ package distillation import ( "os" "path/filepath" + "strings" "testing" ) @@ -134,26 +135,233 @@ func TestListScoredRunFiles_WalksYearMonthDay(t *testing.T) { } } -// TestExportSft_PartialPort_FirewallFires runs the partial-port -// ExportSft on a fixture with one valid + one rejected ScoredRun -// and asserts the firewall counts correctly. Locks the contamination -// guarantee at the integration layer — even before the synthesis -// half ports, the firewall protection is end-to-end testable. -func TestExportSft_PartialPort_FirewallFires(t *testing.T) { +// TestSynthesizeSft_PerSourceClass locks the per-source-class +// instruction templates byte-for-byte against the Rust source. +// If a future commit changes a template, this test fails — the +// trained-model behavior shifts under our feet. +func TestSynthesizeSft_PerSourceClass(t *testing.T) { + cases := []struct { + name string + sourceFile string + taskID string + sourceFiles []string + wantPrefix string + }{ + { + "scrum_reviews", + "data/_kb/scrum_reviews.jsonl", + "any", + []string{"src/foo.rs"}, + "Review the file 'src/foo.rs' against", + }, + { + "mode_experiments", + "data/_kb/mode_experiments.jsonl", + "task_42", + []string{"src/bar.go"}, + "Run task_class='task_42' for file 'src/bar.go'.", + }, + { + "auto_apply", + "data/_kb/auto_apply.jsonl", + "any", + []string{"src/baz.ts"}, + "Auto-apply: emit a 6-line surgical patch for 'src/baz.ts'", + }, + { + "audits with phase: prefix stripped", + "data/_kb/audits.jsonl", + "phase:G2", + nil, + "Audit phase 'G2' and report findings", + }, + { + "observer_reviews", + "data/_kb/observer_reviews.jsonl", + "any", + []string{"f.rs"}, + "Observer-review the latest attempt on 'f.rs'.", + }, + { + "contract_analyses with permit: prefix", + "data/_kb/contract_analyses.jsonl", + "permit:ABC123", + nil, + "Analyze permit 'ABC123'. Recommend with risk markers.", + }, + { + "outcomes", + "data/_kb/outcomes.jsonl", + "any", + nil, + "Run scenario; report per-event outcome with citations.", + }, + { + "unknown source falls back to default", + "data/_kb/something_new.jsonl", + "any", + nil, + "Source 'something_new' run; produce the appropriate output", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + scored := ScoredRun{ + EvidenceRunID: "rid", + Category: CategoryAccepted, + Provenance: Provenance{ + SourceFile: c.sourceFile, + SigHash: "abc", + RecordedAt: "2026-04-30T00:00:00Z", + }, + } + ev := EvidenceRecord{ + RunID: "rid", + TaskID: c.taskID, + ModelRole: RoleExecutor, + Text: "model response text", + SourceFiles: c.sourceFiles, + Provenance: Provenance{SourceFile: c.sourceFile, SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"}, + } + sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "test-id") + if sample == nil { + t.Fatalf("expected non-nil sample for %s", c.name) + } + if !strings.HasPrefix(sample.Instruction, c.wantPrefix) { + t.Errorf("instruction prefix mismatch:\n got: %q\n want: %q...", sample.Instruction, c.wantPrefix) + } + }) + } +} + +// TestSynthesizeSft_RejectsExtraction: extraction-class records +// have no instruction→response shape (they're pure data extraction, +// not model-output-as-training-target). Synthesis must return nil. +func TestSynthesizeSft_RejectsExtraction(t *testing.T) { + ev := EvidenceRecord{ + RunID: "rid", + ModelRole: RoleExtractor, + Text: "extracted data", + Provenance: Provenance{SourceFile: "data/_kb/anything.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"}, + } + scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance} + if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil { + t.Errorf("extraction record must produce nil sample, got %+v", sample) + } +} + +// TestSynthesizeSft_RejectsEmptyText: text is the response side of +// the SFT pair; empty text means nothing to learn. +func TestSynthesizeSft_RejectsEmptyText(t *testing.T) { + ev := EvidenceRecord{ + RunID: "rid", + ModelRole: RoleExecutor, + Text: " \n\t", + Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"}, + } + scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance} + if sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id"); sample != nil { + t.Errorf("empty-text record must produce nil sample, got %+v", sample) + } +} + +// TestSynthesizeSft_ContextAssembly verifies the terse " · "-joined +// context string carries matrix corpora + pathway fingerprints + +// model name in the documented order. +func TestSynthesizeSft_ContextAssembly(t *testing.T) { + ev := EvidenceRecord{ + RunID: "rid", + ModelRole: RoleReviewer, + Text: "verdict", + ModelName: "qwen3.5", + RetrievedContext: &RetrievedContext{ + MatrixCorpora: []string{"workers", "candidates"}, + PathwayFingerprintsSeen: 88, + }, + Provenance: Provenance{SourceFile: "data/_kb/scrum_reviews.jsonl", SigHash: "abc", RecordedAt: "2026-04-30T00:00:00Z"}, + } + scored := ScoredRun{EvidenceRunID: "rid", Category: CategoryAccepted, Provenance: ev.Provenance} + sample := SynthesizeSft(scored, ev, "2026-04-30T00:00:00Z", "id") + if sample == nil { + t.Fatalf("expected non-nil sample") + } + want := "matrix=workers,candidates · pathway_fingerprints=88 · model=qwen3.5" + if sample.Context != want { + t.Errorf("context mismatch:\n got: %q\n want: %q", sample.Context, want) + } +} + +// TestExportSft_FullPort_WritesJSONL covers the fully-ported path: +// scored runs + paired evidence both present, synthesis produces +// SftSamples, output JSONL is written. Locks the end-to-end +// contract that next-wave changes (synthesis tweaks, output layout) +// have to preserve. +func TestExportSft_FullPort_WritesJSONL(t *testing.T) { + tmp := t.TempDir() + scoredDir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30") + evidenceDir := filepath.Join(tmp, "data", "evidence", "2026", "04", "30") + for _, d := range []string{scoredDir, evidenceDir} { + if err := os.MkdirAll(d, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", d, err) + } + } + scoredJSONL := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}} +{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}} +` + evidenceJSONL := `{"run_id":"r1","model_role":"executor","text":"some review output","source_files":["src/foo.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}} +{"run_id":"r2","model_role":"executor","text":"another output","source_files":["src/bar.rs"],"provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}} +` + if err := os.WriteFile(filepath.Join(scoredDir, "run.jsonl"), []byte(scoredJSONL), 0o644); err != nil { + t.Fatalf("write scored: %v", err) + } + if err := os.WriteFile(filepath.Join(evidenceDir, "run.jsonl"), []byte(evidenceJSONL), 0o644); err != nil { + t.Fatalf("write evidence: %v", err) + } + res, err := ExportSft(ExportSftOptions{ + Root: tmp, + RecordedAt: "2026-04-30T00:00:00Z", + }) + if err != nil { + t.Fatalf("ExportSft: %v", err) + } + if res.RecordsRead != 2 || res.RecordsExported != 1 || res.RecordsQuarantined != 1 { + t.Errorf("counts: read=%d exported=%d quarantined=%d (want 2/1/1)", + res.RecordsRead, res.RecordsExported, res.RecordsQuarantined) + } + out, err := os.ReadFile(res.OutputPath) + if err != nil { + t.Fatalf("read output: %v", err) + } + if !strings.Contains(string(out), "Review the file 'src/foo.rs'") { + t.Errorf("output missing expected scrum_reviews instruction; got:\n%s", string(out)) + } + if strings.Contains(string(out), "src/bar.rs") { + t.Errorf("output contains rejected record's source_file — firewall leak") + } +} + +// TestExportSft_FirewallFiresBeforeEvidenceLoad locks the order-of- +// operations: even if evidence records are missing, the +// firewall counts records as quarantined so the contamination +// guarantee never depends on side data being present. Records +// that pass the firewall but lack evidence get the more honest +// "not instructable" label rather than being silently exported. +func TestExportSft_FirewallFiresBeforeEvidenceLoad(t *testing.T) { tmp := t.TempDir() dir := filepath.Join(tmp, "data", "scored-runs", "2026", "04", "30") if err := os.MkdirAll(dir, 0o755); err != nil { t.Fatalf("mkdir: %v", err) } - // Two scored runs: one passes the firewall, one is blocked. - jsonl := `{"category":"accepted","run_id":"r1","task_id":"t1"} -{"category":"rejected","run_id":"r2","task_id":"t2"} -{"category":"partially_accepted","run_id":"r3","task_id":"t3"} -{"category":"needs_human_review","run_id":"r4","task_id":"t4"} + jsonl := `{"category":"accepted","evidence_run_id":"r1","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h1","recorded_at":"2026-04-30T00:00:00Z"}} +{"category":"rejected","evidence_run_id":"r2","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h2","recorded_at":"2026-04-30T00:00:00Z"}} +{"category":"partially_accepted","evidence_run_id":"r3","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h3","recorded_at":"2026-04-30T00:00:00Z"}} +{"category":"needs_human_review","evidence_run_id":"r4","provenance":{"source_file":"data/_kb/scrum_reviews.jsonl","sig_hash":"h4","recorded_at":"2026-04-30T00:00:00Z"}} ` if err := os.WriteFile(filepath.Join(dir, "run.jsonl"), []byte(jsonl), 0o644); err != nil { t.Fatalf("write: %v", err) } + // No evidence directory created → records that pass the firewall + // land in "not instructable" since synthesis can't proceed. res, err := ExportSft(ExportSftOptions{ Root: tmp, RecordedAt: "2026-04-30T00:00:00Z", @@ -165,10 +373,13 @@ func TestExportSft_PartialPort_FirewallFires(t *testing.T) { if res.RecordsRead != 4 { t.Errorf("RecordsRead: got %d, want 4", res.RecordsRead) } - if res.RecordsExported != 2 { - t.Errorf("RecordsExported (firewall-passing): got %d, want 2", res.RecordsExported) - } if res.RecordsQuarantined != 2 { t.Errorf("RecordsQuarantined (firewall-blocked): got %d, want 2", res.RecordsQuarantined) } + if res.RecordsExported != 0 { + t.Errorf("RecordsExported with no evidence: got %d, want 0", res.RecordsExported) + } + if !strings.Contains(res.QuarantineSummary, "not-instructable=2") { + t.Errorf("expected quarantine summary to flag 2 not-instructable, got %q", res.QuarantineSummary) + } }