root b03521a506 validator: port FillValidator + EmailValidator from Rust validator crate
Per architecture_comparison.md universal-win for Go side: ports the
Rust crates/validator/src/staffing/ to internal/validator/. Production
safety net Go was missing — FillValidator catches phantom worker IDs
+ status/blacklist/geo/role mismatches; EmailValidator catches
SSN-shape PII + salary disclosure + wrong-target name in email/SMS
drafts.

Files:
- types.go: Artifact (FillProposal | EmailDraft), Validator interface,
  WorkerLookup interface, ValidationError + Finding + Severity
- lookup.go: InMemoryWorkerLookup with case-insensitive ID lookup
- fill.go: FillValidator — schema → completeness → cross-roster
  (phantom ID / status / blacklist / geo / role)
- email.go: EmailValidator — schema → length → PII (SSN + salary)
  → worker-name consistency
- fill_test.go + email_test.go: 24 tests covering happy path +
  every error variant + the load-bearing edge cases (phone-pattern
  not flagged as SSN, flanking-digit guard rejects extended
  numeric runs)

Validator names match Rust (staffing.fill / staffing.email) so
cross-runtime audit logs share the same identifier. PII scanners
(containsSSNPattern, containsSalaryDisclosure) ported byte-for-byte
so a draft flagged by one runtime is flagged by the other.

Caveat: the Rust validator crate also has parquet_lookup.rs (loads
workers_500k.parquet at startup) and playbook.rs (additional
checks). Those weren't ported in this wave — only the two
load-bearing validators that were named in the comparison doc.

Closes one of the two universal-win items for Go side. The other
(materializer port) remains deferred — it's a bigger surface change
and depends on transforms.ts source-class adapters.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 04:49:55 -05:00

271 lines
7.3 KiB
Go

package validator
import (
"fmt"
"strings"
"time"
)
// EmailValidator is the Go port of Rust's EmailValidator. Per
// `crates/validator/src/staffing/email.rs`:
//
// - Schema (TO/BODY fields present)
// - Length (SMS ≤ 160 chars; email subject ≤ 78 chars)
// - PII absence (no SSN-shape / salary leakage)
// - Worker-name consistency (body mentions worker first name)
//
// PII detection is std-only — no regex dependency. Two scanners:
// - SSN-shape: NNN-NN-NNNN with run-of-digits guards (so phone
// numbers like NNN-NNN-NNNN don't false-positive).
// - Salary disclosure: keywords near a `$amount` substring.
//
// Both mirror Rust byte-for-byte so cross-runtime audit logs
// agree on which messages get flagged.
type EmailValidator struct {
workers WorkerLookup
}
// NewEmailValidator constructs an EmailValidator with the given
// lookup. If you don't need the name-consistency check (e.g.
// generic broadcast templates), pass NewInMemoryWorkerLookup(nil)
// — the validator skips the worker check when _context.candidate_id
// is absent.
func NewEmailValidator(workers WorkerLookup) *EmailValidator {
return &EmailValidator{workers: workers}
}
// Name satisfies Validator. Stable string used for audit trail /
// receipts. Matches Rust output "staffing.email".
func (v *EmailValidator) Name() string { return "staffing.email" }
// Channel-shape limits — match Rust exactly.
const (
smsMaxChars = 160
emailSubjectMaxChars = 78
)
// Validate implements Validator. Order: schema → length →
// PII → worker-name consistency.
func (v *EmailValidator) Validate(artifact Artifact) (Report, error) {
started := time.Now()
value := artifact.EmailDraft
if value == nil {
return Report{}, &ValidationError{
Kind: ErrSchema,
Field: "artifact",
Reason: fmt.Sprintf("EmailValidator expects EmailDraft, got %s", artifact.Kind()),
}
}
// ── Schema (`to` + `body` required) ──
if _, ok := value["to"].(string); !ok {
return Report{}, &ValidationError{
Kind: ErrSchema,
Field: "to",
Reason: "missing or not a string",
}
}
body, ok := value["body"].(string)
if !ok {
return Report{}, &ValidationError{
Kind: ErrSchema,
Field: "body",
Reason: "missing or not a string",
}
}
// ── Length checks ──
isSMS := false
if k, ok := value["kind"].(string); ok && k == "sms" {
isSMS = true
}
if isSMS && len(body) > smsMaxChars {
return Report{}, &ValidationError{
Kind: ErrCompleteness,
Reason: fmt.Sprintf("SMS body is %d chars, max %d",
len(body), smsMaxChars),
}
}
if subject, ok := value["subject"].(string); ok && len(subject) > emailSubjectMaxChars {
return Report{}, &ValidationError{
Kind: ErrCompleteness,
Reason: fmt.Sprintf("email subject is %d chars, max %d",
len(subject), emailSubjectMaxChars),
}
}
// ── PII scan over subject + body combined ──
var subjectStr string
if s, ok := value["subject"].(string); ok {
subjectStr = s
}
scanned := subjectStr + " " + body
if containsSSNPattern(scanned) {
return Report{}, &ValidationError{
Kind: ErrPolicy,
Reason: "body contains an SSN-shaped sequence (NNN-NN-NNNN); strip before send",
}
}
if containsSalaryDisclosure(scanned) {
return Report{}, &ValidationError{
Kind: ErrPolicy,
Reason: "body discloses salary/compensation amount; staffing PII rule says strip before send",
}
}
// ── Worker-name consistency ──
var findings []Finding
if ctx, ok := value["_context"].(map[string]any); ok {
if cid, ok := ctx["candidate_id"].(string); ok && cid != "" {
worker, found := v.workers.Find(cid)
if !found {
return Report{}, &ValidationError{
Kind: ErrConsistency,
Reason: fmt.Sprintf("_context.candidate_id %q not found in worker roster", cid),
}
}
// Body should mention the worker's name (or at least
// their first name) — drafts that address a different
// person than the contracted worker are a recurring
// LLM mistake.
first := strings.Fields(worker.Name)
firstLower := ""
if len(first) > 0 {
firstLower = strings.ToLower(first[0])
}
bodyLower := strings.ToLower(body)
if firstLower != "" && !strings.Contains(bodyLower, firstLower) {
findings = append(findings, Finding{
Field: "body",
Severity: SeverityWarning,
Message: fmt.Sprintf(
"body doesn't mention worker first name %q (candidate_id %q)",
first[0], cid,
),
})
}
}
}
return Report{
Findings: findings,
ElapsedMs: elapsed(started),
}, nil
}
// ── PII scanners — std-only, mirror Rust byte-for-byte ──────────
// containsSSNPattern detects an SSN-shaped sequence: 3 digits, dash,
// 2 digits, dash, 4 digits. Runs-of-digits guards: rejects sequences
// flanked by digit/dash (so phone-area-code-like NNN-NNN-NNNN isn't
// flagged). Tight false-positive surface: specifically the
// NNN-NN-NNNN shape used by U.S. SSNs.
//
// Critical: this fires on PII in real-world drafts. Don't relax the
// flanking guards without a regression test that exercises both
// cases (an actual SSN should fire, a phone-NNN-NNN-NNNN should not).
func containsSSNPattern(s string) bool {
bytes := []byte(s)
if len(bytes) < 11 {
return false
}
for i := 0; i+11 <= len(bytes); i++ {
win := bytes[i : i+11]
shape := true
for j := 0; j < 11; j++ {
switch j {
case 0, 1, 2, 4, 5, 7, 8, 9, 10:
if !isAsciiDigit(win[j]) {
shape = false
}
case 3, 6:
if win[j] != '-' {
shape = false
}
}
if !shape {
break
}
}
if !shape {
continue
}
// Reject if previous byte is digit or dash — we're
// inside a longer numeric run, probably not an SSN.
if i > 0 {
prev := bytes[i-1]
if isAsciiDigit(prev) || prev == '-' {
continue
}
}
// Reject if next byte is digit or dash — same reason.
if i+11 < len(bytes) {
next := bytes[i+11]
if isAsciiDigit(next) || next == '-' {
continue
}
}
return true
}
return false
}
func isAsciiDigit(b byte) bool { return b >= '0' && b <= '9' }
// containsSalaryDisclosure detects salary/compensation disclosure:
// the keywords "salary", "compensation", "pay rate", "bill rate",
// "hourly rate" appearing within ~40 chars of a `$NNN+` substring.
//
// Coarse on purpose — better to false-positive on a legit phrase
// like "discuss your hourly rate of $30/hr" than to miss a real
// disclosure. Operators tuning this should add tests, not loosen
// the check.
func containsSalaryDisclosure(s string) bool {
lower := strings.ToLower(s)
keywords := []string{"salary", "compensation", "pay rate", "bill rate", "hourly rate"}
var keywordPositions []int
for _, kw := range keywords {
start := 0
for {
idx := strings.Index(lower[start:], kw)
if idx < 0 {
break
}
abs := start + idx
keywordPositions = append(keywordPositions, abs)
start = abs + len(kw)
}
}
if len(keywordPositions) == 0 {
return false
}
var dollarPositions []int
bytes := []byte(lower)
for i := 0; i+1 < len(bytes); i++ {
if bytes[i] == '$' && isAsciiDigit(bytes[i+1]) {
dollarPositions = append(dollarPositions, i)
}
}
if len(dollarPositions) == 0 {
return false
}
for _, kp := range keywordPositions {
for _, dp := range dollarPositions {
if absDiff(kp, dp) <= 40 {
return true
}
}
}
return false
}
func absDiff(a, b int) int {
if a > b {
return a - b
}
return b - a
}