Per architecture_comparison.md universal-win for Go side: ports the Rust crates/validator/src/staffing/ to internal/validator/. Production safety net Go was missing — FillValidator catches phantom worker IDs + status/blacklist/geo/role mismatches; EmailValidator catches SSN-shape PII + salary disclosure + wrong-target name in email/SMS drafts. Files: - types.go: Artifact (FillProposal | EmailDraft), Validator interface, WorkerLookup interface, ValidationError + Finding + Severity - lookup.go: InMemoryWorkerLookup with case-insensitive ID lookup - fill.go: FillValidator — schema → completeness → cross-roster (phantom ID / status / blacklist / geo / role) - email.go: EmailValidator — schema → length → PII (SSN + salary) → worker-name consistency - fill_test.go + email_test.go: 24 tests covering happy path + every error variant + the load-bearing edge cases (phone-pattern not flagged as SSN, flanking-digit guard rejects extended numeric runs) Validator names match Rust (staffing.fill / staffing.email) so cross-runtime audit logs share the same identifier. PII scanners (containsSSNPattern, containsSalaryDisclosure) ported byte-for-byte so a draft flagged by one runtime is flagged by the other. Caveat: the Rust validator crate also has parquet_lookup.rs (loads workers_500k.parquet at startup) and playbook.rs (additional checks). Those weren't ported in this wave — only the two load-bearing validators that were named in the comparison doc. Closes one of the two universal-win items for Go side. The other (materializer port) remains deferred — it's a bigger surface change and depends on transforms.ts source-class adapters. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
271 lines
7.3 KiB
Go
271 lines
7.3 KiB
Go
package validator
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// EmailValidator is the Go port of Rust's EmailValidator. Per
|
|
// `crates/validator/src/staffing/email.rs`:
|
|
//
|
|
// - Schema (TO/BODY fields present)
|
|
// - Length (SMS ≤ 160 chars; email subject ≤ 78 chars)
|
|
// - PII absence (no SSN-shape / salary leakage)
|
|
// - Worker-name consistency (body mentions worker first name)
|
|
//
|
|
// PII detection is std-only — no regex dependency. Two scanners:
|
|
// - SSN-shape: NNN-NN-NNNN with run-of-digits guards (so phone
|
|
// numbers like NNN-NNN-NNNN don't false-positive).
|
|
// - Salary disclosure: keywords near a `$amount` substring.
|
|
//
|
|
// Both mirror Rust byte-for-byte so cross-runtime audit logs
|
|
// agree on which messages get flagged.
|
|
type EmailValidator struct {
|
|
workers WorkerLookup
|
|
}
|
|
|
|
// NewEmailValidator constructs an EmailValidator with the given
|
|
// lookup. If you don't need the name-consistency check (e.g.
|
|
// generic broadcast templates), pass NewInMemoryWorkerLookup(nil)
|
|
// — the validator skips the worker check when _context.candidate_id
|
|
// is absent.
|
|
func NewEmailValidator(workers WorkerLookup) *EmailValidator {
|
|
return &EmailValidator{workers: workers}
|
|
}
|
|
|
|
// Name satisfies Validator. Stable string used for audit trail /
|
|
// receipts. Matches Rust output "staffing.email".
|
|
func (v *EmailValidator) Name() string { return "staffing.email" }
|
|
|
|
// Channel-shape limits — match Rust exactly.
|
|
const (
|
|
smsMaxChars = 160
|
|
emailSubjectMaxChars = 78
|
|
)
|
|
|
|
// Validate implements Validator. Order: schema → length →
|
|
// PII → worker-name consistency.
|
|
func (v *EmailValidator) Validate(artifact Artifact) (Report, error) {
|
|
started := time.Now()
|
|
value := artifact.EmailDraft
|
|
if value == nil {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrSchema,
|
|
Field: "artifact",
|
|
Reason: fmt.Sprintf("EmailValidator expects EmailDraft, got %s", artifact.Kind()),
|
|
}
|
|
}
|
|
|
|
// ── Schema (`to` + `body` required) ──
|
|
if _, ok := value["to"].(string); !ok {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrSchema,
|
|
Field: "to",
|
|
Reason: "missing or not a string",
|
|
}
|
|
}
|
|
body, ok := value["body"].(string)
|
|
if !ok {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrSchema,
|
|
Field: "body",
|
|
Reason: "missing or not a string",
|
|
}
|
|
}
|
|
|
|
// ── Length checks ──
|
|
isSMS := false
|
|
if k, ok := value["kind"].(string); ok && k == "sms" {
|
|
isSMS = true
|
|
}
|
|
if isSMS && len(body) > smsMaxChars {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrCompleteness,
|
|
Reason: fmt.Sprintf("SMS body is %d chars, max %d",
|
|
len(body), smsMaxChars),
|
|
}
|
|
}
|
|
if subject, ok := value["subject"].(string); ok && len(subject) > emailSubjectMaxChars {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrCompleteness,
|
|
Reason: fmt.Sprintf("email subject is %d chars, max %d",
|
|
len(subject), emailSubjectMaxChars),
|
|
}
|
|
}
|
|
|
|
// ── PII scan over subject + body combined ──
|
|
var subjectStr string
|
|
if s, ok := value["subject"].(string); ok {
|
|
subjectStr = s
|
|
}
|
|
scanned := subjectStr + " " + body
|
|
if containsSSNPattern(scanned) {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrPolicy,
|
|
Reason: "body contains an SSN-shaped sequence (NNN-NN-NNNN); strip before send",
|
|
}
|
|
}
|
|
if containsSalaryDisclosure(scanned) {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrPolicy,
|
|
Reason: "body discloses salary/compensation amount; staffing PII rule says strip before send",
|
|
}
|
|
}
|
|
|
|
// ── Worker-name consistency ──
|
|
var findings []Finding
|
|
if ctx, ok := value["_context"].(map[string]any); ok {
|
|
if cid, ok := ctx["candidate_id"].(string); ok && cid != "" {
|
|
worker, found := v.workers.Find(cid)
|
|
if !found {
|
|
return Report{}, &ValidationError{
|
|
Kind: ErrConsistency,
|
|
Reason: fmt.Sprintf("_context.candidate_id %q not found in worker roster", cid),
|
|
}
|
|
}
|
|
// Body should mention the worker's name (or at least
|
|
// their first name) — drafts that address a different
|
|
// person than the contracted worker are a recurring
|
|
// LLM mistake.
|
|
first := strings.Fields(worker.Name)
|
|
firstLower := ""
|
|
if len(first) > 0 {
|
|
firstLower = strings.ToLower(first[0])
|
|
}
|
|
bodyLower := strings.ToLower(body)
|
|
if firstLower != "" && !strings.Contains(bodyLower, firstLower) {
|
|
findings = append(findings, Finding{
|
|
Field: "body",
|
|
Severity: SeverityWarning,
|
|
Message: fmt.Sprintf(
|
|
"body doesn't mention worker first name %q (candidate_id %q)",
|
|
first[0], cid,
|
|
),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return Report{
|
|
Findings: findings,
|
|
ElapsedMs: elapsed(started),
|
|
}, nil
|
|
}
|
|
|
|
// ── PII scanners — std-only, mirror Rust byte-for-byte ──────────
|
|
|
|
// containsSSNPattern detects an SSN-shaped sequence: 3 digits, dash,
|
|
// 2 digits, dash, 4 digits. Runs-of-digits guards: rejects sequences
|
|
// flanked by digit/dash (so phone-area-code-like NNN-NNN-NNNN isn't
|
|
// flagged). Tight false-positive surface: specifically the
|
|
// NNN-NN-NNNN shape used by U.S. SSNs.
|
|
//
|
|
// Critical: this fires on PII in real-world drafts. Don't relax the
|
|
// flanking guards without a regression test that exercises both
|
|
// cases (an actual SSN should fire, a phone-NNN-NNN-NNNN should not).
|
|
func containsSSNPattern(s string) bool {
|
|
bytes := []byte(s)
|
|
if len(bytes) < 11 {
|
|
return false
|
|
}
|
|
for i := 0; i+11 <= len(bytes); i++ {
|
|
win := bytes[i : i+11]
|
|
shape := true
|
|
for j := 0; j < 11; j++ {
|
|
switch j {
|
|
case 0, 1, 2, 4, 5, 7, 8, 9, 10:
|
|
if !isAsciiDigit(win[j]) {
|
|
shape = false
|
|
}
|
|
case 3, 6:
|
|
if win[j] != '-' {
|
|
shape = false
|
|
}
|
|
}
|
|
if !shape {
|
|
break
|
|
}
|
|
}
|
|
if !shape {
|
|
continue
|
|
}
|
|
// Reject if previous byte is digit or dash — we're
|
|
// inside a longer numeric run, probably not an SSN.
|
|
if i > 0 {
|
|
prev := bytes[i-1]
|
|
if isAsciiDigit(prev) || prev == '-' {
|
|
continue
|
|
}
|
|
}
|
|
// Reject if next byte is digit or dash — same reason.
|
|
if i+11 < len(bytes) {
|
|
next := bytes[i+11]
|
|
if isAsciiDigit(next) || next == '-' {
|
|
continue
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isAsciiDigit(b byte) bool { return b >= '0' && b <= '9' }
|
|
|
|
// containsSalaryDisclosure detects salary/compensation disclosure:
|
|
// the keywords "salary", "compensation", "pay rate", "bill rate",
|
|
// "hourly rate" appearing within ~40 chars of a `$NNN+` substring.
|
|
//
|
|
// Coarse on purpose — better to false-positive on a legit phrase
|
|
// like "discuss your hourly rate of $30/hr" than to miss a real
|
|
// disclosure. Operators tuning this should add tests, not loosen
|
|
// the check.
|
|
func containsSalaryDisclosure(s string) bool {
|
|
lower := strings.ToLower(s)
|
|
keywords := []string{"salary", "compensation", "pay rate", "bill rate", "hourly rate"}
|
|
|
|
var keywordPositions []int
|
|
for _, kw := range keywords {
|
|
start := 0
|
|
for {
|
|
idx := strings.Index(lower[start:], kw)
|
|
if idx < 0 {
|
|
break
|
|
}
|
|
abs := start + idx
|
|
keywordPositions = append(keywordPositions, abs)
|
|
start = abs + len(kw)
|
|
}
|
|
}
|
|
if len(keywordPositions) == 0 {
|
|
return false
|
|
}
|
|
|
|
var dollarPositions []int
|
|
bytes := []byte(lower)
|
|
for i := 0; i+1 < len(bytes); i++ {
|
|
if bytes[i] == '$' && isAsciiDigit(bytes[i+1]) {
|
|
dollarPositions = append(dollarPositions, i)
|
|
}
|
|
}
|
|
if len(dollarPositions) == 0 {
|
|
return false
|
|
}
|
|
|
|
for _, kp := range keywordPositions {
|
|
for _, dp := range dollarPositions {
|
|
if absDiff(kp, dp) <= 40 {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func absDiff(a, b int) int {
|
|
if a > b {
|
|
return a - b
|
|
}
|
|
return b - a
|
|
}
|