golangLAKEHOUSE/internal/validator/email.go

package validator

import (
	"fmt"
	"strings"
	"time"
)

// EmailValidator is the Go port of Rust's EmailValidator. Per
// `crates/validator/src/staffing/email.rs`:
//
//   - Schema (TO/BODY fields present)
//   - Length (SMS ≤ 160 chars; email subject ≤ 78 chars)
//   - PII absence (no SSN-shape / salary leakage)
//   - Worker-name consistency (body mentions worker first name)
//
// PII detection is std-only — no regex dependency. Two scanners:
//   - SSN-shape: NNN-NN-NNNN with run-of-digits guards (so phone
//     numbers like NNN-NNN-NNNN don't false-positive).
//   - Salary disclosure: keywords near a `$amount` substring.
//
// Both mirror Rust byte-for-byte so cross-runtime audit logs
// agree on which messages get flagged.
type EmailValidator struct {
	workers WorkerLookup
}

// NewEmailValidator constructs an EmailValidator with the given
// lookup. If you don't need the name-consistency check (e.g.
// generic broadcast templates), pass NewInMemoryWorkerLookup(nil)
// — the validator skips the worker check when _context.candidate_id
// is absent.
func NewEmailValidator(workers WorkerLookup) *EmailValidator {
	return &EmailValidator{workers: workers}
}

// Name satisfies Validator. Stable string used for audit trail /
// receipts. Matches Rust output "staffing.email".
func (v *EmailValidator) Name() string { return "staffing.email" }

// Channel-shape limits — match Rust exactly.
const (
	smsMaxChars          = 160
	emailSubjectMaxChars = 78
)

// Validate implements Validator. Order: schema → length →
// PII → worker-name consistency.
func (v *EmailValidator) Validate(artifact Artifact) (Report, error) {
	started := time.Now()
	value := artifact.EmailDraft
	if value == nil {
		return Report{}, &ValidationError{
			Kind:  ErrSchema,
			Field: "artifact",
			Reason: fmt.Sprintf("EmailValidator expects EmailDraft, got %s", artifact.Kind()),
		}
	}

	// ── Schema (`to` + `body` required) ──
	if _, ok := value["to"].(string); !ok {
		return Report{}, &ValidationError{
			Kind:   ErrSchema,
			Field:  "to",
			Reason: "missing or not a string",
		}
	}
	body, ok := value["body"].(string)
	if !ok {
		return Report{}, &ValidationError{
			Kind:   ErrSchema,
			Field:  "body",
			Reason: "missing or not a string",
		}
	}

	// ── Length checks ──
	isSMS := false
	if k, ok := value["kind"].(string); ok && k == "sms" {
		isSMS = true
	}
	if isSMS && len(body) > smsMaxChars {
		return Report{}, &ValidationError{
			Kind: ErrCompleteness,
			Reason: fmt.Sprintf("SMS body is %d chars, max %d",
				len(body), smsMaxChars),
		}
	}
	if subject, ok := value["subject"].(string); ok && len(subject) > emailSubjectMaxChars {
		return Report{}, &ValidationError{
			Kind: ErrCompleteness,
			Reason: fmt.Sprintf("email subject is %d chars, max %d",
				len(subject), emailSubjectMaxChars),
		}
	}

	// ── PII scan over subject + body combined ──
	var subjectStr string
	if s, ok := value["subject"].(string); ok {
		subjectStr = s
	}
	scanned := subjectStr + " " + body
	if containsSSNPattern(scanned) {
		return Report{}, &ValidationError{
			Kind:   ErrPolicy,
			Reason: "body contains an SSN-shaped sequence (NNN-NN-NNNN); strip before send",
		}
	}
	if containsSalaryDisclosure(scanned) {
		return Report{}, &ValidationError{
			Kind:   ErrPolicy,
			Reason: "body discloses salary/compensation amount; staffing PII rule says strip before send",
		}
	}

	// ── Worker-name consistency ──
	var findings []Finding
	if ctx, ok := value["_context"].(map[string]any); ok {
		if cid, ok := ctx["candidate_id"].(string); ok && cid != "" {
			worker, found := v.workers.Find(cid)
			if !found {
				return Report{}, &ValidationError{
					Kind:   ErrConsistency,
					Reason: fmt.Sprintf("_context.candidate_id %q not found in worker roster", cid),
				}
			}
			// Body should mention the worker's name (or at least
			// their first name) — drafts that address a different
			// person than the contracted worker are a recurring
			// LLM mistake.
			first := strings.Fields(worker.Name)
			firstLower := ""
			if len(first) > 0 {
				firstLower = strings.ToLower(first[0])
			}
			bodyLower := strings.ToLower(body)
			if firstLower != "" && !strings.Contains(bodyLower, firstLower) {
				findings = append(findings, Finding{
					Field:    "body",
					Severity: SeverityWarning,
					Message: fmt.Sprintf(
						"body doesn't mention worker first name %q (candidate_id %q)",
						first[0], cid,
					),
				})
			}
		}
	}

	return Report{
		Findings:  findings,
		ElapsedMs: elapsed(started),
	}, nil
}

// ── PII scanners — std-only, mirror Rust byte-for-byte ──────────

// containsSSNPattern detects an SSN-shaped sequence: 3 digits, dash,
// 2 digits, dash, 4 digits. Runs-of-digits guards: rejects sequences
// flanked by digit/dash (so phone-area-code-like NNN-NNN-NNNN isn't
// flagged). Tight false-positive surface: specifically the
// NNN-NN-NNNN shape used by U.S. SSNs.
//
// Critical: this fires on PII in real-world drafts. Don't relax the
// flanking guards without a regression test that exercises both
// cases (an actual SSN should fire, a phone-NNN-NNN-NNNN should not).
func containsSSNPattern(s string) bool {
	bytes := []byte(s)
	if len(bytes) < 11 {
		return false
	}
	for i := 0; i+11 <= len(bytes); i++ {
		win := bytes[i : i+11]
		shape := true
		for j := 0; j < 11; j++ {
			switch j {
			case 0, 1, 2, 4, 5, 7, 8, 9, 10:
				if !isAsciiDigit(win[j]) {
					shape = false
				}
			case 3, 6:
				if win[j] != '-' {
					shape = false
				}
			}
			if !shape {
				break
			}
		}
		if !shape {
			continue
		}
		// Reject if previous byte is digit or dash — we're
		// inside a longer numeric run, probably not an SSN.
		if i > 0 {
			prev := bytes[i-1]
			if isAsciiDigit(prev) || prev == '-' {
				continue
			}
		}
		// Reject if next byte is digit or dash — same reason.
		if i+11 < len(bytes) {
			next := bytes[i+11]
			if isAsciiDigit(next) || next == '-' {
				continue
			}
		}
		return true
	}
	return false
}

func isAsciiDigit(b byte) bool { return b >= '0' && b <= '9' }

// containsSalaryDisclosure detects salary/compensation disclosure:
// the keywords "salary", "compensation", "pay rate", "bill rate",
// "hourly rate" appearing within ~40 chars of a `$NNN+` substring.
//
// Coarse on purpose — better to false-positive on a legit phrase
// like "discuss your hourly rate of $30/hr" than to miss a real
// disclosure. Operators tuning this should add tests, not loosen
// the check.
func containsSalaryDisclosure(s string) bool {
	lower := strings.ToLower(s)
	keywords := []string{"salary", "compensation", "pay rate", "bill rate", "hourly rate"}

	var keywordPositions []int
	for _, kw := range keywords {
		start := 0
		for {
			idx := strings.Index(lower[start:], kw)
			if idx < 0 {
				break
			}
			abs := start + idx
			keywordPositions = append(keywordPositions, abs)
			start = abs + len(kw)
		}
	}
	if len(keywordPositions) == 0 {
		return false
	}

	var dollarPositions []int
	bytes := []byte(lower)
	for i := 0; i+1 < len(bytes); i++ {
		if bytes[i] == '$' && isAsciiDigit(bytes[i+1]) {
			dollarPositions = append(dollarPositions, i)
		}
	}
	if len(dollarPositions) == 0 {
		return false
	}

	for _, kp := range keywordPositions {
		for _, dp := range dollarPositions {
			if absDiff(kp, dp) <= 40 {
				return true
			}
		}
	}
	return false
}

func absDiff(a, b int) int {
	if a > b {
		return a - b
	}
	return b - a
}