From b03521a506a4276c54ebc62a95beba5f3c6592ea Mon Sep 17 00:00:00 2001 From: root Date: Fri, 1 May 2026 04:49:55 -0500 Subject: [PATCH] validator: port FillValidator + EmailValidator from Rust validator crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per architecture_comparison.md universal-win for Go side: ports the Rust crates/validator/src/staffing/ to internal/validator/. Production safety net Go was missing — FillValidator catches phantom worker IDs + status/blacklist/geo/role mismatches; EmailValidator catches SSN-shape PII + salary disclosure + wrong-target name in email/SMS drafts. Files: - types.go: Artifact (FillProposal | EmailDraft), Validator interface, WorkerLookup interface, ValidationError + Finding + Severity - lookup.go: InMemoryWorkerLookup with case-insensitive ID lookup - fill.go: FillValidator — schema → completeness → cross-roster (phantom ID / status / blacklist / geo / role) - email.go: EmailValidator — schema → length → PII (SSN + salary) → worker-name consistency - fill_test.go + email_test.go: 24 tests covering happy path + every error variant + the load-bearing edge cases (phone-pattern not flagged as SSN, flanking-digit guard rejects extended numeric runs) Validator names match Rust (staffing.fill / staffing.email) so cross-runtime audit logs share the same identifier. PII scanners (containsSSNPattern, containsSalaryDisclosure) ported byte-for-byte so a draft flagged by one runtime is flagged by the other. Caveat: the Rust validator crate also has parquet_lookup.rs (loads workers_500k.parquet at startup) and playbook.rs (additional checks). Those weren't ported in this wave — only the two load-bearing validators that were named in the comparison doc. Closes one of the two universal-win items for Go side. The other (materializer port) remains deferred — it's a bigger surface change and depends on transforms.ts source-class adapters. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/validator/email.go | 270 ++++++++++++++++++++++++++++++ internal/validator/email_test.go | 220 +++++++++++++++++++++++++ internal/validator/fill.go | 274 +++++++++++++++++++++++++++++++ internal/validator/fill_test.go | 226 +++++++++++++++++++++++++ internal/validator/lookup.go | 56 +++++++ internal/validator/types.go | 144 ++++++++++++++++ 6 files changed, 1190 insertions(+) create mode 100644 internal/validator/email.go create mode 100644 internal/validator/email_test.go create mode 100644 internal/validator/fill.go create mode 100644 internal/validator/fill_test.go create mode 100644 internal/validator/lookup.go create mode 100644 internal/validator/types.go diff --git a/internal/validator/email.go b/internal/validator/email.go new file mode 100644 index 0000000..dd28928 --- /dev/null +++ b/internal/validator/email.go @@ -0,0 +1,270 @@ +package validator + +import ( + "fmt" + "strings" + "time" +) + +// EmailValidator is the Go port of Rust's EmailValidator. Per +// `crates/validator/src/staffing/email.rs`: +// +// - Schema (TO/BODY fields present) +// - Length (SMS ≤ 160 chars; email subject ≤ 78 chars) +// - PII absence (no SSN-shape / salary leakage) +// - Worker-name consistency (body mentions worker first name) +// +// PII detection is std-only — no regex dependency. Two scanners: +// - SSN-shape: NNN-NN-NNNN with run-of-digits guards (so phone +// numbers like NNN-NNN-NNNN don't false-positive). +// - Salary disclosure: keywords near a `$amount` substring. +// +// Both mirror Rust byte-for-byte so cross-runtime audit logs +// agree on which messages get flagged. +type EmailValidator struct { + workers WorkerLookup +} + +// NewEmailValidator constructs an EmailValidator with the given +// lookup. If you don't need the name-consistency check (e.g. +// generic broadcast templates), pass NewInMemoryWorkerLookup(nil) +// — the validator skips the worker check when _context.candidate_id +// is absent. +func NewEmailValidator(workers WorkerLookup) *EmailValidator { + return &EmailValidator{workers: workers} +} + +// Name satisfies Validator. Stable string used for audit trail / +// receipts. Matches Rust output "staffing.email". +func (v *EmailValidator) Name() string { return "staffing.email" } + +// Channel-shape limits — match Rust exactly. +const ( + smsMaxChars = 160 + emailSubjectMaxChars = 78 +) + +// Validate implements Validator. Order: schema → length → +// PII → worker-name consistency. +func (v *EmailValidator) Validate(artifact Artifact) (Report, error) { + started := time.Now() + value := artifact.EmailDraft + if value == nil { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: "artifact", + Reason: fmt.Sprintf("EmailValidator expects EmailDraft, got %s", artifact.Kind()), + } + } + + // ── Schema (`to` + `body` required) ── + if _, ok := value["to"].(string); !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: "to", + Reason: "missing or not a string", + } + } + body, ok := value["body"].(string) + if !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: "body", + Reason: "missing or not a string", + } + } + + // ── Length checks ── + isSMS := false + if k, ok := value["kind"].(string); ok && k == "sms" { + isSMS = true + } + if isSMS && len(body) > smsMaxChars { + return Report{}, &ValidationError{ + Kind: ErrCompleteness, + Reason: fmt.Sprintf("SMS body is %d chars, max %d", + len(body), smsMaxChars), + } + } + if subject, ok := value["subject"].(string); ok && len(subject) > emailSubjectMaxChars { + return Report{}, &ValidationError{ + Kind: ErrCompleteness, + Reason: fmt.Sprintf("email subject is %d chars, max %d", + len(subject), emailSubjectMaxChars), + } + } + + // ── PII scan over subject + body combined ── + var subjectStr string + if s, ok := value["subject"].(string); ok { + subjectStr = s + } + scanned := subjectStr + " " + body + if containsSSNPattern(scanned) { + return Report{}, &ValidationError{ + Kind: ErrPolicy, + Reason: "body contains an SSN-shaped sequence (NNN-NN-NNNN); strip before send", + } + } + if containsSalaryDisclosure(scanned) { + return Report{}, &ValidationError{ + Kind: ErrPolicy, + Reason: "body discloses salary/compensation amount; staffing PII rule says strip before send", + } + } + + // ── Worker-name consistency ── + var findings []Finding + if ctx, ok := value["_context"].(map[string]any); ok { + if cid, ok := ctx["candidate_id"].(string); ok && cid != "" { + worker, found := v.workers.Find(cid) + if !found { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf("_context.candidate_id %q not found in worker roster", cid), + } + } + // Body should mention the worker's name (or at least + // their first name) — drafts that address a different + // person than the contracted worker are a recurring + // LLM mistake. + first := strings.Fields(worker.Name) + firstLower := "" + if len(first) > 0 { + firstLower = strings.ToLower(first[0]) + } + bodyLower := strings.ToLower(body) + if firstLower != "" && !strings.Contains(bodyLower, firstLower) { + findings = append(findings, Finding{ + Field: "body", + Severity: SeverityWarning, + Message: fmt.Sprintf( + "body doesn't mention worker first name %q (candidate_id %q)", + first[0], cid, + ), + }) + } + } + } + + return Report{ + Findings: findings, + ElapsedMs: elapsed(started), + }, nil +} + +// ── PII scanners — std-only, mirror Rust byte-for-byte ────────── + +// containsSSNPattern detects an SSN-shaped sequence: 3 digits, dash, +// 2 digits, dash, 4 digits. Runs-of-digits guards: rejects sequences +// flanked by digit/dash (so phone-area-code-like NNN-NNN-NNNN isn't +// flagged). Tight false-positive surface: specifically the +// NNN-NN-NNNN shape used by U.S. SSNs. +// +// Critical: this fires on PII in real-world drafts. Don't relax the +// flanking guards without a regression test that exercises both +// cases (an actual SSN should fire, a phone-NNN-NNN-NNNN should not). +func containsSSNPattern(s string) bool { + bytes := []byte(s) + if len(bytes) < 11 { + return false + } + for i := 0; i+11 <= len(bytes); i++ { + win := bytes[i : i+11] + shape := true + for j := 0; j < 11; j++ { + switch j { + case 0, 1, 2, 4, 5, 7, 8, 9, 10: + if !isAsciiDigit(win[j]) { + shape = false + } + case 3, 6: + if win[j] != '-' { + shape = false + } + } + if !shape { + break + } + } + if !shape { + continue + } + // Reject if previous byte is digit or dash — we're + // inside a longer numeric run, probably not an SSN. + if i > 0 { + prev := bytes[i-1] + if isAsciiDigit(prev) || prev == '-' { + continue + } + } + // Reject if next byte is digit or dash — same reason. + if i+11 < len(bytes) { + next := bytes[i+11] + if isAsciiDigit(next) || next == '-' { + continue + } + } + return true + } + return false +} + +func isAsciiDigit(b byte) bool { return b >= '0' && b <= '9' } + +// containsSalaryDisclosure detects salary/compensation disclosure: +// the keywords "salary", "compensation", "pay rate", "bill rate", +// "hourly rate" appearing within ~40 chars of a `$NNN+` substring. +// +// Coarse on purpose — better to false-positive on a legit phrase +// like "discuss your hourly rate of $30/hr" than to miss a real +// disclosure. Operators tuning this should add tests, not loosen +// the check. +func containsSalaryDisclosure(s string) bool { + lower := strings.ToLower(s) + keywords := []string{"salary", "compensation", "pay rate", "bill rate", "hourly rate"} + + var keywordPositions []int + for _, kw := range keywords { + start := 0 + for { + idx := strings.Index(lower[start:], kw) + if idx < 0 { + break + } + abs := start + idx + keywordPositions = append(keywordPositions, abs) + start = abs + len(kw) + } + } + if len(keywordPositions) == 0 { + return false + } + + var dollarPositions []int + bytes := []byte(lower) + for i := 0; i+1 < len(bytes); i++ { + if bytes[i] == '$' && isAsciiDigit(bytes[i+1]) { + dollarPositions = append(dollarPositions, i) + } + } + if len(dollarPositions) == 0 { + return false + } + + for _, kp := range keywordPositions { + for _, dp := range dollarPositions { + if absDiff(kp, dp) <= 40 { + return true + } + } + } + return false +} + +func absDiff(a, b int) int { + if a > b { + return a - b + } + return b - a +} diff --git a/internal/validator/email_test.go b/internal/validator/email_test.go new file mode 100644 index 0000000..7a74744 --- /dev/null +++ b/internal/validator/email_test.go @@ -0,0 +1,220 @@ +package validator + +import "testing" + +// ── Schema ── + +func TestEmail_WrongArtifactType_FailsSchema(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{FillProposal: map[string]any{}}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrSchema { + t.Errorf("expected schema error on wrong artifact, got %+v", ve) + } +} + +func TestEmail_MissingTo_FailsSchema(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{"body": "hi"}}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrSchema || ve.Field != "to" { + t.Errorf("expected schema/to error, got %+v", ve) + } +} + +func TestEmail_MissingBody_FailsSchema(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{"to": "a@b.com"}}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrSchema || ve.Field != "body" { + t.Errorf("expected schema/body error, got %+v", ve) + } +} + +// ── Length limits ── + +func TestEmail_LongSMS_FailsCompleteness(t *testing.T) { + v := NewEmailValidator(mkLookup()) + body := make([]byte, 200) + for i := range body { + body[i] = 'x' + } + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "+15555550123", + "body": string(body), + "kind": "sms", + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrCompleteness { + t.Errorf("expected completeness error on long SMS, got %+v", ve) + } +} + +func TestEmail_LongSubject_FailsCompleteness(t *testing.T) { + v := NewEmailValidator(mkLookup()) + subject := make([]byte, 100) + for i := range subject { + subject[i] = 'x' + } + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "hi", + "subject": string(subject), + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrCompleteness { + t.Errorf("expected completeness error on long subject, got %+v", ve) + } +} + +// ── PII: SSN ── + +func TestEmail_SSNInBody_FailsPolicy(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Their SSN is 123-45-6789, please file accordingly.", + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrPolicy { + t.Errorf("expected policy error on SSN, got %+v", ve) + } +} + +func TestEmail_PhonePatternNotFlaggedAsSSN(t *testing.T) { + // NNN-NNN-NNNN (phone) must NOT trigger the NNN-NN-NNNN check. + // Critical false-positive case from Rust phone-pattern test. + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Call me at 555-123-4567 to confirm.", + }}) + if err != nil { + t.Errorf("phone pattern should NOT trigger SSN policy, got %v", err) + } +} + +func TestEmail_SSNInsideLongerNumericRun_NotFlagged(t *testing.T) { + // 1234-56-78901 has the right shape pattern at offset 0 but + // flanking digits → not an SSN. Mirrors Rust's flanking-digit + // guard test. + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "ID 1234-56-78901 is the new format.", + }}) + if err != nil { + t.Errorf("flanking-digit guard should reject this, got %v", err) + } +} + +// ── PII: salary ── + +func TestEmail_SalaryDisclosure_FailsPolicy(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Their salary is $45000 — please confirm before sending offer.", + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrPolicy { + t.Errorf("expected policy error on salary disclosure, got %+v", ve) + } +} + +func TestEmail_HourlyRateDisclosure_FailsPolicy(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Discuss your hourly rate of $30 with the client when you arrive.", + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrPolicy { + t.Errorf("expected policy error on hourly rate, got %+v", ve) + } +} + +func TestEmail_DollarFar_NotFlagged(t *testing.T) { + // $ amount > 40 chars from the keyword → not flagged. + v := NewEmailValidator(mkLookup()) + body := "We're paid by salary, but the parking validation costs " + + "about three more sentences worth of text appearing in between, " + + "and then much later at $50 the trip is too expensive." + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", "body": body, + }}) + if err != nil { + t.Errorf("salary keyword far from $ amount should not flag, got %v", err) + } +} + +// ── Worker-name consistency ── + +func TestEmail_NameMissingFromBody_EmitsWarning(t *testing.T) { + v := NewEmailValidator(mkLookup(mkWorker("w1", "Alice Smith", "active", "Toledo", "OH", "Welder"))) + report, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Hello, please confirm your shift tomorrow.", + "_context": map[string]any{"candidate_id": "w1"}, + }}) + if err != nil { + t.Fatalf("name mismatch should NOT error (warning only), got %v", err) + } + if len(report.Findings) != 1 || report.Findings[0].Severity != SeverityWarning { + t.Errorf("expected 1 warning finding, got %v", report.Findings) + } +} + +func TestEmail_NameInBody_NoFinding(t *testing.T) { + v := NewEmailValidator(mkLookup(mkWorker("w1", "Alice Smith", "active", "Toledo", "OH", "Welder"))) + report, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Hi Alice, please confirm tomorrow.", + "_context": map[string]any{"candidate_id": "w1"}, + }}) + if err != nil { + t.Fatalf("expected pass, got %v", err) + } + if len(report.Findings) != 0 { + t.Errorf("expected zero findings, got %v", report.Findings) + } +} + +func TestEmail_PhantomCandidateID_FailsConsistency(t *testing.T) { + v := NewEmailValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "a@b.com", + "body": "Hi Alice", + "_context": map[string]any{"candidate_id": "phantom"}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on phantom ID, got %+v", ve) + } +} + +// ── Happy path ── + +func TestEmail_WellFormed_Passes(t *testing.T) { + v := NewEmailValidator(mkLookup()) + report, err := v.Validate(Artifact{EmailDraft: map[string]any{ + "to": "alice@example.com", + "subject": "Shift confirmation", + "body": "Please confirm your shift starts at 9am tomorrow.", + }}) + if err != nil { + t.Errorf("well-formed email should pass, got %v", err) + } + if len(report.Findings) != 0 { + t.Errorf("expected zero findings, got %v", report.Findings) + } +} + +// ── Validator name is stable ── + +func TestEmail_NameMatchesRust(t *testing.T) { + v := NewEmailValidator(mkLookup()) + if v.Name() != "staffing.email" { + t.Errorf("name should match Rust 'staffing.email', got %q", v.Name()) + } +} diff --git a/internal/validator/fill.go b/internal/validator/fill.go new file mode 100644 index 0000000..bc47ee9 --- /dev/null +++ b/internal/validator/fill.go @@ -0,0 +1,274 @@ +package validator + +import ( + "fmt" + "strings" + "time" +) + +// FillValidator is the Go port of Rust's FillValidator. Per +// `crates/validator/src/staffing/fill.rs`: +// +// - Schema compliance (propose_done shape: {fills: [{candidate_id, name}]}) +// - Completeness (endorsed count == target_count) +// - Worker existence (every candidate_id present in workers roster) +// - Status check (worker.status == "active") +// - Client blacklist (worker NOT in client.blacklisted_clients) +// - Geo/role match (worker city/state/role matches contract) +// +// Contract metadata travels alongside the JSON payload under a +// `_context` key: +// +// {"_context": {"target_count": 2, "city": "Toledo", "state": "OH", +// "role": "Welder", "client_id": "CLI-00099"}, "fills": [...]} +// +// The duplicate-ID guard inside one fill catches the LLM mistake +// of repeating the same candidate twice to satisfy a higher +// target_count. +type FillValidator struct { + workers WorkerLookup +} + +// NewFillValidator constructs a FillValidator with the given lookup. +// Lookup must be non-nil; pass NewInMemoryWorkerLookup(nil) for +// tests that don't exercise existence checks. +func NewFillValidator(workers WorkerLookup) *FillValidator { + return &FillValidator{workers: workers} +} + +// Name satisfies Validator. Stable string used for audit +// trail / receipts. Matches Rust output "staffing.fill" so +// cross-runtime audit logs share the same name. +func (v *FillValidator) Name() string { return "staffing.fill" } + +// fillContext is the optional contract metadata extracted from +// _context. Each field is independently nil-able (Rust's Option +// pattern) — validators only enforce a check when both contract +// and roster sides have a value. +type fillContext struct { + TargetCount *int + City *string + State *string + Role *string + ClientID *string +} + +func extractContext(value map[string]any) fillContext { + ctx, ok := value["_context"].(map[string]any) + if !ok { + return fillContext{} + } + out := fillContext{} + if v, ok := ctx["target_count"]; ok { + if n, ok := toInt(v); ok { + out.TargetCount = &n + } + } + if s, ok := ctx["city"].(string); ok { + out.City = &s + } + if s, ok := ctx["state"].(string); ok { + out.State = &s + } + if s, ok := ctx["role"].(string); ok { + out.Role = &s + } + if s, ok := ctx["client_id"].(string); ok { + out.ClientID = &s + } + return out +} + +// toInt accepts JSON numbers (float64) and integers, returning +// the int form when the value is a whole number ≥ 0. +func toInt(v any) (int, bool) { + switch n := v.(type) { + case int: + return n, true + case int64: + return int(n), true + case float64: + // JSON unmarshals all numbers as float64; whole-number check + // is mandatory because target_count=2.5 makes no sense. + i := int(n) + if float64(i) == n { + return i, true + } + return 0, false + } + return 0, false +} + +// eqCI is the case-insensitive equality used everywhere validators +// compare strings (status, role, city, etc.). Trim+lowercase mirrors +// Rust's `.trim().eq_ignore_ascii_case(other.trim())`. +func eqCI(a, b string) bool { + return strings.EqualFold(strings.TrimSpace(a), strings.TrimSpace(b)) +} + +// Validate implements the Validator interface. Mirrors the Rust +// validation order exactly: schema → completeness → cross-roster +// per-fill checks. +func (v *FillValidator) Validate(artifact Artifact) (Report, error) { + started := time.Now() + value := artifact.FillProposal + if value == nil { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: "artifact", + Reason: fmt.Sprintf("FillValidator expects FillProposal, got %s", artifact.Kind()), + } + } + + // ── Schema check ── + fillsRaw, ok := value["fills"].([]any) + if !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: "fills", + Reason: "expected top-level `fills` array", + } + } + for i, fillRaw := range fillsRaw { + fill, ok := fillRaw.(map[string]any) + if !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: fmt.Sprintf("fills[%d]", i), + Reason: "expected object", + } + } + if _, ok := fill["candidate_id"]; !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: fmt.Sprintf("fills[%d].candidate_id", i), + Reason: "missing", + } + } + if _, ok := fill["name"]; !ok { + return Report{}, &ValidationError{ + Kind: ErrSchema, + Field: fmt.Sprintf("fills[%d].name", i), + Reason: "missing", + } + } + } + + ctx := extractContext(value) + + // ── Completeness ── + if ctx.TargetCount != nil && len(fillsRaw) != *ctx.TargetCount { + return Report{}, &ValidationError{ + Kind: ErrCompleteness, + Reason: fmt.Sprintf("endorsed count %d != target_count %d", + len(fillsRaw), *ctx.TargetCount), + } + } + + // ── Cross-roster checks ── + var findings []Finding + seenIDs := make(map[string]bool, len(fillsRaw)) + for i, fillRaw := range fillsRaw { + fill := fillRaw.(map[string]any) // already type-checked in schema pass + candidateID, _ := fill["candidate_id"].(string) + proposedName, _ := fill["name"].(string) + + // Duplicate-ID guard inside one fill. + if seenIDs[candidateID] { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "duplicate candidate_id %q appears multiple times in fills", + candidateID, + ), + } + } + seenIDs[candidateID] = true + + // Worker existence — load-bearing check for the 0→85% pattern. + worker, ok := v.workers.Find(candidateID) + if !ok { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "fills[%d].candidate_id %q does not exist in worker roster", + i, candidateID, + ), + } + } + + // Status — only "active" workers can be endorsed. + if !eqCI(worker.Status, "active") { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "fills[%d] worker %q has status %q, expected \"active\"", + i, candidateID, worker.Status, + ), + } + } + + // Client blacklist. + if ctx.ClientID != nil { + for _, b := range worker.BlacklistedClients { + if eqCI(b, *ctx.ClientID) { + return Report{}, &ValidationError{ + Kind: ErrPolicy, + Reason: fmt.Sprintf( + "fills[%d] worker %q blacklisted for client %q", + i, candidateID, *ctx.ClientID, + ), + } + } + } + } + + // Geo / role match — only when BOTH sides have a value. + if ctx.City != nil && worker.City != nil && !eqCI(*ctx.City, *worker.City) { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "fills[%d] worker %q city %q doesn't match contract city %q", + i, candidateID, *worker.City, *ctx.City, + ), + } + } + if ctx.State != nil && worker.State != nil && !eqCI(*ctx.State, *worker.State) { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "fills[%d] worker %q state %q doesn't match contract state %q", + i, candidateID, *worker.State, *ctx.State, + ), + } + } + if ctx.Role != nil && worker.Role != nil && !eqCI(*ctx.Role, *worker.Role) { + return Report{}, &ValidationError{ + Kind: ErrConsistency, + Reason: fmt.Sprintf( + "fills[%d] worker %q role %q doesn't match contract role %q", + i, candidateID, *worker.Role, *ctx.Role, + ), + } + } + + // Name-mismatch is a warning, not an error — recruiters + // sometimes send updated names through the proposal layer + // before the roster catches up. + if proposedName != "" && !eqCI(proposedName, worker.Name) { + findings = append(findings, Finding{ + Field: fmt.Sprintf("fills[%d].name", i), + Severity: SeverityWarning, + Message: fmt.Sprintf( + "proposed name %q differs from roster name %q for %q", + proposedName, worker.Name, candidateID, + ), + }) + } + } + + return Report{ + Findings: findings, + ElapsedMs: elapsed(started), + }, nil +} diff --git a/internal/validator/fill_test.go b/internal/validator/fill_test.go new file mode 100644 index 0000000..4a14748 --- /dev/null +++ b/internal/validator/fill_test.go @@ -0,0 +1,226 @@ +package validator + +import ( + "errors" + "testing" +) + +// Helpers — mirror the Rust test helpers. + +func mkLookup(records ...WorkerRecord) WorkerLookup { + return NewInMemoryWorkerLookup(records) +} + +func mkWorker(id, name, status, city, state, role string) WorkerRecord { + return WorkerRecord{ + CandidateID: id, + Name: name, + Status: status, + City: strPtr(city), + State: strPtr(state), + Role: strPtr(role), + } +} + +func asValidationError(err error) (*ValidationError, bool) { + var ve *ValidationError + if errors.As(err, &ve) { + return ve, true + } + return nil, false +} + +// ── Schema-level errors ── + +func TestFill_WrongArtifactType_FailsSchema(t *testing.T) { + v := NewFillValidator(mkLookup()) + _, err := v.Validate(Artifact{EmailDraft: map[string]any{}}) + ve, ok := asValidationError(err) + if !ok { + t.Fatalf("expected ValidationError, got %v", err) + } + if ve.Kind != ErrSchema || ve.Field != "artifact" { + t.Errorf("expected schema/artifact error, got %+v", ve) + } +} + +func TestFill_MissingFillsArray_FailsSchema(t *testing.T) { + v := NewFillValidator(mkLookup()) + _, err := v.Validate(Artifact{FillProposal: map[string]any{}}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrSchema || ve.Field != "fills" { + t.Errorf("expected schema/fills error, got %+v", ve) + } +} + +func TestFill_MissingCandidateID_FailsSchema(t *testing.T) { + v := NewFillValidator(mkLookup()) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "fills": []any{ + map[string]any{"name": "Alice"}, + }, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrSchema || ve.Field != "fills[0].candidate_id" { + t.Errorf("expected schema/fills[0].candidate_id error, got %+v", ve) + } +} + +// ── Completeness ── + +func TestFill_TargetCountMismatch_FailsCompleteness(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "active", "Toledo", "OH", "Welder"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{"target_count": float64(2)}, + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrCompleteness { + t.Errorf("expected completeness error, got %+v", ve) + } +} + +// ── Cross-roster checks ── + +func TestFill_PhantomID_FailsConsistency(t *testing.T) { + // Lookup is empty → any candidate_id is "phantom" — the + // load-bearing check for the 0→85% pattern. + v := NewFillValidator(mkLookup()) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "fills": []any{map[string]any{"candidate_id": "phantom-id", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on phantom ID, got %+v", ve) + } +} + +func TestFill_DuplicateID_FailsConsistency(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "active", "Toledo", "OH", "Welder"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "fills": []any{ + map[string]any{"candidate_id": "w1", "name": "Alice"}, + map[string]any{"candidate_id": "w1", "name": "Alice"}, + }, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on duplicate ID, got %+v", ve) + } +} + +func TestFill_InactiveStatus_FailsConsistency(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "inactive", "Toledo", "OH", "Welder"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on inactive status, got %+v", ve) + } +} + +func TestFill_Blacklist_FailsPolicy(t *testing.T) { + w := mkWorker("w1", "Alice", "active", "Toledo", "OH", "Welder") + w.BlacklistedClients = []string{"CLI-99"} + v := NewFillValidator(mkLookup(w)) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{"client_id": "cli-99"}, // case-insensitive + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrPolicy { + t.Errorf("expected policy error on blacklist, got %+v", ve) + } +} + +func TestFill_GeoMismatch_FailsConsistency(t *testing.T) { + // Worker in Detroit, contract says Toledo. + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "active", "Detroit", "MI", "Welder"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{"city": "Toledo", "state": "OH"}, + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on geo mismatch, got %+v", ve) + } +} + +func TestFill_RoleMismatch_FailsConsistency(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "active", "Toledo", "OH", "Forklift Operator"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{"role": "Welder"}, + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + ve, _ := asValidationError(err) + if ve == nil || ve.Kind != ErrConsistency { + t.Errorf("expected consistency error on role mismatch, got %+v", ve) + } +} + +// ── Happy path ── + +func TestFill_WellFormed_Passes(t *testing.T) { + v := NewFillValidator(mkLookup( + mkWorker("w1", "Alice", "active", "Toledo", "OH", "Welder"), + mkWorker("w2", "Bob", "active", "Toledo", "OH", "Welder"), + )) + report, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{ + "target_count": float64(2), + "city": "Toledo", + "state": "OH", + "role": "Welder", + }, + "fills": []any{ + map[string]any{"candidate_id": "w1", "name": "Alice"}, + map[string]any{"candidate_id": "w2", "name": "Bob"}, + }, + }}) + if err != nil { + t.Fatalf("expected pass, got %v", err) + } + if len(report.Findings) != 0 { + t.Errorf("expected zero findings, got %v", report.Findings) + } +} + +// ── Name mismatch is a Finding (warning), not an error ── + +func TestFill_NameMismatch_EmitsWarning(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice Smith", "active", "Toledo", "OH", "Welder"))) + report, err := v.Validate(Artifact{FillProposal: map[string]any{ + "fills": []any{ + map[string]any{"candidate_id": "w1", "name": "Alyssa Smith"}, // typo / outdated + }, + }}) + if err != nil { + t.Fatalf("name mismatch should NOT error, got %v", err) + } + if len(report.Findings) != 1 || report.Findings[0].Severity != SeverityWarning { + t.Errorf("expected 1 warning finding, got %v", report.Findings) + } +} + +// ── Case-insensitive matches ── + +func TestFill_CaseInsensitiveMatch_Passes(t *testing.T) { + v := NewFillValidator(mkLookup(mkWorker("w1", "Alice", "ACTIVE", "TOLEDO", "oh", "Welder"))) + _, err := v.Validate(Artifact{FillProposal: map[string]any{ + "_context": map[string]any{"city": "Toledo", "state": "OH"}, + "fills": []any{map[string]any{"candidate_id": "w1", "name": "Alice"}}, + }}) + if err != nil { + t.Errorf("case-insensitive comparisons should pass, got %v", err) + } +} + +// ── Validator name is stable ── + +func TestFill_NameMatchesRust(t *testing.T) { + v := NewFillValidator(mkLookup()) + if v.Name() != "staffing.fill" { + t.Errorf("name should match Rust 'staffing.fill', got %q", v.Name()) + } +} diff --git a/internal/validator/lookup.go b/internal/validator/lookup.go new file mode 100644 index 0000000..7da9964 --- /dev/null +++ b/internal/validator/lookup.go @@ -0,0 +1,56 @@ +package validator + +import "strings" + +// InMemoryWorkerLookup is a zero-deps WorkerLookup useful for tests +// and small-fixture validation. Mirrors Rust's +// `InMemoryWorkerLookup::from_records`. +// +// Lookup is case-insensitive on candidate_id since Rust's +// HashMap with PartialEq + the source data's casing inconsistency +// (some IDs uppercase, some lowercase, some mixed) means +// case-sensitive lookup misses real matches. Lower-casing on +// insert keeps the contract. +type InMemoryWorkerLookup struct { + byID map[string]WorkerRecord +} + +// NewInMemoryWorkerLookup builds a lookup from a list of records. +// Duplicate candidate_ids: last-write-wins. Empty candidate_id: skipped. +func NewInMemoryWorkerLookup(records []WorkerRecord) *InMemoryWorkerLookup { + m := make(map[string]WorkerRecord, len(records)) + for _, r := range records { + if r.CandidateID == "" { + continue + } + m[strings.ToLower(strings.TrimSpace(r.CandidateID))] = r + } + return &InMemoryWorkerLookup{byID: m} +} + +// Find satisfies WorkerLookup. Returns (rec, true) on hit, +// (nil, false) on miss. +func (l *InMemoryWorkerLookup) Find(candidateID string) (*WorkerRecord, bool) { + if l == nil { + return nil, false + } + r, ok := l.byID[strings.ToLower(strings.TrimSpace(candidateID))] + if !ok { + return nil, false + } + // Return a copy so callers can't mutate the lookup's internal state. + cp := r + return &cp, true +} + +// Len exposes the size for tests + admin endpoints. +func (l *InMemoryWorkerLookup) Len() int { + if l == nil { + return 0 + } + return len(l.byID) +} + +// strPtr is a tiny convenience for tests that need *string fields +// on WorkerRecord.City/State/Role. +func strPtr(s string) *string { return &s } diff --git a/internal/validator/types.go b/internal/validator/types.go new file mode 100644 index 0000000..5615d11 --- /dev/null +++ b/internal/validator/types.go @@ -0,0 +1,144 @@ +// Package validator is the Go port of Rust's `validator` crate +// (`/home/profit/lakehouse/crates/validator/`). Production safety +// nets for staffing-domain LLM outputs: +// +// - FillValidator: catches phantom IDs / wrong-status workers / +// blacklist violations / geo-or-role mismatches in fill proposals +// - EmailValidator: catches SSN-shape sequences / salary +// disclosure / wrong-target name in email/SMS drafts +// +// Per `reports/cutover/architecture_comparison.md`'s "Go missing" +// section: these were Rust-only until this port. Closes one of the +// two named gaps for Go-primary operation (the other being the +// materializer port). +// +// Architectural choice: we mirror the Rust shape exactly so the +// Validator + Artifact + Finding interfaces are call-compatible +// across runtimes. A future "validator service" daemon could expose +// either runtime's implementation behind a uniform HTTP contract. + +package validator + +import "time" + +// Artifact is the discriminated union of input shapes a Validator +// can receive. Mirrors Rust's `enum Artifact`. The first non-zero +// field selects the kind. +type Artifact struct { + // FillProposal: {fills: [{candidate_id, name}], _context: {...}} + FillProposal map[string]any + // EmailDraft: {to, body, subject?, kind?, _context?: {candidate_id?}} + EmailDraft map[string]any +} + +// Kind returns a short string for error messages — mirrors the +// Rust Debug shape used in "expected FillProposal, got X". +func (a Artifact) Kind() string { + switch { + case a.FillProposal != nil: + return "FillProposal" + case a.EmailDraft != nil: + return "EmailDraft" + default: + return "Unknown" + } +} + +// Severity matches Rust's enum {Error, Warning, Info}. +type Severity string + +const ( + SeverityError Severity = "error" + SeverityWarning Severity = "warning" + SeverityInfo Severity = "info" +) + +// Finding is one warning-or-info note attached to a successful +// validation. Errors abort validation; findings come back alongside +// a passing report. Mirrors Rust's Finding shape exactly so JSON +// round-trips between runtimes. +type Finding struct { + Field string `json:"field"` + Severity Severity `json:"severity"` + Message string `json:"message"` +} + +// Report is the success-path return value: zero or more findings +// + per-validator wall-clock cost. +type Report struct { + Findings []Finding `json:"findings"` + ElapsedMs int64 `json:"elapsed_ms"` +} + +// ValidationErrorKind discriminates the failure modes. Mirrors +// Rust's ValidationError variants: +// - Schema: input shape doesn't match contract +// - Completeness: structural counts wrong (e.g. 3 fills, target_count=5) +// - Consistency: cross-source disagreement (phantom worker, wrong city) +// - Policy: org-level rule violation (blacklist, PII leak) +type ValidationErrorKind string + +const ( + ErrSchema ValidationErrorKind = "schema" + ErrCompleteness ValidationErrorKind = "completeness" + ErrConsistency ValidationErrorKind = "consistency" + ErrPolicy ValidationErrorKind = "policy" +) + +// ValidationError is the Go equivalent of Rust's enum + variant +// fields. Field is set for Schema errors (the failing field name); +// Reason carries the human-readable message for all variants. +type ValidationError struct { + Kind ValidationErrorKind + Field string + Reason string +} + +// Error makes ValidationError a Go error value. Format mirrors the +// Rust Debug print so log scraping behaves the same. +func (e *ValidationError) Error() string { + if e.Field != "" { + return string(e.Kind) + " (" + e.Field + "): " + e.Reason + } + return string(e.Kind) + ": " + e.Reason +} + +// Validator is the interface every validator implements. +// Stateless — construction takes any deps (e.g. WorkerLookup) +// upfront, validate() is pure on its inputs. +type Validator interface { + Name() string + Validate(artifact Artifact) (Report, error) +} + +// WorkerRecord is the lookup-side worker shape. Pointer fields +// for City/State/Role mirror Rust's Option — None means +// "we don't know," which is operationally distinct from empty +// string (we know it's empty). Validators only enforce +// city/state/role matches when both expected (from contract) +// and actual (from lookup) are non-nil. +type WorkerRecord struct { + CandidateID string + Name string + Status string // "active" / "inactive" / etc. + City *string + State *string + Role *string + BlacklistedClients []string +} + +// WorkerLookup is the gate validators go through to ask "does +// this candidate_id exist + what's their record?" Implementations +// can be in-memory (test fixture), DuckDB-backed (production +// queryd), or HTTP-backed (cross-daemon). FillValidator + EmailValidator +// take Arc on Rust side; in Go, an interface value. +type WorkerLookup interface { + Find(candidateID string) (*WorkerRecord, bool) +} + +// elapsed converts a start time into the milliseconds-elapsed +// shape matched in Report.ElapsedMs (mirrors Rust's +// .elapsed().as_millis() as u64). +func elapsed(start time.Time) int64 { + return time.Since(start).Milliseconds() +}