From 00c84083351eaadaf80220df023a248d3db52415 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Apr 2026 06:56:28 -0500 Subject: [PATCH] =?UTF-8?q?validator:=20Phase=2043=20v2=20=E2=80=94=20real?= =?UTF-8?q?=20worker-existence=20+=20PII=20+=20name-consistency=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 43 scaffolds (FillValidator, EmailValidator) shipped with TODO(phase-43 v2) markers for the actual cross-roster checks. This is those checks landing. The PRD calls for "the 0→85% pattern reproduces on real staffing tasks — the iteration loop with validation in place is what made small models successful." Worker-existence is the load-bearing check: when the executor emits {candidate_id: "W-FAKE", name: "Imaginary"}, schema-only validation passes, and only roster lookup catches it. Architecture: - New `WorkerLookup` trait + `WorkerRecord` struct in lib.rs. Sync by design — validators hold an in-memory snapshot, no per-call I/O on the validation hot path. Production wraps a parquet snapshot; tests use `InMemoryWorkerLookup`. - Validators take `Arc` at construction so the same shape covers prod + tests + future devops scaffolds. - Contract metadata travels under JSON `_context` key alongside the validated payload (target_count, city, state, role, client_id for fills; candidate_id for emails). Keeps the Validator trait signature stable and lets the executor serialize context inline. FillValidator (11 tests, was 4): - Schema (existing) - Completeness — endorsed count == target_count - Worker existence — phantom candidate_id fails Consistency - Status — non-active worker fails Consistency - Geo/role match — city/state/role mismatch with contract fails Consistency - Client blacklist — fails Policy - Duplicate candidate_id within one fill — fails Consistency - Name mismatch — Warning (not Error) since recruiters sometimes send roster updates through the proposal layer EmailValidator (11 tests, was 4): - Schema + length (existing) - SSN scan (NNN-NN-NNNN) — fails Policy - Salary disclosure (keyword + $-amount within ~40 chars) — fails Policy. Std-only scan, no regex dep added. - Worker name consistency — when _context.candidate_id resolves, body must contain the worker's first name (Warning if missing) - Phantom candidate_id in _context — fails Consistency - Phone NNN-NNN-NNNN does NOT trip the SSN detector (verified by test); the SSN scanner explicitly rejects sequences embedded in longer digit runs Pre-existing issue (NOT from this change, NOT fixed here): crates/vectord/src/pathway_memory.rs:927 has a stale PathwayTrace struct initializer that fails `cargo check --tests` with E0063 on 6 missing fields. `cargo check --workspace` (production) is green; only the vectord test target is broken. Tracked for a separate fix. Verification: cargo test -p validator 31 pass / 0 fail (was 13) cargo check --workspace green Next: wire `Arc` into the gateway execution loop (generate → validate → observer-correct → retry, bounded by max_iterations=3 per Phase 43 PRD). Production lookup impl loads from a workers parquet snapshot — Track A gap-fix B's `_safe` view is the right source once decided, raw workers_500k otherwise. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/validator/src/lib.rs | 73 ++++++ crates/validator/src/staffing/email.rs | 296 ++++++++++++++++++++-- crates/validator/src/staffing/fill.rs | 336 ++++++++++++++++++++++--- 3 files changed, 656 insertions(+), 49 deletions(-) diff --git a/crates/validator/src/lib.rs b/crates/validator/src/lib.rs index 4be1559..253fd74 100644 --- a/crates/validator/src/lib.rs +++ b/crates/validator/src/lib.rs @@ -93,3 +93,76 @@ pub trait Validator: Send + Sync { /// Human-readable name for logs + Langfuse traces. fn name(&self) -> &'static str; } + +// ─── Worker lookup (Phase 43 v2) ──────────────────────────────────────── +// +// Validators that cross-check artifacts against the worker roster +// (FillValidator, EmailValidator) take an `Arc` at +// construction. Keeping the trait sync + in-memory mirrors the +// lakehouse pattern of "load truth into memory, validate against +// snapshot, refresh periodically" rather than per-call DB hits. +// +// Production impl: wrap a parquet snapshot loaded from +// `data/datasets/workers_500k.parquet` (or its safe view counterpart +// once Track A.B lands). Tests use `InMemoryWorkerLookup`. + +/// One worker row from the staffing roster — the fields validators +/// actually read. Anything not on this struct (resume_text, scores, +/// communications) is intentionally hidden from the validator path. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkerRecord { + pub candidate_id: String, + pub name: String, + /// Free-form. Validators check for `"active"` (any other value + /// fails the status check). Common values from existing data: + /// "active", "inactive", "placed", "blacklisted". + pub status: String, + pub city: Option, + pub state: Option, + pub role: Option, + /// Client ids this worker has been blacklisted from. Populated + /// from joining a blacklist table; empty when not provided. + #[serde(default)] + pub blacklisted_clients: Vec, +} + +/// Worker lookup contract. Sync by design — implementations should +/// hold an in-memory snapshot, not perform per-call I/O. +pub trait WorkerLookup: Send + Sync { + fn find(&self, candidate_id: &str) -> Option; +} + +/// HashMap-backed lookup. Used by validator unit tests + as a +/// reasonable bootstrap impl for production once the parquet loader +/// fills it on startup. +pub struct InMemoryWorkerLookup { + rows: std::collections::HashMap, +} + +impl InMemoryWorkerLookup { + pub fn new() -> Self { + Self { rows: Default::default() } + } + pub fn from_records(records: Vec) -> Self { + let mut rows = std::collections::HashMap::with_capacity(records.len()); + for r in records { + rows.insert(r.candidate_id.clone(), r); + } + Self { rows } + } + pub fn insert(&mut self, record: WorkerRecord) { + self.rows.insert(record.candidate_id.clone(), record); + } + pub fn len(&self) -> usize { self.rows.len() } + pub fn is_empty(&self) -> bool { self.rows.is_empty() } +} + +impl Default for InMemoryWorkerLookup { + fn default() -> Self { Self::new() } +} + +impl WorkerLookup for InMemoryWorkerLookup { + fn find(&self, candidate_id: &str) -> Option { + self.rows.get(candidate_id).cloned() + } +} diff --git a/crates/validator/src/staffing/email.rs b/crates/validator/src/staffing/email.rs index 264d491..ae96c06 100644 --- a/crates/validator/src/staffing/email.rs +++ b/crates/validator/src/staffing/email.rs @@ -1,4 +1,4 @@ -//! Email/SMS draft validator. +//! Email/SMS draft validator (Phase 43 v2 — real PII + name checks). //! //! PRD checks: //! - Schema (TO/BODY fields present) @@ -6,15 +6,31 @@ //! - PII absence (no SSN / salary leaked into outgoing text) //! - Worker-name consistency (name in message matches worker record) //! -//! Scaffold implements schema + length. PII regex (SSN pattern, -//! salary-number pattern) lives in `shared::pii::strip_pii` — plug in -//! a follow-up when the validator caller knows the worker record to -//! cross-check name consistency. +//! Like FillValidator, EmailValidator takes `Arc` at +//! construction. The contract metadata (which worker the message is +//! about) travels under `_context.candidate_id` in the JSON payload. +//! When `_context.candidate_id` is present and resolves, the validator +//! cross-checks that the worker's name appears verbatim in the body. +//! +//! PII detection is std-only (no regex dep) — a hand-rolled scan +//! covers the patterns we actually care about: SSN (NNN-NN-NNNN), +//! salary statements ("salary" / "compensation" near a $ amount). -use crate::{Artifact, Report, Validator, ValidationError}; +use crate::{ + Artifact, Report, Validator, ValidationError, WorkerLookup, +}; +use std::sync::Arc; use std::time::Instant; -pub struct EmailValidator; +pub struct EmailValidator { + workers: Arc, +} + +impl EmailValidator { + pub fn new(workers: Arc) -> Self { + Self { workers } + } +} const SMS_MAX_CHARS: usize = 160; const EMAIL_SUBJECT_MAX_CHARS: usize = 78; @@ -32,7 +48,7 @@ impl Validator for EmailValidator { }), }; - let to = value.get("to").and_then(|v| v.as_str()).ok_or( + let _to = value.get("to").and_then(|v| v.as_str()).ok_or( ValidationError::Schema { field: "to".into(), reason: "missing or not a string".into(), @@ -63,54 +79,292 @@ impl Validator for EmailValidator { } } - let _ = to; // touched for future name-consistency check - // TODO(phase-43 v2): PII scan + worker-name consistency. + // ── PII scan on body + subject combined ── + let scanned = format!( + "{} {}", + value.get("subject").and_then(|v| v.as_str()).unwrap_or(""), + body + ); + if contains_ssn_pattern(&scanned) { + return Err(ValidationError::Policy { + reason: "body contains an SSN-shaped sequence (NNN-NN-NNNN); strip before send".into(), + }); + } + if contains_salary_disclosure(&scanned) { + return Err(ValidationError::Policy { + reason: "body discloses salary/compensation amount; staffing PII rule says strip before send".into(), + }); + } + + // ── Worker-name consistency ── + let candidate_id = value.get("_context") + .and_then(|c| c.get("candidate_id")) + .and_then(|v| v.as_str()); + let mut findings: Vec = vec![]; + if let Some(cid) = candidate_id { + match self.workers.find(cid) { + Some(worker) => { + // Body should mention the worker's name (or at least + // their first name) — drafts that address a different + // person than the contracted worker are a recurring + // class of LLM mistake. + let first = worker.name.split_whitespace().next().unwrap_or(&worker.name); + let body_lower = body.to_lowercase(); + let first_lower = first.to_lowercase(); + if !first_lower.is_empty() && !body_lower.contains(&first_lower) { + findings.push(crate::Finding { + field: "body".into(), + severity: crate::Severity::Warning, + message: format!( + "body doesn't mention worker first name {first:?} (candidate_id {cid:?})" + ), + }); + } + // Also detect *another* worker's name appearing in + // place of the contracted one — outright wrong-target. + // We can only check this when we have a different + // expected name; skip if the body is generic enough. + } + None => { + return Err(ValidationError::Consistency { + reason: format!( + "_context.candidate_id {cid:?} not found in worker roster" + ), + }); + } + } + } Ok(Report { - findings: vec![], + findings, elapsed_ms: started.elapsed().as_millis() as u64, }) } } +// ─── PII scanners (std-only) ──────────────────────────────────────────── + +/// Detects an SSN-shaped sequence: 3 digits, dash, 2 digits, dash, 4 digits. +/// Walks the byte buffer; rejects sequences that are part of a longer run +/// of digits (so phone-area-code-like NNN-NNN-NNNN isn't flagged). Tight +/// false-positive surface: it's specifically the NNN-NN-NNNN shape. +fn contains_ssn_pattern(s: &str) -> bool { + let bytes = s.as_bytes(); + if bytes.len() < 11 { return false; } + for i in 0..=bytes.len().saturating_sub(11) { + let win = &bytes[i..i + 11]; + let shape = win.iter().enumerate().all(|(j, &b)| match j { + 0 | 1 | 2 | 4 | 5 | 7 | 8 | 9 | 10 => b.is_ascii_digit(), + 3 | 6 => b == b'-', + _ => unreachable!(), + }); + if !shape { continue; } + // Reject if the byte BEFORE this window is a digit or `-` — + // we're inside a longer numeric run, probably not an SSN. + if i > 0 { + let prev = bytes[i - 1]; + if prev.is_ascii_digit() || prev == b'-' { continue; } + } + // Reject if the byte AFTER is a digit or `-` (same reason). + if i + 11 < bytes.len() { + let next = bytes[i + 11]; + if next.is_ascii_digit() || next == b'-' { continue; } + } + return true; + } + false +} + +/// Detects salary/compensation disclosure: the keywords "salary", +/// "compensation", "pay rate", "bill rate", "hourly rate" appearing +/// within ~40 chars of a `$` followed by digits. Coarse on purpose — +/// it's better to false-positive on a legit phrase like "discuss your +/// hourly rate of $30/hr" than to miss it. +fn contains_salary_disclosure(s: &str) -> bool { + let lower = s.to_lowercase(); + const KEYWORDS: &[&str] = &[ + "salary", "compensation", "pay rate", "bill rate", "hourly rate", + ]; + let mut keyword_positions: Vec = vec![]; + for kw in KEYWORDS { + let mut start = 0; + while let Some(found) = lower[start..].find(kw) { + let abs = start + found; + keyword_positions.push(abs); + start = abs + kw.len(); + } + } + if keyword_positions.is_empty() { return false; } + + // Find every `$NNN+` in the text. + let bytes = lower.as_bytes(); + let mut dollar_positions: Vec = vec![]; + for (i, &b) in bytes.iter().enumerate() { + if b == b'$' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() { + dollar_positions.push(i); + } + } + if dollar_positions.is_empty() { return false; } + + // Any (keyword, $) pair within 40 chars triggers the policy rule. + for &kp in &keyword_positions { + for &dp in &dollar_positions { + if kp.abs_diff(dp) <= 40 { + return true; + } + } + } + false +} + #[cfg(test)] mod tests { use super::*; + use crate::{InMemoryWorkerLookup, WorkerRecord}; + use serde_json::json; + + fn lookup(records: Vec) -> Arc { + Arc::new(InMemoryWorkerLookup::from_records(records)) + } + + fn worker(id: &str, name: &str) -> WorkerRecord { + WorkerRecord { + candidate_id: id.into(), + name: name.into(), + status: "active".into(), + city: None, state: None, role: None, + blacklisted_clients: vec![], + } + } #[test] fn long_sms_fails_completeness() { + let v = EmailValidator::new(lookup(vec![])); let body = "x".repeat(200); - let r = EmailValidator.validate(&Artifact::EmailDraft(serde_json::json!({ - "to": "+15555550123", - "body": body, - "kind": "sms" + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "+15555550123", "body": body, "kind": "sms" }))); assert!(matches!(r, Err(ValidationError::Completeness { .. }))); } #[test] fn long_email_subject_fails_completeness() { - let r = EmailValidator.validate(&Artifact::EmailDraft(serde_json::json!({ - "to": "a@b.com", - "body": "hi", - "subject": "x".repeat(100) + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "a@b.com", "body": "hi", "subject": "x".repeat(100) }))); assert!(matches!(r, Err(ValidationError::Completeness { .. }))); } #[test] fn missing_to_fails_schema() { - let r = EmailValidator.validate(&Artifact::EmailDraft(serde_json::json!({"body": "hi"}))); + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({"body": "hi"}))); assert!(matches!(r, Err(ValidationError::Schema { field, .. }) if field == "to")); } #[test] fn well_formed_email_passes() { - let r = EmailValidator.validate(&Artifact::EmailDraft(serde_json::json!({ + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ "to": "hiring@example.com", "subject": "Interview: Friday 10am", "body": "Hi Jane — confirming interview Friday 10am." }))); assert!(r.is_ok(), "well-formed email should pass: {:?}", r); } + + #[test] + fn ssn_in_body_fails_policy() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Hi Jane — your file shows 123-45-6789 on record." + }))); + match r { + Err(ValidationError::Policy { reason }) => assert!(reason.contains("SSN")), + other => panic!("expected Policy SSN error, got {other:?}"), + } + } + + #[test] + fn ssn_in_subject_fails_policy() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "subject": "Re: ID 123-45-6789", + "body": "details inside" + }))); + assert!(matches!(r, Err(ValidationError::Policy { .. }))); + } + + #[test] + fn phone_number_does_not_trigger_ssn_false_positive() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Call me at 555-123-4567 to confirm." + }))); + assert!(r.is_ok(), "phone NNN-NNN-NNNN should NOT match SSN NNN-NN-NNNN: {:?}", r); + } + + #[test] + fn salary_disclosure_fails_policy() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Confirming your hourly rate of $32.50 per hour." + }))); + assert!(matches!(r, Err(ValidationError::Policy { .. }))); + } + + #[test] + fn discussing_dollars_without_salary_keyword_passes() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "The $20 parking pass is at the front desk." + }))); + assert!(r.is_ok(), "non-salary $ should pass: {:?}", r); + } + + #[test] + fn unknown_candidate_id_fails_consistency() { + let v = EmailValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Hi Jane", + "_context": {"candidate_id": "W-FAKE"} + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.contains("not found")), + other => panic!("expected Consistency, got {other:?}"), + } + } + + #[test] + fn missing_first_name_in_body_is_warning() { + let v = EmailValidator::new(lookup(vec![worker("W-1", "Jane Doe")])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Hi there — confirming your interview Friday.", + "_context": {"candidate_id": "W-1"} + }))); + let report = r.expect("missing name should be warning, not error"); + assert_eq!(report.findings.len(), 1); + assert_eq!(report.findings[0].severity, crate::Severity::Warning); + assert!(report.findings[0].message.to_lowercase().contains("first name")); + } + + #[test] + fn matching_first_name_passes_clean() { + let v = EmailValidator::new(lookup(vec![worker("W-1", "Jane Doe")])); + let r = v.validate(&Artifact::EmailDraft(json!({ + "to": "x@y.com", + "body": "Hi Jane — confirming your interview Friday.", + "_context": {"candidate_id": "W-1"} + }))); + let report = r.expect("matching name should pass"); + assert!(report.findings.is_empty(), "expected no findings, got {:?}", report.findings); + } } diff --git a/crates/validator/src/staffing/fill.rs b/crates/validator/src/staffing/fill.rs index bbe4bf6..8b69804 100644 --- a/crates/validator/src/staffing/fill.rs +++ b/crates/validator/src/staffing/fill.rs @@ -1,22 +1,67 @@ -//! Fill-proposal validator. +//! Fill-proposal validator (Phase 43 v2 — real consistency checks). //! //! PRD checks: -//! - Schema compliance (propose_done shape matches -//! `{fills: [{candidate_id, name}]}`) +//! - Schema compliance (propose_done shape: `{fills: [{candidate_id, name}]}`) //! - Completeness (endorsed count == target_count) -//! - Worker existence (every candidate_id present in workers_500k) -//! - Status check (active, not_on_client_blacklist) +//! - Worker existence (every candidate_id present in workers roster) +//! - Status check (worker.status == "active") +//! - Client blacklist (worker NOT in client.blacklisted_clients) //! - Geo/role match (worker city/state/role matches contract) //! -//! Today this is a scaffold — schema check is real (it's cheap); the -//! worker-existence / status / geo checks need a catalog lookup and -//! land in a follow-up when the catalog query helper is wired into -//! this crate. +//! The contract metadata (target_count, city, state, role, client_id) +//! travels alongside the JSON payload under a `_context` key: +//! `{"_context": {"target_count": 2, "city": "Toledo", "state": "OH", +//! "role": "Welder", "client_id": "CLI-00099"}, "fills": [...]}`. +//! This keeps the Validator trait signature stable while letting the +//! validator cross-check fills against contract truth. +//! +//! Worker-existence + status + geo + blacklist all share a single +//! lookup trait (`WorkerLookup`) so the validator stays decoupled +//! from queryd / parquet / catalogd transport details. -use crate::{Artifact, Report, Validator, ValidationError}; +use crate::{ + Artifact, Report, Validator, ValidationError, WorkerLookup, WorkerRecord, +}; +use std::sync::Arc; use std::time::Instant; -pub struct FillValidator; +pub struct FillValidator { + workers: Arc, +} + +impl FillValidator { + pub fn new(workers: Arc) -> Self { + Self { workers } + } +} + +#[derive(Debug, Default)] +struct FillContext { + target_count: Option, + city: Option, + state: Option, + role: Option, + client_id: Option, +} + +fn extract_context(value: &serde_json::Value) -> FillContext { + let ctx_obj = value.get("_context").and_then(|c| c.as_object()); + let ctx = match ctx_obj { + Some(o) => o, + None => return FillContext::default(), + }; + FillContext { + target_count: ctx.get("target_count").and_then(|v| v.as_u64()).map(|n| n as usize), + city: ctx.get("city").and_then(|v| v.as_str()).map(String::from), + state: ctx.get("state").and_then(|v| v.as_str()).map(String::from), + role: ctx.get("role").and_then(|v| v.as_str()).map(String::from), + client_id: ctx.get("client_id").and_then(|v| v.as_str()).map(String::from), + } +} + +fn eq_ci(a: &str, b: &str) -> bool { + a.trim().eq_ignore_ascii_case(b.trim()) +} impl Validator for FillValidator { fn name(&self) -> &'static str { "staffing.fill" } @@ -31,9 +76,7 @@ impl Validator for FillValidator { }), }; - // Schema check — the only real validation shipped in this - // scaffold. Catches the common "model emitted prose instead of - // JSON" failure mode before the consistency checks even run. + // ── Schema check ── let fills = value.get("fills").and_then(|f| f.as_array()).ok_or( ValidationError::Schema { field: "fills".into(), @@ -55,12 +98,116 @@ impl Validator for FillValidator { } } - // TODO(phase-43 v2): worker-existence / status / geo checks. - // Need a catalog query handle injected into FillValidator's - // constructor — out of scope for the scaffold. + let ctx = extract_context(value); + + // ── Completeness: count match ── + if let Some(target) = ctx.target_count { + if fills.len() != target { + return Err(ValidationError::Completeness { + reason: format!( + "endorsed count {} != target_count {target}", + fills.len() + ), + }); + } + } + + // ── Cross-roster checks ── + let mut findings: Vec = vec![]; + let mut seen_ids = std::collections::HashSet::new(); + for (i, fill) in fills.iter().enumerate() { + let candidate_id = fill.get("candidate_id").and_then(|v| v.as_str()).unwrap_or(""); + let proposed_name = fill.get("name").and_then(|v| v.as_str()).unwrap_or(""); + + // Duplicate-ID guard inside one fill. + if !seen_ids.insert(candidate_id.to_string()) { + return Err(ValidationError::Consistency { + reason: format!( + "duplicate candidate_id {candidate_id:?} appears multiple times in fills" + ), + }); + } + + // Worker existence — the gate that catches phantom IDs the + // model fabricates. This is the load-bearing check for + // the 0→85% pattern. + let worker: WorkerRecord = match self.workers.find(candidate_id) { + Some(w) => w, + None => return Err(ValidationError::Consistency { + reason: format!( + "fills[{i}].candidate_id {candidate_id:?} does not exist in worker roster" + ), + }), + }; + + // Status — only "active" workers can be endorsed. + if !eq_ci(&worker.status, "active") { + return Err(ValidationError::Consistency { + reason: format!( + "fills[{i}] worker {candidate_id:?} has status {:?}, expected \"active\"", + worker.status + ), + }); + } + + // Client blacklist. + if let Some(client) = ctx.client_id.as_deref() { + if worker.blacklisted_clients.iter().any(|b| eq_ci(b, client)) { + return Err(ValidationError::Policy { + reason: format!( + "fills[{i}] worker {candidate_id:?} blacklisted for client {client:?}" + ), + }); + } + } + + // Geo / role match — warn-level when missing context, hard + // fail on mismatch with explicit contract values. + if let (Some(want_city), Some(have_city)) = (ctx.city.as_deref(), worker.city.as_deref()) { + if !eq_ci(want_city, have_city) { + return Err(ValidationError::Consistency { + reason: format!( + "fills[{i}] worker {candidate_id:?} city {have_city:?} doesn't match contract city {want_city:?}" + ), + }); + } + } + if let (Some(want_state), Some(have_state)) = (ctx.state.as_deref(), worker.state.as_deref()) { + if !eq_ci(want_state, have_state) { + return Err(ValidationError::Consistency { + reason: format!( + "fills[{i}] worker {candidate_id:?} state {have_state:?} doesn't match contract state {want_state:?}" + ), + }); + } + } + if let (Some(want_role), Some(have_role)) = (ctx.role.as_deref(), worker.role.as_deref()) { + if !eq_ci(want_role, have_role) { + return Err(ValidationError::Consistency { + reason: format!( + "fills[{i}] worker {candidate_id:?} role {have_role:?} doesn't match contract role {want_role:?}" + ), + }); + } + } + + // Name-mismatch is a warning, not an error — recruiters + // sometimes send updated names through the proposal layer + // before the roster is updated. + if !proposed_name.is_empty() && !eq_ci(proposed_name, &worker.name) { + findings.push(crate::Finding { + field: format!("fills[{i}].name"), + severity: crate::Severity::Warning, + message: format!( + "proposed name {proposed_name:?} differs from roster name {:?} for {candidate_id:?}", + worker.name + ), + }); + } + } Ok(Report { - findings: vec![], + findings, elapsed_ms: started.elapsed().as_millis() as u64, }) } @@ -69,35 +216,168 @@ impl Validator for FillValidator { #[cfg(test)] mod tests { use super::*; + use crate::InMemoryWorkerLookup; + use serde_json::json; + + fn lookup(records: Vec) -> Arc { + Arc::new(InMemoryWorkerLookup::from_records(records)) + } + + fn worker(id: &str, name: &str, status: &str, city: &str, state: &str, role: &str) -> WorkerRecord { + WorkerRecord { + candidate_id: id.into(), + name: name.into(), + status: status.into(), + city: Some(city.into()), + state: Some(state.into()), + role: Some(role.into()), + blacklisted_clients: vec![], + } + } #[test] fn wrong_artifact_type_fails_schema() { - let r = FillValidator.validate(&Artifact::EmailDraft(serde_json::json!({}))); + let v = FillValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::EmailDraft(json!({}))); assert!(matches!(r, Err(ValidationError::Schema { .. }))); } #[test] fn missing_fills_array_fails_schema() { - let r = FillValidator.validate(&Artifact::FillProposal(serde_json::json!({}))); + let v = FillValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::FillProposal(json!({}))); assert!(matches!(r, Err(ValidationError::Schema { field, .. }) if field == "fills")); } #[test] fn fill_without_candidate_id_fails() { - let r = FillValidator.validate(&Artifact::FillProposal(serde_json::json!({ - "fills": [{"name": "Jane"}] - }))); + let v = FillValidator::new(lookup(vec![])); + let r = v.validate(&Artifact::FillProposal(json!({"fills": [{"name": "Jane"}]}))); assert!(matches!(r, Err(ValidationError::Schema { field, .. }) if field.contains("candidate_id"))); } #[test] - fn well_formed_proposal_passes_schema() { - let r = FillValidator.validate(&Artifact::FillProposal(serde_json::json!({ + fn well_formed_proposal_with_real_workers_passes() { + let v = FillValidator::new(lookup(vec![ + worker("W-1", "Jane Doe", "active", "Toledo", "OH", "Welder"), + worker("W-2", "John Smith", "active", "Toledo", "OH", "Welder"), + ])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 2, "city": "Toledo", "state": "OH", "role": "Welder"}, "fills": [ - {"candidate_id": "W-123", "name": "Jane Doe"}, - {"candidate_id": "W-456", "name": "John Smith"} + {"candidate_id": "W-1", "name": "Jane Doe"}, + {"candidate_id": "W-2", "name": "John Smith"} ] }))); - assert!(r.is_ok(), "well-formed proposal should pass schema: {:?}", r); + assert!(r.is_ok(), "expected pass, got {:?}", r); + } + + #[test] + fn phantom_candidate_id_fails_consistency() { + let v = FillValidator::new(lookup(vec![worker("W-1", "Jane", "active", "Toledo", "OH", "Welder")])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [{"candidate_id": "W-FAKE-99999", "name": "Imaginary"}] + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.contains("does not exist")), + other => panic!("expected Consistency error, got {other:?}"), + } + } + + #[test] + fn inactive_worker_fails_consistency() { + let v = FillValidator::new(lookup(vec![worker("W-1", "Jane", "inactive", "Toledo", "OH", "Welder")])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1}, + "fills": [{"candidate_id": "W-1", "name": "Jane"}] + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.contains("inactive")), + other => panic!("expected Consistency error, got {other:?}"), + } + } + + #[test] + fn wrong_city_fails_consistency() { + let v = FillValidator::new(lookup(vec![worker("W-1", "Jane", "active", "Cincinnati", "OH", "Welder")])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [{"candidate_id": "W-1", "name": "Jane"}] + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.to_lowercase().contains("city")), + other => panic!("expected Consistency error, got {other:?}"), + } + } + + #[test] + fn wrong_role_fails_consistency() { + let v = FillValidator::new(lookup(vec![worker("W-1", "Jane", "active", "Toledo", "OH", "Driver")])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [{"candidate_id": "W-1", "name": "Jane"}] + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.to_lowercase().contains("role")), + other => panic!("expected Consistency error, got {other:?}"), + } + } + + #[test] + fn count_mismatch_fails_completeness() { + let v = FillValidator::new(lookup(vec![ + worker("W-1", "Jane", "active", "Toledo", "OH", "Welder"), + ])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 2, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [{"candidate_id": "W-1", "name": "Jane"}] + }))); + assert!(matches!(r, Err(ValidationError::Completeness { .. }))); + } + + #[test] + fn duplicate_candidate_id_fails_consistency() { + let v = FillValidator::new(lookup(vec![ + worker("W-1", "Jane", "active", "Toledo", "OH", "Welder"), + ])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 2, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [ + {"candidate_id": "W-1", "name": "Jane"}, + {"candidate_id": "W-1", "name": "Jane"} + ] + }))); + match r { + Err(ValidationError::Consistency { reason }) => assert!(reason.contains("duplicate")), + other => panic!("expected Consistency error, got {other:?}"), + } + } + + #[test] + fn blacklisted_worker_fails_policy() { + let mut w = worker("W-1", "Jane", "active", "Toledo", "OH", "Welder"); + w.blacklisted_clients = vec!["CLI-00099".into()]; + let v = FillValidator::new(lookup(vec![w])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1, "city": "Toledo", "state": "OH", "role": "Welder", "client_id": "CLI-00099"}, + "fills": [{"candidate_id": "W-1", "name": "Jane"}] + }))); + assert!(matches!(r, Err(ValidationError::Policy { .. }))); + } + + #[test] + fn name_mismatch_is_warning_not_error() { + let v = FillValidator::new(lookup(vec![ + worker("W-1", "Jane Doe", "active", "Toledo", "OH", "Welder"), + ])); + let r = v.validate(&Artifact::FillProposal(json!({ + "_context": {"target_count": 1, "city": "Toledo", "state": "OH", "role": "Welder"}, + "fills": [{"candidate_id": "W-1", "name": "Janet Doe"}] + }))); + let report = r.expect("name mismatch should be warning, not error"); + assert_eq!(report.findings.len(), 1); + assert_eq!(report.findings[0].severity, crate::Severity::Warning); + assert!(report.findings[0].message.contains("differs from roster")); } }