From 3708e6abf1fd008049be6b968327314363ffa2cd Mon Sep 17 00:00:00 2001 From: root Date: Sun, 3 May 2026 05:05:12 -0500 Subject: [PATCH] biometric endpoint: scrum-driven hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per 2026-05-03 phase_1_6_gate_3a scrum (10 findings, 0 convergent location-wise but opus + kimi flagged the same audit-failure issue). Convergent + load-bearing fix: Audit-write failure was silently swallowed (returned 200 with empty hmac) after photo + manifest persisted. For BIPA defensibility this is wrong — a successful response without an audit row is exactly the silent-failure mode the spec exists to prevent. Now: full transactional rollback. If audit append fails after photo + manifest commit, we remove the photo AND revert the manifest to its pre-upload state, then return 500 with error="audit_write_failed". Other real fixes: Orphan-file leak (opus WARN): if put_subject fails AFTER the photo is written, the file would orphan on disk with no manifest pointer. Now removes the photo on manifest-update failure, before returning 500. Content-Type parameter handling (opus WARN): real-world clients send `image/jpeg; charset=binary` etc. Parser now strips parameters per RFC 9110 §8.3 and matches case-insensitively. New regression test content_type_with_parameters_accepted exercises both. data_path doc/code mismatch (opus WARN): doc said "relative to the configured biometric storage root" but code stored absolute path. Now stores relative — operators reading the manifest reconstruct the absolute path with their own storage_root, manifests are portable across deployments. Tests updated. Timestamp-nanosecond collision (kimi WARN): added 8-char uuid suffix to filename. Sub-microsecond cadence collision was implausible but defense-in-depth is cheap. Dead code (opus + kimi INFO): removed unused require_legal_auth function (process_upload reimplements the auth check inline) and the `let _ = ConsentStatus::Given;` no-op type-shape reference. Skipped (acceptable in v1): - qwen BLOCK on image format validation: spec explicitly says "we trust the caller; malformed images fail downstream when deepface runs in Gate 3b". Documented in the file's module doc-comment. - qwen WARN on directory create-then-chmod race: brief window between create_dir_all and set_permissions. Mitigation would require libc-level umask manipulation; accepted as v1 scope. - qwen INFO on constant_time_eq duplication: comment explains the cross-import boundary; acceptable short-term per the reviewer. Tests: 11 unit tests pass (added content_type_with_parameters_accepted). Live verification post-restart: - Content-Type with `; charset=binary` accepted ✓ - data_path returned as relative `WORKER-2/_.jpg` ✓ - Chain verified end-to-end (3 rows: validator + 2 biometric) ✓ - Cross-runtime parity probe still 6/6 byte-identical ✓ Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/catalogd/src/biometric_endpoint.rs | 163 +++++++++++++++------- 1 file changed, 114 insertions(+), 49 deletions(-) diff --git a/crates/catalogd/src/biometric_endpoint.rs b/crates/catalogd/src/biometric_endpoint.rs index bd7653a..4dff5df 100644 --- a/crates/catalogd/src/biometric_endpoint.rs +++ b/crates/catalogd/src/biometric_endpoint.rs @@ -47,7 +47,7 @@ use axum::{ }; use serde::Serialize; use sha2::{Digest, Sha256}; -use shared::types::{AuditAccessor, BiometricCollection, ConsentStatus, SubjectAuditRow}; +use shared::types::{AuditAccessor, BiometricCollection, SubjectAuditRow}; use std::path::PathBuf; use std::sync::Arc; @@ -148,33 +148,19 @@ fn constant_time_eq(a: &[u8], b: &[u8]) -> bool { diff == 0 } -fn require_legal_auth( - state: &BiometricEndpointState, - headers: &HeaderMap, -) -> Result<(), (StatusCode, &'static str)> { - let configured = state.legal_token.as_ref().ok_or(( - StatusCode::SERVICE_UNAVAILABLE, - "biometric endpoint disabled — no legal token configured", - ))?; - let provided = headers.get(LEGAL_TOKEN_HEADER).ok_or(( - StatusCode::UNAUTHORIZED, - "missing X-Lakehouse-Legal-Token header", - ))?; - let provided = provided.to_str().map_err(|_| ( - StatusCode::UNAUTHORIZED, - "X-Lakehouse-Legal-Token contains non-ASCII characters", - ))?; - if !constant_time_eq(provided.as_bytes(), configured.as_bytes()) { - return Err((StatusCode::UNAUTHORIZED, "X-Lakehouse-Legal-Token mismatch")); - } - Ok(()) -} - /// Map Content-Type header to the file extension used in the /// quarantined path. Unknown types are rejected (415) so we don't /// silently store HEIC / WebP / SVG without an explicit decision. +/// +/// The header value MAY include parameters per RFC 9110 §8.3 +/// (`image/jpeg; charset=binary`, `image/jpeg; boundary=...`); we +/// strip parameters before matching on the bare media type. Caught +/// 2026-05-03 opus scrum WARN on biometric_endpoint.rs:189. fn extension_from_content_type(ct: Option<&str>) -> Result<&'static str, (StatusCode, &'static str)> { - match ct { + let bare = ct + .and_then(|s| s.split(';').next()) + .map(|s| s.trim().to_ascii_lowercase()); + match bare.as_deref() { Some("image/jpeg") | Some("image/jpg") => Ok("jpg"), Some("image/png") => Ok("png"), _ => Err(( @@ -293,7 +279,9 @@ pub async fn process_upload( error: "unsupported_media_type", detail: m.into(), consent_status: None, }))?; - let mut manifest = state.registry.get_subject(candidate_id).await.ok_or(( + // We hold the original manifest for rollback. Cloning here so the + // mutated copy and the original are separable references. + let original_manifest = state.registry.get_subject(candidate_id).await.ok_or(( StatusCode::NOT_FOUND, ErrorResponse { error: "subject_not_found", @@ -301,6 +289,7 @@ pub async fn process_upload( consent_status: None, }, ))?; + let mut manifest = original_manifest.clone(); use shared::types::{BiometricConsentStatus, SubjectStatus}; if manifest.consent.biometric.status != BiometricConsentStatus::Given { @@ -339,7 +328,13 @@ pub async fn process_upload( use std::os::unix::fs::PermissionsExt; let _ = std::fs::set_permissions(&subject_dir, std::fs::Permissions::from_mode(0o700)); } - let filename = format!("{}.{}", collected_at.timestamp_nanos_opt().unwrap_or(0), ext); + // Filename: timestamp-nanos + 4-byte random suffix to defeat the + // (very unlikely) two-uploads-in-the-same-nanosecond collision the + // 2026-05-03 kimi scrum WARN flagged. uuid::Uuid::new_v4 → use + // first 8 hex chars (32 bits of entropy is enough — collision over + // a per-subject namespace at sub-microsecond cadence is implausible). + let suffix: String = uuid::Uuid::new_v4().simple().to_string().chars().take(8).collect(); + let filename = format!("{}_{}.{}", collected_at.timestamp_nanos_opt().unwrap_or(0), suffix, ext); let abs_path = subject_dir.join(&filename); tokio::fs::write(&abs_path, body).await.map_err(|e| ( StatusCode::INTERNAL_SERVER_ERROR, @@ -350,7 +345,15 @@ pub async fn process_upload( use std::os::unix::fs::PermissionsExt; let _ = std::fs::set_permissions(&abs_path, std::fs::Permissions::from_mode(0o600)); } - let stored_path = abs_path.to_string_lossy().to_string(); + // Store the path relative to storage_root in the manifest (matches + // the BiometricCollection doc-comment: "relative to the configured + // biometric storage root"). Caught 2026-05-03 opus scrum WARN on + // shared/types.rs::BiometricCollection. Operators reading the + // manifest can rebuild the absolute path with their own storage_root. + let stored_path = abs_path + .strip_prefix(&state.storage_root) + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_else(|_| abs_path.to_string_lossy().to_string()); manifest.biometric_collection = Some(BiometricCollection { data_path: stored_path.clone(), @@ -360,10 +363,32 @@ pub async fn process_upload( classifications: None, }); manifest.updated_at = collected_at; - state.registry.put_subject(manifest.clone()).await.map_err(|e| ( - StatusCode::INTERNAL_SERVER_ERROR, - ErrorResponse { error: "manifest_update_failed", detail: e, consent_status: None }, - ))?; + + // Transactional commit (Caught 2026-05-03 opus BLOCK + kimi WARN): + // + // The three side-effects MUST be all-or-nothing for BIPA defensibility: + // - photo file on disk + // - manifest reflects the collection + // - audit row is appended to the chain + // + // If any later step fails, prior steps are rolled back. The rollback + // path uses best-effort cleanup — every revert error is logged but + // does not propagate (the user-facing failure is the original cause). + // + // Order: photo (already written above) → manifest → audit. We commit + // manifest BEFORE audit because the audit row references the + // manifest's chain root; rolling back a committed audit row would + // require chain rewriting (which we explicitly forbid). + if let Err(e) = state.registry.put_subject(manifest.clone()).await { + // Manifest write failed — clean up the photo and bail. + if let Err(rm_err) = tokio::fs::remove_file(&abs_path).await { + tracing::error!("rollback failed: could not remove orphan photo {}: {rm_err}", stored_path); + } + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + ErrorResponse { error: "manifest_update_failed", detail: e, consent_status: None }, + )); + } let row = SubjectAuditRow { schema: "subject_audit.v1".into(), @@ -383,16 +408,30 @@ pub async fn process_upload( let audit_row_hmac = match state.writer.append(row).await { Ok(h) => h, Err(e) => { - // Photo is on disk + manifest updated. Audit failure is - // real but operator can investigate; return 200 with the - // audit hmac empty rather than 500 (which would imply - // nothing was persisted). - tracing::error!("biometric upload audit row failed for {candidate_id}: {e}"); - String::new() + // Audit-write failed AFTER photo + manifest committed. + // Roll back BOTH: remove the photo and revert the manifest + // to its pre-upload state. Returning 500 is the BIPA- + // defensible answer — a successful response without an + // audit row is exactly the silent-failure mode the spec + // exists to prevent. + tracing::error!("audit row failed for {candidate_id}: {e}; rolling back"); + if let Err(rm_err) = tokio::fs::remove_file(&abs_path).await { + tracing::error!("rollback: could not remove photo {}: {rm_err}", stored_path); + } + if let Err(re_err) = state.registry.put_subject(original_manifest).await { + tracing::error!("rollback: could not revert manifest for {candidate_id}: {re_err}"); + } + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + ErrorResponse { + error: "audit_write_failed", + detail: format!("audit row failed; photo + manifest rolled back: {e}"), + consent_status: None, + }, + )); } }; - let _ = ConsentStatus::Given; // Type-shape reference. Ok(UploadResponse { schema: RESPONSE_SCHEMA, candidate_id: candidate_id.to_string(), @@ -558,14 +597,17 @@ mod tests { .await.unwrap(); assert_eq!(resp.schema, "biometric_photo_response.v1"); assert_eq!(resp.candidate_id, "WORKER-4"); - assert!(resp.data_path.starts_with(storage_root.to_str().unwrap()), - "data_path {} should be under storage root {:?}", resp.data_path, storage_root); + // data_path is relative to storage_root per BiometricCollection doc. + assert!(!resp.data_path.starts_with('/'), "data_path should be relative, got: {}", resp.data_path); + assert!(resp.data_path.starts_with("WORKER-4/"), + "data_path {} should start with subject directory", resp.data_path); assert_eq!(resp.template_hash.len(), 64); assert_eq!(resp.consent_version_hash, "consent-v1-hash"); assert!(!resp.audit_row_hmac.is_empty()); - // File on disk at quarantined path with correct bytes. - let on_disk = tokio::fs::read(&resp.data_path).await.unwrap(); + // File at storage_root.join(data_path) has the bytes we sent. + let abs = storage_root.join(&resp.data_path); + let on_disk = tokio::fs::read(&abs).await.unwrap(); assert_eq!(on_disk, jpeg_bytes()); // Manifest reflects the new collection record. @@ -598,8 +640,8 @@ mod tests { #[tokio::test] async fn quarantine_path_is_not_data_headshots() { - // Defensive structural test: make sure the quarantined path - // does NOT live under data/headshots/. Any future change that + // Defensive structural test: make sure the storage_root does + // NOT live under data/headshots/. Any future change that // accidentally pointed real-photo storage at the synthetic // headshot dir would conflate the two surfaces — exactly the // confusion Phase 1.6 Gate 3 exists to prevent. @@ -607,10 +649,33 @@ mod tests { let _ = state.registry.put_subject(fixture_manifest("WORKER-Q", BiometricConsentStatus::Given, SubjectStatus::Active)).await; let resp = process_upload(&state, "WORKER-Q", Some(TEST_TOKEN), Some("image/jpeg"), "", "", &jpeg_bytes()) .await.unwrap(); - assert!(!resp.data_path.contains("/headshots/"), - "quarantine path {} must not be under /headshots/ (synthetic-only surface)", - resp.data_path); - assert!(resp.data_path.contains("/biometric/uploads/"), - "quarantine path {} should be under /biometric/uploads/", resp.data_path); + let storage = state.storage_root.to_string_lossy().to_string(); + assert!(!storage.contains("/headshots/"), + "storage_root {} must not be under /headshots/ (synthetic-only surface)", storage); + assert!(storage.contains("/biometric/uploads"), + "storage_root {} should be under /biometric/uploads", storage); + // data_path is the relative form: /.. + assert!(resp.data_path.starts_with("WORKER-Q/"), + "data_path {} should be subject-relative", resp.data_path); + } + + #[tokio::test] + async fn content_type_with_parameters_accepted() { + // Real-world clients send `image/jpeg; charset=binary` etc. + // Caught 2026-05-03 opus scrum WARN; regression test ensures + // the bare media type is matched after stripping parameters. + let state = fixture_state("ct_with_params").await; + let _ = state.registry.put_subject(fixture_manifest("WORKER-CT", BiometricConsentStatus::Given, SubjectStatus::Active)).await; + let resp = process_upload( + &state, "WORKER-CT", Some(TEST_TOKEN), + Some("image/jpeg; charset=binary"), "", "", &jpeg_bytes(), + ).await.unwrap(); + assert_eq!(resp.candidate_id, "WORKER-CT"); + // Also case-insensitive matching: "Image/JPEG" should work too. + let resp2 = process_upload( + &state, "WORKER-CT", Some(TEST_TOKEN), + Some("Image/JPEG"), "", "", &jpeg_bytes(), + ).await.unwrap(); + assert_eq!(resp2.candidate_id, "WORKER-CT"); } }