diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index 2898a0f..3f422de 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -1932,18 +1932,18 @@ fn default_subvectors() -> u32 { 48 } // 768/48 = 16 dims per subvector /// search returned 500 + leaked the lakehouse data path AND the /// .cargo/registry path with crate versions. fn sanitize_lance_err(err: String, index_name: &str) -> (StatusCode, String) { - // 404 detection — narrowed per the 2026-05-02 scrum (opus WARN at - // service.rs:1908). The previous `lower.contains("not found")` heuristic - // was too broad: Lance/Arrow surface many non-dataset-missing errors - // containing "not found" (e.g. "column not found", "field not found in - // schema"). Those are real 500s, not lookup misses. Match dataset-shape - // phrasing — Lance's actual format is "Dataset at path X was not found" - // or "no such file or directory". Excludes "column not found" and - // "field not found in schema" which are valid 500s. + // 404 detection — narrowed across two 2026-05-02→03 scrum waves. + // First wave (opus WARN service.rs:1908): the original `lower.contains + // ("not found")` was too broad — caught "column not found" / + // "field not found in schema" which are real 500s. Second wave (opus + // WARN service.rs:1949): the looser `mentions_path_missing` branch I + // added would 404 on a registry-file error like "/root/.cargo/.../x.rs: + // no such file or directory" because it triggers without dataset + // context. Drop the standalone path-missing branch; require dataset + // context AND a missing-shape phrase. Lance's actual error format + // ("Dataset at path X was not found") satisfies this. let lower = err.to_lowercase(); let mentions_dataset = lower.contains("dataset"); - let mentions_path_missing = lower.contains("no such file or directory") - || lower.contains("does not exist"); let lance_dataset_missing = mentions_dataset && ( lower.contains("not found") || lower.contains("does not exist") ); @@ -1951,7 +1951,7 @@ fn sanitize_lance_err(err: String, index_name: &str) -> (StatusCode, String) { let column_or_field = lower.contains("column not found") || lower.contains("field not found") || lower.contains("schema not found"); - let is_not_found = (lance_dataset_missing || mentions_path_missing) && !column_or_field; + let is_not_found = lance_dataset_missing && !column_or_field; if is_not_found { return (StatusCode::NOT_FOUND, format!("lance dataset not found: {index_name}")); } @@ -2009,9 +2009,15 @@ fn redact_paths(s: &str) -> String { let b = bytes[i - 1]; !(b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') } + // Walk by byte index but slice the original &str when emitting, never + // cast bytes to char (that would corrupt multi-byte UTF-8 — opus WARN + // at service.rs:2018 from the 2026-05-03 re-scrum). Path prefixes are + // pure ASCII so byte-level matching is sound; what matters is that + // we emit non-matched stretches as &str slices, not byte-by-byte. let bytes = s.as_bytes(); let mut out = String::with_capacity(s.len()); let mut i = 0; + let mut copy_start = 0usize; // start of an in-progress unmatched run while i < bytes.len() { let mut matched_len: Option = None; // Try absolute prefixes first (always allowed). @@ -2036,20 +2042,42 @@ fn redact_paths(s: &str) -> String { } } if let Some(prefix_len) = matched_len { + // Flush any pending unmatched run as a UTF-8-safe slice. + if copy_start < i { + out.push_str(&s[copy_start..i]); + } out.push_str("[REDACTED]"); + // Skip past the prefix and the path body (until terminator). let mut j = i + prefix_len; while j < bytes.len() && !is_path_term(bytes[j]) { j += 1; } i = j; + copy_start = i; } else { - out.push(bytes[i] as char); - i += 1; + // Advance one CHAR (not one byte) so multi-byte UTF-8 sequences + // stay intact in the eventual slice. Look up the next char + // boundary using the public API. + i += utf8_char_len(bytes, i); } } + if copy_start < bytes.len() { + out.push_str(&s[copy_start..]); + } out } +/// Length in bytes of the UTF-8 character starting at byte `i`. Bytes are +/// guaranteed to be a valid UTF-8 sequence start (callers ensure that). +fn utf8_char_len(bytes: &[u8], i: usize) -> usize { + let b = bytes[i]; + if b < 0x80 { 1 } + else if b < 0xC0 { 1 } // continuation byte — defensive, shouldn't start here + else if b < 0xE0 { 2 } + else if b < 0xF0 { 3 } + else { 4 } +} + #[cfg(test)] mod sanitize_tests { use super::*; @@ -2147,6 +2175,40 @@ mod sanitize_tests { assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR, "{err}"); } } + + #[test] + fn is_not_found_does_not_match_unrelated_path_missing() { + // Regression: opus WARN at service.rs:1949 from the 2026-05-03 + // re-scrum. A registry-file error from inside a Lance internal + // module should NOT be coerced to 404 just because it contains + // "no such file or directory" — it's a real 500. + let (status, _) = sanitize_lance_err( + "/root/.cargo/registry/src/index.crates.io-foo/lance-table-4.0.0/src/io/commit.rs: no such file or directory".into(), + "test_idx", + ); + assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR); + // (And the path is still redacted in the message.) + let (_, msg) = sanitize_lance_err( + "/root/.cargo/registry/src/lance-foo/x.rs: no such file or directory".into(), + "test_idx", + ); + assert!(!msg.contains("/root/.cargo"), "path leak: {msg}"); + } + + #[test] + fn redact_preserves_multibyte_utf8() { + // Regression: opus WARN at service.rs:2018 from the 2026-05-03 + // re-scrum. Old impl did `out.push(bytes[i] as char)` which + // corrupted multi-byte UTF-8 (e.g. a path containing user-supplied + // names with non-ASCII characters) into Latin-1 mojibake. + let input = "Failed to open /home/profit/工作/data — café not found"; + let out = redact_paths(input); + // The path is redacted... + assert!(!out.contains("/home/profit"), "path leak: {out}"); + // ...AND the multi-byte characters elsewhere are preserved verbatim. + assert!(out.contains("café"), "lost UTF-8: {out}"); + assert!(out.contains("not found"), "lost trailing context: {out}"); + } } /// Build the IVF_PQ index on the Lance dataset.