sanitize: drop over-broad path-missing branch + UTF-8-safe redaction

Re-scrum of yesterday's sanitizer fix surfaced 2 more real bugs in the
fix itself (opus, both WARN, neither caught by kimi/qwen):

W1 (service.rs:1949) — `mentions_path_missing` standalone branch was
too aggressive. A registry-internal error like "/root/.cargo/.../x.rs:
no such file or directory" would 404 because it triggers without
dataset context. That's a real 500. Dropped the standalone branch;
require dataset context AND missing-shape phrase. Lance's actual
"Dataset at path X was not found" still satisfies it.

W2 (service.rs:2018) — `out.push(bytes[i] as char)` corrupted
multi-byte UTF-8 by casting raw bytes to char (only sound for ASCII
< 128). A path containing user-supplied non-ASCII names produced
Latin-1 mojibake. Rewrote redact_paths to track byte indices and
emit unmatched runs as &str slices via push_str(&s[range]) — preserves
multi-byte sequences verbatim. Step advance is now per-char, not
per-byte, via small utf8_char_len helper.

Two new regression tests:
- is_not_found_does_not_match_unrelated_path_missing
- redact_preserves_multibyte_utf8 (uses 工作 + café in input)

12/12 sanitize tests PASS. Smoke 10/10 PASS. Loop closure for opus
re-scrum on the 2026-05-02 fix bundle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
root 2026-05-03 00:15:23 -05:00
parent ac7c996596
commit e9d17f7d5a

View File

@ -1932,18 +1932,18 @@ fn default_subvectors() -> u32 { 48 } // 768/48 = 16 dims per subvector
/// search returned 500 + leaked the lakehouse data path AND the
/// .cargo/registry path with crate versions.
fn sanitize_lance_err(err: String, index_name: &str) -> (StatusCode, String) {
// 404 detection — narrowed per the 2026-05-02 scrum (opus WARN at
// service.rs:1908). The previous `lower.contains("not found")` heuristic
// was too broad: Lance/Arrow surface many non-dataset-missing errors
// containing "not found" (e.g. "column not found", "field not found in
// schema"). Those are real 500s, not lookup misses. Match dataset-shape
// phrasing — Lance's actual format is "Dataset at path X was not found"
// or "no such file or directory". Excludes "column not found" and
// "field not found in schema" which are valid 500s.
// 404 detection — narrowed across two 2026-05-02→03 scrum waves.
// First wave (opus WARN service.rs:1908): the original `lower.contains
// ("not found")` was too broad — caught "column not found" /
// "field not found in schema" which are real 500s. Second wave (opus
// WARN service.rs:1949): the looser `mentions_path_missing` branch I
// added would 404 on a registry-file error like "/root/.cargo/.../x.rs:
// no such file or directory" because it triggers without dataset
// context. Drop the standalone path-missing branch; require dataset
// context AND a missing-shape phrase. Lance's actual error format
// ("Dataset at path X was not found") satisfies this.
let lower = err.to_lowercase();
let mentions_dataset = lower.contains("dataset");
let mentions_path_missing = lower.contains("no such file or directory")
|| lower.contains("does not exist");
let lance_dataset_missing = mentions_dataset && (
lower.contains("not found") || lower.contains("does not exist")
);
@ -1951,7 +1951,7 @@ fn sanitize_lance_err(err: String, index_name: &str) -> (StatusCode, String) {
let column_or_field = lower.contains("column not found")
|| lower.contains("field not found")
|| lower.contains("schema not found");
let is_not_found = (lance_dataset_missing || mentions_path_missing) && !column_or_field;
let is_not_found = lance_dataset_missing && !column_or_field;
if is_not_found {
return (StatusCode::NOT_FOUND, format!("lance dataset not found: {index_name}"));
}
@ -2009,9 +2009,15 @@ fn redact_paths(s: &str) -> String {
let b = bytes[i - 1];
!(b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-')
}
// Walk by byte index but slice the original &str when emitting, never
// cast bytes to char (that would corrupt multi-byte UTF-8 — opus WARN
// at service.rs:2018 from the 2026-05-03 re-scrum). Path prefixes are
// pure ASCII so byte-level matching is sound; what matters is that
// we emit non-matched stretches as &str slices, not byte-by-byte.
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len());
let mut i = 0;
let mut copy_start = 0usize; // start of an in-progress unmatched run
while i < bytes.len() {
let mut matched_len: Option<usize> = None;
// Try absolute prefixes first (always allowed).
@ -2036,20 +2042,42 @@ fn redact_paths(s: &str) -> String {
}
}
if let Some(prefix_len) = matched_len {
// Flush any pending unmatched run as a UTF-8-safe slice.
if copy_start < i {
out.push_str(&s[copy_start..i]);
}
out.push_str("[REDACTED]");
// Skip past the prefix and the path body (until terminator).
let mut j = i + prefix_len;
while j < bytes.len() && !is_path_term(bytes[j]) {
j += 1;
}
i = j;
copy_start = i;
} else {
out.push(bytes[i] as char);
i += 1;
// Advance one CHAR (not one byte) so multi-byte UTF-8 sequences
// stay intact in the eventual slice. Look up the next char
// boundary using the public API.
i += utf8_char_len(bytes, i);
}
}
if copy_start < bytes.len() {
out.push_str(&s[copy_start..]);
}
out
}
/// Length in bytes of the UTF-8 character starting at byte `i`. Bytes are
/// guaranteed to be a valid UTF-8 sequence start (callers ensure that).
fn utf8_char_len(bytes: &[u8], i: usize) -> usize {
let b = bytes[i];
if b < 0x80 { 1 }
else if b < 0xC0 { 1 } // continuation byte — defensive, shouldn't start here
else if b < 0xE0 { 2 }
else if b < 0xF0 { 3 }
else { 4 }
}
#[cfg(test)]
mod sanitize_tests {
use super::*;
@ -2147,6 +2175,40 @@ mod sanitize_tests {
assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR, "{err}");
}
}
#[test]
fn is_not_found_does_not_match_unrelated_path_missing() {
// Regression: opus WARN at service.rs:1949 from the 2026-05-03
// re-scrum. A registry-file error from inside a Lance internal
// module should NOT be coerced to 404 just because it contains
// "no such file or directory" — it's a real 500.
let (status, _) = sanitize_lance_err(
"/root/.cargo/registry/src/index.crates.io-foo/lance-table-4.0.0/src/io/commit.rs: no such file or directory".into(),
"test_idx",
);
assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR);
// (And the path is still redacted in the message.)
let (_, msg) = sanitize_lance_err(
"/root/.cargo/registry/src/lance-foo/x.rs: no such file or directory".into(),
"test_idx",
);
assert!(!msg.contains("/root/.cargo"), "path leak: {msg}");
}
#[test]
fn redact_preserves_multibyte_utf8() {
// Regression: opus WARN at service.rs:2018 from the 2026-05-03
// re-scrum. Old impl did `out.push(bytes[i] as char)` which
// corrupted multi-byte UTF-8 (e.g. a path containing user-supplied
// names with non-ASCII characters) into Latin-1 mojibake.
let input = "Failed to open /home/profit/工作/data — café not found";
let out = redact_paths(input);
// The path is redacted...
assert!(!out.contains("/home/profit"), "path leak: {out}");
// ...AND the multi-byte characters elsewhere are preserved verbatim.
assert!(out.contains("café"), "lost UTF-8: {out}");
assert!(out.contains("not found"), "lost trailing context: {out}");
}
}
/// Build the IVF_PQ index on the Lance dataset.