From 7bb66f08c3265678ae30e5035643cefcb97516e7 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 2 May 2026 23:34:54 -0500 Subject: [PATCH] lance: scrum-driven sanitizer + smoke-gate fixes (opus 2026-05-02 BLOCK) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-lineage scrum on the lance wave (4 bundles, 33 distinct findings) surfaced 1 real BLOCK and 2 real WARNs from opus that the kimi/qwen lineages missed. Per feedback_cross_lineage_review.md, opus is the load-bearing reviewer; cross-lineage convergence is noise unless verified. BLOCK fix — sanitize_lance_err path-stripping was unsound: err.split("/home/").next().unwrap_or(&err) returns Some("") when err STARTS with "/home/", erasing the entire message. Replaced truncation with redact_paths() — a hand-rolled scanner that walks the input once, replacing path-shaped substrings with [REDACTED] while preserving surrounding error context. Catches: - absolute paths under /root/.cargo, /home, /var, /tmp, /etc, /usr, /opt - relative variants (Lance occasionally strips leading slash — observed live "Dataset at path home/profit/lakehouse/data/lance/x was not found") - multiple occurrences in one error - preserves quote/comma/whitespace terminators WARN fix #1 — is_not_found heuristic was too broad: lower.contains("not found") caught real 500s like "column not found", "field not found in schema". Narrowed to require dataset-shape phrasing AND exclude the column/field/schema patterns explicitly. WARN fix #2 — lance_smoke.sh `grep -qvE` was an unsound regression gate. bash -c "echo '$BODY' | grep -qvE 'pat'" With -v -q, exits 0 if ANY line lacks the pattern — so a multi-line body with one leak line + any clean line FALSE-PASSES. Replaced with the correct "pattern absent" form: `! grep -qE 'pat'`. Also expanded the pattern set (added /var/, /tmp/) since the scrum surfaced these as additional leak vectors. Also unblocks pre-existing pathway_memory test compile error (stale PathwayTrace init missing 6 Mem0-versioning fields added in 6ac7f61). Tests filled in with sensible defaults — needed to run sanitize_tests. 10/10 new sanitize tests pass. Smoke 9/9 PASS against rebuilt+restarted gateway. Live missing-index probe now returns: "lance dataset not found: no-such-11205" + HTTP 404 (was: leaked absolute paths + HTTP 500 → leaked absolute and relative paths post-first-fix → clean message + 404 now.) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vectord/src/pathway_memory.rs | 10 +- crates/vectord/src/service.rs | 224 +++++++++++++++++++++++++-- scripts/lance_smoke.sh | 9 +- 3 files changed, 227 insertions(+), 16 deletions(-) diff --git a/crates/vectord/src/pathway_memory.rs b/crates/vectord/src/pathway_memory.rs index 603dfa4..52ed646 100644 --- a/crates/vectord/src/pathway_memory.rs +++ b/crates/vectord/src/pathway_memory.rs @@ -925,7 +925,7 @@ mod tests { reject_reason: None, }]; let mut trace = PathwayTrace { - pathway_id, + pathway_id: pathway_id.clone(), task_class: "scrum_review".into(), file_path: format!("crates/{id_tag}/src/x.rs"), signal_class: Some("CONVERGING".into()), @@ -954,6 +954,14 @@ mod tests { replay_count: replays, replays_succeeded: succ, retired: false, + // Versioning fields added by Mem0 wave (commit 6ac7f61) — defaults + // mirror "this trace is the live head with no parent/successor". + trace_uid: format!("test-{pathway_id}"), + version: 1, + parent_trace_uid: None, + superseded_at: None, + superseded_by_trace_uid: None, + retirement_reason: None, }; trace.pathway_vec = build_pathway_vec(&trace); trace diff --git a/crates/vectord/src/service.rs b/crates/vectord/src/service.rs index cbb4275..2898a0f 100644 --- a/crates/vectord/src/service.rs +++ b/crates/vectord/src/service.rs @@ -1932,21 +1932,221 @@ fn default_subvectors() -> u32 { 48 } // 768/48 = 16 dims per subvector /// search returned 500 + leaked the lakehouse data path AND the /// .cargo/registry path with crate versions. fn sanitize_lance_err(err: String, index_name: &str) -> (StatusCode, String) { + // 404 detection — narrowed per the 2026-05-02 scrum (opus WARN at + // service.rs:1908). The previous `lower.contains("not found")` heuristic + // was too broad: Lance/Arrow surface many non-dataset-missing errors + // containing "not found" (e.g. "column not found", "field not found in + // schema"). Those are real 500s, not lookup misses. Match dataset-shape + // phrasing — Lance's actual format is "Dataset at path X was not found" + // or "no such file or directory". Excludes "column not found" and + // "field not found in schema" which are valid 500s. let lower = err.to_lowercase(); - let is_not_found = lower.contains("not found") || lower.contains("no such file"); - let msg = if is_not_found { - format!("lance dataset not found: {index_name}") + let mentions_dataset = lower.contains("dataset"); + let mentions_path_missing = lower.contains("no such file or directory") + || lower.contains("does not exist"); + let lance_dataset_missing = mentions_dataset && ( + lower.contains("not found") || lower.contains("does not exist") + ); + // Excluded shapes — these contain "not found" but are real 500s. + let column_or_field = lower.contains("column not found") + || lower.contains("field not found") + || lower.contains("schema not found"); + let is_not_found = (lance_dataset_missing || mentions_path_missing) && !column_or_field; + if is_not_found { + return (StatusCode::NOT_FOUND, format!("lance dataset not found: {index_name}")); + } + + // Path redaction — replace path-shaped substrings with [REDACTED] + // rather than truncating, per opus BLOCK at service.rs:1914 from the + // 2026-05-02 scrum. The previous `err.split("/home/").next()` returned + // Some("") when the error string STARTED with "/home/", erasing the + // entire message and falling back to a generic "lance backend error" + // that lost all real error context. Replacing keeps the structural + // error (the "what failed") while stripping the location. + let cleaned = redact_paths(&err) + .trim_end_matches([',', ' ', '\n', '\t']) + .to_string(); + let msg = if cleaned.is_empty() { + format!("lance backend error on {index_name}") } else { - // Generic 500 with the Lance error body trimmed of paths. - let cleaned = err - .split("/root/.cargo/").next().unwrap_or(&err) - .split("/home/").next().unwrap_or(&err) - .trim_end_matches([',', ' ', '\n', '\t']) - .to_string(); - if cleaned.is_empty() { format!("lance backend error on {index_name}") } else { cleaned } + cleaned }; - let status = if is_not_found { StatusCode::NOT_FOUND } else { StatusCode::INTERNAL_SERVER_ERROR }; - (status, msg) + (StatusCode::INTERNAL_SERVER_ERROR, msg) +} + +/// Replace absolute-path substrings (under known leak-prone roots) with +/// "[REDACTED]". Walks the input once, identifying path-shaped runs that +/// start with one of the configured prefixes and continue until a +/// path-terminating character (whitespace, quote, comma, paren, EOL). +/// +/// Linear time, no regex dep. Catches multi-occurrence cases that +/// `String::split(p).next()` lost. The path-redaction surface intentionally +/// includes /var, /tmp, /etc, /usr, /opt in addition to /home and +/// /root/.cargo because Lance/Arrow errors surface system paths in +/// addition to project paths. +fn redact_paths(s: &str) -> String { + // Two prefix sets: + // - ABSOLUTE: paths starting with '/' (always safe to redact) + // - RELATIVE: same path bodies but without leading '/' (Lance occasionally + // strips the leading slash when echoing dataset paths back, observed + // live 2026-05-02 — "Dataset at path home/profit/lakehouse/data/lance/x + // was not found"). Match these only when preceded by a non-alpha char + // (start of string, space, colon, etc.) so we don't redact innocent + // tokens like "homecoming" or "etcetera". + const ABSOLUTE: &[&str] = &[ + "/root/.cargo", "/home", "/var", "/tmp", "/etc", "/usr", "/opt", + ]; + const RELATIVE: &[&str] = &[ + "root/.cargo", "home/", "var/", "tmp/", "etc/", "usr/", "opt/", + ]; + fn is_path_term(b: u8) -> bool { + matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'"' | b'\'' | b',' | b')' | b']' | b'}') + } + fn is_word_boundary_before(bytes: &[u8], i: usize) -> bool { + // True if byte at i-1 is non-alphanumeric (so this position starts + // a fresh token). True at start-of-input. + if i == 0 { return true; } + let b = bytes[i - 1]; + !(b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'-') + } + let bytes = s.as_bytes(); + let mut out = String::with_capacity(s.len()); + let mut i = 0; + while i < bytes.len() { + let mut matched_len: Option = None; + // Try absolute prefixes first (always allowed). + for p in ABSOLUTE { + let pb = p.as_bytes(); + if i + pb.len() <= bytes.len() && &bytes[i..i + pb.len()] == pb { + let after = i + pb.len(); + if after == bytes.len() || bytes[after] == b'/' || is_path_term(bytes[after]) { + matched_len = Some(pb.len()); + break; + } + } + } + // Then relative prefixes — only at word boundaries. + if matched_len.is_none() && is_word_boundary_before(bytes, i) { + for p in RELATIVE { + let pb = p.as_bytes(); + if i + pb.len() <= bytes.len() && &bytes[i..i + pb.len()] == pb { + matched_len = Some(pb.len()); + break; + } + } + } + if let Some(prefix_len) = matched_len { + out.push_str("[REDACTED]"); + let mut j = i + prefix_len; + while j < bytes.len() && !is_path_term(bytes[j]) { + j += 1; + } + i = j; + } else { + out.push(bytes[i] as char); + i += 1; + } + } + out +} + +#[cfg(test)] +mod sanitize_tests { + use super::*; + + #[test] + fn redact_path_at_offset_zero() { + // Regression: opus BLOCK 2026-05-02. Old impl returned Some("") + // when err started with "/home/", erasing the whole message. + let out = redact_paths("/home/profit/lakehouse/data/lance not a directory"); + assert_eq!(out, "[REDACTED] not a directory"); + } + + #[test] + fn redact_keeps_pre_and_post_text() { + let out = redact_paths("failed to open /home/profit/lakehouse/data/x for read: ENOENT"); + assert_eq!(out, "failed to open [REDACTED] for read: ENOENT"); + } + + #[test] + fn redact_multiple_paths() { + let out = redact_paths("at /root/.cargo/registry/src/index.crates.io-foo/lance-table-4.0.0/src/io/commit.rs:364:26 from /home/profit/lakehouse"); + assert!(!out.contains("/root/.cargo")); + assert!(!out.contains("/home/")); + assert!(out.contains("[REDACTED]")); + } + + #[test] + fn redact_preserves_quote_terminator() { + let out = redact_paths("{\"path\":\"/home/profit/x\",\"err\":\"bad\"}"); + assert_eq!(out, "{\"path\":\"[REDACTED]\",\"err\":\"bad\"}"); + } + + #[test] + fn is_not_found_narrow_dataset_only() { + // Regression: opus WARN 2026-05-02. Old impl 404'd on any "not + // found" — including legitimate column/field-not-found 500s. + let (status, _) = sanitize_lance_err( + "column not found: vector".into(), "test_idx", + ); + assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR); + + let (status, _) = sanitize_lance_err( + "dataset not found at /home/profit/lakehouse/data/lance/missing".into(), "test_idx", + ); + assert_eq!(status, StatusCode::NOT_FOUND); + } + + #[test] + fn redact_does_not_match_prefix_substring() { + // /etcetera should NOT trigger /etc redaction. + let out = redact_paths("etcetera and /etcd"); + assert_eq!(out, "etcetera and /etcd"); + } + + #[test] + fn redact_relative_paths_lance_emits() { + // 2026-05-02: live missing-index probe surfaced Lance error of the + // form "Dataset at path home/profit/lakehouse/data/lance/x was not + // found" — leading slash stripped. Need to redact the relative form + // when preceded by a word boundary. + let out = redact_paths("Dataset at path home/profit/lakehouse/data/lance/x was not found"); + assert!(!out.contains("home/profit"), "should redact: {out}"); + assert!(out.contains("Dataset at path")); + assert!(out.contains("was not found")); + } + + #[test] + fn redact_does_not_eat_innocent_prefix_words() { + // "homecoming" must NOT trigger "home/" redaction. "Etcetera" must + // NOT trigger "etc/" redaction. The word-boundary guard handles this. + let out = redact_paths("homecoming etcetera vary tmpfile"); + assert_eq!(out, "homecoming etcetera vary tmpfile"); + } + + #[test] + fn is_not_found_lance_actual_phrasing() { + // Lance's actual error format observed live: "Dataset at path X was + // not found: Not found: ...". Must 404, not 500. + let (status, _) = sanitize_lance_err( + "Dataset at path home/profit/lakehouse/data/lance/x was not found".into(), + "x", + ); + assert_eq!(status, StatusCode::NOT_FOUND); + } + + #[test] + fn is_not_found_excludes_column_field_schema() { + // Real 500s with the "not found" phrase that aren't dataset-missing. + for err in [ + "column not found: vector", + "field not found in schema: doc_id", + "schema not found for dataset xyz", + ] { + let (status, _) = sanitize_lance_err(err.into(), "test_idx"); + assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR, "{err}"); + } + } } /// Build the IVF_PQ index on the Lance dataset. diff --git a/scripts/lance_smoke.sh b/scripts/lance_smoke.sh index 83e1e85..404eb1f 100755 --- a/scripts/lance_smoke.sh +++ b/scripts/lance_smoke.sh @@ -60,9 +60,12 @@ STATUS=$(curl -sS -m 5 -o /tmp/lance_smoke_500.json -w '%{http_code}' \ BODY=$(cat /tmp/lance_smoke_500.json) PROBE "search/ → 404 (was 500 pre-2026-05-02)" \ test "$STATUS" = "404" -# The sanitizer fix specifically: no /home/ or /root/.cargo/ in body. +# Assert "pattern absent" — `! grep -qE` (NOT `grep -qvE` which is unsound: +# -v -q exits 0 if ANY line lacks the pattern, so a multi-line body containing +# both a leak line AND any clean line would false-PASS. Caught 2026-05-02 by +# opus scrum on the lance backend wave.) PROBE "search/ body sanitized — no filesystem leak" \ - bash -c "echo '$BODY' | grep -qvE '/home/|/root/\.cargo/'" + bash -c "! echo '$BODY' | grep -qE '/home/|/root/\.cargo/|/var/|/tmp/'" # ── 5. build_index on missing dataset also sanitized ──────────── STATUS=$(curl -sS -m 5 -o /tmp/lance_smoke_idx.json -w '%{http_code}' \ @@ -71,7 +74,7 @@ STATUS=$(curl -sS -m 5 -o /tmp/lance_smoke_idx.json -w '%{http_code}' \ -d '{}') BODY=$(cat /tmp/lance_smoke_idx.json) PROBE "index/ body sanitized" \ - bash -c "echo '$BODY' | grep -qvE '/home/|/root/\.cargo/'" + bash -c "! echo '$BODY' | grep -qE '/home/|/root/\.cargo/|/var/|/tmp/'" # ── 6. append validates input shape (rejects empty rows array) ── STATUS=$(curl -sS -m 5 -o /dev/null -w '%{http_code}' \