diff --git a/crates/vectord/src/chunker.rs b/crates/vectord/src/chunker.rs index aed1a62..cba2ab4 100644 --- a/crates/vectord/src/chunker.rs +++ b/crates/vectord/src/chunker.rs @@ -15,9 +15,28 @@ pub struct TextChunk { pub text: String, } +/// Round `idx` UP to the nearest UTF-8 char boundary in `s`. +/// Needed because `&str[a..b]` panics if `a` or `b` lands mid-character — +/// and any text imported from Postgres/user data will contain multi-byte +/// chars like `\u{202f}` (narrow no-break space), em-dashes, etc. +fn ceil_char_boundary(s: &str, mut idx: usize) -> usize { + if idx >= s.len() { return s.len(); } + while idx <= s.len() && !s.is_char_boundary(idx) { + idx += 1; + } + idx +} + /// Split text into overlapping chunks. -/// - `chunk_size`: target characters per chunk (not tokens — chars are a good proxy) -/// - `overlap`: characters of overlap between consecutive chunks +/// - `chunk_size`: target bytes per chunk (UTF-8 boundary-respecting) +/// - `overlap`: bytes of overlap between consecutive chunks +/// +/// Safety contract: +/// - Never slices at a byte that isn't a UTF-8 char boundary +/// - Always advances forward (monotonic `start`) — no infinite loops on +/// pathological inputs +/// - Bounded iteration — caps total chunks at `text.len()` to prevent +/// runaway allocation on degenerate text/parameter combinations pub fn chunk_text( text: &str, source: &str, @@ -44,28 +63,44 @@ pub fn chunk_text( let mut start = 0; let mut idx = 0u32; - while start < text.len() { - let end = (start + chunk_size).min(text.len()); + // Safety cap: a well-formed chunk_size advances `start` by at least + // `chunk_size - overlap` each iteration. Even in degenerate cases we + // should never emit more chunks than there are bytes of text. + let max_chunks = text.len(); + let mut iterations = 0; - // Try to break at a sentence or paragraph boundary - let chunk_text = &text[start..end]; + while start < text.len() { + iterations += 1; + if iterations > max_chunks { + tracing::warn!( + "chunker: exceeded safety cap ({}) on doc '{}' of length {} — stopping", + max_chunks, doc_id, text.len(), + ); + break; + } + + // UTF-8 safe upper bound: move forward to a char boundary. + let end = ceil_char_boundary(text, (start + chunk_size).min(text.len())); + + let chunk_text_slice = &text[start..end]; let actual_end = if end < text.len() { - // Look for last sentence boundary in the chunk - if let Some(pos) = chunk_text.rfind(". ") { + // Look for last sentence boundary inside the slice. + if let Some(pos) = chunk_text_slice.rfind(". ") { start + pos + 2 - } else if let Some(pos) = chunk_text.rfind('\n') { + } else if let Some(pos) = chunk_text_slice.rfind('\n') { + start + pos + 1 + } else if let Some(pos) = chunk_text_slice.rfind(' ') { start + pos + 1 } else { - // Fall back to word boundary - if let Some(pos) = chunk_text.rfind(' ') { - start + pos + 1 - } else { - end - } + end } } else { end }; + // All the rfind'd positions are at char boundaries (they match + // ASCII byte patterns), so actual_end should already be safe. + // Defensive belt-and-suspenders: round up anyway. + let actual_end = ceil_char_boundary(text, actual_end); let chunk = text[start..actual_end].trim(); if !chunk.is_empty() { @@ -78,11 +113,24 @@ pub fn chunk_text( idx += 1; } - // Advance with overlap + // Advance with overlap — must strictly progress to avoid infinite + // loops on degenerate inputs (e.g. chunk_size ≤ overlap, or + // boundary-finding returning a position at `start`). if actual_end >= text.len() { break; } - start = if actual_end > overlap { actual_end - overlap } else { actual_end }; + let tentative = if actual_end > overlap { actual_end - overlap } else { actual_end }; + let next_start = ceil_char_boundary(text, tentative); + if next_start <= start { + // No progress — force forward one char to guarantee termination. + let mut forced = start + 1; + while forced < text.len() && !text.is_char_boundary(forced) { + forced += 1; + } + start = forced; + } else { + start = next_start; + } } chunks @@ -132,4 +180,45 @@ mod tests { assert_eq!(chunks[0].source, "candidates"); assert_eq!(chunks[0].doc_id, "CAND-001"); } + + #[test] + fn handles_multibyte_utf8_at_chunk_boundary() { + // Regression: multi-byte unicode (narrow no-break space \u{202f}, + // 3 bytes) landing inside a chunk-size window used to panic with + // "byte index N is not a char boundary". Found on pg-imported + // data that contained editorial unicode punctuation. + let text = "**Claim 1**\u{202f}\u{2013} *Each large language model has its own personality \ + and way of interacting.*\u{202f}\u{2013}\u{202f}Verdict:\u{202f}VERIFIED. \ + Reasoning: widely accepted observation that different models exhibit distinct response \ + styles stemming from variations in training data.".repeat(5); + let chunks = chunk_text(&text, "test", "unicode-1", 500, 50); + // Must not panic, must produce at least one chunk. + assert!(!chunks.is_empty()); + // All chunks must be valid UTF-8 substrings of the original. + for chunk in &chunks { + assert!(!chunk.text.is_empty()); + assert!(text.contains(&chunk.text)); + } + } + + #[test] + fn no_infinite_loop_on_no_spaces() { + // Long text with no spaces, sentences, or newlines — chunker must + // still advance via the hard `end` fallback. + let text = "a".repeat(5000); + let chunks = chunk_text(&text, "test", "noSpace", 500, 50); + assert!(!chunks.is_empty()); + // Expected ~5000/450 chunks (with overlap=50 → step=450) + assert!(chunks.len() < 50, "chunk count {} seems excessive", chunks.len()); + } + + #[test] + fn no_infinite_loop_on_degenerate_params() { + // chunk_size equals overlap — previously would have made zero + // forward progress. Our safety forced-advance must catch this. + let text = "a b c d e f g h i j k l m n o p q r s t u v w x y z".repeat(20); + let chunks = chunk_text(&text, "test", "degen", 100, 100); + assert!(!chunks.is_empty()); + assert!(chunks.len() <= text.len(), "runaway chunk count {}", chunks.len()); + } }