/// Text chunking strategies for embedding. /// Chunks need to be small enough for the embedding model (typically <512 tokens) /// but large enough to carry meaning. /// A chunk of text with metadata pointing back to its source. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct TextChunk { /// Source identifier (dataset name, filename, etc.) pub source: String, /// Row or document ID within the source pub doc_id: String, /// Chunk index within the document (0, 1, 2, ...) pub chunk_idx: u32, /// The actual text content pub text: String, } /// Round `idx` UP to the nearest UTF-8 char boundary in `s`. /// Needed because `&str[a..b]` panics if `a` or `b` lands mid-character — /// and any text imported from Postgres/user data will contain multi-byte /// chars like `\u{202f}` (narrow no-break space), em-dashes, etc. fn ceil_char_boundary(s: &str, mut idx: usize) -> usize { if idx >= s.len() { return s.len(); } while idx <= s.len() && !s.is_char_boundary(idx) { idx += 1; } idx } /// Split text into overlapping chunks. /// - `chunk_size`: target bytes per chunk (UTF-8 boundary-respecting) /// - `overlap`: bytes of overlap between consecutive chunks /// /// Safety contract: /// - Never slices at a byte that isn't a UTF-8 char boundary /// - Always advances forward (monotonic `start`) — no infinite loops on /// pathological inputs /// - Bounded iteration — caps total chunks at `text.len()` to prevent /// runaway allocation on degenerate text/parameter combinations pub fn chunk_text( text: &str, source: &str, doc_id: &str, chunk_size: usize, overlap: usize, ) -> Vec { let text = text.trim(); if text.is_empty() { return vec![]; } // Short text — single chunk if text.len() <= chunk_size { return vec![TextChunk { source: source.to_string(), doc_id: doc_id.to_string(), chunk_idx: 0, text: text.to_string(), }]; } let mut chunks = Vec::new(); let mut start = 0; let mut idx = 0u32; // Safety cap: a well-formed chunk_size advances `start` by at least // `chunk_size - overlap` each iteration. Even in degenerate cases we // should never emit more chunks than there are bytes of text. let max_chunks = text.len(); let mut iterations = 0; while start < text.len() { iterations += 1; if iterations > max_chunks { tracing::warn!( "chunker: exceeded safety cap ({}) on doc '{}' of length {} — stopping", max_chunks, doc_id, text.len(), ); break; } // UTF-8 safe upper bound: move forward to a char boundary. let end = ceil_char_boundary(text, (start + chunk_size).min(text.len())); let chunk_text_slice = &text[start..end]; let actual_end = if end < text.len() { // Look for last sentence boundary inside the slice. if let Some(pos) = chunk_text_slice.rfind(". ") { start + pos + 2 } else if let Some(pos) = chunk_text_slice.rfind('\n') { start + pos + 1 } else if let Some(pos) = chunk_text_slice.rfind(' ') { start + pos + 1 } else { end } } else { end }; // All the rfind'd positions are at char boundaries (they match // ASCII byte patterns), so actual_end should already be safe. // Defensive belt-and-suspenders: round up anyway. let actual_end = ceil_char_boundary(text, actual_end); let chunk = text[start..actual_end].trim(); if !chunk.is_empty() { chunks.push(TextChunk { source: source.to_string(), doc_id: doc_id.to_string(), chunk_idx: idx, text: chunk.to_string(), }); idx += 1; } // Advance with overlap — must strictly progress to avoid infinite // loops on degenerate inputs (e.g. chunk_size ≤ overlap, or // boundary-finding returning a position at `start`). if actual_end >= text.len() { break; } let tentative = if actual_end > overlap { actual_end - overlap } else { actual_end }; let next_start = ceil_char_boundary(text, tentative); if next_start <= start { // No progress — force forward one char to guarantee termination. let mut forced = start + 1; while forced < text.len() && !text.is_char_boundary(forced) { forced += 1; } start = forced; } else { start = next_start; } } chunks } /// Chunk a dataset's text column. Returns all chunks from all rows. pub fn chunk_column( source: &str, doc_ids: &[String], texts: &[String], chunk_size: usize, overlap: usize, ) -> Vec { let mut all_chunks = Vec::new(); for (doc_id, text) in doc_ids.iter().zip(texts.iter()) { let chunks = chunk_text(text, source, doc_id, chunk_size, overlap); all_chunks.extend(chunks); } all_chunks } #[cfg(test)] mod tests { use super::*; #[test] fn short_text_single_chunk() { let chunks = chunk_text("Hello world", "test", "1", 500, 50); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].text, "Hello world"); } #[test] fn long_text_multiple_chunks() { let text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence. Sixth sentence. Seventh sentence. Eighth sentence."; let chunks = chunk_text(text, "test", "1", 50, 10); assert!(chunks.len() > 1); // All text should be covered for chunk in &chunks { assert!(!chunk.text.is_empty()); } } #[test] fn chunk_preserves_source() { let chunks = chunk_text("Some text here", "candidates", "CAND-001", 500, 50); assert_eq!(chunks[0].source, "candidates"); assert_eq!(chunks[0].doc_id, "CAND-001"); } #[test] fn handles_multibyte_utf8_at_chunk_boundary() { // Regression: multi-byte unicode (narrow no-break space \u{202f}, // 3 bytes) landing inside a chunk-size window used to panic with // "byte index N is not a char boundary". Found on pg-imported // data that contained editorial unicode punctuation. let text = "**Claim 1**\u{202f}\u{2013} *Each large language model has its own personality \ and way of interacting.*\u{202f}\u{2013}\u{202f}Verdict:\u{202f}VERIFIED. \ Reasoning: widely accepted observation that different models exhibit distinct response \ styles stemming from variations in training data.".repeat(5); let chunks = chunk_text(&text, "test", "unicode-1", 500, 50); // Must not panic, must produce at least one chunk. assert!(!chunks.is_empty()); // All chunks must be valid UTF-8 substrings of the original. for chunk in &chunks { assert!(!chunk.text.is_empty()); assert!(text.contains(&chunk.text)); } } #[test] fn no_infinite_loop_on_no_spaces() { // Long text with no spaces, sentences, or newlines — chunker must // still advance via the hard `end` fallback. let text = "a".repeat(5000); let chunks = chunk_text(&text, "test", "noSpace", 500, 50); assert!(!chunks.is_empty()); // Expected ~5000/450 chunks (with overlap=50 → step=450) assert!(chunks.len() < 50, "chunk count {} seems excessive", chunks.len()); } #[test] fn no_infinite_loop_on_degenerate_params() { // chunk_size equals overlap — previously would have made zero // forward progress. Our safety forced-advance must catch this. let text = "a b c d e f g h i j k l m n o p q r s t u v w x y z".repeat(20); let chunks = chunk_text(&text, "test", "degen", 100, 100); assert!(!chunks.is_empty()); assert!(chunks.len() <= text.len(), "runaway chunk count {}", chunks.len()); } }