lakehouse/crates/vectord/src/chunker.rs

/// Text chunking strategies for embedding.
/// Chunks need to be small enough for the embedding model (typically <512 tokens)
/// but large enough to carry meaning.

/// A chunk of text with metadata pointing back to its source.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct TextChunk {
    /// Source identifier (dataset name, filename, etc.)
    pub source: String,
    /// Row or document ID within the source
    pub doc_id: String,
    /// Chunk index within the document (0, 1, 2, ...)
    pub chunk_idx: u32,
    /// The actual text content
    pub text: String,
}

/// Round `idx` UP to the nearest UTF-8 char boundary in `s`.
/// Needed because `&str[a..b]` panics if `a` or `b` lands mid-character —
/// and any text imported from Postgres/user data will contain multi-byte
/// chars like `\u{202f}` (narrow no-break space), em-dashes, etc.
fn ceil_char_boundary(s: &str, mut idx: usize) -> usize {
    if idx >= s.len() { return s.len(); }
    while idx <= s.len() && !s.is_char_boundary(idx) {
        idx += 1;
    }
    idx
}

/// Split text into overlapping chunks.
/// - `chunk_size`: target bytes per chunk (UTF-8 boundary-respecting)
/// - `overlap`: bytes of overlap between consecutive chunks
///
/// Safety contract:
/// - Never slices at a byte that isn't a UTF-8 char boundary
/// - Always advances forward (monotonic `start`) — no infinite loops on
///   pathological inputs
/// - Bounded iteration — caps total chunks at `text.len()` to prevent
///   runaway allocation on degenerate text/parameter combinations
pub fn chunk_text(
    text: &str,
    source: &str,
    doc_id: &str,
    chunk_size: usize,
    overlap: usize,
) -> Vec<TextChunk> {
    let text = text.trim();
    if text.is_empty() {
        return vec![];
    }

    // Short text — single chunk
    if text.len() <= chunk_size {
        return vec![TextChunk {
            source: source.to_string(),
            doc_id: doc_id.to_string(),
            chunk_idx: 0,
            text: text.to_string(),
        }];
    }

    let mut chunks = Vec::new();
    let mut start = 0;
    let mut idx = 0u32;

    // Safety cap: a well-formed chunk_size advances `start` by at least
    // `chunk_size - overlap` each iteration. Even in degenerate cases we
    // should never emit more chunks than there are bytes of text.
    let max_chunks = text.len();
    let mut iterations = 0;

    while start < text.len() {
        iterations += 1;
        if iterations > max_chunks {
            tracing::warn!(
                "chunker: exceeded safety cap ({}) on doc '{}' of length {} — stopping",
                max_chunks, doc_id, text.len(),
            );
            break;
        }

        // UTF-8 safe upper bound: move forward to a char boundary.
        let end = ceil_char_boundary(text, (start + chunk_size).min(text.len()));

        let chunk_text_slice = &text[start..end];
        let actual_end = if end < text.len() {
            // Look for last sentence boundary inside the slice.
            if let Some(pos) = chunk_text_slice.rfind(". ") {
                start + pos + 2
            } else if let Some(pos) = chunk_text_slice.rfind('\n') {
                start + pos + 1
            } else if let Some(pos) = chunk_text_slice.rfind(' ') {
                start + pos + 1
            } else {
                end
            }
        } else {
            end
        };
        // All the rfind'd positions are at char boundaries (they match
        // ASCII byte patterns), so actual_end should already be safe.
        // Defensive belt-and-suspenders: round up anyway.
        let actual_end = ceil_char_boundary(text, actual_end);

        let chunk = text[start..actual_end].trim();
        if !chunk.is_empty() {
            chunks.push(TextChunk {
                source: source.to_string(),
                doc_id: doc_id.to_string(),
                chunk_idx: idx,
                text: chunk.to_string(),
            });
            idx += 1;
        }

        // Advance with overlap — must strictly progress to avoid infinite
        // loops on degenerate inputs (e.g. chunk_size ≤ overlap, or
        // boundary-finding returning a position at `start`).
        if actual_end >= text.len() {
            break;
        }
        let tentative = if actual_end > overlap { actual_end - overlap } else { actual_end };
        let next_start = ceil_char_boundary(text, tentative);
        if next_start <= start {
            // No progress — force forward one char to guarantee termination.
            let mut forced = start + 1;
            while forced < text.len() && !text.is_char_boundary(forced) {
                forced += 1;
            }
            start = forced;
        } else {
            start = next_start;
        }
    }

    chunks
}

/// Chunk a dataset's text column. Returns all chunks from all rows.
pub fn chunk_column(
    source: &str,
    doc_ids: &[String],
    texts: &[String],
    chunk_size: usize,
    overlap: usize,
) -> Vec<TextChunk> {
    let mut all_chunks = Vec::new();
    for (doc_id, text) in doc_ids.iter().zip(texts.iter()) {
        let chunks = chunk_text(text, source, doc_id, chunk_size, overlap);
        all_chunks.extend(chunks);
    }
    all_chunks
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn short_text_single_chunk() {
        let chunks = chunk_text("Hello world", "test", "1", 500, 50);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "Hello world");
    }

    #[test]
    fn long_text_multiple_chunks() {
        let text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence. Sixth sentence. Seventh sentence. Eighth sentence.";
        let chunks = chunk_text(text, "test", "1", 50, 10);
        assert!(chunks.len() > 1);
        // All text should be covered
        for chunk in &chunks {
            assert!(!chunk.text.is_empty());
        }
    }

    #[test]
    fn chunk_preserves_source() {
        let chunks = chunk_text("Some text here", "candidates", "CAND-001", 500, 50);
        assert_eq!(chunks[0].source, "candidates");
        assert_eq!(chunks[0].doc_id, "CAND-001");
    }

    #[test]
    fn handles_multibyte_utf8_at_chunk_boundary() {
        // Regression: multi-byte unicode (narrow no-break space \u{202f},
        // 3 bytes) landing inside a chunk-size window used to panic with
        // "byte index N is not a char boundary". Found on pg-imported
        // data that contained editorial unicode punctuation.
        let text = "**Claim 1**\u{202f}\u{2013} *Each large language model has its own personality \
                   and way of interacting.*\u{202f}\u{2013}\u{202f}Verdict:\u{202f}VERIFIED. \
                   Reasoning: widely accepted observation that different models exhibit distinct response \
                   styles stemming from variations in training data.".repeat(5);
        let chunks = chunk_text(&text, "test", "unicode-1", 500, 50);
        // Must not panic, must produce at least one chunk.
        assert!(!chunks.is_empty());
        // All chunks must be valid UTF-8 substrings of the original.
        for chunk in &chunks {
            assert!(!chunk.text.is_empty());
            assert!(text.contains(&chunk.text));
        }
    }

    #[test]
    fn no_infinite_loop_on_no_spaces() {
        // Long text with no spaces, sentences, or newlines — chunker must
        // still advance via the hard `end` fallback.
        let text = "a".repeat(5000);
        let chunks = chunk_text(&text, "test", "noSpace", 500, 50);
        assert!(!chunks.is_empty());
        // Expected ~5000/450 chunks (with overlap=50 → step=450)
        assert!(chunks.len() < 50, "chunk count {} seems excessive", chunks.len());
    }

    #[test]
    fn no_infinite_loop_on_degenerate_params() {
        // chunk_size equals overlap — previously would have made zero
        // forward progress. Our safety forced-advance must catch this.
        let text = "a b c d e f g h i j k l m n o p q r s t u v w x y z".repeat(20);
        let chunks = chunk_text(&text, "test", "degen", 100, 100);
        assert!(!chunks.is_empty());
        assert!(chunks.len() <= text.len(), "runaway chunk count {}", chunks.len());
    }
}