The chunker's &text[start..end] slice could land inside a multi-byte
UTF-8 character (e.g. narrow no-break space \u{202f}, em-dashes, smart
quotes — universal in pg-imported editorial data). Rust panics on
non-boundary string slicing. In the refresh path that panic is caught
by tokio's task machinery but somehow causes linear memory growth at
~540MB/sec until OOM at 120GB+.
Root cause: chunk boundaries computed by byte arithmetic without
checking is_char_boundary(). The existing "look for last sentence / \n
/ space" logic finds ASCII-safe positions, but the *primary* `end`
calculation `(start + chunk_size).min(text.len())` lands wherever.
Fix:
- ceil_char_boundary(s, idx) — forward-scan to the nearest valid
UTF-8 char boundary. Used at end, actual_end, and next_start.
- Iteration cap — break if iterations exceed text.len(). Any
non-progressing loop dies safely instead of burning memory.
- Forced forward advance — if overlap + boundary math produce a
next_start <= start, force +1 char to guarantee termination.
Reproduced on kb_team_runs (585 pg-imported prompts with editorial
unicode): previous run grew memory linearly to 124GB over 240s then
OOM-killed. Same request after fix: peaks at <100MB, completes in
~4m42s to produce 12,693 embeddings. /vectors/search returns
relevant results.
Regression tests added:
- handles_multibyte_utf8_at_chunk_boundary — exact \u{202f} repro
- no_infinite_loop_on_no_spaces — 5KB text, no whitespace
- no_infinite_loop_on_degenerate_params — chunk_size == overlap
Surfaced by Phase C, but pre-existed as a latent bug since Phase 7.
Any Ollama-targeted RAG corpus with non-ASCII content would have hit
this once it grew past ~13KB per document.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
225 lines
8.1 KiB
Rust
225 lines
8.1 KiB
Rust
/// Text chunking strategies for embedding.
|
|
/// Chunks need to be small enough for the embedding model (typically <512 tokens)
|
|
/// but large enough to carry meaning.
|
|
|
|
/// A chunk of text with metadata pointing back to its source.
|
|
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
|
pub struct TextChunk {
|
|
/// Source identifier (dataset name, filename, etc.)
|
|
pub source: String,
|
|
/// Row or document ID within the source
|
|
pub doc_id: String,
|
|
/// Chunk index within the document (0, 1, 2, ...)
|
|
pub chunk_idx: u32,
|
|
/// The actual text content
|
|
pub text: String,
|
|
}
|
|
|
|
/// Round `idx` UP to the nearest UTF-8 char boundary in `s`.
|
|
/// Needed because `&str[a..b]` panics if `a` or `b` lands mid-character —
|
|
/// and any text imported from Postgres/user data will contain multi-byte
|
|
/// chars like `\u{202f}` (narrow no-break space), em-dashes, etc.
|
|
fn ceil_char_boundary(s: &str, mut idx: usize) -> usize {
|
|
if idx >= s.len() { return s.len(); }
|
|
while idx <= s.len() && !s.is_char_boundary(idx) {
|
|
idx += 1;
|
|
}
|
|
idx
|
|
}
|
|
|
|
/// Split text into overlapping chunks.
|
|
/// - `chunk_size`: target bytes per chunk (UTF-8 boundary-respecting)
|
|
/// - `overlap`: bytes of overlap between consecutive chunks
|
|
///
|
|
/// Safety contract:
|
|
/// - Never slices at a byte that isn't a UTF-8 char boundary
|
|
/// - Always advances forward (monotonic `start`) — no infinite loops on
|
|
/// pathological inputs
|
|
/// - Bounded iteration — caps total chunks at `text.len()` to prevent
|
|
/// runaway allocation on degenerate text/parameter combinations
|
|
pub fn chunk_text(
|
|
text: &str,
|
|
source: &str,
|
|
doc_id: &str,
|
|
chunk_size: usize,
|
|
overlap: usize,
|
|
) -> Vec<TextChunk> {
|
|
let text = text.trim();
|
|
if text.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
// Short text — single chunk
|
|
if text.len() <= chunk_size {
|
|
return vec![TextChunk {
|
|
source: source.to_string(),
|
|
doc_id: doc_id.to_string(),
|
|
chunk_idx: 0,
|
|
text: text.to_string(),
|
|
}];
|
|
}
|
|
|
|
let mut chunks = Vec::new();
|
|
let mut start = 0;
|
|
let mut idx = 0u32;
|
|
|
|
// Safety cap: a well-formed chunk_size advances `start` by at least
|
|
// `chunk_size - overlap` each iteration. Even in degenerate cases we
|
|
// should never emit more chunks than there are bytes of text.
|
|
let max_chunks = text.len();
|
|
let mut iterations = 0;
|
|
|
|
while start < text.len() {
|
|
iterations += 1;
|
|
if iterations > max_chunks {
|
|
tracing::warn!(
|
|
"chunker: exceeded safety cap ({}) on doc '{}' of length {} — stopping",
|
|
max_chunks, doc_id, text.len(),
|
|
);
|
|
break;
|
|
}
|
|
|
|
// UTF-8 safe upper bound: move forward to a char boundary.
|
|
let end = ceil_char_boundary(text, (start + chunk_size).min(text.len()));
|
|
|
|
let chunk_text_slice = &text[start..end];
|
|
let actual_end = if end < text.len() {
|
|
// Look for last sentence boundary inside the slice.
|
|
if let Some(pos) = chunk_text_slice.rfind(". ") {
|
|
start + pos + 2
|
|
} else if let Some(pos) = chunk_text_slice.rfind('\n') {
|
|
start + pos + 1
|
|
} else if let Some(pos) = chunk_text_slice.rfind(' ') {
|
|
start + pos + 1
|
|
} else {
|
|
end
|
|
}
|
|
} else {
|
|
end
|
|
};
|
|
// All the rfind'd positions are at char boundaries (they match
|
|
// ASCII byte patterns), so actual_end should already be safe.
|
|
// Defensive belt-and-suspenders: round up anyway.
|
|
let actual_end = ceil_char_boundary(text, actual_end);
|
|
|
|
let chunk = text[start..actual_end].trim();
|
|
if !chunk.is_empty() {
|
|
chunks.push(TextChunk {
|
|
source: source.to_string(),
|
|
doc_id: doc_id.to_string(),
|
|
chunk_idx: idx,
|
|
text: chunk.to_string(),
|
|
});
|
|
idx += 1;
|
|
}
|
|
|
|
// Advance with overlap — must strictly progress to avoid infinite
|
|
// loops on degenerate inputs (e.g. chunk_size ≤ overlap, or
|
|
// boundary-finding returning a position at `start`).
|
|
if actual_end >= text.len() {
|
|
break;
|
|
}
|
|
let tentative = if actual_end > overlap { actual_end - overlap } else { actual_end };
|
|
let next_start = ceil_char_boundary(text, tentative);
|
|
if next_start <= start {
|
|
// No progress — force forward one char to guarantee termination.
|
|
let mut forced = start + 1;
|
|
while forced < text.len() && !text.is_char_boundary(forced) {
|
|
forced += 1;
|
|
}
|
|
start = forced;
|
|
} else {
|
|
start = next_start;
|
|
}
|
|
}
|
|
|
|
chunks
|
|
}
|
|
|
|
/// Chunk a dataset's text column. Returns all chunks from all rows.
|
|
pub fn chunk_column(
|
|
source: &str,
|
|
doc_ids: &[String],
|
|
texts: &[String],
|
|
chunk_size: usize,
|
|
overlap: usize,
|
|
) -> Vec<TextChunk> {
|
|
let mut all_chunks = Vec::new();
|
|
for (doc_id, text) in doc_ids.iter().zip(texts.iter()) {
|
|
let chunks = chunk_text(text, source, doc_id, chunk_size, overlap);
|
|
all_chunks.extend(chunks);
|
|
}
|
|
all_chunks
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn short_text_single_chunk() {
|
|
let chunks = chunk_text("Hello world", "test", "1", 500, 50);
|
|
assert_eq!(chunks.len(), 1);
|
|
assert_eq!(chunks[0].text, "Hello world");
|
|
}
|
|
|
|
#[test]
|
|
fn long_text_multiple_chunks() {
|
|
let text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence. Sixth sentence. Seventh sentence. Eighth sentence.";
|
|
let chunks = chunk_text(text, "test", "1", 50, 10);
|
|
assert!(chunks.len() > 1);
|
|
// All text should be covered
|
|
for chunk in &chunks {
|
|
assert!(!chunk.text.is_empty());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn chunk_preserves_source() {
|
|
let chunks = chunk_text("Some text here", "candidates", "CAND-001", 500, 50);
|
|
assert_eq!(chunks[0].source, "candidates");
|
|
assert_eq!(chunks[0].doc_id, "CAND-001");
|
|
}
|
|
|
|
#[test]
|
|
fn handles_multibyte_utf8_at_chunk_boundary() {
|
|
// Regression: multi-byte unicode (narrow no-break space \u{202f},
|
|
// 3 bytes) landing inside a chunk-size window used to panic with
|
|
// "byte index N is not a char boundary". Found on pg-imported
|
|
// data that contained editorial unicode punctuation.
|
|
let text = "**Claim 1**\u{202f}\u{2013} *Each large language model has its own personality \
|
|
and way of interacting.*\u{202f}\u{2013}\u{202f}Verdict:\u{202f}VERIFIED. \
|
|
Reasoning: widely accepted observation that different models exhibit distinct response \
|
|
styles stemming from variations in training data.".repeat(5);
|
|
let chunks = chunk_text(&text, "test", "unicode-1", 500, 50);
|
|
// Must not panic, must produce at least one chunk.
|
|
assert!(!chunks.is_empty());
|
|
// All chunks must be valid UTF-8 substrings of the original.
|
|
for chunk in &chunks {
|
|
assert!(!chunk.text.is_empty());
|
|
assert!(text.contains(&chunk.text));
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn no_infinite_loop_on_no_spaces() {
|
|
// Long text with no spaces, sentences, or newlines — chunker must
|
|
// still advance via the hard `end` fallback.
|
|
let text = "a".repeat(5000);
|
|
let chunks = chunk_text(&text, "test", "noSpace", 500, 50);
|
|
assert!(!chunks.is_empty());
|
|
// Expected ~5000/450 chunks (with overlap=50 → step=450)
|
|
assert!(chunks.len() < 50, "chunk count {} seems excessive", chunks.len());
|
|
}
|
|
|
|
#[test]
|
|
fn no_infinite_loop_on_degenerate_params() {
|
|
// chunk_size equals overlap — previously would have made zero
|
|
// forward progress. Our safety forced-advance must catch this.
|
|
let text = "a b c d e f g h i j k l m n o p q r s t u v w x y z".repeat(20);
|
|
let chunks = chunk_text(&text, "test", "degen", 100, 100);
|
|
assert!(!chunks.is_empty());
|
|
assert!(chunks.len() <= text.len(), "runaway chunk count {}", chunks.len());
|
|
}
|
|
}
|