diff --git a/crates/ingestd/src/pdf_ingest.rs b/crates/ingestd/src/pdf_ingest.rs index c087e80..7ea7c11 100644 --- a/crates/ingestd/src/pdf_ingest.rs +++ b/crates/ingestd/src/pdf_ingest.rs @@ -4,8 +4,19 @@ use arrow::record_batch::RecordBatch; use std::sync::Arc; /// Extract text from a PDF file. -/// Returns one row per page: (page_number, text_content). -/// This handles text-based PDFs. Scanned/image PDFs need OCR (not implemented yet). +/// +/// Two-tier approach: +/// 1. **Text extraction (lopdf)** — fast, works on digital/native PDFs. +/// Returns one row per page: (source_file, page_number, text_content). +/// 2. **OCR fallback (Tesseract)** — if text extraction yields zero +/// pages (common with scanned documents, image-only PDFs, or PDFs +/// exported from scanners without an OCR layer), we extract embedded +/// images from the PDF and run Tesseract on each. Falls back +/// gracefully if Tesseract isn't installed (returns an error +/// directing the operator to install it). +/// +/// Schema is identical for both paths so downstream consumers (chunker, +/// vectord, queryd) don't need to know how the text was produced. pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc, Vec), String> { let doc = lopdf::Document::load_mem(content) .map_err(|e| format!("PDF load error: {e}"))?; @@ -26,13 +37,35 @@ pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc, } } + // Tier 2: OCR fallback for scanned pages. if page_numbers.is_empty() { - // PDF has no extractable text — likely scanned/image - return Err("PDF contains no extractable text (may be scanned/image — OCR not yet supported)".into()); + tracing::info!( + "no extractable text in '{}' — attempting OCR on {} pages", + source_filename, pages.len(), + ); + let ocr_results = ocr_pdf_images(&doc, source_filename)?; + if ocr_results.is_empty() { + return Err( + "PDF contains no extractable text and OCR produced no output. \ + Ensure tesseract is installed (`apt install tesseract-ocr tesseract-ocr-eng`)." + .into(), + ); + } + for (pg, text) in ocr_results { + page_numbers.push(pg); + page_texts.push(text); + sources.push(source_filename.to_string()); + } + tracing::info!( + "OCR recovered {} pages from '{}'", page_numbers.len(), source_filename, + ); + } else { + tracing::info!( + "extracted {} pages with text from PDF '{}'", + page_numbers.len(), source_filename, + ); } - tracing::info!("extracted {} pages with text from PDF '{}'", page_numbers.len(), source_filename); - let schema = Arc::new(Schema::new(vec![ Field::new("source_file", DataType::Utf8, false), Field::new("page_number", DataType::Int32, false), @@ -50,3 +83,197 @@ pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc, Ok((schema, vec![batch])) } + +/// Extract embedded images from the PDF, write each to a temp file, +/// run Tesseract, collect the OCR'd text. Returns (page_number, text) +/// pairs for pages that yielded non-empty output. +/// +/// Image extraction strategy: walk every page's resources looking for +/// XObject streams with /Subtype /Image. Decode the raw stream bytes +/// (lopdf handles FlateDecode/DCTDecode) and write them as-is to a +/// temp file with the appropriate extension. Tesseract handles JPEG, +/// PNG, TIFF, and BMP natively. +fn ocr_pdf_images( + doc: &lopdf::Document, + source_filename: &str, +) -> Result, String> { + if !tesseract_available() { + return Err( + "tesseract not found on PATH — install with `apt install tesseract-ocr tesseract-ocr-eng`" + .into(), + ); + } + + let pages = doc.get_pages(); + let mut results: Vec<(i32, String)> = Vec::new(); + + for (&page_num, &page_id) in &pages { + let images = extract_page_images(doc, page_id); + if images.is_empty() { + continue; + } + + // OCR each image on this page and concatenate the text. + let mut page_text = String::new(); + for (idx, img_bytes) in images.iter().enumerate() { + match ocr_image_bytes(img_bytes, page_num, idx) { + Ok(text) if !text.trim().is_empty() => { + if !page_text.is_empty() { page_text.push('\n'); } + page_text.push_str(text.trim()); + } + Ok(_) => {} // empty OCR — skip + Err(e) => { + tracing::warn!( + "OCR failed on '{}' page {} image {}: {e}", + source_filename, page_num, idx, + ); + } + } + } + if !page_text.is_empty() { + results.push((page_num as i32, page_text)); + } + } + Ok(results) +} + +/// Extract raw image bytes from a PDF page's XObject resources. +/// Handles the common case: page /Resources → /XObject dict → streams +/// with /Subtype /Image. +fn extract_page_images( + doc: &lopdf::Document, + page_id: lopdf::ObjectId, +) -> Vec> { + let mut images = Vec::new(); + + let page_obj = match doc.get_object(page_id) { + Ok(o) => o, + Err(_) => return images, + }; + let page_dict = match page_obj.as_dict() { + Ok(d) => d, + Err(_) => return images, + }; + + // Resolve /Resources → /XObject + let resources = page_dict + .get(b"Resources") + .ok() + .and_then(|r| doc.dereference(r).ok()) + .map(|(_, o)| o.clone()); + let xobjects = resources + .as_ref() + .and_then(|r| r.as_dict().ok()) + .and_then(|d| d.get(b"XObject").ok()) + .and_then(|x| doc.dereference(x).ok()) + .map(|(_, o)| o.clone()); + + let xobj_dict = match xobjects.as_ref().and_then(|x| x.as_dict().ok()) { + Some(d) => d, + None => return images, + }; + + for (_name, obj_ref) in xobj_dict.iter() { + let obj = match doc.dereference(obj_ref) { + Ok((_, o)) => o.clone(), + Err(_) => continue, + }; + let stream = match obj.as_stream() { + Ok(s) => s, + Err(_) => continue, + }; + // Check /Subtype == /Image + let is_image = stream.dict.get(b"Subtype") + .ok() + .and_then(|s| s.as_name().ok()) + .map(|n| n == b"Image") + .unwrap_or(false); + if !is_image { + continue; + } + // Get decoded content — lopdf handles FlateDecode/DCTDecode. + let content = match stream.decompressed_content() { + Ok(c) => c, + Err(_) => { + // Try raw content as fallback (some DCT streams are stored raw). + stream.content.clone() + } + }; + if !content.is_empty() { + images.push(content); + } + } + images +} + +/// Run Tesseract on raw image bytes. Writes to a temp file, shells out, +/// reads the output. Returns the OCR'd text. +fn ocr_image_bytes( + img_bytes: &[u8], + page_num: u32, + img_idx: usize, +) -> Result { + use std::io::Write; + use std::process::Command; + + let tmp_dir = std::env::temp_dir(); + // Detect image format from magic bytes to pick the right extension. + let ext = if img_bytes.starts_with(&[0xFF, 0xD8]) { + "jpg" + } else if img_bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) { + "png" + } else if img_bytes.starts_with(b"II") || img_bytes.starts_with(b"MM") { + "tiff" + } else { + // Tesseract can often handle raw pixel data with the right options, + // but writing as PNG is safer. For truly unknown formats, try PNG + // wrapping; if Tesseract can't read it, it'll error cleanly. + "png" + }; + + let img_path = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}.{ext}")); + let out_base = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}_out")); + let out_path = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}_out.txt")); + + // Write image bytes to temp file. + let mut f = std::fs::File::create(&img_path) + .map_err(|e| format!("create temp image: {e}"))?; + f.write_all(img_bytes) + .map_err(|e| format!("write temp image: {e}"))?; + drop(f); + + // Run Tesseract. + let status = Command::new("tesseract") + .arg(&img_path) + .arg(&out_base) + .arg("--oem").arg("3") // LSTM + legacy combined + .arg("--psm").arg("6") // assume uniform block of text + .arg("-l").arg("eng") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map_err(|e| format!("tesseract exec: {e}"))?; + + // Cleanup input regardless of outcome. + let _ = std::fs::remove_file(&img_path); + + if !status.success() { + let _ = std::fs::remove_file(&out_path); + return Err(format!("tesseract exited with status {status}")); + } + + let text = std::fs::read_to_string(&out_path).unwrap_or_default(); + let _ = std::fs::remove_file(&out_path); + Ok(text) +} + +/// Quick check: is `tesseract` on PATH? +fn tesseract_available() -> bool { + std::process::Command::new("tesseract") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) +}