PDF OCR via Tesseract — scanned documents now ingestible

Two-tier PDF extraction: lopdf text layer first (fast, digital PDFs), Tesseract OCR fallback when text extraction yields zero pages (scanned documents, image-only PDFs). Falls back gracefully if Tesseract isn't installed — returns an actionable error directing the operator to `apt install tesseract-ocr tesseract-ocr-eng`. OCR path: extract embedded XObject /Image streams from each page via lopdf, detect format from magic bytes (JPEG/PNG/TIFF), write to temp file, shell out to tesseract with --oem 3 --psm 6 (LSTM + uniform text block), read output, clean up. Temp files cleaned even on error. Schema unchanged — both paths produce (source_file, page_number, text_content) so downstream consumers (chunker, vectord, queryd) work identically regardless of how text was produced. Verified: created a synthetic scanned PDF (PIL → image → PDF with no text layer), ingested via POST /ingest/file. Tesseract recovered the text with expected OCR artifacts. Queryable via DataFusion SQL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 20:45:00 -05:00 · 2026-04-16 20:45:00 -05:00 · 2592f8fcb3
commit 2592f8fcb3
parent 17a0259cd0
1 changed files with 233 additions and 6 deletions
--- a/crates/ingestd/src/pdf_ingest.rs
+++ b/crates/ingestd/src/pdf_ingest.rs
@ -4,8 +4,19 @@ use arrow::record_batch::RecordBatch;
 use std::sync::Arc;

 /// Extract text from a PDF file.
-/// Returns one row per page: (page_number, text_content).
-/// This handles text-based PDFs. Scanned/image PDFs need OCR (not implemented yet).
+///
+/// Two-tier approach:
+/// 1. **Text extraction (lopdf)** — fast, works on digital/native PDFs.
+///    Returns one row per page: (source_file, page_number, text_content).
+/// 2. **OCR fallback (Tesseract)** — if text extraction yields zero
+///    pages (common with scanned documents, image-only PDFs, or PDFs
+///    exported from scanners without an OCR layer), we extract embedded
+///    images from the PDF and run Tesseract on each. Falls back
+///    gracefully if Tesseract isn't installed (returns an error
+///    directing the operator to install it).
+///
+/// Schema is identical for both paths so downstream consumers (chunker,
+/// vectord, queryd) don't need to know how the text was produced.
 pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc<Schema>, Vec<RecordBatch>), String> {
    let doc = lopdf::Document::load_mem(content)
        .map_err(|e| format!("PDF load error: {e}"))?;
@ -26,13 +37,35 @@ pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc<Schema>,
        }
    }

+    // Tier 2: OCR fallback for scanned pages.
    if page_numbers.is_empty() {
-        // PDF has no extractable text — likely scanned/image
-        return Err("PDF contains no extractable text (may be scanned/image — OCR not yet supported)".into());
+        tracing::info!(
+            "no extractable text in '{}' — attempting OCR on {} pages",
+            source_filename, pages.len(),
+        );
+        let ocr_results = ocr_pdf_images(&doc, source_filename)?;
+        if ocr_results.is_empty() {
+            return Err(
+                "PDF contains no extractable text and OCR produced no output. \
+                 Ensure tesseract is installed (`apt install tesseract-ocr tesseract-ocr-eng`)."
+                .into(),
+            );
+        }
+        for (pg, text) in ocr_results {
+            page_numbers.push(pg);
+            page_texts.push(text);
+            sources.push(source_filename.to_string());
+        }
+        tracing::info!(
+            "OCR recovered {} pages from '{}'", page_numbers.len(), source_filename,
+        );
+    } else {
+        tracing::info!(
+            "extracted {} pages with text from PDF '{}'",
+            page_numbers.len(), source_filename,
+        );
    }

-    tracing::info!("extracted {} pages with text from PDF '{}'", page_numbers.len(), source_filename);
-
    let schema = Arc::new(Schema::new(vec![
        Field::new("source_file", DataType::Utf8, false),
        Field::new("page_number", DataType::Int32, false),
@ -50,3 +83,197 @@ pub fn parse_pdf(content: &[u8], source_filename: &str) -> Result<(Arc<Schema>,

    Ok((schema, vec![batch]))
 }
+
+/// Extract embedded images from the PDF, write each to a temp file,
+/// run Tesseract, collect the OCR'd text. Returns (page_number, text)
+/// pairs for pages that yielded non-empty output.
+///
+/// Image extraction strategy: walk every page's resources looking for
+/// XObject streams with /Subtype /Image. Decode the raw stream bytes
+/// (lopdf handles FlateDecode/DCTDecode) and write them as-is to a
+/// temp file with the appropriate extension. Tesseract handles JPEG,
+/// PNG, TIFF, and BMP natively.
+fn ocr_pdf_images(
+    doc: &lopdf::Document,
+    source_filename: &str,
+) -> Result<Vec<(i32, String)>, String> {
+    if !tesseract_available() {
+        return Err(
+            "tesseract not found on PATH — install with `apt install tesseract-ocr tesseract-ocr-eng`"
+                .into(),
+        );
+    }
+
+    let pages = doc.get_pages();
+    let mut results: Vec<(i32, String)> = Vec::new();
+
+    for (&page_num, &page_id) in &pages {
+        let images = extract_page_images(doc, page_id);
+        if images.is_empty() {
+            continue;
+        }
+
+        // OCR each image on this page and concatenate the text.
+        let mut page_text = String::new();
+        for (idx, img_bytes) in images.iter().enumerate() {
+            match ocr_image_bytes(img_bytes, page_num, idx) {
+                Ok(text) if !text.trim().is_empty() => {
+                    if !page_text.is_empty() { page_text.push('\n'); }
+                    page_text.push_str(text.trim());
+                }
+                Ok(_) => {} // empty OCR — skip
+                Err(e) => {
+                    tracing::warn!(
+                        "OCR failed on '{}' page {} image {}: {e}",
+                        source_filename, page_num, idx,
+                    );
+                }
+            }
+        }
+        if !page_text.is_empty() {
+            results.push((page_num as i32, page_text));
+        }
+    }
+    Ok(results)
+}
+
+/// Extract raw image bytes from a PDF page's XObject resources.
+/// Handles the common case: page /Resources → /XObject dict → streams
+/// with /Subtype /Image.
+fn extract_page_images(
+    doc: &lopdf::Document,
+    page_id: lopdf::ObjectId,
+) -> Vec<Vec<u8>> {
+    let mut images = Vec::new();
+
+    let page_obj = match doc.get_object(page_id) {
+        Ok(o) => o,
+        Err(_) => return images,
+    };
+    let page_dict = match page_obj.as_dict() {
+        Ok(d) => d,
+        Err(_) => return images,
+    };
+
+    // Resolve /Resources → /XObject
+    let resources = page_dict
+        .get(b"Resources")
+        .ok()
+        .and_then(|r| doc.dereference(r).ok())
+        .map(|(_, o)| o.clone());
+    let xobjects = resources
+        .as_ref()
+        .and_then(|r| r.as_dict().ok())
+        .and_then(|d| d.get(b"XObject").ok())
+        .and_then(|x| doc.dereference(x).ok())
+        .map(|(_, o)| o.clone());
+
+    let xobj_dict = match xobjects.as_ref().and_then(|x| x.as_dict().ok()) {
+        Some(d) => d,
+        None => return images,
+    };
+
+    for (_name, obj_ref) in xobj_dict.iter() {
+        let obj = match doc.dereference(obj_ref) {
+            Ok((_, o)) => o.clone(),
+            Err(_) => continue,
+        };
+        let stream = match obj.as_stream() {
+            Ok(s) => s,
+            Err(_) => continue,
+        };
+        // Check /Subtype == /Image
+        let is_image = stream.dict.get(b"Subtype")
+            .ok()
+            .and_then(|s| s.as_name().ok())
+            .map(|n| n == b"Image")
+            .unwrap_or(false);
+        if !is_image {
+            continue;
+        }
+        // Get decoded content — lopdf handles FlateDecode/DCTDecode.
+        let content = match stream.decompressed_content() {
+            Ok(c) => c,
+            Err(_) => {
+                // Try raw content as fallback (some DCT streams are stored raw).
+                stream.content.clone()
+            }
+        };
+        if !content.is_empty() {
+            images.push(content);
+        }
+    }
+    images
+}
+
+/// Run Tesseract on raw image bytes. Writes to a temp file, shells out,
+/// reads the output. Returns the OCR'd text.
+fn ocr_image_bytes(
+    img_bytes: &[u8],
+    page_num: u32,
+    img_idx: usize,
+) -> Result<String, String> {
+    use std::io::Write;
+    use std::process::Command;
+
+    let tmp_dir = std::env::temp_dir();
+    // Detect image format from magic bytes to pick the right extension.
+    let ext = if img_bytes.starts_with(&[0xFF, 0xD8]) {
+        "jpg"
+    } else if img_bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
+        "png"
+    } else if img_bytes.starts_with(b"II") || img_bytes.starts_with(b"MM") {
+        "tiff"
+    } else {
+        // Tesseract can often handle raw pixel data with the right options,
+        // but writing as PNG is safer. For truly unknown formats, try PNG
+        // wrapping; if Tesseract can't read it, it'll error cleanly.
+        "png"
+    };
+
+    let img_path = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}.{ext}"));
+    let out_base = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}_out"));
+    let out_path = tmp_dir.join(format!("lh_ocr_p{page_num}_i{img_idx}_out.txt"));
+
+    // Write image bytes to temp file.
+    let mut f = std::fs::File::create(&img_path)
+        .map_err(|e| format!("create temp image: {e}"))?;
+    f.write_all(img_bytes)
+        .map_err(|e| format!("write temp image: {e}"))?;
+    drop(f);
+
+    // Run Tesseract.
+    let status = Command::new("tesseract")
+        .arg(&img_path)
+        .arg(&out_base)
+        .arg("--oem").arg("3")         // LSTM + legacy combined
+        .arg("--psm").arg("6")         // assume uniform block of text
+        .arg("-l").arg("eng")
+        .stdout(std::process::Stdio::null())
+        .stderr(std::process::Stdio::null())
+        .status()
+        .map_err(|e| format!("tesseract exec: {e}"))?;
+
+    // Cleanup input regardless of outcome.
+    let _ = std::fs::remove_file(&img_path);
+
+    if !status.success() {
+        let _ = std::fs::remove_file(&out_path);
+        return Err(format!("tesseract exited with status {status}"));
+    }
+
+    let text = std::fs::read_to_string(&out_path).unwrap_or_default();
+    let _ = std::fs::remove_file(&out_path);
+    Ok(text)
+}
+
+/// Quick check: is `tesseract` on PATH?
+fn tesseract_available() -> bool {
+    std::process::Command::new("tesseract")
+        .arg("--version")
+        .stdout(std::process::Stdio::null())
+        .stderr(std::process::Stdio::null())
+        .status()
+        .map(|s| s.success())
+        .unwrap_or(false)
+}