ipfs_datasets_py/examples/intermediate/07_pdf_processing.py at main · endomorphosis/ipfs_datasets_py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""
PDF Processing - Extract Text with OCR Support

This example demonstrates how to process PDF files, including both text-based
PDFs and image-based PDFs that require OCR (Optical Character Recognition).

Requirements:
    - pypdf or pymupdf: pip install pypdf pymupdf
    - pytesseract: pip install pytesseract
    - Optional: surya_ocr, easyocr for better OCR

Usage:
    python examples/07_pdf_processing.py
"""

import asyncio
import tempfile
from pathlib import Path


def create_sample_pdf():
    """Create a simple PDF for demonstration."""
    try:
        from reportlab.pdfgen import canvas
        from reportlab.lib.pagesizes import letter

        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
        tmp_file.close()

        # Create PDF
        c = canvas.Canvas(tmp_file.name, pagesize=letter)
        c.drawString(100, 750, "Sample PDF Document")
        c.drawString(100, 730, "This is a text-based PDF created for testing.")
        c.drawString(100, 710, "It contains multiple lines of text.")
        c.drawString(100, 690, "PDF processing can extract this text easily.")
        c.showPage()
        c.save()

        return tmp_file.name
    except ImportError:
        print("⚠️  reportlab not installed, using alternative method")
        # Create a minimal PDF manually if reportlab not available
        return None


async def demo_basic_pdf_extraction():
    """Extract text from a text-based PDF."""
    print("\n" + "="*70)
    print("DEMO 1: Basic PDF Text Extraction")
    print("="*70)

    try:
        from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor

        # Create sample PDF
        print("\n📝 Creating sample PDF...")
        pdf_path = create_sample_pdf()

        if not pdf_path:
            print("⚠️  Could not create sample PDF, skipping demo")
            return

        # Initialize processor
        print("\n🔍 Initializing PDF processor...")
        processor = PDFProcessor()

        # Extract text
        print(f"\n📄 Extracting text from PDF...")
        result = await processor.process(pdf_path)

        if result.success:
            print("✅ Extraction successful")
            print(f"   Pages: {result.metadata.get('num_pages', 0)}")
            print(f"   Text length: {len(result.text)} characters")
            print(f"\n   Preview:")
            print(f"   {result.text[:200]}...")
        else:
            print(f"❌ Extraction failed: {result.error}")

        # Cleanup
        import os
        os.unlink(pdf_path)

    except ImportError as e:
        print(f"\n❌ Missing dependencies: {e}")
        print("   Install with: pip install pypdf pymupdf")
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()


async def demo_pdf_metadata():
    """Extract PDF metadata."""
    print("\n" + "="*70)
    print("DEMO 2: PDF Metadata Extraction")
    print("="*70)

    try:
        from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor

        pdf_path = create_sample_pdf()
        if not pdf_path:
            return

        processor = PDFProcessor()

        print("\n📊 Extracting PDF metadata...")
        result = await processor.process(pdf_path)

        if result.success and result.metadata:
            print("✅ Metadata extracted:")
            metadata = result.metadata

            # Common metadata fields
            fields = ['title', 'author', 'subject', 'creator', 'producer',
                     'num_pages', 'creation_date', 'modification_date']

            for field in fields:
                if field in metadata:
                    print(f"   {field}: {metadata[field]}")

        import os
        os.unlink(pdf_path)

    except Exception as e:
        print(f"\n❌ Error: {e}")


async def demo_ocr_processing():
    """Demonstrate OCR processing for image-based PDFs."""
    print("\n" + "="*70)
    print("DEMO 3: OCR Processing")
    print("="*70)

    print("\n📷 OCR Example")
    print("   OCR processes image-based PDFs or scanned documents")
    print("   Requires: pytesseract, tesseract-ocr system package")

    # Example code (requires actual image PDF)
    """
    try:
        from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor

        processor = PDFProcessor(ocr_engine="tesseract")

        # Process image-based PDF
        result = await processor.process(
            "scanned_document.pdf",
            use_ocr=True
        )

        if result.success:
            print(f"✅ OCR extraction: {len(result.text)} chars")

    except Exception as e:
        print(f"❌ OCR error: {e}")
    """

    print("\n💡 Available OCR Engines:")
    print("   - tesseract: Fast, good accuracy")
    print("   - easyocr: GPU-accelerated, multi-language")
    print("   - surya_ocr: High accuracy, modern architecture")
    print("   - paddleocr: Good for Asian languages")


async def demo_multi_engine_ocr():
    """Demonstrate multi-engine OCR with fallbacks."""
    print("\n" + "="*70)
    print("DEMO 4: Multi-Engine OCR")
    print("="*70)

    print("\n🔧 Multi-Engine OCR Configuration")

    print("\n   The PDFProcessor supports multiple OCR engines with fallbacks:")

    example_code = '''
from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor

# Configure multiple OCR engines
processor = PDFProcessor(
    ocr_engines=["surya", "tesseract", "easyocr"],  # Try in order
    fallback_on_error=True,                          # Fallback if one fails
    confidence_threshold=0.8                          # Min confidence
)

result = await processor.process("document.pdf", use_ocr=True)
    '''

    print(example_code)

    print("\n💡 Tips:")
    print("   - surya_ocr: Best quality, slower")
    print("   - tesseract: Fast, good for English")
    print("   - easyocr: Good balance, GPU support")
    print("   - Use fallbacks for reliability")


async def demo_pdf_structure_extraction():
    """Extract PDF structure (headings, paragraphs, etc.)."""
    print("\n" + "="*70)
    print("DEMO 5: PDF Structure Extraction")
    print("="*70)

    print("\n📑 Structure Extraction")
    print("   Extract document structure beyond raw text:")

    example_code = '''
from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor

processor = PDFProcessor(extract_structure=True)
result = await processor.process("document.pdf")

if result.success:
    # Access structured data
    structure = result.metadata.get('structure', {})

    print(f"Headings: {len(structure.get('headings', []))}")
    print(f"Paragraphs: {len(structure.get('paragraphs', []))}")
    print(f"Tables: {len(structure.get('tables', []))}")
    print(f"Images: {len(structure.get('images', []))}")
    '''

    print(example_code)


async def demo_batch_pdf_processing():
    """Process multiple PDFs in batch."""
    print("\n" + "="*70)
    print("DEMO 6: Batch PDF Processing")
    print("="*70)

    print("\n📚 Batch Processing Example")

    example_code = '''
from ipfs_datasets_py.processors.specialized.pdf import PDFProcessor
from ipfs_datasets_py.processors.file_converter import BatchProcessor
from pathlib import Path

# Get all PDFs in directory
pdf_files = list(Path("documents/").glob("*.pdf"))

# Create batch processor
processor = PDFProcessor()
batch = BatchProcessor(max_concurrent=5)

# Process all PDFs
results = await batch.process_batch(
    files=pdf_files,
    processor=processor,
    use_ocr=False  # Set True for image PDFs
)

# Summarize results
successful = sum(1 for r in results if r.success)
print(f"Processed: {len(results)} PDFs")
print(f"Successful: {successful}")
print(f"Failed: {len(results) - successful}")
    '''

    print(example_code)


def show_tips():
    """Show tips for PDF processing."""
    print("\n" + "="*70)
    print("TIPS FOR PDF PROCESSING")
    print("="*70)

    print("\n1. Choosing OCR Engine:")
    print("   - Text PDFs: No OCR needed, fast extraction")
    print("   - Scanned documents: Use OCR")
    print("   - Mixed PDFs: Auto-detect text/image pages")

    print("\n2. OCR Performance:")
    print("   - tesseract: Fastest, good for English")
    print("   - easyocr: GPU acceleration, multi-language")
    print("   - surya: Best quality, slower")
    print("   - Use appropriate DPI (300+ for quality)")

    print("\n3. Memory Management:")
    print("   - Large PDFs: Process page-by-page")
    print("   - Batch processing: Limit concurrent jobs")
    print("   - Clear caches between large batches")

    print("\n4. Quality Optimization:")
    print("   - Preprocessing: Deskew, denoise images")
    print("   - Language hints: Improve accuracy")
    print("   - Post-processing: Clean extracted text")

    print("\n5. Common Issues:")
    print("   - Encrypted PDFs: Decrypt first")
    print("   - Complex layouts: May need manual review")
    print("   - Low-quality scans: Enhance before OCR")

    print("\n6. System Requirements:")
    print("   - tesseract: apt install tesseract-ocr")
    print("   - GPU: Improves easyocr/surya performance")
    print("   - Memory: ~2GB per OCR process")

    print("\n7. Next Steps:")
    print("   - See 09_batch_processing.py for large-scale processing")
    print("   - See 12_graphrag_basic.py for PDF-based RAG")


async def main():
    """Run all PDF processing demonstrations."""
    print("\n" + "="*70)
    print("IPFS DATASETS PYTHON - PDF PROCESSING")
    print("="*70)

    await demo_basic_pdf_extraction()
    await demo_pdf_metadata()
    await demo_ocr_processing()
    await demo_multi_engine_ocr()
    await demo_pdf_structure_extraction()
    await demo_batch_pdf_processing()

    show_tips()

    print("\n" + "="*70)
    print("✅ PDF PROCESSING EXAMPLES COMPLETE")
    print("="*70)


if __name__ == "__main__":
    asyncio.run(main())