Initialize pipelines for more doc types

aakankshaduggal · aakankshaduggal · commit 10ac813a9206 · 2025-01-20T14:37:49.000-05:00
Signed-off-by: Aakanksha Duggal &lt;aduggal@redhat.com&gt;
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -115,6 +115,7 @@ def _init_docling_converter(self):
         # Third Party
         from docling.document_converter import DocumentConverter, PdfFormatOption
         from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+        from docling.pipeline.simple_pipeline import SimplePipeline
 
         if self.docling_model_path is None:
             logger.info("Docling models not found on disk, downloading models...")
@@ -134,7 +135,14 @@ def _init_docling_converter(self):
 
         return DocumentConverter(
             format_options={
-                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+                InputFormat.MD: None,  # Add support for markdown
+                InputFormat.DOCX: WordFormatOption(
+                    pipeline_cls=SimplePipeline
+                ),  # Add support for docx, default pipeline for Docx and HTML
+                InputFormat.IMAGE: None,  # Add support for images
+                InputFormat.HTML: None,  # Add support for html
+                InputFormat.PPTX: None,  # Add support for pptx
             }
         )
 
@@ -155,7 +163,9 @@ def chunk_documents(self) -> List:
         docling_json_paths = list(docling_artifacts_path.glob("*.json"))
         chunks = []
         for json_fp in docling_json_paths:
-            chunks.extend(self._process_parsed_docling_json(json_fp))
+            with json_fp.open("r", encoding="utf-8") as file:
+                data = json.load(file)
+                chunks.extend(self._process_parsed_docling_json(data))
 
         return chunks