@@ -115,6 +115,7 @@ def _init_docling_converter(self):
115
115
# Third Party
116
116
from docling .document_converter import DocumentConverter , PdfFormatOption
117
117
from docling .pipeline .standard_pdf_pipeline import StandardPdfPipeline
118
+ from docling .pipeline .simple_pipeline import SimplePipeline
118
119
119
120
if self .docling_model_path is None :
120
121
logger .info ("Docling models not found on disk, downloading models..." )
@@ -134,7 +135,14 @@ def _init_docling_converter(self):
134
135
135
136
return DocumentConverter (
136
137
format_options = {
137
- InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options )
138
+ InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options ),
139
+ InputFormat .MD : None , # Add support for markdown
140
+ InputFormat .DOCX : WordFormatOption (
141
+ pipeline_cls = SimplePipeline
142
+ ), # Add support for docx, default pipeline for Docx and HTML
143
+ InputFormat .IMAGE : None , # Add support for images
144
+ InputFormat .HTML : None , # Add support for html
145
+ InputFormat .PPTX : None , # Add support for pptx
138
146
}
139
147
)
140
148
@@ -155,7 +163,9 @@ def chunk_documents(self) -> List:
155
163
docling_json_paths = list (docling_artifacts_path .glob ("*.json" ))
156
164
chunks = []
157
165
for json_fp in docling_json_paths :
158
- chunks .extend (self ._process_parsed_docling_json (json_fp ))
166
+ with json_fp .open ("r" , encoding = "utf-8" ) as file :
167
+ data = json .load (file )
168
+ chunks .extend (self ._process_parsed_docling_json (data ))
159
169
160
170
return chunks
161
171
0 commit comments