Skip to content

Commit 10ac813

Browse files
Initialize pipelines for more doc types
Signed-off-by: Aakanksha Duggal <[email protected]>
1 parent 0b91a3f commit 10ac813

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

src/instructlab/sdg/utils/chunkers.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def _init_docling_converter(self):
115115
# Third Party
116116
from docling.document_converter import DocumentConverter, PdfFormatOption
117117
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
118+
from docling.pipeline.simple_pipeline import SimplePipeline
118119

119120
if self.docling_model_path is None:
120121
logger.info("Docling models not found on disk, downloading models...")
@@ -134,7 +135,14 @@ def _init_docling_converter(self):
134135

135136
return DocumentConverter(
136137
format_options={
137-
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
138+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
139+
InputFormat.MD: None, # Add support for markdown
140+
InputFormat.DOCX: WordFormatOption(
141+
pipeline_cls=SimplePipeline
142+
), # Add support for docx, default pipeline for Docx and HTML
143+
InputFormat.IMAGE: None, # Add support for images
144+
InputFormat.HTML: None, # Add support for html
145+
InputFormat.PPTX: None, # Add support for pptx
138146
}
139147
)
140148

@@ -155,7 +163,9 @@ def chunk_documents(self) -> List:
155163
docling_json_paths = list(docling_artifacts_path.glob("*.json"))
156164
chunks = []
157165
for json_fp in docling_json_paths:
158-
chunks.extend(self._process_parsed_docling_json(json_fp))
166+
with json_fp.open("r", encoding="utf-8") as file:
167+
data = json.load(file)
168+
chunks.extend(self._process_parsed_docling_json(data))
159169

160170
return chunks
161171

0 commit comments

Comments
 (0)