@@ -125,6 +125,65 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
125125 from docling_core .transforms .chunker .hybrid_chunker import HybridChunker
126126 from docling_core .types .doc .labels import DocItemLabel
127127
128+ # Function to determine the correct MIME type based on file extension
129+ def get_mime_type (file_path ):
130+ # Get the file extension and convert to lowercase
131+ _ , file_extension = os .path .splitext (file_path )
132+ ext = file_extension .lower ()
133+
134+ # Dictionary mapping extensions to MIME types
135+ mime_types = {
136+ # Text files
137+ '.txt' : 'text/plain' ,
138+ '.text' : 'text/plain' ,
139+ '.log' : 'text/plain' ,
140+
141+ # HTML and web files
142+ '.html' : 'text/html' ,
143+ '.htm' : 'text/html' ,
144+ '.xhtml' : 'application/xhtml+xml' ,
145+ '.css' : 'text/css' ,
146+ '.js' : 'text/javascript' ,
147+
148+ # Document formats
149+ '.pdf' : 'application/pdf' ,
150+ '.doc' : 'application/msword' ,
151+ '.docx' : 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ,
152+ '.rtf' : 'application/rtf' ,
153+ '.xls' : 'application/vnd.ms-excel' ,
154+ '.xlsx' : 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ,
155+ '.ppt' : 'application/vnd.ms-powerpoint' ,
156+ '.pptx' : 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ,
157+ '.odt' : 'application/vnd.oasis.opendocument.text' ,
158+
159+ # Markup and data formats
160+ '.xml' : 'application/xml' ,
161+ '.json' : 'application/json' ,
162+ '.csv' : 'text/csv' ,
163+ '.md' : 'text/markdown' ,
164+ '.markdown' : 'text/markdown' ,
165+
166+ # Image formats
167+ '.jpg' : 'image/jpeg' ,
168+ '.jpeg' : 'image/jpeg' ,
169+ '.png' : 'image/png' ,
170+ '.gif' : 'image/gif' ,
171+ '.bmp' : 'image/bmp' ,
172+ '.svg' : 'image/svg+xml' ,
173+ '.webp' : 'image/webp' ,
174+
175+ # Archive formats
176+ '.zip' : 'application/zip' ,
177+ '.tar' : 'application/x-tar' ,
178+ '.gz' : 'application/gzip' ,
179+
180+ # Default for unknown types
181+ '' : 'application/octet-stream'
182+ }
183+
184+ # Return the appropriate MIME type, or default to octet-stream if unknown
185+ return mime_types .get (ext , 'application/octet-stream' )
186+
128187 source = os .environ .get ('SOURCE' )
129188 name = os .environ .get ('NAME' )
130189 version = os .environ .get ('VERSION' )
@@ -201,11 +260,16 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
201260 chunks = chunker .chunk (docling_doc )
202261 chunk_count = 0
203262
263+ # Get the appropriate MIME type for this file
264+ mime_type = get_mime_type (file_path )
265+ print (f"Using MIME type { mime_type } for file { os .path .basename (file_path )} " )
266+
204267 for chunk in chunks :
205268 if any (
206269 c .label in [DocItemLabel .TEXT , DocItemLabel .PARAGRAPH , DocItemLabel .TABLE ,
207- DocItemLabel .TABLE_CELL , DocItemLabel .HEADER , DocItemLabel .FOOTER ,
208- DocItemLabel .TITLE , DocItemLabel .PICTURE_DESCRIPTION ]
270+ DocItemLabel .PAGE_HEADER , DocItemLabel .PAGE_FOOTER ,
271+ DocItemLabel .TITLE , DocItemLabel .PICTURE , DocItemLabel .CHART ,
272+ DocItemLabel .DOCUMENT_INDEX , DocItemLabel .SECTION_HEADER ]
209273 for c in chunk .meta .doc_items
210274 ):
211275 i += 1
@@ -214,7 +278,7 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
214278 LlamaStackDocument (
215279 document_id = f"doc-{ i } " ,
216280 content = chunk .text ,
217- mime_type = "text/plain" ,
281+ mime_type = mime_type ,
218282 metadata = {"source" : os .path .basename (file_path )},
219283 )
220284 )
0 commit comments