@@ -125,6 +125,65 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
125
125
from docling_core .transforms .chunker .hybrid_chunker import HybridChunker
126
126
from docling_core .types .doc .labels import DocItemLabel
127
127
128
+ # Function to determine the correct MIME type based on file extension
129
+ def get_mime_type (file_path ):
130
+ # Get the file extension and convert to lowercase
131
+ _ , file_extension = os .path .splitext (file_path )
132
+ ext = file_extension .lower ()
133
+
134
+ # Dictionary mapping extensions to MIME types
135
+ mime_types = {
136
+ # Text files
137
+ '.txt' : 'text/plain' ,
138
+ '.text' : 'text/plain' ,
139
+ '.log' : 'text/plain' ,
140
+
141
+ # HTML and web files
142
+ '.html' : 'text/html' ,
143
+ '.htm' : 'text/html' ,
144
+ '.xhtml' : 'application/xhtml+xml' ,
145
+ '.css' : 'text/css' ,
146
+ '.js' : 'text/javascript' ,
147
+
148
+ # Document formats
149
+ '.pdf' : 'application/pdf' ,
150
+ '.doc' : 'application/msword' ,
151
+ '.docx' : 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ,
152
+ '.rtf' : 'application/rtf' ,
153
+ '.xls' : 'application/vnd.ms-excel' ,
154
+ '.xlsx' : 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ,
155
+ '.ppt' : 'application/vnd.ms-powerpoint' ,
156
+ '.pptx' : 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ,
157
+ '.odt' : 'application/vnd.oasis.opendocument.text' ,
158
+
159
+ # Markup and data formats
160
+ '.xml' : 'application/xml' ,
161
+ '.json' : 'application/json' ,
162
+ '.csv' : 'text/csv' ,
163
+ '.md' : 'text/markdown' ,
164
+ '.markdown' : 'text/markdown' ,
165
+
166
+ # Image formats
167
+ '.jpg' : 'image/jpeg' ,
168
+ '.jpeg' : 'image/jpeg' ,
169
+ '.png' : 'image/png' ,
170
+ '.gif' : 'image/gif' ,
171
+ '.bmp' : 'image/bmp' ,
172
+ '.svg' : 'image/svg+xml' ,
173
+ '.webp' : 'image/webp' ,
174
+
175
+ # Archive formats
176
+ '.zip' : 'application/zip' ,
177
+ '.tar' : 'application/x-tar' ,
178
+ '.gz' : 'application/gzip' ,
179
+
180
+ # Default for unknown types
181
+ '' : 'application/octet-stream'
182
+ }
183
+
184
+ # Return the appropriate MIME type, or default to octet-stream if unknown
185
+ return mime_types .get (ext , 'application/octet-stream' )
186
+
128
187
source = os .environ .get ('SOURCE' )
129
188
name = os .environ .get ('NAME' )
130
189
version = os .environ .get ('VERSION' )
@@ -201,11 +260,16 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
201
260
chunks = chunker .chunk (docling_doc )
202
261
chunk_count = 0
203
262
263
+ # Get the appropriate MIME type for this file
264
+ mime_type = get_mime_type (file_path )
265
+ print (f"Using MIME type { mime_type } for file { os .path .basename (file_path )} " )
266
+
204
267
for chunk in chunks :
205
268
if any (
206
269
c .label in [DocItemLabel .TEXT , DocItemLabel .PARAGRAPH , DocItemLabel .TABLE ,
207
- DocItemLabel .TABLE_CELL , DocItemLabel .HEADER , DocItemLabel .FOOTER ,
208
- DocItemLabel .TITLE , DocItemLabel .PICTURE_DESCRIPTION ]
270
+ DocItemLabel .PAGE_HEADER , DocItemLabel .PAGE_FOOTER ,
271
+ DocItemLabel .TITLE , DocItemLabel .PICTURE , DocItemLabel .CHART ,
272
+ DocItemLabel .DOCUMENT_INDEX , DocItemLabel .SECTION_HEADER ]
209
273
for c in chunk .meta .doc_items
210
274
):
211
275
i += 1
@@ -214,7 +278,7 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
214
278
LlamaStackDocument (
215
279
document_id = f"doc-{ i } " ,
216
280
content = chunk .text ,
217
- mime_type = "text/plain" ,
281
+ mime_type = mime_type ,
218
282
metadata = {"source" : os .path .basename (file_path )},
219
283
)
220
284
)
0 commit comments