Skip to content

Commit d82a0a0

Browse files
committed
ADD mime type selection for llama stack document
Signed-off-by: acmenezes <[email protected]>
1 parent a5e14e2 commit d82a0a0

File tree

1 file changed

+67
-3
lines changed

1 file changed

+67
-3
lines changed

deploy/helm/ingestion-pipeline/files/ingestion.py

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,65 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
125125
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
126126
from docling_core.types.doc.labels import DocItemLabel
127127

128+
# Function to determine the correct MIME type based on file extension
129+
def get_mime_type(file_path):
130+
# Get the file extension and convert to lowercase
131+
_, file_extension = os.path.splitext(file_path)
132+
ext = file_extension.lower()
133+
134+
# Dictionary mapping extensions to MIME types
135+
mime_types = {
136+
# Text files
137+
'.txt': 'text/plain',
138+
'.text': 'text/plain',
139+
'.log': 'text/plain',
140+
141+
# HTML and web files
142+
'.html': 'text/html',
143+
'.htm': 'text/html',
144+
'.xhtml': 'application/xhtml+xml',
145+
'.css': 'text/css',
146+
'.js': 'text/javascript',
147+
148+
# Document formats
149+
'.pdf': 'application/pdf',
150+
'.doc': 'application/msword',
151+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
152+
'.rtf': 'application/rtf',
153+
'.xls': 'application/vnd.ms-excel',
154+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
155+
'.ppt': 'application/vnd.ms-powerpoint',
156+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
157+
'.odt': 'application/vnd.oasis.opendocument.text',
158+
159+
# Markup and data formats
160+
'.xml': 'application/xml',
161+
'.json': 'application/json',
162+
'.csv': 'text/csv',
163+
'.md': 'text/markdown',
164+
'.markdown': 'text/markdown',
165+
166+
# Image formats
167+
'.jpg': 'image/jpeg',
168+
'.jpeg': 'image/jpeg',
169+
'.png': 'image/png',
170+
'.gif': 'image/gif',
171+
'.bmp': 'image/bmp',
172+
'.svg': 'image/svg+xml',
173+
'.webp': 'image/webp',
174+
175+
# Archive formats
176+
'.zip': 'application/zip',
177+
'.tar': 'application/x-tar',
178+
'.gz': 'application/gzip',
179+
180+
# Default for unknown types
181+
'': 'application/octet-stream'
182+
}
183+
184+
# Return the appropriate MIME type, or default to octet-stream if unknown
185+
return mime_types.get(ext, 'application/octet-stream')
186+
128187
source = os.environ.get('SOURCE')
129188
name = os.environ.get('NAME')
130189
version = os.environ.get('VERSION')
@@ -201,11 +260,16 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
201260
chunks = chunker.chunk(docling_doc)
202261
chunk_count = 0
203262

263+
# Get the appropriate MIME type for this file
264+
mime_type = get_mime_type(file_path)
265+
print(f"Using MIME type {mime_type} for file {os.path.basename(file_path)}")
266+
204267
for chunk in chunks:
205268
if any(
206269
c.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH, DocItemLabel.TABLE,
207-
DocItemLabel.TABLE_CELL, DocItemLabel.HEADER, DocItemLabel.FOOTER,
208-
DocItemLabel.TITLE, DocItemLabel.PICTURE_DESCRIPTION]
270+
DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER,
271+
DocItemLabel.TITLE, DocItemLabel.PICTURE, DocItemLabel.CHART,
272+
DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER]
209273
for c in chunk.meta.doc_items
210274
):
211275
i += 1
@@ -214,7 +278,7 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
214278
LlamaStackDocument(
215279
document_id=f"doc-{i}",
216280
content=chunk.text,
217-
mime_type="text/plain",
281+
mime_type=mime_type,
218282
metadata={"source": os.path.basename(file_path)},
219283
)
220284
)

0 commit comments

Comments
 (0)