ADD mime type selection for llama stack document

acmenezes · acmenezes · commit d82a0a0486d9 · 2025-05-06T19:21:31.000-04:00
Signed-off-by: acmenezes &lt;adcmenezes@gmail.com&gt;
diff --git a/deploy/helm/ingestion-pipeline/files/ingestion.py b/deploy/helm/ingestion-pipeline/files/ingestion.py
@@ -125,6 +125,65 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
     from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
     from docling_core.types.doc.labels import DocItemLabel
 
+    # Function to determine the correct MIME type based on file extension
+    def get_mime_type(file_path):
+        # Get the file extension and convert to lowercase
+        _, file_extension = os.path.splitext(file_path)
+        ext = file_extension.lower()
+        
+        # Dictionary mapping extensions to MIME types
+        mime_types = {
+            # Text files
+            '.txt': 'text/plain',
+            '.text': 'text/plain',
+            '.log': 'text/plain',
+            
+            # HTML and web files
+            '.html': 'text/html',
+            '.htm': 'text/html',
+            '.xhtml': 'application/xhtml+xml',
+            '.css': 'text/css',
+            '.js': 'text/javascript',
+            
+            # Document formats
+            '.pdf': 'application/pdf',
+            '.doc': 'application/msword',
+            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            '.rtf': 'application/rtf',
+            '.xls': 'application/vnd.ms-excel',
+            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            '.ppt': 'application/vnd.ms-powerpoint',
+            '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+            '.odt': 'application/vnd.oasis.opendocument.text',
+            
+            # Markup and data formats
+            '.xml': 'application/xml',
+            '.json': 'application/json',
+            '.csv': 'text/csv',
+            '.md': 'text/markdown',
+            '.markdown': 'text/markdown',
+            
+            # Image formats
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.png': 'image/png',
+            '.gif': 'image/gif',
+            '.bmp': 'image/bmp',
+            '.svg': 'image/svg+xml',
+            '.webp': 'image/webp',
+            
+            # Archive formats
+            '.zip': 'application/zip',
+            '.tar': 'application/x-tar',
+            '.gz': 'application/gzip',
+            
+            # Default for unknown types
+            '': 'application/octet-stream'
+        }
+        
+        # Return the appropriate MIME type, or default to octet-stream if unknown
+        return mime_types.get(ext, 'application/octet-stream')
+    
     source = os.environ.get('SOURCE')
     name = os.environ.get('NAME')
     version = os.environ.get('VERSION')
@@ -201,11 +260,16 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
             chunks = chunker.chunk(docling_doc)
             chunk_count = 0
 
+            # Get the appropriate MIME type for this file
+            mime_type = get_mime_type(file_path)
+            print(f"Using MIME type {mime_type} for file {os.path.basename(file_path)}")
+
             for chunk in chunks:
                 if any(
                     c.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH, DocItemLabel.TABLE, 
-                               DocItemLabel.TABLE_CELL, DocItemLabel.HEADER, DocItemLabel.FOOTER, 
-                               DocItemLabel.TITLE, DocItemLabel.PICTURE_DESCRIPTION]
+                               DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER, 
+                               DocItemLabel.TITLE, DocItemLabel.PICTURE, DocItemLabel.CHART, 
+                               DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER]
                     for c in chunk.meta.doc_items
                 ):
                     i += 1
@@ -214,7 +278,7 @@ def process_and_store_pgvector(llamastack_base_url: str, input_dir: dsl.InputPat
                         LlamaStackDocument(
                             document_id=f"doc-{i}",
                             content=chunk.text,
-                            mime_type="text/plain",
+                            mime_type=mime_type,
                             metadata={"source": os.path.basename(file_path)},
                         )
                     )