Merge branch 'main' into main

afourney · web-flow · commit 7a81cc86f38b · 2025-04-13T09:38:11.000-07:00
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively,
 ```bash
 git clone git@github.com:microsoft/markitdown.git
 cd markitdown
-pip install -e packages/markitdown[all]
+pip install -e 'packages/markitdown[all]'
 ```
 
 ## Usage
@@ -73,7 +73,7 @@ cat path-to-file.pdf | markitdown
 MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
 
 ```bash
-pip install markitdown[pdf, docx, pptx]
+pip install 'markitdown[pdf, docx, pptx]'
 ```
 
 will install only the dependencies for PDF, DOCX, and PPTX files.
diff --git a/packages/markitdown-mcp/README.md b/packages/markitdown-mcp/README.md
@@ -25,7 +25,7 @@ To run the MCP server, ussing STDIO (default) use the following command:
 markitdown-mcp
 ```
 
-To run the MCP server, ussing SSE use the following command:
+To run the MCP server, using SSE use the following command:
 
 ```bash	
 markitdown-mcp --sse --host 127.0.0.1 --port 3001
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -41,6 +41,7 @@
     ZipConverter,
     EpubConverter,
     DocumentIntelligenceConverter,
+    CsvConverter,
 )
 
 from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -194,6 +195,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(PdfConverter())
             self.register_converter(OutlookMsgConverter())
             self.register_converter(EpubConverter())
+            self.register_converter(CsvConverter())
 
             # Register Document Intelligence converter at the top of the stack if endpoint is provided
             docintel_endpoint = kwargs.get("docintel_endpoint")
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -22,6 +22,7 @@
     DocumentIntelligenceFileType,
 )
 from ._epub_converter import EpubConverter
+from ._csv_converter import CsvConverter
 
 __all__ = [
     "PlainTextConverter",
@@ -43,4 +44,5 @@
     "DocumentIntelligenceConverter",
     "DocumentIntelligenceFileType",
     "EpubConverter",
+    "CsvConverter",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py
@@ -0,0 +1,79 @@
+import sys
+import csv
+import io
+from typing import BinaryIO, Any
+from charset_normalizer import from_bytes
+from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/csv",
+    "application/csv",
+]
+ACCEPTED_FILE_EXTENSIONS = [".csv"]
+
+
+class CsvConverter(DocumentConverter):
+    """
+    Converts CSV files to Markdown tables.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Read the file content
+        if stream_info.charset:
+            content = file_stream.read().decode(stream_info.charset)
+        else:
+            content = str(from_bytes(file_stream.read()).best())
+
+        # Parse CSV content
+        reader = csv.reader(io.StringIO(content))
+        rows = list(reader)
+
+        if not rows:
+            return DocumentConverterResult(markdown="")
+
+        # Create markdown table
+        markdown_table = []
+
+        # Add header row
+        markdown_table.append("| " + " | ".join(rows[0]) + " |")
+
+        # Add separator row
+        markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
+
+        # Add data rows
+        for row in rows[1:]:
+            # Make sure row has the same number of columns as header
+            while len(row) < len(rows[0]):
+                row.append("")
+            # Truncate if row has more columns than header
+            row = row[: len(rows[0])]
+            markdown_table.append("| " + " | ".join(row) + " |")
+
+        result = "\n".join(markdown_table)
+
+        return DocumentConverterResult(markdown=result)
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,8 +1,7 @@
 import sys
 import re
 import os
-
-from typing import BinaryIO, Any, List
+from typing import BinaryIO, Any, List, Optional, Union
 from enum import Enum
 
 from ._html_converter import HtmlConverter
@@ -26,6 +25,28 @@
     # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
 
+    # Define these types for type hinting when the package is not available
+    class AzureKeyCredential:
+        pass
+
+    class TokenCredential:
+        pass
+
+    class DocumentIntelligenceClient:
+        pass
+
+    class AnalyzeDocumentRequest:
+        pass
+
+    class AnalyzeResult:
+        pass
+
+    class DocumentAnalysisFeature:
+        pass
+
+    class DefaultAzureCredential:
+        pass
+
 
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
@@ -144,10 +144,11 @@ class FileTestVector(object):
         charset="cp932",
         url=None,
         must_include=[
-            "名前,年齢,住所",
-            "佐藤太郎,30,東京",
-            "三木英子,25,大阪",
-            "髙橋淳,35,名古屋",
+            "| 名前 | 年齢 | 住所 |",
+            "| --- | --- | --- |",
+            "| 佐藤太郎 | 30 | 東京 |",
+            "| 三木英子 | 25 | 大阪 |",
+            "| 髙橋淳 | 35 | 名古屋 |",
         ],
         must_not_include=[],
     ),

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`DocumentIntelligenceFileType,`
`23`	`23`	`)`
`24`	`24`	`from ._epub_converter import EpubConverter`
	`25`	`+from ._csv_converter import CsvConverter`
`25`	`26`
`26`	`27`	`__all__ = [`
`27`	`28`	`"PlainTextConverter",`
`@@ -43,4 +44,5 @@`
`43`	`44`	`"DocumentIntelligenceConverter",`
`44`	`45`	`"DocumentIntelligenceFileType",`
`45`	`46`	`"EpubConverter",`
	`47`	`+ "CsvConverter",`
`46`	`48`	`]`