Skip to content

Commit 7a81cc8

Browse files
authored
Merge branch 'main' into main
2 parents e57a9fd + 041be54 commit 7a81cc8

File tree

7 files changed

+114
-9
lines changed

7 files changed

+114
-9
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively,
4646
```bash
4747
git clone [email protected]:microsoft/markitdown.git
4848
cd markitdown
49-
pip install -e packages/markitdown[all]
49+
pip install -e 'packages/markitdown[all]'
5050
```
5151

5252
## Usage
@@ -73,7 +73,7 @@ cat path-to-file.pdf | markitdown
7373
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
7474

7575
```bash
76-
pip install markitdown[pdf, docx, pptx]
76+
pip install 'markitdown[pdf, docx, pptx]'
7777
```
7878

7979
will install only the dependencies for PDF, DOCX, and PPTX files.

packages/markitdown-mcp/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ To run the MCP server, ussing STDIO (default) use the following command:
2525
markitdown-mcp
2626
```
2727

28-
To run the MCP server, ussing SSE use the following command:
28+
To run the MCP server, using SSE use the following command:
2929

3030
```bash
3131
markitdown-mcp --sse --host 127.0.0.1 --port 3001

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
ZipConverter,
4242
EpubConverter,
4343
DocumentIntelligenceConverter,
44+
CsvConverter,
4445
)
4546

4647
from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -194,6 +195,7 @@ def enable_builtins(self, **kwargs) -> None:
194195
self.register_converter(PdfConverter())
195196
self.register_converter(OutlookMsgConverter())
196197
self.register_converter(EpubConverter())
198+
self.register_converter(CsvConverter())
197199

198200
# Register Document Intelligence converter at the top of the stack if endpoint is provided
199201
docintel_endpoint = kwargs.get("docintel_endpoint")

packages/markitdown/src/markitdown/converters/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
DocumentIntelligenceFileType,
2323
)
2424
from ._epub_converter import EpubConverter
25+
from ._csv_converter import CsvConverter
2526

2627
__all__ = [
2728
"PlainTextConverter",
@@ -43,4 +44,5 @@
4344
"DocumentIntelligenceConverter",
4445
"DocumentIntelligenceFileType",
4546
"EpubConverter",
47+
"CsvConverter",
4648
]
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import sys
2+
import csv
3+
import io
4+
from typing import BinaryIO, Any
5+
from charset_normalizer import from_bytes
6+
from ._html_converter import HtmlConverter
7+
from .._base_converter import DocumentConverter, DocumentConverterResult
8+
from .._stream_info import StreamInfo
9+
10+
ACCEPTED_MIME_TYPE_PREFIXES = [
11+
"text/csv",
12+
"application/csv",
13+
]
14+
ACCEPTED_FILE_EXTENSIONS = [".csv"]
15+
16+
17+
class CsvConverter(DocumentConverter):
18+
"""
19+
Converts CSV files to Markdown tables.
20+
"""
21+
22+
def __init__(self):
23+
super().__init__()
24+
25+
def accepts(
26+
self,
27+
file_stream: BinaryIO,
28+
stream_info: StreamInfo,
29+
**kwargs: Any, # Options to pass to the converter
30+
) -> bool:
31+
mimetype = (stream_info.mimetype or "").lower()
32+
extension = (stream_info.extension or "").lower()
33+
if extension in ACCEPTED_FILE_EXTENSIONS:
34+
return True
35+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
36+
if mimetype.startswith(prefix):
37+
return True
38+
return False
39+
40+
def convert(
41+
self,
42+
file_stream: BinaryIO,
43+
stream_info: StreamInfo,
44+
**kwargs: Any, # Options to pass to the converter
45+
) -> DocumentConverterResult:
46+
# Read the file content
47+
if stream_info.charset:
48+
content = file_stream.read().decode(stream_info.charset)
49+
else:
50+
content = str(from_bytes(file_stream.read()).best())
51+
52+
# Parse CSV content
53+
reader = csv.reader(io.StringIO(content))
54+
rows = list(reader)
55+
56+
if not rows:
57+
return DocumentConverterResult(markdown="")
58+
59+
# Create markdown table
60+
markdown_table = []
61+
62+
# Add header row
63+
markdown_table.append("| " + " | ".join(rows[0]) + " |")
64+
65+
# Add separator row
66+
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
67+
68+
# Add data rows
69+
for row in rows[1:]:
70+
# Make sure row has the same number of columns as header
71+
while len(row) < len(rows[0]):
72+
row.append("")
73+
# Truncate if row has more columns than header
74+
row = row[: len(rows[0])]
75+
markdown_table.append("| " + " | ".join(row) + " |")
76+
77+
result = "\n".join(markdown_table)
78+
79+
return DocumentConverterResult(markdown=result)

packages/markitdown/src/markitdown/converters/_doc_intel_converter.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import sys
22
import re
33
import os
4-
5-
from typing import BinaryIO, Any, List
4+
from typing import BinaryIO, Any, List, Optional, Union
65
from enum import Enum
76

87
from ._html_converter import HtmlConverter
@@ -26,6 +25,28 @@
2625
# Preserve the error and stack trace for later
2726
_dependency_exc_info = sys.exc_info()
2827

28+
# Define these types for type hinting when the package is not available
29+
class AzureKeyCredential:
30+
pass
31+
32+
class TokenCredential:
33+
pass
34+
35+
class DocumentIntelligenceClient:
36+
pass
37+
38+
class AnalyzeDocumentRequest:
39+
pass
40+
41+
class AnalyzeResult:
42+
pass
43+
44+
class DocumentAnalysisFeature:
45+
pass
46+
47+
class DefaultAzureCredential:
48+
pass
49+
2950

3051
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
3152
# This constant is a temporary fix until the bug is resolved.

packages/markitdown/tests/_test_vectors.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,11 @@ class FileTestVector(object):
144144
charset="cp932",
145145
url=None,
146146
must_include=[
147-
"名前,年齢,住所",
148-
"佐藤太郎,30,東京",
149-
"三木英子,25,大阪",
150-
"髙橋淳,35,名古屋",
147+
"| 名前 | 年齢 | 住所 |",
148+
"| --- | --- | --- |",
149+
"| 佐藤太郎 | 30 | 東京 |",
150+
"| 三木英子 | 25 | 大阪 |",
151+
"| 髙橋淳 | 35 | 名古屋 |",
151152
],
152153
must_not_include=[],
153154
),

0 commit comments

Comments
 (0)