Skip to content

Commit 716f74d

Browse files
authored
Consider anything with a charset as plain text-convertible. (#1142)
1 parent a93e056 commit 716f74d

File tree

1 file changed

+16
-7
lines changed

1 file changed

+16
-7
lines changed

packages/markitdown/src/markitdown/converters/_plain_text_converter.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@
1717
ACCEPTED_MIME_TYPE_PREFIXES = [
1818
"text/",
1919
"application/json",
20+
"application/markdown",
2021
]
2122

22-
# Mimetypes to ignore (commonly confused extensions)
23-
IGNORE_MIME_TYPE_PREFIXES = [
24-
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
25-
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
23+
ACCEPTED_FILE_EXTENSIONS = [
24+
".txt",
25+
".text",
26+
".md",
27+
".markdown",
28+
".json",
29+
".jsonl",
2630
]
2731

2832

@@ -38,9 +42,14 @@ def accepts(
3842
mimetype = (stream_info.mimetype or "").lower()
3943
extension = (stream_info.extension or "").lower()
4044

41-
for prefix in IGNORE_MIME_TYPE_PREFIXES:
42-
if mimetype.startswith(prefix):
43-
return False
45+
# If we have a charset, we can safely assume it's text
46+
# With Magika in the earlier stages, this handles most cases
47+
if stream_info.charset is not None:
48+
return True
49+
50+
# Otherwise, check the mimetype and extension
51+
if extension in ACCEPTED_FILE_EXTENSIONS:
52+
return True
4453

4554
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
4655
if mimetype.startswith(prefix):

0 commit comments

Comments
 (0)