Skip to content

Commit cd6aa41

Browse files
authored
Adjust warning filters and update dependencies (#1143)
Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py
1 parent 716f74d commit cd6aa41

File tree

4 files changed

+19
-28
lines changed

4 files changed

+19
-28
lines changed

packages/markitdown/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dependencies = [
2727
"beautifulsoup4",
2828
"requests",
2929
"markdownify",
30-
"magika>=0.6.1rc3",
30+
"magika~=0.6.1",
3131
"charset-normalizer",
3232
]
3333

@@ -42,7 +42,7 @@ all = [
4242
"olefile",
4343
"pydub",
4444
"SpeechRecognition",
45-
"youtube-transcript-api",
45+
"youtube-transcript-api~=1.0.0",
4646
"azure-ai-documentintelligence",
4747
"azure-identity"
4848
]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
22
#
33
# SPDX-License-Identifier: MIT
4-
__version__ = "0.1.0a4"
4+
__version__ = "0.1.0a5"

packages/markitdown/src/markitdown/converters/_transcribe_audio.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,14 @@
77
# Save reporting of any exceptions for later
88
_dependency_exc_info = None
99
try:
10-
# Suppress some deprecation warnings from the speech_recognition library
10+
# Suppress some warnings on library import
1111
import warnings
1212

13-
warnings.filterwarnings(
14-
"ignore", category=DeprecationWarning, module="speech_recognition"
15-
)
16-
warnings.filterwarnings(
17-
"ignore",
18-
category=SyntaxWarning,
19-
module="pydub", # TODO: Migrate away from pydub
20-
)
21-
import speech_recognition as sr
22-
23-
import pydub
13+
with warnings.catch_warnings():
14+
warnings.filterwarnings("ignore", category=DeprecationWarning)
15+
warnings.filterwarnings("ignore", category=SyntaxWarning)
16+
import speech_recognition as sr
17+
import pydub
2418
except ImportError:
2519
# Preserve the error and stack trace for later
2620
_dependency_exc_info = sys.exc_info()

packages/markitdown/src/markitdown/converters/_youtube_converter.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,21 @@
44
import io
55
import re
66
import bs4
7-
import warnings
87
from typing import Any, BinaryIO, Optional, Dict, List, Union
98
from urllib.parse import parse_qs, urlparse, unquote
109

1110
from .._base_converter import DocumentConverter, DocumentConverterResult
1211
from .._stream_info import StreamInfo
13-
from ._markdownify import _CustomMarkdownify
1412

1513
# Optional YouTube transcription support
1614
try:
17-
warnings.filterwarnings(
18-
"ignore",
19-
category=SyntaxWarning,
20-
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
21-
)
22-
from youtube_transcript_api import YouTubeTranscriptApi
15+
# Suppress some warnings on library import
16+
import warnings
17+
18+
with warnings.catch_warnings():
19+
warnings.filterwarnings("ignore", category=SyntaxWarning)
20+
# Patch submitted upstream to fix the SyntaxWarning
21+
from youtube_transcript_api import YouTubeTranscriptApi
2322

2423
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
2524
except ModuleNotFoundError:
@@ -148,6 +147,7 @@ def convert(
148147
webpage_text += f"\n### Description\n{description}\n"
149148

150149
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
150+
ytt_api = YouTubeTranscriptApi()
151151
transcript_text = ""
152152
parsed_url = urlparse(stream_info.url) # type: ignore
153153
params = parse_qs(parsed_url.query) # type: ignore
@@ -159,19 +159,16 @@ def convert(
159159
)
160160
# Retry the transcript fetching operation
161161
transcript = self._retry_operation(
162-
lambda: YouTubeTranscriptApi.get_transcript(
162+
lambda: ytt_api.fetch(
163163
video_id, languages=youtube_transcript_languages
164164
),
165165
retries=3, # Retry 3 times
166166
delay=2, # 2 seconds delay between retries
167167
)
168168
if transcript:
169169
transcript_text = " ".join(
170-
[part["text"] for part in transcript]
170+
[part.text for part in transcript]
171171
) # type: ignore
172-
# Alternative formatting:
173-
# formatter = TextFormatter()
174-
# formatter.format_transcript(transcript)
175172
except Exception as e:
176173
print(f"Error fetching transcript: {e}")
177174
if transcript_text:

0 commit comments

Comments
 (0)