Adjust warning filters and update dependencies (#1143)

afourney · web-flow · commit cd6aa41361d4 · 2025-03-19T22:09:14.000-07:00
Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
   "beautifulsoup4",
   "requests",
   "markdownify",
-  "magika>=0.6.1rc3",
+  "magika~=0.6.1",
   "charset-normalizer",
 ]
 
@@ -42,7 +42,7 @@ all = [
   "olefile",
   "pydub",
   "SpeechRecognition",
-  "youtube-transcript-api",
+  "youtube-transcript-api~=1.0.0",
   "azure-ai-documentintelligence",
   "azure-identity"
 ]
diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a4"
+__version__ = "0.1.0a5"
diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,20 +7,14 @@
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some deprecation warnings from the speech_recognition library
+    # Suppress some warnings on library import
     import warnings
 
-    warnings.filterwarnings(
-        "ignore", category=DeprecationWarning, module="speech_recognition"
-    )
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="pydub",  # TODO: Migrate away from pydub
-    )
-    import speech_recognition as sr
-
-    import pydub
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        import speech_recognition as sr
+        import pydub
 except ImportError:
     # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -4,22 +4,21 @@
 import io
 import re
 import bs4
-import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from ._markdownify import _CustomMarkdownify
 
 # Optional YouTube transcription support
 try:
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
-    )
-    from youtube_transcript_api import YouTubeTranscriptApi
+    # Suppress some warnings on library import
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        # Patch submitted upstream to fix the SyntaxWarning
+        from youtube_transcript_api import YouTubeTranscriptApi
 
     IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -148,6 +147,7 @@ def convert(
             webpage_text += f"\n### Description\n{description}\n"
 
         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            ytt_api = YouTubeTranscriptApi()
             transcript_text = ""
             parsed_url = urlparse(stream_info.url)  # type: ignore
             params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,19 +159,16 @@ def convert(
                     )
                     # Retry the transcript fetching operation
                     transcript = self._retry_operation(
-                        lambda: YouTubeTranscriptApi.get_transcript(
+                        lambda: ytt_api.fetch(
                             video_id, languages=youtube_transcript_languages
                         ),
                         retries=3,  # Retry 3 times
                         delay=2,  # 2 seconds delay between retries
                     )
                     if transcript:
                         transcript_text = " ".join(
-                            [part["text"] for part in transcript]
+                            [part.text for part in transcript]
                         )  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
                 except Exception as e:
                     print(f"Error fetching transcript: {e}")
             if transcript_text:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`# SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: MIT`
`4`		`-__version__ = "0.1.0a4"`
	`4`	`+__version__ = "0.1.0a5"`