-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: enhance YouTubeTranscripts component with Data output support (#…
…6113) * 📝 (youtube_transcripts.py): update description of YouTubeTranscriptsComponent to be more concise and accurate ✨ (youtube_transcripts.py): add new output option 'data_output' to provide transcript along with the source video URL 🔧 (youtube_transcripts.py): add method 'get_data_output' to handle the new 'data_output' output option and return a Data object with transcript, video URL, and error message * [autofix.ci] apply automated fixes * 📝 (youtube_transcripts.py): improve documentation for get_data_output method to provide a clear description of the returned data object and its contents 🐛 (youtube_transcripts.py): handle specific exceptions from the youtube_transcript_api library to provide more informative error messages and improve error handling in the get_data_output method * [autofix.ci] apply automated fixes * 🐛 (youtube_transcripts.py): handle case where no transcripts are found by updating the error message and returning a default data object 🔧 (youtube_transcripts.py): refactor get_data_output method to use a default data object and combine all transcript parts into a single continuous text * [autofix.ci] apply automated fixes * ✨ (test_youtube_transcript_component.py): Add unit tests for YouTubeTranscriptsComponent to test various functionalities such as component initialization, output generation, error handling, and setting translation languages. * [autofix.ci] apply automated fixes * ✅ (test_youtube_transcript_component.py): update file_names_mapping fixture to return a non-empty list to properly test different versions of file names mapping in the YouTube transcripts component * [autofix.ci] apply automated fixes * 📝 (test_youtube_transcript_component.py): Add docstrings and improve variable names for better readability and maintainability 🔧 (test_youtube_transcript_component.py): Refactor error handling in test methods to use descriptive error messages and improve code readability * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
17f1ecf
commit d98d377
Showing
4 changed files
with
200 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
167 changes: 167 additions & 0 deletions
167
src/backend/tests/unit/components/bundles/youtube/test_youtube_transcript_component.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
from unittest.mock import Mock, patch | ||
|
||
import pytest | ||
from langflow.components.youtube.youtube_transcripts import YouTubeTranscriptsComponent | ||
from langflow.schema import Data, DataFrame, Message | ||
from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled | ||
|
||
from tests.base import ComponentTestBaseWithoutClient | ||
|
||
|
||
class TestYouTubeTranscriptsComponent(ComponentTestBaseWithoutClient): | ||
@pytest.fixture | ||
def component_class(self): | ||
"""Return the component class to test.""" | ||
return YouTubeTranscriptsComponent | ||
|
||
@pytest.fixture | ||
def default_kwargs(self): | ||
"""Return the default kwargs for the component.""" | ||
return { | ||
"url": "https://www.youtube.com/watch?v=test123", | ||
"chunk_size_seconds": 60, | ||
"translation": "", | ||
} | ||
|
||
@pytest.fixture | ||
def file_names_mapping(self): | ||
"""Return the file names mapping for different versions.""" | ||
return [] | ||
|
||
@pytest.fixture | ||
def mock_transcript_data(self): | ||
"""Return mock transcript data for testing.""" | ||
return [ | ||
Mock(page_content="First part of the transcript", metadata={"start_seconds": 0}), | ||
Mock(page_content="Second part of the transcript", metadata={"start_seconds": 60}), | ||
] | ||
|
||
def test_basic_setup(self, component_class, default_kwargs): | ||
"""Test basic component initialization.""" | ||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
assert component.url == default_kwargs["url"] | ||
assert component.chunk_size_seconds == default_kwargs["chunk_size_seconds"] | ||
assert component.translation == default_kwargs["translation"] | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_get_dataframe_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): | ||
"""Test successful DataFrame output generation.""" | ||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
result = component.get_dataframe_output() | ||
|
||
assert isinstance(result, DataFrame) | ||
result_df = result # More descriptive variable name | ||
assert len(result_df) == 2 | ||
assert list(result_df.columns) == ["timestamp", "text"] | ||
assert result_df.iloc[0]["timestamp"] == "00:00" | ||
assert result_df.iloc[1]["timestamp"] == "01:00" | ||
assert result_df.iloc[0]["text"] == "First part of the transcript" | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_get_message_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): | ||
"""Test successful Message output generation.""" | ||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
result = component.get_message_output() | ||
|
||
assert isinstance(result, Message) | ||
assert result.text == "First part of the transcript" | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_get_data_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data): | ||
"""Test successful Data output generation.""" | ||
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
result = component.get_data_output() | ||
|
||
assert isinstance(result, Data) | ||
assert result.data["video_url"] == default_kwargs["url"] | ||
assert result.data["transcript"] == "First part of the transcript Second part of the transcript" | ||
assert "error" not in result.data | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_transcript_disabled_error(self, mock_loader, component_class, default_kwargs): | ||
"""Test handling of TranscriptsDisabled error.""" | ||
error_message = "Transcripts are disabled for this video" | ||
|
||
# Mock the load method to raise TranscriptsDisabled | ||
def raise_error(*_): # Use underscore to indicate unused arguments | ||
raise TranscriptsDisabled(error_message) | ||
|
||
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
|
||
# Test DataFrame output | ||
df_result = component.get_dataframe_output() | ||
assert isinstance(df_result, DataFrame) | ||
assert len(df_result) == 1 # One row for error message | ||
assert "error" in df_result.columns | ||
assert "Failed to get YouTube transcripts" in df_result["error"][0] | ||
|
||
# Test Message output | ||
msg_result = component.get_message_output() | ||
assert isinstance(msg_result, Message) | ||
assert "Failed to get YouTube transcripts" in msg_result.text | ||
|
||
# Test Data output | ||
data_result = component.get_data_output() | ||
assert isinstance(data_result, Data) | ||
assert "error" in data_result.data | ||
assert data_result.data["transcript"] == "" | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_no_transcript_found_error(self, mock_loader, component_class, default_kwargs): | ||
"""Test handling of NoTranscriptFound error.""" | ||
video_id = "test123" | ||
requested_langs = ["en"] | ||
transcript_data = {"en": {"translationLanguages": []}} | ||
|
||
# Mock the load method to raise NoTranscriptFound | ||
def raise_error(*_): # Use underscore to indicate unused arguments | ||
raise NoTranscriptFound(video_id, requested_langs, transcript_data) | ||
|
||
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
|
||
data_result = component.get_data_output() | ||
assert isinstance(data_result, Data) | ||
assert "error" in data_result.data | ||
assert data_result.data["transcript"] == "" | ||
|
||
def test_translation_setting(self, component_class): | ||
"""Test setting different translation languages.""" | ||
component = component_class() | ||
test_cases = ["en", "es", "fr", ""] | ||
|
||
for lang in test_cases: | ||
component.set_attributes({"url": "https://youtube.com/watch?v=test", "translation": lang}) | ||
assert component.translation == lang | ||
|
||
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader") | ||
def test_empty_transcript_handling(self, mock_loader, component_class, default_kwargs): | ||
"""Test handling of empty transcript response.""" | ||
mock_loader.from_youtube_url.return_value.load.return_value = [] | ||
|
||
component = component_class() | ||
component.set_attributes(default_kwargs) | ||
|
||
# Test Data output with empty transcript | ||
data_result = component.get_data_output() | ||
assert data_result.data["error"] == "No transcripts found." | ||
assert data_result.data["transcript"] == "" | ||
|
||
# Test DataFrame output with empty transcript | ||
df_result = component.get_dataframe_output() | ||
assert len(df_result) == 0 |