Skip to content

Commit d98d377

Browse files
feat: enhance YouTubeTranscripts component with Data output support (#6113)
* 📝 (youtube_transcripts.py): update description of YouTubeTranscriptsComponent to be more concise and accurate ✨ (youtube_transcripts.py): add new output option 'data_output' to provide transcript along with the source video URL 🔧 (youtube_transcripts.py): add method 'get_data_output' to handle the new 'data_output' output option and return a Data object with transcript, video URL, and error message * [autofix.ci] apply automated fixes * 📝 (youtube_transcripts.py): improve documentation for get_data_output method to provide a clear description of the returned data object and its contents 🐛 (youtube_transcripts.py): handle specific exceptions from the youtube_transcript_api library to provide more informative error messages and improve error handling in the get_data_output method * [autofix.ci] apply automated fixes * 🐛 (youtube_transcripts.py): handle case where no transcripts are found by updating the error message and returning a default data object 🔧 (youtube_transcripts.py): refactor get_data_output method to use a default data object and combine all transcript parts into a single continuous text * [autofix.ci] apply automated fixes * ✨ (test_youtube_transcript_component.py): Add unit tests for YouTubeTranscriptsComponent to test various functionalities such as component initialization, output generation, error handling, and setting translation languages. * [autofix.ci] apply automated fixes * ✅ (test_youtube_transcript_component.py): update file_names_mapping fixture to return a non-empty list to properly test different versions of file names mapping in the YouTube transcripts component * [autofix.ci] apply automated fixes * 📝 (test_youtube_transcript_component.py): Add docstrings and improve variable names for better readability and maintainability 🔧 (test_youtube_transcript_component.py): Refactor error handling in test methods to use descriptive error messages and improve code readability * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
1 parent 17f1ecf commit d98d377

File tree

4 files changed

+200
-2
lines changed

4 files changed

+200
-2
lines changed

src/backend/base/langflow/components/youtube/youtube_transcripts.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55

66
from langflow.custom import Component
77
from langflow.inputs import DropdownInput, IntInput, MultilineInput
8-
from langflow.schema import DataFrame, Message
8+
from langflow.schema import Data, DataFrame, Message
99
from langflow.template import Output
1010

1111

1212
class YouTubeTranscriptsComponent(Component):
1313
"""A component that extracts spoken content from YouTube videos as transcripts."""
1414

1515
display_name: str = "YouTube Transcripts"
16-
description: str = "Extracts spoken content from YouTube videos with both DataFrame and text output options."
16+
description: str = "Extracts spoken content from YouTube videos with multiple output options."
1717
icon: str = "YouTube"
1818
name = "YouTubeTranscripts"
1919

@@ -43,6 +43,7 @@ class YouTubeTranscriptsComponent(Component):
4343
outputs = [
4444
Output(name="dataframe", display_name="Chunks", method="get_dataframe_output"),
4545
Output(name="message", display_name="Transcript", method="get_message_output"),
46+
Output(name="data_output", display_name="Transcript + Source", method="get_data_output"),
4647
]
4748

4849
def _load_transcripts(self, *, as_chunks: bool = True):
@@ -68,6 +69,7 @@ def get_dataframe_output(self) -> DataFrame:
6869
start_seconds %= 60
6970
timestamp = f"{start_minutes:02d}:{start_seconds:02d}"
7071
data.append({"timestamp": timestamp, "text": doc.page_content})
72+
7173
return DataFrame(pd.DataFrame(data))
7274

7375
except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc:
@@ -83,3 +85,32 @@ def get_message_output(self) -> Message:
8385
except (youtube_transcript_api.TranscriptsDisabled, youtube_transcript_api.NoTranscriptFound) as exc:
8486
error_msg = f"Failed to get YouTube transcripts: {exc!s}"
8587
return Message(text=error_msg)
88+
89+
def get_data_output(self) -> Data:
90+
"""Creates a structured data object with transcript and metadata.
91+
92+
Returns a Data object containing transcript text, video URL, and any error
93+
messages that occurred during processing. The object includes:
94+
- 'transcript': continuous text from the entire video (concatenated if multiple parts)
95+
- 'video_url': the input YouTube URL
96+
- 'error': error message if an exception occurs
97+
"""
98+
default_data = {"transcript": "", "video_url": self.url, "error": None}
99+
100+
try:
101+
transcripts = self._load_transcripts(as_chunks=False)
102+
if not transcripts:
103+
default_data["error"] = "No transcripts found."
104+
return Data(data=default_data)
105+
106+
# Combine all transcript parts
107+
full_transcript = " ".join(doc.page_content for doc in transcripts)
108+
return Data(data={"transcript": full_transcript, "video_url": self.url})
109+
110+
except (
111+
youtube_transcript_api.TranscriptsDisabled,
112+
youtube_transcript_api.NoTranscriptFound,
113+
youtube_transcript_api.CouldNotRetrieveTranscript,
114+
) as exc:
115+
default_data["error"] = str(exc)
116+
return Data(data=default_data)

src/backend/tests/unit/components/bundles/__init__.py

Whitespace-only changes.

src/backend/tests/unit/components/bundles/youtube/__init__.py

Whitespace-only changes.
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
from unittest.mock import Mock, patch
2+
3+
import pytest
4+
from langflow.components.youtube.youtube_transcripts import YouTubeTranscriptsComponent
5+
from langflow.schema import Data, DataFrame, Message
6+
from youtube_transcript_api import NoTranscriptFound, TranscriptsDisabled
7+
8+
from tests.base import ComponentTestBaseWithoutClient
9+
10+
11+
class TestYouTubeTranscriptsComponent(ComponentTestBaseWithoutClient):
12+
@pytest.fixture
13+
def component_class(self):
14+
"""Return the component class to test."""
15+
return YouTubeTranscriptsComponent
16+
17+
@pytest.fixture
18+
def default_kwargs(self):
19+
"""Return the default kwargs for the component."""
20+
return {
21+
"url": "https://www.youtube.com/watch?v=test123",
22+
"chunk_size_seconds": 60,
23+
"translation": "",
24+
}
25+
26+
@pytest.fixture
27+
def file_names_mapping(self):
28+
"""Return the file names mapping for different versions."""
29+
return []
30+
31+
@pytest.fixture
32+
def mock_transcript_data(self):
33+
"""Return mock transcript data for testing."""
34+
return [
35+
Mock(page_content="First part of the transcript", metadata={"start_seconds": 0}),
36+
Mock(page_content="Second part of the transcript", metadata={"start_seconds": 60}),
37+
]
38+
39+
def test_basic_setup(self, component_class, default_kwargs):
40+
"""Test basic component initialization."""
41+
component = component_class()
42+
component.set_attributes(default_kwargs)
43+
assert component.url == default_kwargs["url"]
44+
assert component.chunk_size_seconds == default_kwargs["chunk_size_seconds"]
45+
assert component.translation == default_kwargs["translation"]
46+
47+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
48+
def test_get_dataframe_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
49+
"""Test successful DataFrame output generation."""
50+
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
51+
52+
component = component_class()
53+
component.set_attributes(default_kwargs)
54+
result = component.get_dataframe_output()
55+
56+
assert isinstance(result, DataFrame)
57+
result_df = result # More descriptive variable name
58+
assert len(result_df) == 2
59+
assert list(result_df.columns) == ["timestamp", "text"]
60+
assert result_df.iloc[0]["timestamp"] == "00:00"
61+
assert result_df.iloc[1]["timestamp"] == "01:00"
62+
assert result_df.iloc[0]["text"] == "First part of the transcript"
63+
64+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
65+
def test_get_message_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
66+
"""Test successful Message output generation."""
67+
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
68+
69+
component = component_class()
70+
component.set_attributes(default_kwargs)
71+
result = component.get_message_output()
72+
73+
assert isinstance(result, Message)
74+
assert result.text == "First part of the transcript"
75+
76+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
77+
def test_get_data_output_success(self, mock_loader, component_class, default_kwargs, mock_transcript_data):
78+
"""Test successful Data output generation."""
79+
mock_loader.from_youtube_url.return_value.load.return_value = mock_transcript_data
80+
81+
component = component_class()
82+
component.set_attributes(default_kwargs)
83+
result = component.get_data_output()
84+
85+
assert isinstance(result, Data)
86+
assert result.data["video_url"] == default_kwargs["url"]
87+
assert result.data["transcript"] == "First part of the transcript Second part of the transcript"
88+
assert "error" not in result.data
89+
90+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
91+
def test_transcript_disabled_error(self, mock_loader, component_class, default_kwargs):
92+
"""Test handling of TranscriptsDisabled error."""
93+
error_message = "Transcripts are disabled for this video"
94+
95+
# Mock the load method to raise TranscriptsDisabled
96+
def raise_error(*_): # Use underscore to indicate unused arguments
97+
raise TranscriptsDisabled(error_message)
98+
99+
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error
100+
101+
component = component_class()
102+
component.set_attributes(default_kwargs)
103+
104+
# Test DataFrame output
105+
df_result = component.get_dataframe_output()
106+
assert isinstance(df_result, DataFrame)
107+
assert len(df_result) == 1 # One row for error message
108+
assert "error" in df_result.columns
109+
assert "Failed to get YouTube transcripts" in df_result["error"][0]
110+
111+
# Test Message output
112+
msg_result = component.get_message_output()
113+
assert isinstance(msg_result, Message)
114+
assert "Failed to get YouTube transcripts" in msg_result.text
115+
116+
# Test Data output
117+
data_result = component.get_data_output()
118+
assert isinstance(data_result, Data)
119+
assert "error" in data_result.data
120+
assert data_result.data["transcript"] == ""
121+
122+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
123+
def test_no_transcript_found_error(self, mock_loader, component_class, default_kwargs):
124+
"""Test handling of NoTranscriptFound error."""
125+
video_id = "test123"
126+
requested_langs = ["en"]
127+
transcript_data = {"en": {"translationLanguages": []}}
128+
129+
# Mock the load method to raise NoTranscriptFound
130+
def raise_error(*_): # Use underscore to indicate unused arguments
131+
raise NoTranscriptFound(video_id, requested_langs, transcript_data)
132+
133+
mock_loader.from_youtube_url.return_value.load.side_effect = raise_error
134+
135+
component = component_class()
136+
component.set_attributes(default_kwargs)
137+
138+
data_result = component.get_data_output()
139+
assert isinstance(data_result, Data)
140+
assert "error" in data_result.data
141+
assert data_result.data["transcript"] == ""
142+
143+
def test_translation_setting(self, component_class):
144+
"""Test setting different translation languages."""
145+
component = component_class()
146+
test_cases = ["en", "es", "fr", ""]
147+
148+
for lang in test_cases:
149+
component.set_attributes({"url": "https://youtube.com/watch?v=test", "translation": lang})
150+
assert component.translation == lang
151+
152+
@patch("langflow.components.youtube.youtube_transcripts.YoutubeLoader")
153+
def test_empty_transcript_handling(self, mock_loader, component_class, default_kwargs):
154+
"""Test handling of empty transcript response."""
155+
mock_loader.from_youtube_url.return_value.load.return_value = []
156+
157+
component = component_class()
158+
component.set_attributes(default_kwargs)
159+
160+
# Test Data output with empty transcript
161+
data_result = component.get_data_output()
162+
assert data_result.data["error"] == "No transcripts found."
163+
assert data_result.data["transcript"] == ""
164+
165+
# Test DataFrame output with empty transcript
166+
df_result = component.get_dataframe_output()
167+
assert len(df_result) == 0

0 commit comments

Comments
 (0)