forked from gkamradt/QuickAgent
-
Notifications
You must be signed in to change notification settings - Fork 6
/
dl_yt_subtitles.py
108 lines (84 loc) · 3.82 KB
/
dl_yt_subtitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import yt_dlp
import re
def download_youtube_video_info(url):
# Configuration for yt-dlp
ydl_opts = {
'writesubtitles': True, # Download subtitles if available
'writeautomaticsub': True, # Download automatic subtitles if no others are available
'subtitleslangs': ['en'], # Preferred languages for the subtitles
'skip_download': True, # Skip downloading the video, only metadata and subtitles
'quiet': True, # Reduce output
'outtmpl': 'subtitles', # Filename for the subtitle file
'format': 'bestaudio/best' # Select the best available quality
}
# Using yt-dlp to extract information
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(url, download=False) # Only download info, not the video
# Check if subtitles are available
subtitles = result.get('requested_subtitles')
subtitle_content = None
if subtitles:
language = list(subtitles.keys())[0]
subtitle_url = subtitles[language]['url']
subtitle_content = ydl.urlopen(subtitle_url).read().decode('utf-8')
# Compile data
video_data = {
'title': result.get('title'),
'description': result.get('description'),
'url': result.get('webpage_url'),
'subtitles': subtitle_content
}
return video_data
def extract_and_concat_subtitle_text(subtitle_text):
# Regex to find all occurrences of text within <c>...</c> tags
pattern = re.compile(r'<c>(.*?)<\/c>')
matches = pattern.findall(subtitle_text)
concatenated_text = "Video Title:" + extract_title(subtitle_text)
concatenated_text += "\nDescription: "+ extract_description(subtitle_text)
# Joining all the matched texts with a space
concatenated_text += "\nSubtitles: "''.join(matches)
return concatenated_text
def extract_title(text):
# Regex pattern to find text between 'title': ' and ', 'description
pattern = re.compile(r"'title':\s*'([^']*)',\s*'description")
# Searching the text for the pattern
match = pattern.search(text)
# If a match is found, return the captured group, otherwise return None
return match.group(1) if match else None
def extract_description(input_string):
# Regular expression to find text between ', 'description and ', 'url
pattern = re.compile(r", 'description': '(.*?)', 'url'")
match = pattern.search(input_string)
# If a match is found, return the matched text; otherwise, return None
if match:
return match.group(1)
else:
return None
def find_first_youtube_url(text):
# Pattern to match different YouTube URL formats
pattern = re.compile(
r'(https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+'
r'|https?://youtu\.be/[\w-]+'
r'|https?://(?:www\.)?youtube\.com/embed/[\w-]+'
r'|https?://(?:www\.)?youtube\.com/channel/[\w-]+'
r'|https?://(?:www\.)?youtube\.com/user/[\w-]+'
r'|https?://music\.youtube\.com/watch\?v=[\w-]+'
r'|https?://tv\.youtube\.com)'
)
# Search for the first YouTube URL in the text
match = pattern.search(text)
if match:
return match.group(0) # Return the matched URL
return None # If no URL is found, return None
'''
# Example usage
example_text = "Check out this video at https://www.youtube.com/watch?v=dQw4w9WgXcQ and this one at https://youtu.be/NpEaa2P7qZI"
first_youtube_url = find_first_youtube_url(example_text)
print(first_youtube_url)
# Example of using the function
video_url = 'https://www.youtube.com/watch?v=o27GDttmHxY'
video_info = download_youtube_video_info(video_url)
print(video_info)
text= extract_and_concat_subtitle_text(str(video_info))
print(text)
'''