Skip to content

Commit 781cabd

Browse files
authored
Merge pull request #22 from Jsakkos/19-add-support-non-pgs-subtitles-ie-from-dvds
19 add support non pgs subtitles ie from dvds
2 parents 3d6401c + 15074eb commit 781cabd

File tree

4 files changed

+177
-41
lines changed

4 files changed

+177
-41
lines changed

mkv_episode_matcher/episode_matcher.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -74,28 +74,36 @@ def process_show(season=None, dry_run=False, get_subs=False):
7474
logger.warning(f"Season {season} has no .mkv files to process")
7575
return
7676

77+
season_paths_to_process = [season_path]
78+
else:
79+
# Process all valid seasons
80+
season_paths_to_process = valid_season_paths
81+
82+
# Process each season
83+
for season_path in season_paths_to_process:
84+
logger.info(f"Processing season path: {season_path}")
7785
mkv_files = [
7886
os.path.join(season_path, f)
7987
for f in os.listdir(season_path)
8088
if f.endswith(".mkv")
8189
]
82-
else:
83-
# Process all valid seasons
84-
for season_path in valid_season_paths:
85-
mkv_files = [
86-
os.path.join(season_path, f)
87-
for f in os.listdir(season_path)
88-
if f.endswith(".mkv")
89-
]
90-
# Filter out files that have already been processed
91-
for f in mkv_files:
92-
if check_filename(f):
93-
logger.info(f"Skipping {f}, already processed")
94-
mkv_files.remove(f)
95-
if len(mkv_files) == 0:
96-
logger.info("No new files to process")
97-
return
98-
convert_mkv_to_srt(season_path, mkv_files)
90+
91+
# Filter out files that have already been processed
92+
unprocessed_files = []
93+
for f in mkv_files:
94+
if check_filename(f):
95+
logger.info(f"Skipping {f}, already processed")
96+
else:
97+
unprocessed_files.append(f)
98+
99+
if not unprocessed_files:
100+
logger.info(f"No new files to process in {season_path}")
101+
continue
102+
103+
logger.info(f"Processing {len(unprocessed_files)} files in {season_path}")
104+
convert_mkv_to_srt(season_path, unprocessed_files)
105+
106+
# Process reference and SRT files after all seasons are converted
99107
reference_text_dict = process_reference_srt_files(show_name)
100108
srt_text_dict = process_srt_files(show_dir)
101109
compare_and_rename_files(srt_text_dict, reference_text_dict, dry_run=dry_run)

mkv_episode_matcher/mkv_to_srt.py

Lines changed: 150 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,28 @@
99
import re
1010
from concurrent.futures import ThreadPoolExecutor
1111
from datetime import datetime, timedelta
12-
12+
from pathlib import Path
1313
import pytesseract
1414
from imagemaker import make_image
1515
from loguru import logger
1616
from pgsreader import PGSReader
1717
from PIL import Image, ImageOps
18-
18+
from typing import Optional
1919
from mkv_episode_matcher.__main__ import CONFIG_FILE
2020
from mkv_episode_matcher.config import get_config
21+
def check_if_processed(filename: str) -> bool:
22+
"""
23+
Check if the file has already been processed (has SxxExx format)
24+
25+
Args:
26+
filename (str): Filename to check
27+
28+
Returns:
29+
bool: True if file is already processed
30+
"""
31+
import re
32+
match = re.search(r"S\d+E\d+", filename)
33+
return bool(match)
2134

2235

2336
def convert_mkv_to_sup(mkv_file, output_dir):
@@ -51,21 +64,23 @@ def convert_mkv_to_sup(mkv_file, output_dir):
5164

5265

5366
@logger.catch
54-
def perform_ocr(sup_file_path):
67+
def perform_ocr(sup_file_path: str) -> Optional[str]:
5568
"""
5669
Perform OCR on a .sup file and save the extracted text to a .srt file.
57-
58-
Args:
59-
sup_file_path (str): Path to the .sup file.
70+
Returns the path to the created SRT file.
6071
"""
61-
6272
# Get the base name of the .sup file without the extension
6373
base_name = os.path.splitext(os.path.basename(sup_file_path))[0]
6474
output_dir = os.path.dirname(sup_file_path)
6575
logger.info(f"Performing OCR on {sup_file_path}")
76+
6677
# Construct the output .srt file path
6778
srt_file = os.path.join(output_dir, f"{base_name}.srt")
6879

80+
if os.path.exists(srt_file):
81+
logger.info(f"SRT file {srt_file} already exists, skipping OCR")
82+
return srt_file
83+
6984
# Load a PGS/SUP file.
7085
pgs = PGSReader(sup_file_path)
7186

@@ -151,24 +166,137 @@ def perform_ocr(sup_file_path):
151166
logger.info(f"Saved to: {srt_file}")
152167

153168

154-
def convert_mkv_to_srt(season_path, mkv_files):
155-
"""
156-
Converts MKV files to SRT format.
169+
# def convert_mkv_to_srt(season_path, mkv_files):
170+
# """
171+
# Converts MKV files to SRT format.
157172

158-
Args:
159-
season_path (str): The path to the season directory.
160-
mkv_files (list): List of MKV files to convert.
173+
# Args:
174+
# season_path (str): The path to the season directory.
175+
# mkv_files (list): List of MKV files to convert.
161176

162-
Returns:
163-
None
177+
# Returns:
178+
# None
179+
# """
180+
# logger.info(f"Converting {len(mkv_files)} files to SRT")
181+
# output_dir = os.path.join(season_path, "ocr")
182+
# os.makedirs(output_dir, exist_ok=True)
183+
# sup_files = []
184+
# for mkv_file in mkv_files:
185+
# sup_file = convert_mkv_to_sup(mkv_file, output_dir)
186+
# sup_files.append(sup_file)
187+
# with ThreadPoolExecutor() as executor:
188+
# for sup_file in sup_files:
189+
# executor.submit(perform_ocr, sup_file)
190+
191+
192+
193+
def extract_subtitles(mkv_file: str, output_dir: str) -> Optional[str]:
194+
"""
195+
Extract subtitles from MKV file based on detected subtitle type.
196+
"""
197+
subtitle_type, stream_index = detect_subtitle_type(mkv_file)
198+
if not subtitle_type:
199+
logger.error(f"No supported subtitle streams found in {mkv_file}")
200+
return None
201+
202+
base_name = Path(mkv_file).stem
203+
204+
if subtitle_type == 'subrip':
205+
# For SRT subtitles, extract directly to .srt
206+
output_file = os.path.join(output_dir, f"{base_name}.srt")
207+
if not os.path.exists(output_file):
208+
cmd = [
209+
"ffmpeg", "-i", mkv_file,
210+
"-map", f"0:{stream_index}",
211+
output_file
212+
]
213+
else:
214+
# For DVD or PGS subtitles, extract to SUP format first
215+
output_file = os.path.join(output_dir, f"{base_name}.sup")
216+
if not os.path.exists(output_file):
217+
cmd = [
218+
"ffmpeg", "-i", mkv_file,
219+
"-map", f"0:{stream_index}",
220+
"-c", "copy",
221+
output_file
222+
]
223+
224+
if not os.path.exists(output_file):
225+
try:
226+
subprocess.run(cmd, check=True)
227+
logger.info(f"Extracted subtitles from {mkv_file} to {output_file}")
228+
return output_file
229+
except subprocess.CalledProcessError as e:
230+
logger.error(f"Error extracting subtitles: {e}")
231+
return None
232+
else:
233+
logger.info(f"Subtitle file {output_file} already exists, skipping extraction")
234+
return output_file
235+
236+
def convert_mkv_to_srt(season_path: str, mkv_files: list[str]) -> None:
237+
"""
238+
Convert subtitles from MKV files to SRT format.
164239
"""
165240
logger.info(f"Converting {len(mkv_files)} files to SRT")
241+
242+
# Filter out already processed files
243+
unprocessed_files = []
244+
for mkv_file in mkv_files:
245+
if check_if_processed(os.path.basename(mkv_file)):
246+
logger.info(f"Skipping {mkv_file} - already processed")
247+
continue
248+
unprocessed_files.append(mkv_file)
249+
250+
if not unprocessed_files:
251+
logger.info("No new files to process")
252+
return
253+
254+
# Create OCR directory
166255
output_dir = os.path.join(season_path, "ocr")
167256
os.makedirs(output_dir, exist_ok=True)
168-
sup_files = []
169-
for mkv_file in mkv_files:
170-
sup_file = convert_mkv_to_sup(mkv_file, output_dir)
171-
sup_files.append(sup_file)
172-
with ThreadPoolExecutor() as executor:
173-
for sup_file in sup_files:
174-
executor.submit(perform_ocr, sup_file)
257+
258+
for mkv_file in unprocessed_files:
259+
subtitle_file = extract_subtitles(mkv_file, output_dir)
260+
if not subtitle_file:
261+
continue
262+
263+
if subtitle_file.endswith('.srt'):
264+
# Already have SRT, keep it in OCR directory
265+
logger.info(f"Extracted SRT subtitle to {subtitle_file}")
266+
else:
267+
# For SUP files (DVD or PGS), perform OCR
268+
srt_file = perform_ocr(subtitle_file)
269+
if srt_file:
270+
logger.info(f"Created SRT from OCR: {srt_file}")
271+
272+
def detect_subtitle_type(mkv_file: str) -> tuple[Optional[str], Optional[int]]:
273+
"""
274+
Detect the type and index of subtitle streams in an MKV file.
275+
"""
276+
cmd = ["ffmpeg", "-i", mkv_file]
277+
278+
try:
279+
result = subprocess.run(cmd, capture_output=True, text=True)
280+
281+
subtitle_streams = []
282+
for line in result.stderr.split('\n'):
283+
if 'Subtitle' in line:
284+
stream_index = int(line.split('#0:')[1].split('(')[0])
285+
if 'subrip' in line:
286+
subtitle_streams.append(('subrip', stream_index))
287+
elif 'dvd_subtitle' in line:
288+
subtitle_streams.append(('dvd_subtitle', stream_index))
289+
elif 'hdmv_pgs_subtitle' in line:
290+
subtitle_streams.append(('hdmv_pgs_subtitle', stream_index))
291+
292+
# Prioritize subtitle formats: SRT > DVD > PGS
293+
for format_priority in ['subrip', 'dvd_subtitle', 'hdmv_pgs_subtitle']:
294+
for format_type, index in subtitle_streams:
295+
if format_type == format_priority:
296+
return format_type, index
297+
298+
return None, None
299+
300+
except subprocess.CalledProcessError as e:
301+
logger.error(f"Error detecting subtitle type: {e}")
302+
return None, None

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = mkv_episode_matcher
3-
version = 0.1.13
3+
version = 0.2.0
44
author = Jonathan Sakkos
55
author_email = [email protected]
66
description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)