99import re
1010from concurrent .futures import ThreadPoolExecutor
1111from datetime import datetime , timedelta
12-
12+ from pathlib import Path
1313import pytesseract
1414from imagemaker import make_image
1515from loguru import logger
1616from pgsreader import PGSReader
1717from PIL import Image , ImageOps
18-
18+ from typing import Optional
1919from mkv_episode_matcher .__main__ import CONFIG_FILE
2020from mkv_episode_matcher .config import get_config
21+ def check_if_processed (filename : str ) -> bool :
22+ """
23+ Check if the file has already been processed (has SxxExx format)
24+
25+ Args:
26+ filename (str): Filename to check
27+
28+ Returns:
29+ bool: True if file is already processed
30+ """
31+ import re
32+ match = re .search (r"S\d+E\d+" , filename )
33+ return bool (match )
2134
2235
2336def convert_mkv_to_sup (mkv_file , output_dir ):
@@ -51,21 +64,23 @@ def convert_mkv_to_sup(mkv_file, output_dir):
5164
5265
5366@logger .catch
54- def perform_ocr (sup_file_path ) :
67+ def perform_ocr (sup_file_path : str ) -> Optional [ str ] :
5568 """
5669 Perform OCR on a .sup file and save the extracted text to a .srt file.
57-
58- Args:
59- sup_file_path (str): Path to the .sup file.
70+ Returns the path to the created SRT file.
6071 """
61-
6272 # Get the base name of the .sup file without the extension
6373 base_name = os .path .splitext (os .path .basename (sup_file_path ))[0 ]
6474 output_dir = os .path .dirname (sup_file_path )
6575 logger .info (f"Performing OCR on { sup_file_path } " )
76+
6677 # Construct the output .srt file path
6778 srt_file = os .path .join (output_dir , f"{ base_name } .srt" )
6879
80+ if os .path .exists (srt_file ):
81+ logger .info (f"SRT file { srt_file } already exists, skipping OCR" )
82+ return srt_file
83+
6984 # Load a PGS/SUP file.
7085 pgs = PGSReader (sup_file_path )
7186
@@ -151,24 +166,137 @@ def perform_ocr(sup_file_path):
151166 logger .info (f"Saved to: { srt_file } " )
152167
153168
154- def convert_mkv_to_srt (season_path , mkv_files ):
155- """
156- Converts MKV files to SRT format.
169+ # def convert_mkv_to_srt(season_path, mkv_files):
170+ # """
171+ # Converts MKV files to SRT format.
157172
158- Args:
159- season_path (str): The path to the season directory.
160- mkv_files (list): List of MKV files to convert.
173+ # Args:
174+ # season_path (str): The path to the season directory.
175+ # mkv_files (list): List of MKV files to convert.
161176
162- Returns:
163- None
177+ # Returns:
178+ # None
179+ # """
180+ # logger.info(f"Converting {len(mkv_files)} files to SRT")
181+ # output_dir = os.path.join(season_path, "ocr")
182+ # os.makedirs(output_dir, exist_ok=True)
183+ # sup_files = []
184+ # for mkv_file in mkv_files:
185+ # sup_file = convert_mkv_to_sup(mkv_file, output_dir)
186+ # sup_files.append(sup_file)
187+ # with ThreadPoolExecutor() as executor:
188+ # for sup_file in sup_files:
189+ # executor.submit(perform_ocr, sup_file)
190+
191+
192+
193+ def extract_subtitles (mkv_file : str , output_dir : str ) -> Optional [str ]:
194+ """
195+ Extract subtitles from MKV file based on detected subtitle type.
196+ """
197+ subtitle_type , stream_index = detect_subtitle_type (mkv_file )
198+ if not subtitle_type :
199+ logger .error (f"No supported subtitle streams found in { mkv_file } " )
200+ return None
201+
202+ base_name = Path (mkv_file ).stem
203+
204+ if subtitle_type == 'subrip' :
205+ # For SRT subtitles, extract directly to .srt
206+ output_file = os .path .join (output_dir , f"{ base_name } .srt" )
207+ if not os .path .exists (output_file ):
208+ cmd = [
209+ "ffmpeg" , "-i" , mkv_file ,
210+ "-map" , f"0:{ stream_index } " ,
211+ output_file
212+ ]
213+ else :
214+ # For DVD or PGS subtitles, extract to SUP format first
215+ output_file = os .path .join (output_dir , f"{ base_name } .sup" )
216+ if not os .path .exists (output_file ):
217+ cmd = [
218+ "ffmpeg" , "-i" , mkv_file ,
219+ "-map" , f"0:{ stream_index } " ,
220+ "-c" , "copy" ,
221+ output_file
222+ ]
223+
224+ if not os .path .exists (output_file ):
225+ try :
226+ subprocess .run (cmd , check = True )
227+ logger .info (f"Extracted subtitles from { mkv_file } to { output_file } " )
228+ return output_file
229+ except subprocess .CalledProcessError as e :
230+ logger .error (f"Error extracting subtitles: { e } " )
231+ return None
232+ else :
233+ logger .info (f"Subtitle file { output_file } already exists, skipping extraction" )
234+ return output_file
235+
236+ def convert_mkv_to_srt (season_path : str , mkv_files : list [str ]) -> None :
237+ """
238+ Convert subtitles from MKV files to SRT format.
164239 """
165240 logger .info (f"Converting { len (mkv_files )} files to SRT" )
241+
242+ # Filter out already processed files
243+ unprocessed_files = []
244+ for mkv_file in mkv_files :
245+ if check_if_processed (os .path .basename (mkv_file )):
246+ logger .info (f"Skipping { mkv_file } - already processed" )
247+ continue
248+ unprocessed_files .append (mkv_file )
249+
250+ if not unprocessed_files :
251+ logger .info ("No new files to process" )
252+ return
253+
254+ # Create OCR directory
166255 output_dir = os .path .join (season_path , "ocr" )
167256 os .makedirs (output_dir , exist_ok = True )
168- sup_files = []
169- for mkv_file in mkv_files :
170- sup_file = convert_mkv_to_sup (mkv_file , output_dir )
171- sup_files .append (sup_file )
172- with ThreadPoolExecutor () as executor :
173- for sup_file in sup_files :
174- executor .submit (perform_ocr , sup_file )
257+
258+ for mkv_file in unprocessed_files :
259+ subtitle_file = extract_subtitles (mkv_file , output_dir )
260+ if not subtitle_file :
261+ continue
262+
263+ if subtitle_file .endswith ('.srt' ):
264+ # Already have SRT, keep it in OCR directory
265+ logger .info (f"Extracted SRT subtitle to { subtitle_file } " )
266+ else :
267+ # For SUP files (DVD or PGS), perform OCR
268+ srt_file = perform_ocr (subtitle_file )
269+ if srt_file :
270+ logger .info (f"Created SRT from OCR: { srt_file } " )
271+
272+ def detect_subtitle_type (mkv_file : str ) -> tuple [Optional [str ], Optional [int ]]:
273+ """
274+ Detect the type and index of subtitle streams in an MKV file.
275+ """
276+ cmd = ["ffmpeg" , "-i" , mkv_file ]
277+
278+ try :
279+ result = subprocess .run (cmd , capture_output = True , text = True )
280+
281+ subtitle_streams = []
282+ for line in result .stderr .split ('\n ' ):
283+ if 'Subtitle' in line :
284+ stream_index = int (line .split ('#0:' )[1 ].split ('(' )[0 ])
285+ if 'subrip' in line :
286+ subtitle_streams .append (('subrip' , stream_index ))
287+ elif 'dvd_subtitle' in line :
288+ subtitle_streams .append (('dvd_subtitle' , stream_index ))
289+ elif 'hdmv_pgs_subtitle' in line :
290+ subtitle_streams .append (('hdmv_pgs_subtitle' , stream_index ))
291+
292+ # Prioritize subtitle formats: SRT > DVD > PGS
293+ for format_priority in ['subrip' , 'dvd_subtitle' , 'hdmv_pgs_subtitle' ]:
294+ for format_type , index in subtitle_streams :
295+ if format_type == format_priority :
296+ return format_type , index
297+
298+ return None , None
299+
300+ except subprocess .CalledProcessError as e :
301+ logger .error (f"Error detecting subtitle type: { e } " )
302+ return None , None
0 commit comments