1+ import tempfile
12from datetime import datetime , timedelta
23import glob
34import camelot
910from dataclasses import dataclass
1011import re
1112
12-
1313JSON_FOLDER_NAME = 'Academic_Cal-j'
1414
15+
1516@dataclass
1617class DataEntry :
1718 start_date : datetime = datetime .today ()
1819 end_date : datetime = datetime .today ()
1920 event : str = ""
2021
21- #get the current working directory
22+
23+ # get the current working directory
2224def cwd ():
2325 return os .getcwd ()
2426
27+
2528def get_latest_calendar_name ():
2629 curr_year = datetime .today ().year
2730 curr_month = datetime .today ().month
2831
29- if (curr_month < 7 ):
32+ if (curr_month < 7 ):
3033 curr_year -= 1
31-
34+
3235 year_str = str (curr_year ) + '-' + str ((curr_year % 100 ) + 1 )
3336 filename = 'AcademicCalendar' + year_str + '.pdf'
3437 return filename
3538
39+
3640def is_file_present (file ):
37- if (os .path .exists (cwd () + '/' + file ) or
38- os .path .exists (cwd () + '/' + file + '/' )
39- ):
41+ if (os .path .exists (cwd () + '/' + file ) or
42+ os .path .exists (cwd () + '/' + file + '/' )
43+ ):
4044 return True
4145 return False
4246
47+
4348def delete_file (file ):
44- if (is_file_present (file )):
49+ if (is_file_present (file )):
4550 try :
46- print ("DELETING file " ,file )
47- if (os .path .isdir (file )):
51+ print ("DELETING file " , file )
52+ if (os .path .isdir (file )):
4853 shutil .rmtree (cwd () + '/' + file )
49- elif (os .path .isfile (file )):
54+ elif (os .path .isfile (file )):
5055 os .remove (file )
5156 else :
5257 raise Exception ("filename not valid" )
@@ -57,25 +62,26 @@ def delete_file(file):
5762 else :
5863 print (file , "File not present.." )
5964
60- #fetch the latest academic calendar from the iitkgp website
65+
66+ # fetch the latest academic calendar from the iitkgp website
6167def get_latest_calendar ():
62-
6368 filename = get_latest_calendar_name ()
6469 url = 'https://www.iitkgp.ac.in/assets/pdf/' + filename
6570
6671 ## delete any old academic calander pdf if exists
67- if (is_file_present (filename )):
72+ if (is_file_present (filename )):
6873 delete_file (filename )
69-
70- with open (filename ,"wb" ) as file :
74+
75+ with open (filename , "wb" ) as file :
7176 response = requests .get (url )
7277 file .write (response .content )
7378
74- if (is_file_present (filename )):
79+ if (is_file_present (filename )):
7580 return True
7681 return False
77-
78- def upzip_and_delete_zip (zip_file_name ,result_folder_name ):
82+
83+
84+ def upzip_and_delete_zip (zip_file_name , result_folder_name ):
7985 with ZipFile (zip_file_name ) as zip :
8086 try :
8187 zip .extractall (result_folder_name )
@@ -87,49 +93,67 @@ def upzip_and_delete_zip(zip_file_name,result_folder_name):
8793 delete_file (zip_file_name )
8894 return True
8995
96+
9097def export_json ():
9198 filename = get_latest_calendar_name ()
9299 ## [NOTE]
93100 ## ignore the read_pdf not found warning
94101 ## also the devs of camelot have mismached backend names so ghostscript points to pdfium and vice versa ...
95102 ## so basically this is using pdfium but backend name needs to be ghostscript
96103 ## in future if this gets fixed this need to be changed back
97- tables = camelot .read_pdf (filename ,pages = "all" ,backend = "ghostscript" )
104+
105+ ## This creates temporary files using `tempfile.mkdtemp()` in /tmp and does not clean them up until the program exits.
106+ tables = camelot .read_pdf (filename , pages = "all" , backend = "ghostscript" )
98107
99108 print ("Checking for pre-existing folder" )
100109 delete_file (JSON_FOLDER_NAME )
101110
102111 try :
103- tables .export ((JSON_FOLDER_NAME + '.json' ),f = 'json' ,compress = True )
112+ tables .export ((JSON_FOLDER_NAME + '.json' ), f = 'json' , compress = True )
104113 except Exception as E :
105114 print (E )
106115 return False
107116
108- upzip_and_delete_zip ((JSON_FOLDER_NAME + '.zip' ),JSON_FOLDER_NAME )
117+ upzip_and_delete_zip ((JSON_FOLDER_NAME + '.zip' ), JSON_FOLDER_NAME )
109118 return True
110119
120+
111121def get_json_files ():
112122 folder_path = cwd () + '/' + JSON_FOLDER_NAME
113- if (is_file_present (JSON_FOLDER_NAME )):
114- files = glob .glob (folder_path + '/*.json' ,include_hidden = True )
123+ if (is_file_present (JSON_FOLDER_NAME )):
124+ files = glob .glob (folder_path + '/*.json' , include_hidden = True )
115125 return files
116126 else :
117127 return []
118128
129+
119130def merge_json ():
120131 merged_data = []
121132 for file in get_json_files ():
122133 with open (file ) as f :
123134 data = json .load (f )
124135 merged_data .extend (data )
125-
126- with open ('final.json' ,"w" ) as f :
127- json .dump (merged_data ,f , indent = 4 )
136+
137+ with open ('final.json' , "w" ) as f :
138+ json .dump (merged_data , f , indent = 4 )
128139
129140 return merged_data
130141
131- def get_academic_calendar () -> list [DataEntry ]:
132142
143+ def clean_temp_files ():
144+ base = tempfile .gettempdir ()
145+ for filename in os .listdir (base ):
146+ if not filename .startswith ('tmp' ) or len (filename ) != 11 :
147+ continue
148+ fullpath = os .path .join (base , filename )
149+ try :
150+ shutil .rmtree (fullpath )
151+ except Exception as E :
152+ print (E )
153+ continue
154+
155+
156+ def get_academic_calendar () -> list [DataEntry ]:
133157 get_latest_calendar ()
134158 export_json ()
135159
@@ -168,28 +192,36 @@ def get_academic_calendar() -> list[DataEntry]:
168192 date_regex = re .compile (r'\d{2}.\d{2}.\d{4}' )
169193 maxLen = 1
170194 for date in all_dates :
171- if (len (date ) > 4 and date ['4' ] != '' ):
195+ if (len (date ) > 4 and date ['4' ] != '' ):
172196 entry = DataEntry ()
173- if (len (date ['1' ]) > 3 ):
174- entry .event += date ['1' ].replace ('\n ' ,'' )
175- entry .event += date ['2' ].replace ('\n ' ,'' )
197+ if (len (date ['1' ]) > 3 ):
198+ entry .event += date ['1' ].replace ('\n ' , '' )
199+ entry .event += date ['2' ].replace ('\n ' , '' )
176200
177- d = date ['3' ].replace ('\n ' ,' ' ).replace ('(AN)' ,'' ) + date ['4' ].replace ('\n ' ,' ' ).replace ('(AN)' ,'' )
201+ d = date ['3' ].replace ('\n ' , ' ' ).replace ('(AN)' , '' ) + date ['4' ].replace ('\n ' , ' ' ).replace ('(AN)' , '' )
178202 d = date_regex .findall (d )
179- if (maxLen < len (d )):
203+ if (maxLen < len (d )):
180204 maxLen = len (d )
181- if (len (d ) == 1 ):
182- entry .start_date = datetime .strptime (d [0 ],"%d.%m.%Y" )
183- entry .end_date = ( entry .start_date + timedelta (1 ) )
184- elif (len (d ) == 2 ):
185- entry .start_date = datetime .strptime (d [0 ],"%d.%m.%Y" )
186- entry .end_date = datetime .strptime (d [1 ],"%d.%m.%Y" )
205+ if (len (d ) == 1 ):
206+ entry .start_date = datetime .strptime (d [0 ], "%d.%m.%Y" )
207+ entry .end_date = (entry .start_date + timedelta (1 ))
208+ elif (len (d ) == 2 ):
209+ entry .start_date = datetime .strptime (d [0 ], "%d.%m.%Y" )
210+ entry .end_date = datetime .strptime (d [1 ], "%d.%m.%Y" )
187211 main_dates .append (entry )
188212 annual_convocation = str (date ['1' ]).strip ().lower ().split (" " )
189213 ## KGP hai .. cannot trust, they can even mess up the spellings of annual convocation
190214 ## this can just reduce the amount of places this will fail
191- if (len (annual_convocation ) == 2 and ("annual" in annual_convocation or "convocation" in annual_convocation )):
215+ if (len (annual_convocation ) == 2 and ("annual" in annual_convocation or "convocation" in annual_convocation )):
192216 break
193217
194- return main_dates
218+ ## This has to be done to remove temporary files created by camelot. These files are not automatically
219+ ## deleted until program exits
220+ ## This is not ideal, and might be dangerous (and invisible) if other programs are creating similar directories often
221+ ## Nothing else can be done without modifying `camelot`.
222+ try :
223+ clean_temp_files ()
224+ except Exception as E :
225+ print (E )
195226
227+ return main_dates
0 commit comments