-
Notifications
You must be signed in to change notification settings - Fork 1
/
MainBoilerPlate.py
41 lines (31 loc) · 1.71 KB
/
MainBoilerPlate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pdfcutter
import helper
import json #For writing PDF Link JSON File
import os #To check if PDF Link JSON File exists
#get_session is main method for parsing session to Senats/Bundesrats Texts dict
class MainExtractorMethod:
#In: Can't init TextExtractorHolder before (missing paras in get_beschluesse_text), so have class as input in init
def __init__(self, textExtractorHolderSubclass):
self.textExtractorHolderSubclass = textExtractorHolderSubclass #Needed for get_beschluesse_text and no cyclic dependencies
#In: Session Dict
#Out: Dict of "TOP: {'senat': senatsText, 'bundesrat': BRText}" entries
def get_session(self, session):
PDF_URLS = dict(self._get_pdf_urls())
URLFILENAME = "session_urls.json"
if not os.path.exists(URLFILENAME): #Create PDF Link JSON File
with open(URLFILENAME, 'w') as f: #Because of override of MainExtractorMethod in counties, the FILENAME is always relative to folder
json.dump(PDF_URLS, f)
try:
filename = helper.get_session_pdf_filename(session, PDF_URLS)
except KeyError:
return
return self.get_beschluesse_text(session, filename)
#Out: Dict of {sessionNumberOfBR: PDFWebLink} entries
#For each County very different, so implement it new each time
def _get_pdf_urls(self):
raise NotImplementedError()
#Out: Dict of "TOP: {'senat': senatsText, 'bundesrat': BRText}" entries
#Extraction work done in AbstractSenatsAndBRTextExtractor Subclasses
def get_beschluesse_text(self, session, filename):
extractor = self.textExtractorHolderSubclass(filename, session)
return dict(extractor.getSenatsAndBRTextsForAllSessionTOPs())