|
| 1 | +"""Scraper for the Texas Attorney General |
| 2 | +CourtID: texag |
| 3 | +Court Short Name: Texas Attorney General |
| 4 | +""" |
| 5 | + |
| 6 | +from juriscraper.OpinionSite import OpinionSite |
| 7 | +from juriscraper.AbstractSite import InsanityException |
| 8 | +from juriscraper.lib.string_utils import convert_date_string |
| 9 | + |
| 10 | + |
| 11 | +class Site(OpinionSite): |
| 12 | + def __init__(self, *args, **kwargs): |
| 13 | + super(Site, self).__init__(*args, **kwargs) |
| 14 | + self.court_id = self.__module__ |
| 15 | + self.target_index = 2 |
| 16 | + self.url_path = False |
| 17 | + self.opinion_path = False |
| 18 | + self.section_path = False |
| 19 | + self.year_sub_path = False |
| 20 | + self.opinion_sub_path = False |
| 21 | + self.domain = 'https://texasattorneygeneral.gov' |
| 22 | + self.url = '%s/opinion/index-to-opinions' % self.domain |
| 23 | + self.back_scrape_iterable = range(2, 16) # Hard coded for initial run |
| 24 | + self.select_sub_path = './/select/option[position()>1]' |
| 25 | + self.flat_list_path = '//a[contains(./text(), "See a flat listing of all opinions")]' |
| 26 | + self.target_sub_page_path_base = '//table/tbody/tr[%d]/td[2]//a/@href' |
| 27 | + self.target_sub_page_path = self.target_sub_page_path_base % self.target_index |
| 28 | + |
| 29 | + def _download(self, request_dict={}): |
| 30 | + """Follow top-most opinions urls on landing page to resource page""" |
| 31 | + # Process landing page |
| 32 | + landing_html = super(Site, self)._download(request_dict) |
| 33 | + if self.method == 'LOCAL': |
| 34 | + # Example file should be direct resource page |
| 35 | + return landing_html |
| 36 | + # Load resource page |
| 37 | + url = landing_html.xpath(self.target_sub_page_path)[0] |
| 38 | + resource_page_html = self._get_html_tree_by_url(url, request_dict) |
| 39 | + flat_list_link = resource_page_html.xpath(self.flat_list_path) |
| 40 | + if not flat_list_link: |
| 41 | + return resource_page_html |
| 42 | + # Load flat list page for older pages with bad js |
| 43 | + url = flat_list_link[0].xpath('./@href')[0] |
| 44 | + return self._get_html_tree_by_url(url) |
| 45 | + |
| 46 | + def _get_case_dates(self): |
| 47 | + """All we have are years, so estimate middle most day of year""" |
| 48 | + self.set_dynamic_resource_paths() |
| 49 | + dates = [] |
| 50 | + for section in self.html.xpath(self.section_path): |
| 51 | + year = section.xpath(self.year_sub_path)[0].text_content().strip() |
| 52 | + date = convert_date_string('July 2, %s' % year) |
| 53 | + count = len(section.xpath(self.opinion_sub_path)) |
| 54 | + dates.extend([date] * count) |
| 55 | + return dates |
| 56 | + |
| 57 | + def _get_case_names(self): |
| 58 | + """No case names available""" |
| 59 | + return ["Untitled Texas Attorney General Opinion"] * len(self.case_dates) |
| 60 | + |
| 61 | + def _get_download_urls(self): |
| 62 | + # Some listings provide direct links, others are relative |
| 63 | + return [self.domain + v if self.domain not in v else v |
| 64 | + for v in self.html.xpath(self.url_path)] |
| 65 | + |
| 66 | + def _get_docket_numbers(self): |
| 67 | + return [option.text_content().strip() for option in self.html.xpath(self.opinion_path)] |
| 68 | + |
| 69 | + def _get_precedential_statuses(self): |
| 70 | + return ['Published'] * len(self.case_dates) |
| 71 | + |
| 72 | + def _get_judges(self): |
| 73 | + breadcrumb = self.html.xpath('//div[contains(@class, "breadcrumb")]//li')[-1].text_content().strip() |
| 74 | + return [breadcrumb.split('Opinions')[0]] * len(self.case_dates) |
| 75 | + |
| 76 | + def _get_date_filed_is_approximate(self): |
| 77 | + return [True] * len(self.case_dates) |
| 78 | + |
| 79 | + def _download_backwards(self, index): |
| 80 | + self.target_index = index |
| 81 | + self.target_sub_page_path = self.target_sub_page_path_base % index |
| 82 | + self.html = self._download() |
| 83 | + |
| 84 | + # Across the whole history of the opinions, the court |
| 85 | + # has used various slightly different page html formats |
| 86 | + # The functions below are used to detect which format |
| 87 | + # the page is using, and set the path variables accordingly. |
| 88 | + |
| 89 | + def set_dynamic_resource_paths(self): |
| 90 | + self.opinion_path = self.return_opinion_path() |
| 91 | + self.opinion_sub_path = '.%s' % self.opinion_path |
| 92 | + self.url_path = self.return_url_path() |
| 93 | + self.section_path = self.return_section_path() |
| 94 | + self.year_sub_path = self.return_year_sub_path() |
| 95 | + |
| 96 | + def return_section_path(self): |
| 97 | + paths = [ |
| 98 | + '//div[contains(@class, "panel-default")]', |
| 99 | + '//td[contains(p/@class, "center")]', |
| 100 | + '//td[contains(p/@align, "center")]', |
| 101 | + '//td[contains(h2/@class, "center")]', |
| 102 | + '//div[contains(h3/@class, "center")]', |
| 103 | + '//div[contains(h3/@align, "center")]', |
| 104 | + ] |
| 105 | + for path in paths: |
| 106 | + if self.html.xpath(path): |
| 107 | + return path |
| 108 | + raise InsanityException('No recognized path to opinion sections') |
| 109 | + |
| 110 | + def return_year_sub_path(self): |
| 111 | + parent = self.html.xpath(self.section_path)[0] |
| 112 | + paths = [ |
| 113 | + './div[contains(@class, "panel-heading")]/label', |
| 114 | + './p[contains(@class, "center")]/strong', |
| 115 | + './p[contains(@align, "center")]/font/b', |
| 116 | + './h2[contains(@class, "center")]', |
| 117 | + './h3[contains(@class, "center")]', |
| 118 | + './h3[contains(@align, "center")]', |
| 119 | + ] |
| 120 | + for path in paths: |
| 121 | + if parent.xpath(path): |
| 122 | + return path |
| 123 | + raise InsanityException('No recognized path to year string') |
| 124 | + |
| 125 | + def return_opinion_path(self): |
| 126 | + paths = [ |
| 127 | + '//select/option[contains(@value, ".pdf")]', |
| 128 | + '//ul/li/a[contains(@href, ".pdf")]', |
| 129 | + ] |
| 130 | + for path in paths: |
| 131 | + if self.html.xpath(path): |
| 132 | + return path |
| 133 | + raise InsanityException('No recognized path to opinion listings') |
| 134 | + |
| 135 | + def return_url_path(self): |
| 136 | + if '/option' in self.opinion_path: |
| 137 | + return '%s/@value' % self.opinion_path |
| 138 | + elif '/li/a' in self.opinion_path: |
| 139 | + return '%s/@href' % self.opinion_path |
| 140 | + raise InsanityException('No recognized path to url') |
0 commit comments