Skip to content

Commit 3ead220

Browse files
committed
Adding new texag scraper for Texas Attorney General. There is a lot of HTML variation across the history of opinions, so I've added multiple example files for coverage. A backscraper is included and should be run after deployment, taking around 2 minutes and yeilding 18,377 cases. Relates to #168
1 parent c253180 commit 3ead220

File tree

6 files changed

+19880
-0
lines changed

6 files changed

+19880
-0
lines changed

juriscraper/opinions/united_states/state/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
'tennctapp',
136136
'tenncrimapp',
137137
'tex',
138+
'texag',
138139
'texapp_1',
139140
'texapp_2',
140141
'texapp_3',
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Scraper for the Texas Attorney General
2+
CourtID: texag
3+
Court Short Name: Texas Attorney General
4+
"""
5+
6+
from juriscraper.OpinionSite import OpinionSite
7+
from juriscraper.AbstractSite import InsanityException
8+
from juriscraper.lib.string_utils import convert_date_string
9+
10+
11+
class Site(OpinionSite):
12+
def __init__(self, *args, **kwargs):
13+
super(Site, self).__init__(*args, **kwargs)
14+
self.court_id = self.__module__
15+
self.target_index = 2
16+
self.url_path = False
17+
self.opinion_path = False
18+
self.section_path = False
19+
self.year_sub_path = False
20+
self.opinion_sub_path = False
21+
self.domain = 'https://texasattorneygeneral.gov'
22+
self.url = '%s/opinion/index-to-opinions' % self.domain
23+
self.back_scrape_iterable = range(2, 16) # Hard coded for initial run
24+
self.select_sub_path = './/select/option[position()>1]'
25+
self.flat_list_path = '//a[contains(./text(), "See a flat listing of all opinions")]'
26+
self.target_sub_page_path_base = '//table/tbody/tr[%d]/td[2]//a/@href'
27+
self.target_sub_page_path = self.target_sub_page_path_base % self.target_index
28+
29+
def _download(self, request_dict={}):
30+
"""Follow top-most opinions urls on landing page to resource page"""
31+
# Process landing page
32+
landing_html = super(Site, self)._download(request_dict)
33+
if self.method == 'LOCAL':
34+
# Example file should be direct resource page
35+
return landing_html
36+
# Load resource page
37+
url = landing_html.xpath(self.target_sub_page_path)[0]
38+
resource_page_html = self._get_html_tree_by_url(url, request_dict)
39+
flat_list_link = resource_page_html.xpath(self.flat_list_path)
40+
if not flat_list_link:
41+
return resource_page_html
42+
# Load flat list page for older pages with bad js
43+
url = flat_list_link[0].xpath('./@href')[0]
44+
return self._get_html_tree_by_url(url)
45+
46+
def _get_case_dates(self):
47+
"""All we have are years, so estimate middle most day of year"""
48+
self.set_dynamic_resource_paths()
49+
dates = []
50+
for section in self.html.xpath(self.section_path):
51+
year = section.xpath(self.year_sub_path)[0].text_content().strip()
52+
date = convert_date_string('July 2, %s' % year)
53+
count = len(section.xpath(self.opinion_sub_path))
54+
dates.extend([date] * count)
55+
return dates
56+
57+
def _get_case_names(self):
58+
"""No case names available"""
59+
return ["Untitled Texas Attorney General Opinion"] * len(self.case_dates)
60+
61+
def _get_download_urls(self):
62+
# Some listings provide direct links, others are relative
63+
return [self.domain + v if self.domain not in v else v
64+
for v in self.html.xpath(self.url_path)]
65+
66+
def _get_docket_numbers(self):
67+
return [option.text_content().strip() for option in self.html.xpath(self.opinion_path)]
68+
69+
def _get_precedential_statuses(self):
70+
return ['Published'] * len(self.case_dates)
71+
72+
def _get_judges(self):
73+
breadcrumb = self.html.xpath('//div[contains(@class, "breadcrumb")]//li')[-1].text_content().strip()
74+
return [breadcrumb.split('Opinions')[0]] * len(self.case_dates)
75+
76+
def _get_date_filed_is_approximate(self):
77+
return [True] * len(self.case_dates)
78+
79+
def _download_backwards(self, index):
80+
self.target_index = index
81+
self.target_sub_page_path = self.target_sub_page_path_base % index
82+
self.html = self._download()
83+
84+
# Across the whole history of the opinions, the court
85+
# has used various slightly different page html formats
86+
# The functions below are used to detect which format
87+
# the page is using, and set the path variables accordingly.
88+
89+
def set_dynamic_resource_paths(self):
90+
self.opinion_path = self.return_opinion_path()
91+
self.opinion_sub_path = '.%s' % self.opinion_path
92+
self.url_path = self.return_url_path()
93+
self.section_path = self.return_section_path()
94+
self.year_sub_path = self.return_year_sub_path()
95+
96+
def return_section_path(self):
97+
paths = [
98+
'//div[contains(@class, "panel-default")]',
99+
'//td[contains(p/@class, "center")]',
100+
'//td[contains(p/@align, "center")]',
101+
'//td[contains(h2/@class, "center")]',
102+
'//div[contains(h3/@class, "center")]',
103+
'//div[contains(h3/@align, "center")]',
104+
]
105+
for path in paths:
106+
if self.html.xpath(path):
107+
return path
108+
raise InsanityException('No recognized path to opinion sections')
109+
110+
def return_year_sub_path(self):
111+
parent = self.html.xpath(self.section_path)[0]
112+
paths = [
113+
'./div[contains(@class, "panel-heading")]/label',
114+
'./p[contains(@class, "center")]/strong',
115+
'./p[contains(@align, "center")]/font/b',
116+
'./h2[contains(@class, "center")]',
117+
'./h3[contains(@class, "center")]',
118+
'./h3[contains(@align, "center")]',
119+
]
120+
for path in paths:
121+
if parent.xpath(path):
122+
return path
123+
raise InsanityException('No recognized path to year string')
124+
125+
def return_opinion_path(self):
126+
paths = [
127+
'//select/option[contains(@value, ".pdf")]',
128+
'//ul/li/a[contains(@href, ".pdf")]',
129+
]
130+
for path in paths:
131+
if self.html.xpath(path):
132+
return path
133+
raise InsanityException('No recognized path to opinion listings')
134+
135+
def return_url_path(self):
136+
if '/option' in self.opinion_path:
137+
return '%s/@value' % self.opinion_path
138+
elif '/li/a' in self.opinion_path:
139+
return '%s/@href' % self.opinion_path
140+
raise InsanityException('No recognized path to url')

0 commit comments

Comments
 (0)