Skip to content

Commit f5e36ae

Browse files
authored
Merge pull request #1360 from freelawproject/update-maine-scraper
fix(maine): Update maine
2 parents 3703d9f + 62261d7 commit f5e36ae

File tree

7 files changed

+339
-74
lines changed

7 files changed

+339
-74
lines changed

CHANGES.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Releases are also tagged in git, if that's helpful.
1212

1313
## Coming up
1414

15+
- Fix `me` Update maine scraper and add backscraper
1516

1617
## Current
1718

juriscraper/opinions/united_states/state/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
# that. It will also be worth considering whether the scraper itself
7777
# should be re-written to scrape CourtListener's API instead of the
7878
# court's website.
79-
# "me",
79+
"me",
8080
"mesuperct",
8181
"mich",
8282
"michctapp",

juriscraper/opinions/united_states/state/me.py

+113-36
Original file line numberDiff line numberDiff line change
@@ -13,48 +13,125 @@
1313
2022-01-06: This scraper is not maintained. Future work to gather this
1414
data should be done by scraping the CourtListener API
1515
https://www.courtlistener.com/api/rest/v3/clusters/?docket__court__id=me
16+
17+
2025-03-31: This scraper has been updated with a backscraper (flooie)
1618
"""
1719

18-
from lxml import html
20+
import re
21+
from datetime import date
22+
23+
from juriscraper.AbstractSite import logger
24+
from juriscraper.lib.judge_parsers import normalize_judge_names
25+
from juriscraper.lib.string_utils import convert_date_string, titlecase
26+
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
1927

20-
from juriscraper.lib.string_utils import convert_date_string
21-
from juriscraper.OpinionSite import OpinionSite
2228

29+
class Site(OpinionSiteLinear):
30+
url_template = (
31+
"https://www.courts.maine.gov/courts/sjc/lawcourt/{}/index.html"
32+
)
33+
first_opinion_year = 2017
2334

24-
class Site(OpinionSite):
2535
def __init__(self, *args, **kwargs):
2636
super().__init__(*args, **kwargs)
2737
self.court_id = self.__module__
2838
self.url = "https://www.courts.maine.gov/courts/sjc/opinions.html"
29-
self.path_root = '//table[contains(.//th[1], "Opinion")]'
30-
31-
def _get_cell_path(self, cell_number: int, subpath: str = "") -> str:
32-
path = '//table[contains(.//th[1], "Opinion")]//td[%d]'
33-
return f"{path}/{subpath}" if subpath else path
34-
35-
def _get_download_urls(self):
36-
path = f"{self.path_root}//td[2]/a[1]/@href"
37-
return list(self.html.xpath(path))
38-
39-
def _get_case_names(self):
40-
case_names = []
41-
path = f"{self.path_root}//td[2]/a[1]"
42-
for e in self.html.xpath(path):
43-
s = html.tostring(e, method="text", encoding="unicode")
44-
case_names.append(s)
45-
return case_names
46-
47-
def _get_case_dates(self):
48-
dates = []
49-
path = f"{self.path_root}//td[3]"
50-
for cell in self.html.xpath(path):
51-
date_string = cell.text_content().replace("Aguust", "August")
52-
dates.append(convert_date_string(date_string))
53-
return dates
54-
55-
def _get_precedential_statuses(self):
56-
return ["Published"] * len(self.case_names)
57-
58-
def _get_citations(self):
59-
path = f"{self.path_root}//td[1]//text()"
60-
return list(self.html.xpath(path))
39+
self.path_root = '//table[contains(.//th[1], "Opinion")]//tr[td]'
40+
self.status = "Published"
41+
self.make_backscrape_iterable(kwargs)
42+
43+
def _process_html(self):
44+
for row in self.html.xpath(self.path_root):
45+
cite, name, date = row.xpath("./td")
46+
47+
# handle the one typo
48+
date_str = date.text_content().replace("Aguust", "August")
49+
50+
case_name = titlecase(name.text_content())
51+
if "Revised" in case_name:
52+
# handle revised opinions case name
53+
case_name = case_name.split("Revised")[0].strip()
54+
self.cases.append(
55+
{
56+
"citation": cite.text_content(),
57+
"date": date_str,
58+
"name": case_name,
59+
"url": name.xpath(".//a")[0].attrib["href"],
60+
"docket": "",
61+
}
62+
)
63+
64+
def extract_from_text(self, scraped_text: str) -> dict:
65+
"""Extract out lots of data from Maine
66+
67+
:param scraped_text: The first page of content
68+
:return: The dictionary of extracted data
69+
"""
70+
pattern = re.compile(
71+
r"(?P<label>Docket|On Briefs|Decided|Argued|Panel|Reporter of Decisions):\s*(?P<value>[^\n]+)"
72+
)
73+
extracted = {}
74+
for match in pattern.finditer(scraped_text[:500]):
75+
label = match.group("label")
76+
value = match.group("value").strip()
77+
extracted[label] = value
78+
79+
author = r"(?P<author_str>.*)\n+(\s+)?\[¶1\]"
80+
m = re.search(author, scraped_text, re.MULTILINE)
81+
if m:
82+
if m.group("author_str") == "PER CURIAM":
83+
per_curiam = True
84+
author_str = ""
85+
else:
86+
per_curiam = False
87+
author_str = m.group("author_str")
88+
else:
89+
per_curiam = False
90+
author_str = ""
91+
92+
date_argued = extracted.get("On Briefs", "") or extracted.get(
93+
"Argued", ""
94+
)
95+
date_argued_str = ""
96+
if date_argued:
97+
# Format date
98+
date_argued = convert_date_string(date_argued)
99+
date_argued_str = date_argued.strftime("%Y-%m-%d")
100+
101+
metadata = {
102+
"Opinion": {
103+
"author_str": normalize_judge_names(author_str),
104+
"per_curiam": per_curiam,
105+
},
106+
"OpinionCluster": {
107+
"judges": extracted.get("Panel", ""),
108+
},
109+
"Docket": {
110+
"date_argued": date_argued_str,
111+
"docket_number": extracted.get("Docket", ""),
112+
},
113+
}
114+
115+
return metadata
116+
117+
def _download_backwards(self, year: int) -> None:
118+
self.url = self.url_template.format(year)
119+
logger.info("Backscraping for year %s %s", year, self.url)
120+
self.html = self._download()
121+
self._process_html()
122+
123+
def make_backscrape_iterable(self, kwargs: dict):
124+
if kwargs.get("backscrape_start"):
125+
start = int(kwargs["backscrape_start"])
126+
else:
127+
start = self.first_opinion_year
128+
129+
if kwargs.get("backscrape_end"):
130+
end = int(kwargs["backscrape_end"])
131+
else:
132+
end = date.today().year - 1
133+
134+
if start == end:
135+
end = start + 1
136+
137+
self.back_scrape_iterable = range(start, end)

tests/examples/opinions/united_states/me_example.compare.json

+16-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"precedential_statuses": "Published",
77
"blocked_statuses": false,
88
"date_filed_is_approximate": false,
9+
"docket_numbers": "",
910
"citations": "2014 ME 74",
1011
"case_name_shorts": ""
1112
},
@@ -16,6 +17,7 @@
1617
"precedential_statuses": "Published",
1718
"blocked_statuses": false,
1819
"date_filed_is_approximate": false,
20+
"docket_numbers": "",
1921
"citations": "2014 ME 75",
2022
"case_name_shorts": ""
2123
},
@@ -26,6 +28,7 @@
2628
"precedential_statuses": "Published",
2729
"blocked_statuses": false,
2830
"date_filed_is_approximate": false,
31+
"docket_numbers": "",
2932
"citations": "2014 ME 64",
3033
"case_name_shorts": ""
3134
},
@@ -36,6 +39,7 @@
3639
"precedential_statuses": "Published",
3740
"blocked_statuses": false,
3841
"date_filed_is_approximate": false,
42+
"docket_numbers": "",
3943
"citations": "2014 ME 63",
4044
"case_name_shorts": ""
4145
},
@@ -46,6 +50,7 @@
4650
"precedential_statuses": "Published",
4751
"blocked_statuses": false,
4852
"date_filed_is_approximate": false,
53+
"docket_numbers": "",
4954
"citations": "2014 ME 62",
5055
"case_name_shorts": ""
5156
},
@@ -56,6 +61,7 @@
5661
"precedential_statuses": "Published",
5762
"blocked_statuses": false,
5863
"date_filed_is_approximate": false,
64+
"docket_numbers": "",
5965
"citations": "2014 ME 52",
6066
"case_name_shorts": ""
6167
},
@@ -66,6 +72,7 @@
6672
"precedential_statuses": "Published",
6773
"blocked_statuses": false,
6874
"date_filed_is_approximate": false,
75+
"docket_numbers": "",
6976
"citations": "2014 ME 51",
7077
"case_name_shorts": ""
7178
},
@@ -76,6 +83,7 @@
7683
"precedential_statuses": "Published",
7784
"blocked_statuses": false,
7885
"date_filed_is_approximate": false,
86+
"docket_numbers": "",
7987
"citations": "2014 ME 35",
8088
"case_name_shorts": ""
8189
},
@@ -86,6 +94,7 @@
8694
"precedential_statuses": "Published",
8795
"blocked_statuses": false,
8896
"date_filed_is_approximate": false,
97+
"docket_numbers": "",
8998
"citations": "2014 ME 34",
9099
"case_name_shorts": ""
91100
},
@@ -96,6 +105,7 @@
96105
"precedential_statuses": "Published",
97106
"blocked_statuses": false,
98107
"date_filed_is_approximate": false,
108+
"docket_numbers": "",
99109
"citations": "2014 ME 36",
100110
"case_name_shorts": "Adoption of T.D."
101111
},
@@ -106,6 +116,7 @@
106116
"precedential_statuses": "Published",
107117
"blocked_statuses": false,
108118
"date_filed_is_approximate": false,
119+
"docket_numbers": "",
109120
"citations": "2014 ME 12",
110121
"case_name_shorts": ""
111122
},
@@ -116,6 +127,7 @@
116127
"precedential_statuses": "Published",
117128
"blocked_statuses": false,
118129
"date_filed_is_approximate": false,
130+
"docket_numbers": "",
119131
"citations": "2014 ME 13",
120132
"case_name_shorts": ""
121133
},
@@ -126,16 +138,18 @@
126138
"precedential_statuses": "Published",
127139
"blocked_statuses": false,
128140
"date_filed_is_approximate": false,
141+
"docket_numbers": "",
129142
"citations": "2014 ME 3",
130143
"case_name_shorts": ""
131144
},
132145
{
133146
"case_dates": "2014-01-09",
134-
"case_names": "In re Steven L.",
147+
"case_names": "In Re Steven L.",
135148
"download_urls": "tests/examples/opinions/united_states/lawcourt/2014/14me1sl.pdf",
136149
"precedential_statuses": "Published",
137150
"blocked_statuses": false,
138151
"date_filed_is_approximate": false,
152+
"docket_numbers": "",
139153
"citations": "2014 ME 1",
140154
"case_name_shorts": ""
141155
},
@@ -146,6 +160,7 @@
146160
"precedential_statuses": "Published",
147161
"blocked_statuses": false,
148162
"date_filed_is_approximate": false,
163+
"docket_numbers": "",
149164
"citations": "2014 ME 2",
150165
"case_name_shorts": ""
151166
}

0 commit comments

Comments
 (0)